Skip to content

Commit

Permalink
#60 - Implement variable disambiguation component using dkpro similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
maxxkia committed Dec 14, 2017
1 parent 560181f commit 580a0c8
Show file tree
Hide file tree
Showing 14 changed files with 6,227 additions and 0 deletions.
75 changes: 75 additions & 0 deletions ss-variable-detection/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lsr.version>0.8.1</lsr.version>
<jena.version>3.4.0</jena.version>
<dkpro.similarity.version>2.3.0-SNAPSHOT</dkpro.similarity.version>
</properties>

<dependencyManagement>
Expand All @@ -35,6 +36,11 @@
</dependencyManagement>

<dependencies>
<dependency>
<groupId>eu.openminted.uc-tdm-socialsciences</groupId>
<artifactId>ss-common</artifactId>
</dependency>

<!-- OMTD-SHARE annotations -->
<dependency>
<groupId>eu.openminted.share.annotations</groupId>
Expand Down Expand Up @@ -138,6 +144,75 @@
<version>${jena.version}</version>
</dependency>

<!-- DKPro Similarity -->
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-asl</artifactId>
<version>${dkpro.similarity.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-ml-core-gpl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-algorithms-lexical-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-algorithms-lsr-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-algorithms-lexical-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-uima-data-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-uima-io-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-algorithms-vsm-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-uima-core-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-ml-io-gpl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-uima-core-gpl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-algorithms-api-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.similarity</groupId>
<artifactId>dkpro-similarity-uima-api-asl</artifactId>
<version>${dkpro.similarity.version}</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* Copyright 2012-2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package eu.openminted.uc.socialsciences.similarity.algorithms.ml;

import java.io.File;
import java.util.Map;

import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.dkpro.similarity.uima.resource.JCasTextSimilarityResourceBase;

import eu.openminted.uc.socialsciences.similarity.algorithms.ml.LinearRegressionSimilarityMeasure;

/**
* Copied from dkpro-similarity project https://github.com/dkpro/dkpro-similarity
*
* Original class org.dkpro.similarity.uima.resource.ml.LinearRegressionResource
*/
public class LinearRegressionResource
extends JCasTextSimilarityResourceBase
{
public static final String PARAM_LOG_FILTER = "LogFilter";
@ConfigurationParameter(name=PARAM_LOG_FILTER, mandatory=true, defaultValue="true")
private boolean logFilter;

public static final String PARAM_TRAIN_ARFF = "TRAIN_ARFF";
@ConfigurationParameter(name=PARAM_TRAIN_ARFF, mandatory=true)
private File trainArff;

@SuppressWarnings("unchecked")
@Override
public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams)
throws ResourceInitializationException
{
if (!super.initialize(aSpecifier, aAdditionalParams)) {
return false;
}

try {
this.setMode(TextSimilarityResourceMode.jcas);
measure = new LinearRegressionSimilarityMeasure(trainArff, logFilter);
}
catch (Exception e) {
throw new ResourceInitializationException(e);
}

return true;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/**
* Copyright 2012-2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package eu.openminted.uc.socialsciences.similarity.algorithms.ml;

import java.io.File;
import java.util.List;

import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.dkpro.similarity.algorithms.api.JCasTextSimilarityMeasureBase;
import org.dkpro.similarity.algorithms.api.SimilarityException;
import org.dkpro.similarity.ml.filters.LogFilter;

import weka.classifiers.AbstractClassifier;
import weka.classifiers.Classifier;
import weka.classifiers.functions.LinearRegression;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;


/**
*
* Copied from dkpro-similarity project https://github.com/dkpro/dkpro-similarity
*
* Original class org.dkpro.similarity.algorithms.ml.LinearRegressionSimilarityMeasure
*
*
* Runs a linear regression classifier on the provided test data on a model
* that is trained on the given training data. Mind that the
* {@link #getSimilarity(JCas,JCas) getSimilarity} method
* classifies the input texts by their ID, not their textual contents. The
* <pre>DocumentID</pre> of the <pre>DocumentMetaData</pre> is expected to denote
* the corresponding input line in the test data.
*/
public class LinearRegressionSimilarityMeasure
extends JCasTextSimilarityMeasureBase
{
public static final Classifier CLASSIFIER = new LinearRegression();

Classifier filteredClassifier;
List<String> features;
boolean useLogFilter;

public LinearRegressionSimilarityMeasure(File trainArff, boolean aUseLogFilter)
throws Exception
{
// Get all instances
Instances train = getTrainInstances(trainArff);
useLogFilter = aUseLogFilter;

// Apply log filter
if (useLogFilter)
{
Filter logFilter = new LogFilter();
logFilter.setInputFormat(train);
train = Filter.useFilter(train, logFilter);
}

Classifier clsCopy;
try {
// Copy the classifier
clsCopy = AbstractClassifier.makeCopy(CLASSIFIER);

// Build the classifier
filteredClassifier = clsCopy;
filteredClassifier.buildClassifier(train);

System.out.println(filteredClassifier.toString());
}
catch (Exception e) {
throw new SimilarityException(e);
}
}

private Instances getTrainInstances(File trainArff)
throws SimilarityException
{
// Read with Weka
Instances data;
try {
data = DataSource.read(trainArff.getAbsolutePath());
}
catch (Exception e) {
throw new SimilarityException(e);
}

// Set the index of the class attribute
data.setClassIndex(data.numAttributes() - 1);

return data;
}

public Instance getInstance(File arff) throws Exception
{
Instances instances = getTrainInstances(arff);
if (useLogFilter)
{
Filter logFilter = new LogFilter();
logFilter.setInputFormat(instances);
instances = Filter.useFilter(instances, logFilter);
}
return instances.get(0);
}

public double getSimilarity(Instance instance) throws Exception
{
return filteredClassifier.classifyInstance(instance);
}

@Override
public double getSimilarity(JCas jcas1, JCas jcas2, Annotation coveringAnnotation1,
Annotation coveringAnnotation2)
throws SimilarityException
{
throw new UnsupportedOperationException();
}
}
Loading

0 comments on commit 580a0c8

Please sign in to comment.