-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#60 - Implement variable disambiguation component using dkpro similarity
- Loading branch information
Showing
14 changed files
with
6,227 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
...va/eu/openminted/uc/socialsciences/similarity/algorithms/ml/LinearRegressionResource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/** | ||
* Copyright 2012-2016 | ||
* Ubiquitous Knowledge Processing (UKP) Lab | ||
* Technische Universität Darmstadt | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see http://www.gnu.org/licenses/. | ||
*/ | ||
package eu.openminted.uc.socialsciences.similarity.algorithms.ml; | ||
|
||
import java.io.File; | ||
import java.util.Map; | ||
|
||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
import org.apache.uima.resource.ResourceSpecifier; | ||
import org.dkpro.similarity.uima.resource.JCasTextSimilarityResourceBase; | ||
|
||
import eu.openminted.uc.socialsciences.similarity.algorithms.ml.LinearRegressionSimilarityMeasure; | ||
|
||
/** | ||
* Copied from dkpro-similarity project https://github.com/dkpro/dkpro-similarity | ||
* | ||
* Original class org.dkpro.similarity.uima.resource.ml.LinearRegressionResource | ||
*/ | ||
public class LinearRegressionResource | ||
extends JCasTextSimilarityResourceBase | ||
{ | ||
public static final String PARAM_LOG_FILTER = "LogFilter"; | ||
@ConfigurationParameter(name=PARAM_LOG_FILTER, mandatory=true, defaultValue="true") | ||
private boolean logFilter; | ||
|
||
public static final String PARAM_TRAIN_ARFF = "TRAIN_ARFF"; | ||
@ConfigurationParameter(name=PARAM_TRAIN_ARFF, mandatory=true) | ||
private File trainArff; | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) | ||
throws ResourceInitializationException | ||
{ | ||
if (!super.initialize(aSpecifier, aAdditionalParams)) { | ||
return false; | ||
} | ||
|
||
try { | ||
this.setMode(TextSimilarityResourceMode.jcas); | ||
measure = new LinearRegressionSimilarityMeasure(trainArff, logFilter); | ||
} | ||
catch (Exception e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
|
||
return true; | ||
} | ||
} |
135 changes: 135 additions & 0 deletions
135
...nminted/uc/socialsciences/similarity/algorithms/ml/LinearRegressionSimilarityMeasure.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/** | ||
* Copyright 2012-2016 | ||
* Ubiquitous Knowledge Processing (UKP) Lab | ||
* Technische Universität Darmstadt | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see http://www.gnu.org/licenses/. | ||
*/ | ||
package eu.openminted.uc.socialsciences.similarity.algorithms.ml; | ||
|
||
import java.io.File; | ||
import java.util.List; | ||
|
||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.jcas.tcas.Annotation; | ||
import org.dkpro.similarity.algorithms.api.JCasTextSimilarityMeasureBase; | ||
import org.dkpro.similarity.algorithms.api.SimilarityException; | ||
import org.dkpro.similarity.ml.filters.LogFilter; | ||
|
||
import weka.classifiers.AbstractClassifier; | ||
import weka.classifiers.Classifier; | ||
import weka.classifiers.functions.LinearRegression; | ||
import weka.core.Instance; | ||
import weka.core.Instances; | ||
import weka.core.converters.ConverterUtils.DataSource; | ||
import weka.filters.Filter; | ||
|
||
|
||
/** | ||
* | ||
* Copied from dkpro-similarity project https://github.com/dkpro/dkpro-similarity | ||
* | ||
* Original class org.dkpro.similarity.algorithms.ml.LinearRegressionSimilarityMeasure | ||
* | ||
* | ||
* Runs a linear regression classifier on the provided test data on a model | ||
* that is trained on the given training data. Mind that the | ||
* {@link #getSimilarity(JCas,JCas) getSimilarity} method | ||
* classifies the input texts by their ID, not their textual contents. The | ||
* <pre>DocumentID</pre> of the <pre>DocumentMetaData</pre> is expected to denote | ||
* the corresponding input line in the test data. | ||
*/ | ||
public class LinearRegressionSimilarityMeasure | ||
extends JCasTextSimilarityMeasureBase | ||
{ | ||
public static final Classifier CLASSIFIER = new LinearRegression(); | ||
|
||
Classifier filteredClassifier; | ||
List<String> features; | ||
boolean useLogFilter; | ||
|
||
public LinearRegressionSimilarityMeasure(File trainArff, boolean aUseLogFilter) | ||
throws Exception | ||
{ | ||
// Get all instances | ||
Instances train = getTrainInstances(trainArff); | ||
useLogFilter = aUseLogFilter; | ||
|
||
// Apply log filter | ||
if (useLogFilter) | ||
{ | ||
Filter logFilter = new LogFilter(); | ||
logFilter.setInputFormat(train); | ||
train = Filter.useFilter(train, logFilter); | ||
} | ||
|
||
Classifier clsCopy; | ||
try { | ||
// Copy the classifier | ||
clsCopy = AbstractClassifier.makeCopy(CLASSIFIER); | ||
|
||
// Build the classifier | ||
filteredClassifier = clsCopy; | ||
filteredClassifier.buildClassifier(train); | ||
|
||
System.out.println(filteredClassifier.toString()); | ||
} | ||
catch (Exception e) { | ||
throw new SimilarityException(e); | ||
} | ||
} | ||
|
||
private Instances getTrainInstances(File trainArff) | ||
throws SimilarityException | ||
{ | ||
// Read with Weka | ||
Instances data; | ||
try { | ||
data = DataSource.read(trainArff.getAbsolutePath()); | ||
} | ||
catch (Exception e) { | ||
throw new SimilarityException(e); | ||
} | ||
|
||
// Set the index of the class attribute | ||
data.setClassIndex(data.numAttributes() - 1); | ||
|
||
return data; | ||
} | ||
|
||
public Instance getInstance(File arff) throws Exception | ||
{ | ||
Instances instances = getTrainInstances(arff); | ||
if (useLogFilter) | ||
{ | ||
Filter logFilter = new LogFilter(); | ||
logFilter.setInputFormat(instances); | ||
instances = Filter.useFilter(instances, logFilter); | ||
} | ||
return instances.get(0); | ||
} | ||
|
||
public double getSimilarity(Instance instance) throws Exception | ||
{ | ||
return filteredClassifier.classifyInstance(instance); | ||
} | ||
|
||
@Override | ||
public double getSimilarity(JCas jcas1, JCas jcas2, Annotation coveringAnnotation1, | ||
Annotation coveringAnnotation2) | ||
throws SimilarityException | ||
{ | ||
throw new UnsupportedOperationException(); | ||
} | ||
} |
Oops, something went wrong.