From 5e665c640828687f21481e25fcf4c6b24468d476 Mon Sep 17 00:00:00 2001 From: Luke Slater Date: Sat, 13 Aug 2022 16:04:42 +0100 Subject: [PATCH] fix --classify; use old exclusion distribution --- klarigi/build.gradle | 2 +- klarigi/src/main/groovy/klarigi/App.groovy | 14 ++++-- .../src/main/groovy/klarigi/Classifier.groovy | 3 +- .../src/main/groovy/klarigi/Klarigi.groovy | 46 +++++++++++-------- klarigi/src/main/groovy/klarigi/Scorer.groovy | 9 ---- 5 files changed, 39 insertions(+), 35 deletions(-) diff --git a/klarigi/build.gradle b/klarigi/build.gradle index cf2ceda..94c1ead 100644 --- a/klarigi/build.gradle +++ b/klarigi/build.gradle @@ -95,4 +95,4 @@ jacocoTestReport { } } -version = '0.1.1' +version = '0.1.2' diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy index e9fab56..c21c672 100644 --- a/klarigi/src/main/groovy/klarigi/App.groovy +++ b/klarigi/src/main/groovy/klarigi/App.groovy @@ -122,6 +122,12 @@ class App { } } + def includeAll = o['include-all'] + // Otherwise, variables we use to classify may not be scored (and we need their old nExclusion values). Strictly it should probably be on CWV rather than c and re? + if(o['classify']) { + includeAll = true + } + def k = new Klarigi(o, excludeClasses, threads) if(!o['similarity-mode']) { def allExplanations @@ -134,11 +140,11 @@ class App { System.exit(1) } - allExplanations = k.explainClusters(groups, o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], o['include-all']) + allExplanations = k.explainClusters(groups, o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], includeAll) } else if(o['group'] && o['group'] != '*') { - allExplanations = k.explainClusters([o['group']], o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], o['include-all']) + allExplanations = k.explainClusters([o['group']], o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], includeAll) } else { - allExplanations = k.explainAllClusters(o['output-scores'], o['scores-only'], o['output-type'], threads, o['debug'], o['include-all']) + allExplanations = k.explainAllClusters(o['output-scores'], o['scores-only'], o['output-type'], threads, o['debug'], includeAll) } if(o['scores-only']) { @@ -170,7 +176,7 @@ class App { k.reclassify(allExplanations, excludeClasses, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], threads) } if(o['classify']) { - k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], excludeClasses, threads) + k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], excludeClasses, threads, o) // fuck it just passing o } } diff --git a/klarigi/src/main/groovy/klarigi/Classifier.groovy b/klarigi/src/main/groovy/klarigi/Classifier.groovy index b80ae96..a9bd39e 100644 --- a/klarigi/src/main/groovy/klarigi/Classifier.groovy +++ b/klarigi/src/main/groovy/klarigi/Classifier.groovy @@ -29,10 +29,11 @@ public class Classifier { //iterate each entity //GParsPool.withPool(threads) { p -> data.associations.each { entity, codes -> - // Iterate each group def scores = [:] + // Iterate each group allExplanations.each { exps -> + // Start from 1 scores[exps.cluster] = new Float(1.0) def rs = sterms[exps.cluster].collect { e -> diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy index 2ec6439..93123f8 100644 --- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy +++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy @@ -153,9 +153,10 @@ public class Klarigi { } } - /*groupings.each { k, v -> - println "$k: ${v.size()}" - }*/ + println "Groupings loaded:" + groupings.each { k, v -> + println " $k: ${v.size()} members" + } // kind of stupid but ok def qa = [:] @@ -164,7 +165,9 @@ public class Klarigi { } def allAssociations = qa.keySet().toList() - //println "all associ: ${allAssociations.size()}" + if(verbose) { + println "Loaded ${allAssociations.size()} entity-term associations." + } data = [ groupings: groupings, @@ -400,15 +403,22 @@ public class Klarigi { ucm = false def assoc = [:] + def allAssoc = [] new File(cwf).splitEachLine('\t') { if(!assoc.containsKey(it[1])) { assoc[it[1]] = [] } - assoc[it[1]] << 'http://purl.obolibrary.org/obo/' + it[0].replace(':','_') + def t = 'http://purl.obolibrary.org/obo/' + it[0].replace(':','_') + assoc[it[1]] << t + allAssoc << t } + // We rescore to ensure we have the scores for all of our given classes, and to get the new incEnts if we've reloaded data (per classify) def reScorer = new Scorer(ontoHelper, coefficients, data, excludeClasses, false, threads) - - allExplanations.each { exps -> + def newScores = [] + allExplanations.each { exps -> exps.results[0] = reScorer.scoreClasses(exps.cluster, threads, assoc[exps.cluster], true) + exps.results[0].each { t -> + t.nExclusion = exps.results[2].find { it.iri == t.iri }.nExclusion + } } } @@ -417,7 +427,7 @@ public class Klarigi { RaiseError("Failed to build reclassifier. There may have been too few examples.") } - println 'Reclassification:' + println 'Classification performance:' Classifier.Print(m) println '' @@ -426,21 +436,17 @@ public class Klarigi { } } - def classify(path, allExplanations, outClassScores, ucm, cwf, excludeClasses, threads) { - loadData(path) // TODO I know, i know, this is awful state management and design. i'll fix it later - - def m = Classifier.classify(allExplanations, data, ontoHelper, threads, ucm) - if(!m) { - RaiseError("Failed to build classifier. There may have been too few examples.") + def classify(path, allExplanations, outClassScores, ucm, cwf, excludeClasses, threads, o) { + if(o['verbose']) { + println "Loading new dataset at $path in order to classify ..." } - println 'Classification:' - Classifier.Print(m) - println '' + def saveIc = data.ic + loadData(path, o['pp'], o['group'], o['egl'], threads) + data.ic = saveIc + // holding onto ic saves us a bit of time, but this should be looked at again if decide to involve IC in classify scoring. - if(outClassScores) { - Classifier.WriteScores(m, "classify") - } + reclassify(allExplanations, excludeClasses, outClassScores, ucm, cwf, threads) } def genSim(toFile, group) { diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy index a03953d..e2262d6 100644 --- a/klarigi/src/main/groovy/klarigi/Scorer.groovy +++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy @@ -61,13 +61,6 @@ public class Scorer { } toProcess = toProcess.unique(false) - /*if(manToProcess) { - toProcess = manToProcess - }*/ - - //toProcess = toProcess.findAll { c -> data.ic[c] >= this.c.MIN_IC } - - //println "Processing ${toProcess.size()}" toProcess.each { iri -> ass[iri] = [:] @@ -85,7 +78,6 @@ public class Scorer { def i = 0 GParsPool.withPool(threads) { p -> data.associations.eachParallel { e, terms -> - //println "${++i}" terms.each { t, v -> scMap[t].each { dt -> data.egroups[e].each { g -> @@ -99,7 +91,6 @@ public class Scorer { def z = 0 GParsPool.withPool(threads) { p -> data.groupings.eachParallel { cid, v -> - //println "${++z}" toProcess.each { iri -> ass[iri][cid].inc = ass[iri][cid].incEnts.size() ass[iri][cid].exc = data.groupings[cid].size() - ass[iri][cid].inc