From 5e665c640828687f21481e25fcf4c6b24468d476 Mon Sep 17 00:00:00 2001
From: Luke Slater <tinmachin3@gmail.com>
Date: Sat, 13 Aug 2022 16:04:42 +0100
Subject: [PATCH] fix --classify; use old exclusion distribution

---
 klarigi/build.gradle                          |  2 +-
 klarigi/src/main/groovy/klarigi/App.groovy    | 14 ++++--
 .../src/main/groovy/klarigi/Classifier.groovy |  3 +-
 .../src/main/groovy/klarigi/Klarigi.groovy    | 46 +++++++++++--------
 klarigi/src/main/groovy/klarigi/Scorer.groovy |  9 ----
 5 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/klarigi/build.gradle b/klarigi/build.gradle
index cf2ceda..94c1ead 100644
--- a/klarigi/build.gradle
+++ b/klarigi/build.gradle
@@ -95,4 +95,4 @@ jacocoTestReport {
   }
 }
 
-version = '0.1.1'
+version = '0.1.2'
diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy
index e9fab56..c21c672 100644
--- a/klarigi/src/main/groovy/klarigi/App.groovy
+++ b/klarigi/src/main/groovy/klarigi/App.groovy
@@ -122,6 +122,12 @@ class App {
       }
     }
 
+    def includeAll = o['include-all']
+    // Otherwise, variables we use to classify may not be scored (and we need their old nExclusion values). Strictly it should probably be on CWV rather than c and re?
+    if(o['classify']) {
+      includeAll = true
+    }
+
     def k = new Klarigi(o, excludeClasses, threads)
     if(!o['similarity-mode']) {
       def allExplanations 
@@ -134,11 +140,11 @@ class App {
           System.exit(1)
         }
 
-        allExplanations = k.explainClusters(groups, o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], o['include-all'])
+        allExplanations = k.explainClusters(groups, o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], includeAll)
       } else if(o['group'] && o['group'] != '*') {
-        allExplanations = k.explainClusters([o['group']], o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], o['include-all'])
+        allExplanations = k.explainClusters([o['group']], o['scores-only'], o['output-scores'], o['output-type'], threads, o['debug'], includeAll)
       } else {
-        allExplanations = k.explainAllClusters(o['output-scores'], o['scores-only'], o['output-type'], threads, o['debug'], o['include-all'])
+        allExplanations = k.explainAllClusters(o['output-scores'], o['scores-only'], o['output-type'], threads, o['debug'], includeAll)
       }
 
       if(o['scores-only']) {
@@ -170,7 +176,7 @@ class App {
           k.reclassify(allExplanations, excludeClasses, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], threads)
         }
         if(o['classify']) {
-          k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], excludeClasses, threads)
+          k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ucm'], o['classify-with-variables'], excludeClasses, threads, o) // fuck it just passing o
         }
       }
 
diff --git a/klarigi/src/main/groovy/klarigi/Classifier.groovy b/klarigi/src/main/groovy/klarigi/Classifier.groovy
index b80ae96..a9bd39e 100644
--- a/klarigi/src/main/groovy/klarigi/Classifier.groovy
+++ b/klarigi/src/main/groovy/klarigi/Classifier.groovy
@@ -29,10 +29,11 @@ public class Classifier {
     //iterate each entity
     //GParsPool.withPool(threads) { p ->
     data.associations.each { entity, codes ->
-      // Iterate each group
       def scores = [:]
       
+      // Iterate each group
       allExplanations.each { exps ->
+        // Start from 1
         scores[exps.cluster] = new Float(1.0)
 
         def rs = sterms[exps.cluster].collect { e ->
diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
index 2ec6439..93123f8 100644
--- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy
+++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
@@ -153,9 +153,10 @@ public class Klarigi {
       }
     }
 
-    /*groupings.each { k, v ->
-      println "$k: ${v.size()}" 
-    }*/
+    println "Groupings loaded:"
+    groupings.each { k, v ->
+      println "  $k: ${v.size()} members" 
+    }
 
     // kind of stupid but ok 
     def qa = [:]
@@ -164,7 +165,9 @@ public class Klarigi {
     }
     def allAssociations = qa.keySet().toList()
 
-    //println "all associ: ${allAssociations.size()}"
+    if(verbose) {
+      println "Loaded ${allAssociations.size()} entity-term associations."
+    }
 
     data = [
       groupings: groupings,
@@ -400,15 +403,22 @@ public class Klarigi {
       ucm = false
 
       def assoc = [:]
+      def allAssoc = []
       new File(cwf).splitEachLine('\t') {
         if(!assoc.containsKey(it[1])) { assoc[it[1]] = [] }
-        assoc[it[1]] << 'http://purl.obolibrary.org/obo/' + it[0].replace(':','_')
+        def t = 'http://purl.obolibrary.org/obo/' + it[0].replace(':','_')
+        assoc[it[1]] << t
+        allAssoc << t 
       }
 
+      // We rescore to ensure we have the scores for all of our given classes, and to get the new incEnts if we've reloaded data (per classify)
       def reScorer = new Scorer(ontoHelper, coefficients, data, excludeClasses, false, threads)
-
-       allExplanations.each { exps ->
+      def newScores = []
+      allExplanations.each { exps ->
          exps.results[0] = reScorer.scoreClasses(exps.cluster, threads, assoc[exps.cluster], true)
+         exps.results[0].each { t ->
+          t.nExclusion = exps.results[2].find { it.iri == t.iri }.nExclusion
+         }
       }
     }
 
@@ -417,7 +427,7 @@ public class Klarigi {
       RaiseError("Failed to build reclassifier. There may have been too few examples.")
     }
 
-    println 'Reclassification:'
+    println 'Classification performance:'
     Classifier.Print(m)
     println ''
 
@@ -426,21 +436,17 @@ public class Klarigi {
     }
   }
 
-  def classify(path, allExplanations, outClassScores, ucm, cwf, excludeClasses, threads) {
-    loadData(path) // TODO I know, i know, this is awful state management and design. i'll fix it later
-
-    def m = Classifier.classify(allExplanations, data, ontoHelper, threads, ucm)
-    if(!m) {
-      RaiseError("Failed to build classifier. There may have been too few examples.")
+  def classify(path, allExplanations, outClassScores, ucm, cwf, excludeClasses, threads, o) {
+    if(o['verbose']) {
+      println "Loading new dataset at $path in order to classify ..."
     }
 
-    println 'Classification:'
-    Classifier.Print(m)
-    println ''
+    def saveIc = data.ic
+    loadData(path, o['pp'], o['group'], o['egl'], threads)
+    data.ic = saveIc 
+    // holding onto ic saves us a bit of time, but this should be looked at again if decide to involve IC in classify scoring.
 
-    if(outClassScores) {
-      Classifier.WriteScores(m, "classify")
-    }
+    reclassify(allExplanations, excludeClasses, outClassScores, ucm, cwf, threads)
   }
 
   def genSim(toFile, group) {
diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy
index a03953d..e2262d6 100644
--- a/klarigi/src/main/groovy/klarigi/Scorer.groovy
+++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy
@@ -61,13 +61,6 @@ public class Scorer {
     }
 
     toProcess = toProcess.unique(false)
-    /*if(manToProcess) {
-      toProcess = manToProcess
-    }*/
-
-    //toProcess = toProcess.findAll { c -> data.ic[c] >= this.c.MIN_IC } 
-
-    //println "Processing ${toProcess.size()}"
 
     toProcess.each { iri ->
       ass[iri] = [:]
@@ -85,7 +78,6 @@ public class Scorer {
     def i = 0
     GParsPool.withPool(threads) { p ->
     data.associations.eachParallel { e, terms ->
-      //println "${++i}"
       terms.each { t, v ->
         scMap[t].each { dt ->
           data.egroups[e].each { g ->
@@ -99,7 +91,6 @@ public class Scorer {
     def z = 0
     GParsPool.withPool(threads) { p ->
     data.groupings.eachParallel { cid, v ->
-      //println "${++z}"
       toProcess.each { iri ->
         ass[iri][cid].inc = ass[iri][cid].incEnts.size()
         ass[iri][cid].exc = data.groupings[cid].size() - ass[iri][cid].inc