From 400b4df22f0443831629dca6e9545e2775102012 Mon Sep 17 00:00:00 2001
From: arangrhie <arrhie@gmail.com>
Date: Fri, 25 Jun 2021 10:39:59 -0400
Subject: [PATCH] change -vmer to -filter and -polish, make -seqmer optional

---
 src/merfin/merfin-globals.C  |  61 ++++++++++++----
 src/merfin/merfin-globals.H  |  19 ++---
 src/merfin/merfin-variants.C |   6 +-
 src/merfin/merfin.C          | 130 ++++++++++++++++++++---------------
 src/merfin/varMer.C          |  63 +++++++++--------
 src/merfin/varMer.H          |   2 +-
 6 files changed, 172 insertions(+), 109 deletions(-)

diff --git a/src/merfin/merfin-globals.C b/src/merfin/merfin-globals.C
index d0dd0b8..acc18de 100644
--- a/src/merfin/merfin-globals.C
+++ b/src/merfin/merfin-globals.C
@@ -15,7 +15,7 @@
 
 #include "merfin-globals.H"
 #include "strings.H"
-
+#include <libgen.h>
 
 //  Read probabilities lookup table for 1-4 copy kmers.
 void
@@ -24,10 +24,10 @@ merfinGlobal::load_Kmetric(void) {
   if (pLookupTable == nullptr)    //  No input supplied, no lookup table to make.
     return;
 
-  fprintf(stderr, "-- Loading copy-number lookup table '%s'.\n\n", pLookupTable);
+  fprintf(stderr, "-- Loading probability table '%s'.\n\n", pLookupTable);
 
   if (fileExists(pLookupTable) == false) {
-    fprintf(stderr, "ERROR: Lookup table (-lookup) file '%s' doesn't exist!\n", pLookupTable);
+    fprintf(stderr, "ERROR: Probability table (-prob) file '%s' doesn't exist!\n", pLookupTable);
     exit(1);
   }
 
@@ -119,20 +119,26 @@ merfinGlobal::load_Kmers(void) {
     return;
   }
 
-  merylFileReader*  readDB     = new merylFileReader(readDBname);
-  merylFileReader*  asmDB      = new merylFileReader(seqDBname);
-
   double            minMem, minMemTotal = 0;
   double            optMem, optMemTotal = 0;
   bool              useOpt = false;
   bool              useMin = false;
 
+  //  Make readDB first so we know the k size
+  merylFileReader*  readDB     = new merylFileReader(readDBname);
+  
+  //  Open sequence and build seqDBname if not provided
+  load_Sequence();
+
+  fprintf(stderr, "-- Estimating required space for loading '%s'\n", readDBname);
   readLookup = new merylExactLookup();
   readLookup->estimateMemoryUsage(readDB, maxMemory, minMem, optMem, minV, maxV);
   minMemTotal += minMem;
   optMemTotal += optMem;
 
- asmLookup  = new merylExactLookup();
+  fprintf(stderr, "-- Estimating required space for loading '%s'\n", seqDBname);
+  merylFileReader*  asmDB      = new merylFileReader(seqDBname);
+  asmLookup  = new merylExactLookup();
   asmLookup->estimateMemoryUsage(asmDB, maxMemory, minMem, optMem, minV, maxV);
   minMemTotal += minMem;
   optMemTotal += optMem;
@@ -166,27 +172,56 @@ merfinGlobal::load_Kmers(void) {
   delete asmDB;
 }
 
-
-
-
 void
-merfinGlobal::open_Inputs(void) {
+merfinGlobal::load_Sequence(void) {
 
   if (reportType == OP_COMPL) {
     return;
   }
 
-  //  Open input sequences.
+  if (seqDBname == nullptr) {
+    seqDBname = new char[FILENAME_MAX+1];
+    snprintf(seqDBname, FILENAME_MAX, "%s.meryl", basename(seqName));
+    fprintf(stderr, "-- No -seqmer given. Build sequence db as '%s'.\n", seqDBname);
+
+    //  TODO: Replace this hacky counting when meryl has proper APIs
+    char merylcount[FILENAME_MAX+1];
+    char merylpath[FILENAME_MAX+1];
+
+    if (strcmp(execName, "merfin") == 0)
+      snprintf(merylpath, FILENAME_MAX, "meryl");
+    else
+      snprintf(merylpath, FILENAME_MAX, "%s/meryl", dirname(execName));
+
+    snprintf(merylcount, FILENAME_MAX, "%s count k=%d memory=%.3f %s output %s", merylpath, kmer::merSize(), maxMemory, seqName, seqDBname);
 
+    fprintf(stderr, "%s\n\n", merylcount);
+    system(merylcount);
+    fprintf(stderr, "\n");
+
+  }
+
+  //  Open input sequence.
   if (seqName != nullptr) {
     fprintf(stderr, "-- Opening sequences in '%s'.\n", seqName);
     seqFile = new dnaSeqFile(seqName);
+  } 
+
+}
+
+
+
+void
+merfinGlobal::open_Inputs(void) {
+
+  if (reportType == OP_COMPL) {
+    return;
   }
 
   //  Open VCF input.  This is only needed for reportType OP_VAR_MER.
   //  main() checks that we have a vcfName.
 
-  if (reportType == OP_VAR_MER) {
+  if (reportType == OP_FILTER || reportType == OP_POLISH) {
     fprintf(stderr, "-- Opening vcf file '%s'.\n", vcfName);
     inVcf = new vcfFile(vcfName);
 
diff --git a/src/merfin/merfin-globals.H b/src/merfin/merfin-globals.H
index 35c7cea..3d64423 100644
--- a/src/merfin/merfin-globals.H
+++ b/src/merfin/merfin-globals.H
@@ -29,10 +29,10 @@
 
 #define OP_NONE       0
 #define OP_HIST       1
-#define OP_DUMP       2
-#define OP_VAR_MER    3
-#define OP_COMPL      4
-
+#define OP_COMPL      2
+#define OP_DUMP       3
+#define OP_FILTER     4
+#define OP_POLISH     5
 
 
 ////////////////////////////////////////
@@ -129,6 +129,10 @@ public:
 //
 class merfinGlobal {
 public:
+  merfinGlobal(char *const execname) {
+    execName = execname;
+  }
+
   ~merfinGlobal() {
     delete dumpOutFile;
     delete oVCF;
@@ -145,6 +149,7 @@ public:
 
 public:
   void      load_Kmetric(void);
+  void      load_Sequence(void);
   void      load_Kmers(void);
   void      open_Inputs(void);
 
@@ -171,9 +176,6 @@ public:
   compressedFileWriter   *oVCF         = nullptr;
 
 
-
-
-
   //  Output data for histogram and data modes.  Data mode only uses histKasm
   //  and histKmissing.
 public:
@@ -234,8 +236,9 @@ public:
 public:
   double                  peak          = 0;
   bool                    nosplit       = false;
-  bool                    bykstar       = true;
   uint32                  comb          = 15;
+
+  char                   *execName      = nullptr;
 };
 
 
diff --git a/src/merfin/merfin-variants.C b/src/merfin/merfin-variants.C
index 2a1393c..d37aaa7 100644
--- a/src/merfin/merfin-variants.C
+++ b/src/merfin/merfin-variants.C
@@ -278,14 +278,14 @@ processVariants(void *G, void *T, void *S) {
     //  Generate output VCFs.
 
     // Experimental: output vcf according to k*
-    if (g->bykstar) {
+    if (g->reportType == OP_POLISH) {
       //fprintf(t->oVcf->file(), "%s", seqMer->bestVariant().c_str());
       s->result += seqMer->bestVariant();
     }
 
     // Filter vcf and print as it was in the original vcf, conservatively
     else {
-      vector<vcfRecord*> records = seqMer->bestVariantOriginalVCF();
+      vector<vcfRecord*> records = seqMer->bestFilter();
 
       for (uint64 i = 0; i < records.size(); i++)
         //records[i]->save(t->oVcf);
@@ -316,7 +316,7 @@ outputVariants(void *G, void *S) {
   if (g->oVCF == nullptr) {
     char  name[FILENAME_MAX+1];
 
-		if (g->bykstar)
+		if (g->reportType == OP_POLISH)
 	    snprintf(name, FILENAME_MAX, "%s.polish.vcf", g->outName);
 		else
 	    snprintf(name, FILENAME_MAX, "%s.filter.vcf", g->outName);
diff --git a/src/merfin/merfin.C b/src/merfin/merfin.C
index eae0369..645c009 100644
--- a/src/merfin/merfin.C
+++ b/src/merfin/merfin.C
@@ -71,7 +71,7 @@ void computeCompleteness(merfinGlobal *G);
 
 int
 main(int32 argc, char **argv) {
-  merfinGlobal  *G = new merfinGlobal;
+  merfinGlobal  *G = new merfinGlobal(argv[0]);
 
   std::vector<const char *>  err;
   for (int32 arg=1; arg < argc; arg++) {
@@ -87,7 +87,7 @@ main(int32 argc, char **argv) {
     } else if (strcmp(argv[arg], "-peak") == 0) {
       G->peak = strtodouble(argv[++arg]);
 
-    } else if (strcmp(argv[arg], "-lookup") == 0) {
+    } else if (strcmp(argv[arg], "-prob") == 0) {
       G->pLookupTable = argv[++arg];
 
     } else if (strcmp(argv[arg], "-vcf") == 0) {
@@ -111,8 +111,11 @@ main(int32 argc, char **argv) {
     } else if (strcmp(argv[arg], "-nosplit") == 0) {
       G->nosplit = true;
 
-    } else if (strcmp(argv[arg], "-disable-kstar") == 0) {
-      G->bykstar = false;
+    } else if (strcmp(argv[arg], "-filter") == 0) {
+      G->reportType = OP_FILTER;
+
+    } else if (strcmp(argv[arg], "-polish") == 0) {
+      G->reportType = OP_POLISH;
 
     } else if (strcmp(argv[arg], "-hist") == 0) {
       G->reportType = OP_HIST;
@@ -123,9 +126,6 @@ main(int32 argc, char **argv) {
     } else if (strcmp(argv[arg], "-skipMissing") == 0) {
       G->skipMissing = true;
 
-    } else if (strcmp(argv[arg], "-vmer") == 0) {
-      G->reportType = OP_VAR_MER;
-
     } else if (strcmp(argv[arg], "-completeness") == 0) {
       G->reportType = OP_COMPL;
 
@@ -144,34 +144,35 @@ main(int32 argc, char **argv) {
 
   //  Check inputs are present for the various modes.
 
-  if ((G->reportType == OP_HIST) ||
-      (G->reportType == OP_DUMP) ||
-      (G->reportType == OP_VAR_MER)) {
+  if ((G->reportType == OP_HIST)   ||
+      (G->reportType == OP_DUMP)   ||
+      (G->reportType == OP_POLISH) ||
+      (G->reportType == OP_FILTER)) {
     if (G->seqName == nullptr)   err.push_back("No input sequences (-sequence) supplied.\n");
     if (G->outName == nullptr)   err.push_back("No output (-output) supplied.\n");
   }
 
-  if  (G->reportType == OP_VAR_MER) {
-    if (G->vcfName == nullptr)   err.push_back("No variant call input (-vcf) supplied; mandatory for -vmer reports.\n");
+  if  (G->reportType == OP_POLISH || OP_FILTER) {
+    if (G->vcfName == nullptr)   err.push_back("No variant call input (-vcf) supplied; mandatory for -filter or -polish.\n");
+  }
+
+  if (G->reportType != OP_FILTER) {
+    if (G->pLookupTable == nullptr && G->peak == 0)  err.push_back("No probability vector (-prob) nor haploid peak (-peak) supplied.\n");
   }
 
   if  (G->reportType == OP_NONE) {
-    err.push_back("No report type (-hist, -dump, -vmer, -completeness) supplied.\n");
+    err.push_back("No report type (-filter, -polish, -hist, -dump, -completeness) supplied.\n");
   }
 
-  if (G->seqDBname == nullptr)   err.push_back("No sequence meryl database (-seqmers) supplied.\n");
   if (G->readDBname == nullptr)  err.push_back("No read meryl database (-readmers) supplied.\n");
-  if (G->peak == 0)              err.push_back("Peak=0 or no haploid peak (-peak) supplied.\n");
 
-  //  check reportType OP_VAR_MER has vcfName
 
   if (err.size() > 0) {
     fprintf(stderr, "usage: %s <report-type>            \\\n", argv[0]);
     fprintf(stderr, "         -sequence <seq.fasta>     \\\n");
-    fprintf(stderr, "         -seqmers  <seq.meryl>     \\\n");
     fprintf(stderr, "         -readmers <read.meryl>    \\\n");
     fprintf(stderr, "         -peak     <haploid_peak>  \\\n");
-    fprintf(stderr, "         -lookup   <lookup_table>  \\\n");
+    fprintf(stderr, "         -prob     <lookup_table>  \\\n");
     fprintf(stderr, "         -vcf      <input.vcf>     \\\n");
     fprintf(stderr, "         -output   <output>        \n\n");
     fprintf(stderr, "  Predict the kmer consequences of variant calls <input.vcf> given the consensus sequence <seq.fasta>\n");
@@ -179,38 +180,67 @@ main(int32 argc, char **argv) {
     fprintf(stderr, "\n");
     fprintf(stderr, "  Input -sequence and -vcf files can be FASTA or FASTQ; uncompressed, gz, bz2 or xz compressed\n");
     fprintf(stderr, "\n");
-    fprintf(stderr, "  Each input database can be filtered by value.  More advanced filtering\n");
+    fprintf(stderr, "  Each readmers can be filtered by value.  More advanced filtering\n");
     fprintf(stderr, "  requires a new database to be constructed using meryl.\n");
-    fprintf(stderr, "    -min   m    Ignore kmers with value below m\n");
-    fprintf(stderr, "    -max   m    Ignore kmers with value above m\n");
-    fprintf(stderr, "    -threads t  Multithreading for meryl lookup table construction, dump and hist.\n");
+    fprintf(stderr, "    -min     m     Ignore kmers with value below m\n");
+    fprintf(stderr, "    -max     m     Ignore kmers with value above m\n");
+    fprintf(stderr, "    -threads t     Multithreading for meryl lookup table construction, dump and hist.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  Memory usage can be limited, within reason, by sacrificing kmer lookup\n");
     fprintf(stderr, "  speed.  If the lookup table requires more memory than allowed, the program\n");
     fprintf(stderr, "  exits with an error.\n");
-    fprintf(stderr, "    -memory1 m   Don't use more than m GB memory for loading seqmers\n");
-    fprintf(stderr, "    -memory2 m   Don't use more than m GB memory for loading readmers\n");
+    fprintf(stderr, "    -memory  m     Don't use more than m GB memory for loading mers\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "  For k* based evaluation and polishing, -prob or -peak is required.\n");
+    fprintf(stderr, "    -prob    file  Optional input vector of probabilities. Adjust multiplicity to copy number\n");
+    fprintf(stderr, "    -peak    m     Optional input for hard setting copy 1.\n");
     fprintf(stderr, "\n");
-    fprintf(stderr, "    -lookup file   Optional input vector of probabilities.\n");
+    fprintf(stderr, "  By default, <seq.fasta>.meryl will be generated unless -seqmers is provided.\n");
+    fprintf(stderr, "    -seqmers seq.meryl  Optional input for pre-built sequence meryl db\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "  Exactly one report type must be specified.\n");
     fprintf(stderr, "\n\n");
+    fprintf(stderr, "  -filter\n");
+    fprintf(stderr, "   Filter variants within distance k and their combinations by missing k-mers.\n");
+    fprintf(stderr, "   Assumes the reference (-sequence) is from a different individual.\n");
+    fprintf(stderr, "   Required: -sequence, -readmers, -vcf, and -output\n");
+    fprintf(stderr, "   Optional: -comb <N>  set the max N of combinations of variants to be evaluated (default: 15)\n");
+    fprintf(stderr, "             -nosplit   without this options combinations larger than N are split\n");
+    fprintf(stderr, "             -debug     output a debug log, into <output>.THREAD_ID.debug.gz\n");
+    fprintf(stderr, "\n\n");
+    fprintf(stderr, "  -polish\n");
+    fprintf(stderr, "   Score each variant, or variants within distance k and their combinations by k*.\n");
+    fprintf(stderr, "   Assumes the reference (-sequence) is from the same individual.\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "   Required: -sequence, -readmers, -prob or -peak, -vcf, and -output\n");
+    fprintf(stderr, "   Optional: -comb <N>    set the max N of combinations of variants to be evaluated (default: 15)\n");
+//  fprintf(stderr, "             -keep-het    keep het calls (default: only hom calls are evaluated)\n");
+    fprintf(stderr, "             -nosplit     without this options combinations larger than N are split\n");
+    fprintf(stderr, "             -prob <file> use probabilities to adjust multiplicity to copy number (recommended)\n");
+    fprintf(stderr, "             -peak <N>    hard set copy 1 and use to infer multiplicity to compy number.\n");
+    fprintf(stderr, "                            in case both -prob and -peak are provided, -prob takes higher priority.\n");
+    fprintf(stderr, "             -debug       output a debug log, into <output>.THREAD_ID.debug.gz\n");
+    fprintf(stderr, "   Output: <output>.polish.vcf : variants chosen.\n");
+    fprintf(stderr, "     use bcftools view -Oz <output>.polish.vcf and bcftools consensus -H 1 -f <seq.fata> to polish.\n");
+    fprintf(stderr, "     first ALT in heterozygous alleles are usually better supported by avg. |k*|.\n");
+    fprintf(stderr, "\n\n");
     fprintf(stderr, "  -hist\n");
     fprintf(stderr, "   Generate a 0-centered k* histogram for sequences in <input.fasta>.\n");
-    fprintf(stderr, "   Positive k* values are expected collapsed copies.\n");
-    fprintf(stderr, "   Negative k* values are expected expanded  copies.\n");
-    fprintf(stderr, "   Closer to 0 means the expected and found k-mers are well balenced, 1:1.\n");
-    fprintf(stderr, "   Reports QV at the end, in stderr.\n");
-    fprintf(stderr, "   Required: -sequence, -seqmers, -readmers, -peak, and -output.\n");
-    fprintf(stderr, "   Optional: -lookup <probabilities> use probabilities to adjust multiplicity to copy number\n");
+    fprintf(stderr, "     Positive k* values are expected collapsed copies.\n");
+    fprintf(stderr, "     Negative k* values are expected expanded  copies.\n");
+    fprintf(stderr, "     Closer to 0 means the expected and found k-mers are well balenced, 1:1.\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "   Required: -sequence, -readmers, -prob or -peak, and -output.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "   Output: k* <tab> frequency\n");
+    fprintf(stderr, "           Reports QV at the end, in stderr.\n");
     fprintf(stderr, "\n\n");
     fprintf(stderr, "  -dump\n");
     fprintf(stderr, "   Dump readK, asmK, and k* per bases (k-mers) in <input.fasta>.\n");
-    fprintf(stderr, "   Required: -sequence, -seqmers, -readmers, -peak, and -output\n");
-    fprintf(stderr, "   Optional: -skipMissing will skip the missing kmer sites to be printed\n");
-    fprintf(stderr, "             -lookup <probabilities> use probabilities to adjust multiplicity to copy number\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "   Required: -sequence, -readmers, -peak, and -output\n");
+    fprintf(stderr, "   Optional: -skipMissing  skip the missing kmer sites to be printed\n");
+    fprintf(stderr, "             -prob <file>  use probabilities to adjust multiplicity to copy number\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "   Output: seqName <tab> seqPos <tab> readK <tab> asmK <tab> k*\n");
     fprintf(stderr, "      seqName    - name of the sequence this kmer is from\n");
@@ -219,23 +249,12 @@ main(int32 argc, char **argv) {
     fprintf(stderr, "      asmK       - assembly copies as found in <seq.meryl>\n");
     fprintf(stderr, "      k*         - 0-centered k* value\n");
     fprintf(stderr, "\n\n");
-    fprintf(stderr, "  -vmer\n");
-    fprintf(stderr, "   Score each variant, or variants within distance k and their combinations by k*.\n");
-    fprintf(stderr, "   Required: -sequence, -seqmers, -readmers, -peak, -vcf, and -output\n");
-    fprintf(stderr, "   Optional: -comb <N>  set the max N of combinations of variants to be evaluated (default: 15)\n");
-    fprintf(stderr, "             -nosplit   without this options combinations larger than N are split\n");
-    fprintf(stderr, "             -disable-kstar  only missing kmers are considered for filtering.\n");
-    //fprintf(stderr, "                        if chosen, use bcftools to compress and index, and consensus -H 1 -f <seq.fata> to polish.\n");
-    //fprintf(stderr, "                        first ALT in heterozygous alleles are better supported by avg. |k*|.\n");
-    fprintf(stderr, "             -lookup <probabilities> use probabilities to adjust multiplicity to copy number\n");
-    fprintf(stderr, "             -debug     output a debug log, into <output>.THREAD_ID.debug.gz\n");
-    fprintf(stderr, "\n");
     fprintf(stderr, "  -completeness\n");
     fprintf(stderr, "   Compute kmer completeness using expected copy numbers for all kmers.\n");
-    fprintf(stderr, "\n");    
-    fprintf(stderr, "   Output files: <output>.debug and <output>.polish.vcf\n");
-    fprintf(stderr, "    <output>.debug : some useful info for debugging.\n");
-    fprintf(stderr, "                     seqName <tab> varMerStart <tab> varMerEnd <tab> varMerSeq <tab> score <tab> path\n");
+    fprintf(stderr, "\n\n");    
+    fprintf(stderr, "  Optional output from -debug in -filter and -polish:\n");
+    fprintf(stderr, "   <output>.THREAD_ID.debug.gz : some useful info for debugging.\n");
+    fprintf(stderr, "      seqName <tab> varMerStart <tab> varMerEnd <tab> varMerSeq <tab> score <tab> path\n");
     fprintf(stderr, "      varMerID                - unique numbering, starting from 0\n");
     fprintf(stderr, "      varMerRange             - seqName:start-end. position (0-based) of the variant (s), including sequences upstream and downstream of k-1 bp\n");
     fprintf(stderr, "      varMerSeq               - combination of variant sequence to evalute\n");
@@ -246,11 +265,8 @@ main(int32 argc, char **argv) {
     fprintf(stderr, "      avg k*                  - average of all |k*| for non-missing kmers. -1 when all kmers are missing.\n");
     fprintf(stderr, "      avg ref-alt k*          - difference between reference and alternate average k*.\n");
     fprintf(stderr, "      delta kmer multiplicity - cumulative sum of kmer multiplicity variation. Positive values imply recovered kmers. Negative values overrepresented kmers introduced.\n");
-    fprintf(stderr, "      record          - vcf record with <tab> replaced to <space>. only non-reference alleles are printed with GT being 1/1.\n");
+    fprintf(stderr, "      record                  - vcf record with <tab> replaced to <space>. only non-reference alleles are printed with GT being 1/1.\n");
     fprintf(stderr, "\n");
-    fprintf(stderr, "    <output>.polish.vcf : variants chosen.\n");
-    //fprintf(stderr, "     use bcftools view -Oz <output>.polish.vcf and bcftools consensus -H 2 -f <seq.fata> to polish.\n");
-    //fprintf(stderr, "     ALT alleles are favored with more support compared to the REF allele.\n");
     fprintf(stderr, "\n");
     fprintf(stderr, "\n");
 
@@ -267,9 +283,9 @@ main(int32 argc, char **argv) {
 
   //  Open read kmers, build a lookup table.
 
-  G->load_Kmetric();
-  G->load_Kmers();
-  G->open_Inputs();
+  G->load_Kmetric();  // load prob. table
+  G->load_Kmers();    // load lookup tables: asmDB and readDB
+  G->open_Inputs();   // load vcf
 
   //  Configure the sweatShop.
 
@@ -290,7 +306,7 @@ main(int32 argc, char **argv) {
     ss->setInOrderOutput(true);
   }
 
-  if (G->reportType == OP_VAR_MER) {
+  if (G->reportType == OP_POLISH || G->reportType == OP_FILTER) {
     fprintf(stderr, "-- Generate variant mers and score them.\n");
     ss = new sweatShop(loadSequence, processVariants, outputVariants);
     ss->setInOrderOutput(false);
diff --git a/src/merfin/varMer.C b/src/merfin/varMer.C
index 267340e..77b1c19 100644
--- a/src/merfin/varMer.C
+++ b/src/merfin/varMer.C
@@ -84,6 +84,17 @@ varMer::score(merfinGlobal *g) {
         g->getK(kiter.fmer(), kiter.rmer(), readK, asmK, prob);
         //fprintf(stderr, "[ DEBUG ] :: idx %u -- readK=%.0f , asmK=%.0f\n", idx, readK, asmK);
       }
+
+      if (readK == 0) {
+        numM++;
+      }
+    
+      //  We only use the num missings in -filter mode
+      if (g->reportType == OP_FILTER) {
+        idx++;
+        continue;
+      }
+
       // store difference in kmer count accounting for uncertainty in the estimate of readK
       oDeltak = abs(readK - asmK) * prob;
 
@@ -104,7 +115,6 @@ varMer::score(merfinGlobal *g) {
       //  re-define k* given rounded readK and asmK, in absolute values
       if (readK == 0) {
         kMetric = -1;  // use 0 if we are using non-abs k*
-        numM++;
 
       } else if (readK > asmK) {
         kMetric = readK / asmK - 1;
@@ -136,40 +146,40 @@ varMer::score(merfinGlobal *g) {
 }
 
 /***
- * Get the best variant combination, but print the original vcf 
+ * Get the best variant combination in -filter mode, but print the original vcf 
  ***/
 vector<vcfRecord*>
-varMer::bestVariantOriginalVCF() {
+varMer::bestFilter() {
   uint32 numMissing = UINT32_MAX;  //  actual minimum number of missing kmers in the combination with minimum missings
   vector<int> idxs;
-  vector<vcfRecord*> records;	// empty vector with no records
+  vector<vcfRecord*> records;      // empty vector with no records
 
   for ( int ii = 0; ii < numMs.size(); ii++ ) {
 
-		//  ignore when all kmers are 'missings'
-		if ( numMs.at(ii)  == seqs.at(ii).size() - kmer::merSize() + 1)  continue;
-
-		// 1) path with no missings? add to report
-		if ( numMs.at(ii) == 0 ) {
-			idxs.push_back(ii);
-			numMissing = 0;
-		}
-
-		// 2) lowest num. missings? add to report
-		//    only if no paths satisfying 1) are found
-		if ( numMs.at(ii) < numMissing ) {
-			numMissing = numMs.at(ii);
-		  idxs.clear();
-		  idxs.push_back(ii);
-
-		} else if ( numMs.at(ii) == numMissing ) {
-	    //  has the same numMissing
-		  idxs.push_back(ii);
-		} // else : ignore
-		
+    //  ignore when all kmers are 'missings'
+    if ( numMs.at(ii)  == seqs.at(ii).size() - kmer::merSize() + 1)  continue;
+
+    // 1) path with no missings? add to report
+    if ( numMs.at(ii) == 0 ) {
+      idxs.push_back(ii);
+      numMissing = 0;
+    }
+
+    // 2) lowest num. missings? add to report
+    //    only if no paths satisfying 1) are found
+    if ( numMs.at(ii) < numMissing ) {
+      numMissing = numMs.at(ii);
+      idxs.clear();
+      idxs.push_back(ii);
+
+    } else if ( numMs.at(ii) == numMissing ) {
+      //  has the same numMissing
+      idxs.push_back(ii);
+    } // else : ignore
+
   }
 
-  //  ignore if all kmers creats only missings
+  //  ignore if all kmers only increase missing kmers
   if ( idxs.size() == 0 ) return records;
 
   //  report all the rest
@@ -187,7 +197,6 @@ varMer::bestVariantOriginalVCF() {
   for (list<int>::iterator it=gtIdxs.begin(); it!=gtIdxs.end(); ++it)
     records.push_back(posGt->_gts[*it]->_record);
 
-  // no best?
   return records;
 }
 
diff --git a/src/merfin/varMer.H b/src/merfin/varMer.H
index 4eb5bdd..8488391 100644
--- a/src/merfin/varMer.H
+++ b/src/merfin/varMer.H
@@ -41,7 +41,7 @@ public:
 
 public:
   string  bestVariant();
-  vector<vcfRecord*>  bestVariantOriginalVCF();
+  vector<vcfRecord*>  bestFilter();
 
 private:
   vector<vcfRecord*>  getOriginalVCF(int idx);