From 5e211493a4d586a6512c93fb41e149caa7c571c3 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Fri, 14 Mar 2014 18:07:38 -0400
Subject: [PATCH 01/13] Added files for z-algorithm computation. Created method
 templates for one additional alignment filter

---
 src/AlignmentFilters.cpp             | 19 +++++++++++++++
 src/AlignmentFilters.h               |  3 +++
 src/Makefile.am                      |  7 ++++++
 src/tests/NWNoRefEndPenalty_test.cpp | 35 ++++++----------------------
 src/tests/NWNoRefEndPenalty_test.h   |  2 +-
 src/tests/main_test.cpp              |  2 ++
 6 files changed, 39 insertions(+), 29 deletions(-)
diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index a569aca..5855ccf 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -24,6 +24,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "src/AlignmentFilters.h"
 #include "src/common.h"
+#include "src/ZAlgorithm.h"
 
 using namespace std;
 
@@ -147,4 +148,22 @@ namespace AlignmentFilters {
     else
       return pair<int,int>(head_match, match_run);
   } 
+
+
+
+  bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream){
+    /*
+      TO DO: Fill in this function
+
+      Extract read nucleotides
+      Remove  any soft-clipped bases from nucleotide string
+      
+    vector<int> match_counts;
+    ZAlgorithm::GetPrefixMatchCounts(string& s1, string& s2, match_counts);
+    
+    ZAlgorithm::GetSuffixMatchCounts(string& s1, string& s2, match_counts);
+
+    */
+    return true;
+  }
 }
diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h
index e4c4d8a..f28e49b 100644
--- a/src/AlignmentFilters.h
+++ b/src/AlignmentFilters.h
@@ -41,6 +41,9 @@ namespace AlignmentFilters {
   
   /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */
   pair<int,int> GetEndDistToIndel(AlignedRead* aln);
+
+  /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */
+  bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
 }
 
 #endif
diff --git a/src/Makefile.am b/src/Makefile.am
index a22cd01..f7dce80 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -201,6 +201,7 @@ liballelotype_a_SOURCES = \
 	TextFileReader.cpp TextFileReader.h \
 	TextFileWriter.cpp TextFileWriter.h \
 	VCFWriter.cpp VCFWriter.h \
+	ZAlgorithm.cpp ZAlgorithm.h \
 	ZippedFastaFileReader.cpp ZippedFastaFileReader.h \
 	ZippedFastqFileReader.cpp ZippedFastqFileReader.h \
 	ZippedTextFileReader.cpp ZippedTextFileReader.h \
@@ -245,6 +246,8 @@ lobSTRTests_SOURCES = \
 	tests/BWAReadAligner_test.cpp \
 	tests/common_test.h \
 	tests/common_test.cpp \
+	tests/DNATools.h \
+	tests/DNATools.cpp \
 	tests/NWNoRefEndPenalty_test.h \
 	tests/NWNoRefEndPenalty_test.cpp \
 	tests/ReadContainer_test.h \
@@ -253,6 +256,8 @@ lobSTRTests_SOURCES = \
 	tests/RemoveDuplicates_test.cpp \
 	tests/VCFWriter_test.h \
 	tests/VCFWriter_test.cpp \
+	tests/ZAlgorithm_test.h \
+	tests/ZAlgorithm_test.cpp \
 	AlignmentFilters.cpp \
 	BamFileReader.cpp \
 	BamPairedFileReader.cpp \
@@ -281,6 +286,8 @@ lobSTRTests_SOURCES = \
 	ReadContainer.cpp \
 	RemoveDuplicates.cpp \
 	VCFWriter.cpp \
+	ZAlgorithm.h \
+	ZAlgorithm.cpp \
 	ZippedFastaFileReader.cpp \
 	ZippedFastqFileReader.cpp \
 	ZippedTextFileReader.cpp \
diff --git a/src/tests/NWNoRefEndPenalty_test.cpp b/src/tests/NWNoRefEndPenalty_test.cpp
index bf60dec..0dc1038 100644
--- a/src/tests/NWNoRefEndPenalty_test.cpp
+++ b/src/tests/NWNoRefEndPenalty_test.cpp
@@ -25,36 +25,15 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include <cppunit/extensions/TestFactoryRegistry.h>
 #include <cppunit/ui/text/TestRunner.h>
 
-#include "src/tests/NWNoRefEndPenalty_test.h"
 #include "src/common.h"
-
-// Registers the fixture into the 'registry'
-CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest);
+#include "src/tests/DNATools.h"
+#include "src/tests/NWNoRefEndPenalty_test.h"
 
 using namespace std;
 
+// Registers the fixture into the 'registry'
+CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest);
 
-char GetChar(int index){
-  switch(index){
-  case 0:
-    return 'A';
-  case 1:
-    return 'C';
-  case 2:
-    return 'G';
-  case 3:
-    return 'T';
-  default:
-    return 'N';
-  }
-}
-
-string RandDNA(const int length){
-  stringstream seq;
-  for (int j = 0; j < length; j++)
-    seq << GetChar(rand()%4);
-  return seq.str();
-}
 
 void NWNoRefEndPenaltyTest::setUp() {} 
 void NWNoRefEndPenaltyTest::tearDown() {}
@@ -64,7 +43,7 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double
   int num_correct = 0;
   for (int i = 0; i < num_trials; i++){
     // Create random reference sequence
-    string ref_seq = RandDNA(REF_LEN);
+    string ref_seq = DNATools::RandDNA(REF_LEN);
 
     // Choose the read region
     int read_start  = rand()%(ref_seq.size()-READ_LEN+1);
@@ -75,10 +54,10 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double
     for (unsigned int i = PERFECT_FLANK; i < read_seq.size()-PERFECT_FLANK; i++){
       float prob = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
       if (prob <= mut_prob)
-	mut_read_seq_ss << RandDNA(1);
+	mut_read_seq_ss << DNATools::RandDNA(1);
       else if (prob - mut_prob <= ins_prob){
 	int ins_size = rand()%(MAX_INS-1) + 1;
-	mut_read_seq_ss << RandDNA(ins_size);
+	mut_read_seq_ss << DNATools::RandDNA(ins_size);
       }
       else if (prob - mut_prob - ins_prob <= del_prob){
 	int del_size = rand()%(MAX_DEL-1) + 1;
diff --git a/src/tests/NWNoRefEndPenalty_test.h b/src/tests/NWNoRefEndPenalty_test.h
index 967dffa..37cdb14 100644
--- a/src/tests/NWNoRefEndPenalty_test.h
+++ b/src/tests/NWNoRefEndPenalty_test.h
@@ -1,5 +1,5 @@
 /*
-Copyright (C) 2011 Melissa Gymrek <mgymrek@mit.edu>
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
 
 This file is part of lobSTR.
 
diff --git a/src/tests/main_test.cpp b/src/tests/main_test.cpp
index 1026a95..4cc5bc6 100644
--- a/src/tests/main_test.cpp
+++ b/src/tests/main_test.cpp
@@ -27,6 +27,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include "src/tests/ReadContainer_test.h"
 #include "src/tests/RemoveDuplicates_test.h"
 #include "src/tests/VCFWriter_test.h"
+#include "src/tests/ZAlgorithm_test.h"
 
 int main( int argc, char **argv) {
   // Adds the test to the list of tests to run
@@ -38,6 +39,7 @@ int main( int argc, char **argv) {
   runner.addTest(ReadContainerTest::suite());
   runner.addTest(RemoveDuplicatesTest::suite());
   runner.addTest(VCFWriterTest::suite());
+  runner.addTest(ZAlgorithmTest::suite());
 
   // Run the tests
   bool wasSucessful = runner.run();

From 7b17ce7a3dc08bc181dc880b523bb20d570a4100 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Fri, 14 Mar 2014 18:08:50 -0400
Subject: [PATCH 02/13] Added unit tests for z-algorithm prefix and suffix
 methods

---
 src/ZAlgorithm.cpp            | 99 +++++++++++++++++++++++++++++++++++
 src/ZAlgorithm.h              | 10 ++++
 src/tests/DNATools.cpp        | 31 +++++++++++
 src/tests/DNATools.h          |  9 ++++
 src/tests/ZAlgorithm_test.cpp | 50 ++++++++++++++++++
 src/tests/ZAlgorithm_test.h   | 46 ++++++++++++++++
 6 files changed, 245 insertions(+)
 create mode 100644 src/ZAlgorithm.cpp
 create mode 100644 src/ZAlgorithm.h
 create mode 100644 src/tests/DNATools.cpp
 create mode 100644 src/tests/DNATools.h
 create mode 100644 src/tests/ZAlgorithm_test.cpp
 create mode 100644 src/tests/ZAlgorithm_test.h

diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
new file mode 100644
index 0000000..1682278
--- /dev/null
+++ b/src/ZAlgorithm.cpp
@@ -0,0 +1,99 @@
+#include <iostream>
+#include <sstream>
+
+#include <stdlib.h>
+
+#include "ZAlgorithm.h"
+
+
+namespace ZAlgorithm{
+  void suffix_helper(int start, string& s1, string& s2, 
+		     vector<int>& s1_matches, vector<int>& num_matches){
+    num_matches  = vector<int>(s2.size(), 0);
+    int leftmost = s2.size(), right_index = s2.size();
+    for (int i = start; i >= 0; i--){
+      if (i <= leftmost){
+	int index_a = s1.size()-1, index_b = i;
+	while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+	  index_a--;
+	  index_b--;
+	}
+	num_matches[i] = i - index_b;
+	if (index_b < i){
+	  right_index = i;
+	  leftmost    = index_b + 1;
+	}
+      }
+      else {
+	int twin     = i - right_index + s1.size()-1;
+	int new_left = i - s1_matches[twin] + 1;
+	if (new_left > leftmost)
+	  num_matches[i] = s1_matches[twin];
+	else if (new_left < leftmost)
+	  num_matches[i] = i-leftmost+1;
+	else {
+	  int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1;
+	  while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+	    index_a--;
+	    index_b--;
+	  }
+	  num_matches[i] = i-index_b;
+	  right_index    = i;
+	  leftmost       = index_b + 1;
+	}
+      }
+    }
+  }
+
+
+  void prefix_helper(unsigned int start, string& s1, string& s2, 
+		     vector<int>& s1_matches, vector<int>& num_matches){
+    num_matches = vector<int>(s2.size(), 0);
+    int rightmost = 0, left_index = 0;
+    for (int i = start; i < s2.size(); i++){
+      if (i >= rightmost){
+	int index_a = 0, index_b = i;
+	while (index_a < s1.size() && index_b < s1.size() && s1[index_a] == s2[index_b]){
+	  index_a++;
+	  index_b++;
+	}
+	num_matches[i] = index_b - i;
+	if (index_b > i){
+	  left_index = i;
+	  rightmost  = index_b - 1;
+	}
+      }
+      else {
+	int twin      = i - left_index;
+	int new_right = i + s1_matches[twin] - 1;
+	if (new_right < rightmost)
+	  num_matches[i] = s1_matches[twin];
+	else if (new_right > rightmost)
+	  num_matches[i] = rightmost-i+1;
+	else {
+	  int index_a = rightmost+1-i, index_b = rightmost+1;
+	  while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
+	    index_a++;
+	    index_b++;
+	  }
+	  num_matches[i] = index_b - i;
+	  left_index     = i;
+	  rightmost      = index_b - 1;
+	}
+      }
+    }
+  }
+
+  void GetPrefixMatchCounts(string& s1, string& s2, vector<int>& num_matches) {
+    vector<int> s1_matches;
+    prefix_helper(1, s1, s1, s1_matches, s1_matches);
+    prefix_helper(0, s1, s2, s1_matches, num_matches);
+  }  
+
+  void GetSuffixMatchCounts(string& s1, string& s2, vector<int>& num_matches) {
+    vector<int> s1_matches;
+    suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches);
+    suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches);
+  }  
+}
+
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
new file mode 100644
index 0000000..5dbb94e
--- /dev/null
+++ b/src/ZAlgorithm.h
@@ -0,0 +1,10 @@
+#include <string>
+#include <vector>
+
+using namespace std;
+
+
+namespace ZAlgorithm{
+  void GetPrefixMatchCounts(string& s1, string& s2, vector<int>& num_matches);
+  void GetSuffixMatchCounts(string& s1, string& s2, vector<int>& num_matches);
+}
diff --git a/src/tests/DNATools.cpp b/src/tests/DNATools.cpp
new file mode 100644
index 0000000..73ec7eb
--- /dev/null
+++ b/src/tests/DNATools.cpp
@@ -0,0 +1,31 @@
+#include <sstream>
+#include <string>
+
+#include <stdlib.h>
+
+#include "src/tests/DNATools.h"
+
+namespace DNATools {
+  char GetChar(int index){
+    switch(index){
+  case 0:
+    return 'A';
+    case 1:
+      return 'C';
+    case 2:
+      return 'G';
+    case 3:
+      return 'T';
+    default:
+      return 'N';
+    }
+  }
+
+  std::string RandDNA(const int length){
+    std::stringstream seq;
+    for (int j = 0; j < length; j++)
+      seq << GetChar(rand()%4);
+    return seq.str();
+  }
+}
+
diff --git a/src/tests/DNATools.h b/src/tests/DNATools.h
new file mode 100644
index 0000000..0540313
--- /dev/null
+++ b/src/tests/DNATools.h
@@ -0,0 +1,9 @@
+#ifndef DNATOOLS_H
+#define DNATOOLS_H
+
+#include <string>
+
+namespace DNATools {
+  std::string RandDNA(const int length);
+}
+#endif // DNATOOLS_H
diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp
new file mode 100644
index 0000000..88904aa
--- /dev/null
+++ b/src/tests/ZAlgorithm_test.cpp
@@ -0,0 +1,50 @@
+
+#include <string>
+#include <vector>
+
+#include "src/tests/DNATools.h"
+#include "src/tests/ZAlgorithm_test.h"
+#include "src/common.h"
+#include "src/ZAlgorithm.h"
+
+
+using namespace std;
+
+void ZAlgorithmTest::setUp(){}
+void ZAlgorithmTest::tearDown(){}
+
+void ZAlgorithmTest::test_Prefix(){
+  vector<int> num_matches;
+  string s1, s2;
+  for (int i = 0; i < NUM_TRIALS; i++){
+    s1 = DNATools::RandDNA(LENGTH);
+    s2 = DNATools::RandDNA(LENGTH);
+    ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches);
+    for (unsigned int j = 0; j < s2.size(); j++){
+      bool match  = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0);
+      bool longer = (s1.size() > num_matches[j] && s2.size() > (j + num_matches[j]) && s2[j+num_matches[j]] == s1[num_matches[j]]);
+      CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer)));
+    }
+  }
+}
+
+void ZAlgorithmTest::test_Suffix(){
+  vector<int> num_matches;
+  string s1, s2;
+  for (int i = 0; i < NUM_TRIALS; i++){
+    s1 = DNATools::RandDNA(LENGTH);
+    s2 = DNATools::RandDNA(LENGTH);
+    ZAlgorithm::GetSuffixMatchCounts(s1, s2, num_matches);
+    for (int j = s2.size()-1; j >= 0; j--){
+      string sub_1 = s1.substr(s1.size()-num_matches[j], num_matches[j]);
+      string sub_2 = s2.substr(j-num_matches[j]+1, num_matches[j]);
+      bool match   = (sub_1.compare(sub_2) == 0);
+      bool longer  = ((j-num_matches[j]) > 0 && (s1.size()-num_matches[j]) > 0 && s1[s1.size()-num_matches[j]-1] == s2[j-num_matches[j]]);
+      CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer)));
+    }
+  }
+}
+
+
+
+
diff --git a/src/tests/ZAlgorithm_test.h b/src/tests/ZAlgorithm_test.h
new file mode 100644
index 0000000..fc776f7
--- /dev/null
+++ b/src/tests/ZAlgorithm_test.h
@@ -0,0 +1,46 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifndef SRC_TESTS_ZALGORITHM_H__
+#define SRC_TESTS_ZALGORITHM_H__
+
+#include <cppunit/extensions/HelperMacros.h>
+
+#include "src/ZAlgorithm.h"
+
+class ZAlgorithmTest :
+public CppUnit::TestFixture {
+  CPPUNIT_TEST_SUITE(ZAlgorithmTest);
+  CPPUNIT_TEST(test_Prefix);
+  CPPUNIT_TEST(test_Suffix);
+  CPPUNIT_TEST_SUITE_END();
+
+ public:
+  const static int NUM_TRIALS=1000;
+  const static int LENGTH=100;
+  void setUp();
+  void tearDown();
+  void test_Prefix();
+  void test_Suffix();
+    
+ private:
+};
+
+#endif //  SRC_TESTS_ZALGORITHM_H_

From e55c69d739499a8bf7f1bf7e0ef0212555008443 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Mon, 17 Mar 2014 14:58:47 -0400
Subject: [PATCH 03/13] Completed method to check for maximal matching suffixes
 and prefixes. Added preliminary unit tests for this function

---
 src/AlignmentFilters.cpp            | 89 ++++++++++++++++++++++++++---
 src/ZAlgorithm.cpp                  | 28 +++++++--
 src/ZAlgorithm.h                    | 24 +++++++-
 src/tests/AlignmentFilters_test.cpp | 45 +++++++++++++++
 src/tests/AlignmentFilters_test.h   |  2 +
 5 files changed, 173 insertions(+), 15 deletions(-)

diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index 5855ccf..fa8fcef 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -150,20 +150,91 @@ namespace AlignmentFilters {
   } 
 
 
+  /* 
+     Stores the sequence, start and end position of the read after removing clipped bases
+     using the provided references
+   */
+  void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){
+    unclipped_start = aln->read_start;
+    unclipped_end   = aln->read_start-1;
+    bool begin      = true;
+    int start_index = 0, num_bases = 0;
+    for(vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){
+      switch(cigar_iter->Type) {
+      case 'D':
+	unclipped_end += cigar_iter->Length;
+	begin          = false;
+	break;
+      case 'H':
+	break;
+      case 'S':
+	if (begin) start_index += cigar_iter->Length;
+	break;
+      case 'M':
+	unclipped_end += cigar_iter->Length;
+	num_bases     += cigar_iter->Length;
+	begin          = false;
+	break;
+      case 'I':
+	num_bases += cigar_iter->Length;
+	begin      = false;
+	break;
+      default:
+	PrintMessageDieOnError("Invalid CIGAR char " + cigar_iter->Type, ERROR);
+	break;
+      }
+    }
+    bases = aln->nucleotides.substr(start_index, num_bases);
+  }
+
 
-  bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream){
-    /*
-      TO DO: Fill in this function
 
-      Extract read nucleotides
-      Remove  any soft-clipped bases from nucleotide string
-      
+  /*
+    
+   */
+  bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){
+    // Extract sequence, start and end coordinates of read after clipping
+    string bases;
+    int start, end;
+    GetUnclippedInfo(aln, bases, start, end);
+
+    std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl;
+
+    // Check that the prefix match is the longest
     vector<int> match_counts;
-    ZAlgorithm::GetPrefixMatchCounts(string& s1, string& s2, match_counts);
+    ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts);
+    if (ref_seq.size() != match_counts.size())
+      PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR);
+    if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){
+      int start_index = start - ref_seq_start;
+      int num_matches = match_counts[start_index];
+      for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){
+	if (i == start_index)
+	  continue;
+	if (match_counts[i] >= num_matches){
+	  std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	  return false;
+	}
+      }
+    }
     
-    ZAlgorithm::GetSuffixMatchCounts(string& s1, string& s2, match_counts);
+    // Check that the suffix match is the longest
+    ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, match_counts);
+    if (ref_seq.size() != match_counts.size())
+      PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR);
+    if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){
+      int end_index   = end - ref_seq_start;
+      int num_matches = match_counts[end_index];
+      for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){
+	if (i == end_index)
+	  continue;
+	if (match_counts[i] >= num_matches){
+	  std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	  return false;
+	}
+      }
+    }
 
-    */
     return true;
   }
 }
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
index 1682278..018ca73 100644
--- a/src/ZAlgorithm.cpp
+++ b/src/ZAlgorithm.cpp
@@ -1,3 +1,23 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
 #include <iostream>
 #include <sstream>
 
@@ -7,7 +27,7 @@
 
 
 namespace ZAlgorithm{
-  void suffix_helper(int start, string& s1, string& s2, 
+  void suffix_helper(int start, const string& s1, const string& s2, 
 		     vector<int>& s1_matches, vector<int>& num_matches){
     num_matches  = vector<int>(s2.size(), 0);
     int leftmost = s2.size(), right_index = s2.size();
@@ -46,7 +66,7 @@ namespace ZAlgorithm{
   }
 
 
-  void prefix_helper(unsigned int start, string& s1, string& s2, 
+  void prefix_helper(unsigned int start, const string& s1, const string& s2, 
 		     vector<int>& s1_matches, vector<int>& num_matches){
     num_matches = vector<int>(s2.size(), 0);
     int rightmost = 0, left_index = 0;
@@ -84,13 +104,13 @@ namespace ZAlgorithm{
     }
   }
 
-  void GetPrefixMatchCounts(string& s1, string& s2, vector<int>& num_matches) {
+  void GetPrefixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches) {
     vector<int> s1_matches;
     prefix_helper(1, s1, s1, s1_matches, s1_matches);
     prefix_helper(0, s1, s2, s1_matches, num_matches);
   }  
 
-  void GetSuffixMatchCounts(string& s1, string& s2, vector<int>& num_matches) {
+  void GetSuffixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches) {
     vector<int> s1_matches;
     suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches);
     suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches);
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
index 5dbb94e..6db8886 100644
--- a/src/ZAlgorithm.h
+++ b/src/ZAlgorithm.h
@@ -1,3 +1,23 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
 #include <string>
 #include <vector>
 
@@ -5,6 +25,6 @@ using namespace std;
 
 
 namespace ZAlgorithm{
-  void GetPrefixMatchCounts(string& s1, string& s2, vector<int>& num_matches);
-  void GetSuffixMatchCounts(string& s1, string& s2, vector<int>& num_matches);
+  void GetPrefixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches);
+  void GetSuffixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches);
 }
diff --git a/src/tests/AlignmentFilters_test.cpp b/src/tests/AlignmentFilters_test.cpp
index d4fa929..3384497 100644
--- a/src/tests/AlignmentFilters_test.cpp
+++ b/src/tests/AlignmentFilters_test.cpp
@@ -128,3 +128,48 @@ void AlignmentFiltersTest::test_GetNumEndMatches(){
 }
 
 
+void AlignmentFiltersTest::test_HasLargestEndMatches(){
+  AlignedRead aln;
+  string ref_seq;
+  int    ref_start;
+
+  // Simple case with perfect maximal match
+  aln.cigar_ops   = GetCigarOps("9M");
+  aln.read_start  = 102;
+  aln.nucleotides =   "ACACATACAC";
+  ref_seq         = "ACACACATACACACACAC";
+  ref_start       = 100;
+  CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+  // Perfect maximal match and soft clipping
+  aln.cigar_ops   = GetCigarOps("2S9M2S");
+  aln.read_start  = 102;
+  aln.nucleotides = "NNACACATACACNN";
+  ref_seq         = "ACACACATACACACACAC";
+  ref_start       = 100;
+  CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+  // Perfect non-maximal match
+  aln.cigar_ops   = GetCigarOps("8M");
+  aln.read_start  = 102;
+  aln.nucleotides =   "ACACACAC";
+  ref_seq         = "ACACACACACACACAC";
+  ref_start       = 100;
+  CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+  // Maximal prefix but not maximal suffix
+  aln.cigar_ops   = GetCigarOps("9M");
+  aln.read_start  = 102;
+  aln.nucleotides =   "ACATTACAC";
+  ref_seq         = "ACACATCACACACTACAC";
+  ref_start       = 100;
+  CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+ 
+  // Maximal suffix but not maximal prefix
+  aln.cigar_ops   = GetCigarOps("9M");
+  aln.read_start  = 102;
+  aln.nucleotides =   "ACACTACAC";
+  ref_seq         = "ACACATTACACACACAC";
+  ref_start       = 100;
+  CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+}
diff --git a/src/tests/AlignmentFilters_test.h b/src/tests/AlignmentFilters_test.h
index 5e4f5d6..bae2181 100644
--- a/src/tests/AlignmentFilters_test.h
+++ b/src/tests/AlignmentFilters_test.h
@@ -30,6 +30,7 @@ public CppUnit::TestFixture {
   CPPUNIT_TEST_SUITE(AlignmentFiltersTest);
   CPPUNIT_TEST(test_GetDistToIndel);
   CPPUNIT_TEST(test_GetNumEndMatches);
+  CPPUNIT_TEST(test_HasLargestEndMatches);
   CPPUNIT_TEST_SUITE_END();
 
  public:
@@ -37,6 +38,7 @@ public CppUnit::TestFixture {
   void tearDown();
   void test_GetDistToIndel();
   void test_GetNumEndMatches();
+  void test_HasLargestEndMatches();
  private:
 };
 

From 927870792693f4f0fca820827f8421dba0660a7c Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Mon, 17 Mar 2014 15:19:15 -0400
Subject: [PATCH 04/13] Added command line options for maximal prefix/suffix
 read filter

---
 src/main_allelotype.cpp    | 8 ++++++++
 src/runtime_parameters.cpp | 1 +
 src/runtime_parameters.h   | 1 +
 3 files changed, 10 insertions(+)

diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp
index b079394..95243d8 100644
--- a/src/main_allelotype.cpp
+++ b/src/main_allelotype.cpp
@@ -123,6 +123,8 @@ void show_help() {
     "                                from either end of the read.\n"
     "--min-read-end-match <INT>:     Filter reads whose alignments don't exactly match the reference for at least\n"\
     "                                <INT> bp at both ends. \n"
+    "--maximal-end-match <INT>:      Filter reads whose prefix/suffix matches to reference are <= \n"
+    "                                those obtained when shifting the read ends by <INT> bp\n"
     "--exclude-partial:              Do not report any information about partially\n" \
     "                                spanning reads.\n\n"
     "Additional options\n" \
@@ -151,6 +153,7 @@ void parse_commandline_options(int argc, char* argv[]) {
     OPT_MAX_DIFF_REF,
     OPT_MAXMAPQ,
     OPT_MAXMATEDIST,
+    OPT_MAXIMAL_END_MATCH_WINDOW,
     OPT_MIN_BORDER,
     OPT_MIN_BP_BEFORE_INDEL,
     OPT_MIN_HET_FREQ,
@@ -184,6 +187,7 @@ void parse_commandline_options(int argc, char* argv[]) {
     {"max-diff-ref", 1, 0, OPT_MAX_DIFF_REF},
     {"mapq", 1, 0, OPT_MAXMAPQ},
     {"max-matedist", 1, 0, OPT_MAXMATEDIST},
+    {"maximal-end-match", 1, 0, OPT_MAXIMAL_END_MATCH_WINDOW},
     {"min-border", 1, 0, OPT_MIN_BORDER},
     {"min-bp-before-indel", 1, 0, OPT_MIN_BP_BEFORE_INDEL},
     {"min-het-freq", 1, 0, OPT_MIN_HET_FREQ},
@@ -258,6 +262,10 @@ void parse_commandline_options(int argc, char* argv[]) {
       max_diff_ref = atoi(optarg);
       AddOption("max-diff-ref", string(optarg), true, &user_defined_arguments_allelotyper);
       break;
+    case OPT_MAXIMAL_END_MATCH_WINDOW:
+      maximal_end_match_window = atoi(optarg);
+      AddOption("maximal-end-match", string(optarg), true, &user_defined_arguments_allelotyper);
+      break;
     case OPT_MAXMAPQ:
       max_mapq = atoi(optarg);
       AddOption("mapq", string(optarg), true, &user_defined_arguments_allelotyper);
diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp
index cde4cf9..a4baa79 100644
--- a/src/runtime_parameters.cpp
+++ b/src/runtime_parameters.cpp
@@ -115,6 +115,7 @@ bool include_gl = false;
 bool print_reads = false;
 float min_het_freq = 0;
 int max_matedist = 100000;
+int maximal_end_match_window = 0;
 int min_border   = 0;
 int min_bp_before_indel = 0;
 int min_read_end_match = 0;
diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h
index c0618a5..9879204 100644
--- a/src/runtime_parameters.h
+++ b/src/runtime_parameters.h
@@ -140,6 +140,7 @@ extern int min_border;
 extern int min_bp_before_indel;
 extern int min_read_end_match;
 extern int max_matedist;
+extern int maximal_end_match_window;
 
 // debug
 extern bool profile;

From f9308b856668c3a9b0a66a42c0beefefd09fdac4 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Mon, 17 Mar 2014 15:33:03 -0400
Subject: [PATCH 05/13] Modified the allelotyper to use the prefix/suffix
 filter, if requested

---
 src/AlignmentFilters.cpp |  6 +++---
 src/ReadContainer.cpp    | 13 +++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index fa8fcef..9cfc7ff 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -198,7 +198,7 @@ namespace AlignmentFilters {
     int start, end;
     GetUnclippedInfo(aln, bases, start, end);
 
-    std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl;
+    //std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl;
 
     // Check that the prefix match is the longest
     vector<int> match_counts;
@@ -212,7 +212,7 @@ namespace AlignmentFilters {
 	if (i == start_index)
 	  continue;
 	if (match_counts[i] >= num_matches){
-	  std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	  //std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
 	  return false;
 	}
       }
@@ -229,7 +229,7 @@ namespace AlignmentFilters {
 	if (i == end_index)
 	  continue;
 	if (match_counts[i] >= num_matches){
-	  std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	  //std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
 	  return false;
 	}
       }
diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp
index 48f6435..ff4e85f 100644
--- a/src/ReadContainer.cpp
+++ b/src/ReadContainer.cpp
@@ -258,6 +258,19 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
       return false;
   }
 
+
+  // check that the prefix and suffix of the read match maximally compared to proximal reference locations
+  if (maximal_end_match_window > 0){
+    map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
+    if (loc_iter == ref_ext_nucleotides.end())
+      PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
+    string ref_ext_seq = loc_iter->second;
+    bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window);
+    if (!maximum_end_matches)
+      return false;
+  }
+
+
   // check that both ends of the aligned read have sufficient bases before the first indel
   if (min_bp_before_indel > 0){
     pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read);

From 8e032aaaf8b5b647381328ca8c25c45ff602e6ba Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Mon, 17 Mar 2014 18:57:10 -0400
Subject: [PATCH 06/13] Fixed major Z-algorithm bug which caused it to fail on
 strings of unequal length.

---
 src/AlignmentFilters.cpp      | 11 ++---------
 src/ZAlgorithm.cpp            |  6 +++---
 src/main_allelotype.cpp       |  4 ++--
 src/tests/ZAlgorithm_test.cpp | 26 ++++++++++++++++++++++++--
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index 9cfc7ff..82a1fbe 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -188,7 +188,6 @@ namespace AlignmentFilters {
   }
 
 
-
   /*
     
    */
@@ -198,8 +197,6 @@ namespace AlignmentFilters {
     int start, end;
     GetUnclippedInfo(aln, bases, start, end);
 
-    //std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl;
-
     // Check that the prefix match is the longest
     vector<int> match_counts;
     ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts);
@@ -211,10 +208,8 @@ namespace AlignmentFilters {
       for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){
 	if (i == start_index)
 	  continue;
-	if (match_counts[i] >= num_matches){
-	  //std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	if (match_counts[i] >= num_matches)
 	  return false;
-	}
       }
     }
     
@@ -228,10 +223,8 @@ namespace AlignmentFilters {
       for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){
 	if (i == end_index)
 	  continue;
-	if (match_counts[i] >= num_matches){
-	  //std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; 
+	if (match_counts[i] >= num_matches)
 	  return false;
-	}
       }
     }
 
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
index 018ca73..6a39f5f 100644
--- a/src/ZAlgorithm.cpp
+++ b/src/ZAlgorithm.cpp
@@ -29,7 +29,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 namespace ZAlgorithm{
   void suffix_helper(int start, const string& s1, const string& s2, 
 		     vector<int>& s1_matches, vector<int>& num_matches){
-    num_matches  = vector<int>(s2.size(), 0);
+    num_matches  = vector<int>(s2.size(), -1);
     int leftmost = s2.size(), right_index = s2.size();
     for (int i = start; i >= 0; i--){
       if (i <= leftmost){
@@ -68,12 +68,12 @@ namespace ZAlgorithm{
 
   void prefix_helper(unsigned int start, const string& s1, const string& s2, 
 		     vector<int>& s1_matches, vector<int>& num_matches){
-    num_matches = vector<int>(s2.size(), 0);
+    num_matches = vector<int>(s2.size(), -1);
     int rightmost = 0, left_index = 0;
     for (int i = start; i < s2.size(); i++){
       if (i >= rightmost){
 	int index_a = 0, index_b = i;
-	while (index_a < s1.size() && index_b < s1.size() && s1[index_a] == s2[index_b]){
+	while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
 	  index_a++;
 	  index_b++;
 	}
diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp
index 95243d8..2ebb7af 100644
--- a/src/main_allelotype.cpp
+++ b/src/main_allelotype.cpp
@@ -123,8 +123,8 @@ void show_help() {
     "                                from either end of the read.\n"
     "--min-read-end-match <INT>:     Filter reads whose alignments don't exactly match the reference for at least\n"\
     "                                <INT> bp at both ends. \n"
-    "--maximal-end-match <INT>:      Filter reads whose prefix/suffix matches to reference are <= \n"
-    "                                those obtained when shifting the read ends by <INT> bp\n"
+    "--maximal-end-match <INT>:      Filter reads whose prefix/suffix matches to reference are <= those \n"
+    "                                obtained when shifting the read ends by distances within <INT> bp\n"
     "--exclude-partial:              Do not report any information about partially\n" \
     "                                spanning reads.\n\n"
     "Additional options\n" \
diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp
index 88904aa..ef215e1 100644
--- a/src/tests/ZAlgorithm_test.cpp
+++ b/src/tests/ZAlgorithm_test.cpp
@@ -1,7 +1,28 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
 
 #include <string>
 #include <vector>
 
+#include <stdlib.h>
+
 #include "src/tests/DNATools.h"
 #include "src/tests/ZAlgorithm_test.h"
 #include "src/common.h"
@@ -17,8 +38,9 @@ void ZAlgorithmTest::test_Prefix(){
   vector<int> num_matches;
   string s1, s2;
   for (int i = 0; i < NUM_TRIALS; i++){
-    s1 = DNATools::RandDNA(LENGTH);
-    s2 = DNATools::RandDNA(LENGTH);
+    s1 = DNATools::RandDNA(rand()%LENGTH);
+    s2 = DNATools::RandDNA(rand()%LENGTH);
+
     ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches);
     for (unsigned int j = 0; j < s2.size(); j++){
       bool match  = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0);

From f11d2e227f3aaf68a15104253c0bbaf24785de91 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Tue, 18 Mar 2014 17:05:44 -0400
Subject: [PATCH 07/13] Added functionality to count the number of reads
 filtered out by each filter. Statistics are printed at the end of
 allelotyping to the stats file

---
 src/AlignedRead.h          |  5 +--
 src/AlignmentFilters.cpp   |  1 -
 src/AlignmentFilters.h     | 14 ++++----
 src/FilterCounter.cpp      | 74 ++++++++++++++++++++++++++++++++++++++
 src/FilterCounter.h        | 56 +++++++++++++++++++++++++++++
 src/Makefile.am            |  3 ++
 src/ReadContainer.cpp      | 33 ++++++++++++-----
 src/RemoveDuplicates.h     |  4 +--
 src/RunInfo.h              |  7 +++-
 src/common.cpp             |  2 +-
 src/runtime_parameters.cpp |  1 +
 src/runtime_parameters.h   |  4 +++
 12 files changed, 179 insertions(+), 25 deletions(-)
 create mode 100644 src/FilterCounter.cpp
 create mode 100644 src/FilterCounter.h

diff --git a/src/AlignedRead.h b/src/AlignedRead.h
index 87650d6..ab32091 100644
--- a/src/AlignedRead.h
+++ b/src/AlignedRead.h
@@ -22,9 +22,6 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "src/api/BamAlignment.h"
 
-using namespace std;
-using BamTools::CigarOp;
-
 /*
   Struct to keep track of aligned read used in allelotyping
  */
@@ -37,7 +34,7 @@ struct AlignedRead {
   int read_start;
   std::string nucleotides;
   std::string qualities;
-  vector<BamTools::CigarOp> cigar_ops;
+  std::vector<BamTools::CigarOp> cigar_ops;
   std::string repseq;
   int period;
   int diffFromRef;
diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index 82a1fbe..f277a9f 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -29,7 +29,6 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 using namespace std;
 
 namespace AlignmentFilters {
-
   template<typename CigarIterator> int GetDistToIndel(CigarIterator iter, CigarIterator end){
     // Process leading clipping ops
     if (iter != end && iter->Type == 'H')
diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h
index f28e49b..a2d6b78 100644
--- a/src/AlignmentFilters.h
+++ b/src/AlignmentFilters.h
@@ -19,8 +19,8 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 */
 
 
-#ifndef SRC_ALIGNMENTFILTERS_H
-#define SRC_ALIGNMENTFILTERS_H
+#ifndef SRC_ALIGNMENTFILTERS_H_
+#define SRC_ALIGNMENTFILTERS_H_
 
 #include <string>
 #include <vector>
@@ -31,19 +31,19 @@ using namespace std;
 
 namespace AlignmentFilters {
   /* Returns the vector of CigarOps corresponding to the CIGAR string. */
-  vector<BamTools::CigarOp> GetCigarOps(string cigar_string);
+  std::vector<BamTools::CigarOp> GetCigarOps(std::string cigar_string);
   
   /* Returns the CIGAR string corresponding to the vector of CigarOps. */
-  string GetCigarString(vector<BamTools::CigarOp>& cigar_ops); 
+  std::string GetCigarString(std::vector<BamTools::CigarOp>& cigar_ops); 
   
   /* Length of perfect base matches at 5' and 3' end of read. */
-  pair<int,int> GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start);
+  std::pair<int,int> GetNumEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start);
   
   /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */
-  pair<int,int> GetEndDistToIndel(AlignedRead* aln);
+  std::pair<int,int> GetEndDistToIndel(AlignedRead* aln);
 
   /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */
-  bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
+  bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
 }
 
 #endif
diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp
new file mode 100644
index 0000000..4148278
--- /dev/null
+++ b/src/FilterCounter.cpp
@@ -0,0 +1,74 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#include "src/common.h"
+#include "src/FilterCounter.h"
+
+FilterCounter::FilterCounter(){
+  counts = new uint64_t[NUM_FILTERS];
+  for (int i = 0; i < NUM_FILTERS; i++)
+    counts[i] = 0;
+}
+  
+void FilterCounter::increment(const int type){
+  if (type > NUM_FILTERS || type < 0)
+    PrintMessageDieOnError("Invalid filter type", ERROR);
+  counts[type]++;
+}
+ 
+std::string FilterCounter::GetFilterType(const int type){
+  switch(type){
+  case NOT_UNIT:
+    return "NOT_UNIT";
+  case DIFF_FROM_REF:
+    return "DIFF_FROM_REF";
+  case MAPPING_QUALITY:
+    return "MAPPING_QUALITY";
+  case MATE_DIST:
+    return "MATE_DIST";
+  case ALLELE_SIZE:
+    return "ALLELE_SIZE";
+  case SPANNING_AMOUNT:
+    return "SPANNING_AMOUNT";
+  case NUM_END_MATCHES:
+    return "NUM_END_MATCHES";
+  case NOT_MAXIMAL_END:
+    return "NOT_MAXIMAL_END";
+  case BP_BEFORE_INDEL:
+    return "BP_BEFORE_INDEL";
+  default:
+    PrintMessageDieOnError("Invalid filter type", ERROR);
+  }
+}
+
+uint64_t FilterCounter::GetFilterCount(const int type){
+  if (type > NUM_FILTERS || type < 0)
+    PrintMessageDieOnError("Invalid filter type", ERROR);
+  return counts[type];
+}
+
+ 
+FilterCounter::~FilterCounter(){
+  delete [] counts;
+}
+
+
+  
+  
diff --git a/src/FilterCounter.h b/src/FilterCounter.h
new file mode 100644
index 0000000..23341b9
--- /dev/null
+++ b/src/FilterCounter.h
@@ -0,0 +1,56 @@
+/*
+Copyright (C) 2014 Thomas Willems <twillems@mit.edu>
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#ifndef SRC_FILTER_COUNTER_H_
+#define SRC_FILTER_COUNTER_H_
+
+#include <stdint.h>
+
+#include <string>
+
+class FilterCounter {
+ private:
+  uint64_t* counts;
+  
+ public:
+  const static int NUM_FILTERS     = 9;
+  const static int NOT_UNIT        = 0;
+  const static int DIFF_FROM_REF   = 1;
+  const static int MAPPING_QUALITY = 2;
+  const static int MATE_DIST       = 3;
+  const static int ALLELE_SIZE     = 4;
+  const static int SPANNING_AMOUNT = 5;
+  const static int NUM_END_MATCHES = 6;
+  const static int NOT_MAXIMAL_END = 7;
+  const static int BP_BEFORE_INDEL = 8;
+  
+  FilterCounter();
+  
+  void increment(const int type);
+  
+  std::string GetFilterType(const int type);
+
+  uint64_t GetFilterCount(const int type);
+
+  ~FilterCounter();
+};
+
+#endif
diff --git a/src/Makefile.am b/src/Makefile.am
index f7dce80..57b6c1c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -121,6 +121,7 @@ liblobstr_a_SOURCES = \
 	FastqPairedFileReader.cpp FastqPairedFileReader.h \
 	FFT_four_nuc_vectors.cpp FFT_four_nuc_vectors.h \
 	FFT_nuc_vectors.cpp FFT_nuc_vectors.h \
+	FilterCounter.cpp FilterCounter.h \
 	gzstream.cpp gzstream.h \
 	STRRecord.h \
 	HammingWindowGenerator.cpp HammingWindowGenerator.h \
@@ -186,6 +187,7 @@ liballelotype_a_SOURCES = \
 	FastqFileReader.cpp FastqFileReader.h \
 	FastaPairedFileReader.cpp FastaPairedFileReader.h \
 	FastqPairedFileReader.cpp FastqPairedFileReader.h \
+	FilterCounter.cpp FilterCounter.h \
 	Genotyper.cpp Genotyper.h \
 	gzstream.cpp gzstream.h \
 	NoiseModel.cpp NoiseModel.h \
@@ -271,6 +273,7 @@ lobSTRTests_SOURCES = \
 	FastqPairedFileReader.cpp \
 	FFT_four_nuc_vectors.cpp \
 	FFT_nuc_vectors.cpp \
+	FilterCounter.h FilterCounter.cpp \
 	gzstream.cpp \
 	HammingWindowGenerator.cpp \
 	MultithreadData.cpp \
diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp
index ff4e85f..350d503 100644
--- a/src/ReadContainer.cpp
+++ b/src/ReadContainer.cpp
@@ -225,27 +225,36 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
   }
   // apply filters
   if (unit) {
-    if (aligned_read->diffFromRef % aligned_read->period != 0) return false;
+    if (aligned_read->diffFromRef % aligned_read->period != 0){ 
+      filter_counter.increment(FilterCounter::NOT_UNIT);
+      return false;
+    }
   }
   if (abs(aligned_read->diffFromRef) > max_diff_ref) {
+    filter_counter.increment(FilterCounter::DIFF_FROM_REF);
     return false;
   }
   if (aligned_read->mapq > max_mapq) {
+    filter_counter.increment(FilterCounter::MAPPING_QUALITY);
     return false;
   }
   if (aligned_read->matedist > max_matedist) {
+    filter_counter.increment(FilterCounter::MATE_DIST);
     return false;
   }
   // Check if the allele length is valid
   if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) {
+    filter_counter.increment(FilterCounter::ALLELE_SIZE);
     return false;
   }
 
   // check that read sufficiently spans STR
   int max_read_start = aligned_read->msStart - min_border;
   int min_read_stop  = aligned_read->msEnd   + min_border;
-  if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop)
-    return false;
+  if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){
+    filter_counter.increment(FilterCounter::SPANNING_AMOUNT);
+    return false; 
+  }
   
   // check that both ends of the read contain sufficient perfect matches
   if (min_read_end_match > 0){
@@ -254,11 +263,12 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
       PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
     string ref_ext_seq = loc_iter->second;
     pair<int,int> num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend);
-    if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match)
+    if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){
+      filter_counter.increment(FilterCounter::NUM_END_MATCHES);
       return false;
+    }
   }
 
-
   // check that the prefix and suffix of the read match maximally compared to proximal reference locations
   if (maximal_end_match_window > 0){
     map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
@@ -266,18 +276,23 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
       PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
     string ref_ext_seq = loc_iter->second;
     bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window);
-    if (!maximum_end_matches)
+    if (!maximum_end_matches){
+      filter_counter.increment(FilterCounter::NOT_MAXIMAL_END);
       return false;
+    }
   }
 
-
   // check that both ends of the aligned read have sufficient bases before the first indel
   if (min_bp_before_indel > 0){
     pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read);
-    if (num_bps.first != -1 && num_bps.first < min_bp_before_indel)
+    if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){
+      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
       return false;
-    if (num_bps.second != -1 && num_bps.second < min_bp_before_indel)
+    }
+    if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){
+      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
       return false;
+    }
   }
   
   return true;
diff --git a/src/RemoveDuplicates.h b/src/RemoveDuplicates.h
index 4560740..0517b6f 100644
--- a/src/RemoveDuplicates.h
+++ b/src/RemoveDuplicates.h
@@ -34,10 +34,10 @@ namespace RemoveDuplicates {
   void RemovePCRDuplicates(std::list<AlignedRead>* aligned_reads);
 
   /* Get representative read from group of duplicates, read with highest qual score */
-  void GetRepRead(const list<AlignedRead>& aligned_reads, AlignedRead* rep_alignment);
+  void GetRepRead(const std::list<AlignedRead>& aligned_reads, AlignedRead* rep_alignment);
 
   /* Get quality score of a read */
-  float GetScore(const string& quality_string);
+  float GetScore(const std::string& quality_string);
 }
 
 #endif  // SRC_REMOVEDUPLICATES_H_
diff --git a/src/RunInfo.h b/src/RunInfo.h
index 6f76e08..362bb31 100644
--- a/src/RunInfo.h
+++ b/src/RunInfo.h
@@ -25,6 +25,8 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include <string>
 #include <vector>
 
+#include "src/FilterCounter.h"
+
 /*
   RunInfo stores information about the run that will be uploaded to Amazon for further analysis
  */
@@ -87,7 +89,7 @@ class RunInfo {
   }
   
   // Print to string
-  std::string PrintToString(int program) {
+  std::string PrintToString(int program, FilterCounter& filter_counter) {
     std::stringstream ss;
     ss << "Starttime: " << starttime << std::endl;
     ss << "Endtime: " << endtime << std::endl;
@@ -119,6 +121,9 @@ class RunInfo {
 	ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl;
 	ss << "Mean coverage\t" << static_cast<float>(total_coverage.at(i))/static_cast<float>(num_calls.at(i)) << std::endl;
 	ss << "Mean perc. agree\t" << static_cast<float>(total_agree.at(i))/static_cast<float>(num_calls.at(i)) << std::endl;
+	ss << "Read filter stats:" << std::endl;
+	for (int i = 0; i < FilterCounter::NUM_FILTERS; i++)
+	  ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
 	ss << std::endl;
       }
       ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl;
diff --git a/src/common.cpp b/src/common.cpp
index c723ef1..f6676af 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -84,7 +84,7 @@ void OutputRunStatistics() {
   PrintMessageDieOnError("Outputting run statistics", PROGRESS);
   TextFileWriter sWriter(output_prefix + (program == LOBSTR? ".aligned.stats":".allelotype.stats"));
   // Output run statistics to stats file
-  string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1));
+  string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1), filter_counter);
   sWriter.Write(stats_string);
   // Upload to AWS S3
   if (!noweb) {
diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp
index a4baa79..9f020ee 100644
--- a/src/runtime_parameters.cpp
+++ b/src/runtime_parameters.cpp
@@ -23,6 +23,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include "src/runtime_parameters.h"
 
 RunInfo run_info;
+FilterCounter filter_counter;
 
 int MIN_PERIOD=1;
 int MAX_PERIOD=6;
diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h
index 9879204..01853a1 100644
--- a/src/runtime_parameters.h
+++ b/src/runtime_parameters.h
@@ -25,6 +25,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include <vector>
 
 #include "src/RunInfo.h"
+#include "src/FilterCounter.h"
 
 // period range for genotyper
 extern int MIN_PERIOD;
@@ -33,6 +34,9 @@ extern int MAX_PERIOD;
 // global variables that probably shouldn't be global variables
 extern RunInfo run_info;
 
+// when allelotyping, tracks why filtered reads were omitted
+extern FilterCounter filter_counter;
+
 // stores user arguments for lobSTR alignment
 // used to store the bam header for allelotyping step
 extern std::string user_defined_arguments;

From 329a7cbf621c6536f723f94a4347e7d3b4a31eae Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Tue, 18 Mar 2014 17:23:43 -0400
Subject: [PATCH 08/13] Fixed bug in which filter stats were printed for every
 sample. Added unfiltered filter type

---
 src/FilterCounter.cpp | 2 ++
 src/FilterCounter.h   | 5 +++--
 src/ReadContainer.cpp | 2 +-
 src/RunInfo.h         | 9 +++++----
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp
index 4148278..972a41a 100644
--- a/src/FilterCounter.cpp
+++ b/src/FilterCounter.cpp
@@ -53,6 +53,8 @@ std::string FilterCounter::GetFilterType(const int type){
     return "NOT_MAXIMAL_END";
   case BP_BEFORE_INDEL:
     return "BP_BEFORE_INDEL";
+  case UNFILTERED:
+    return "UNFILTERED";
   default:
     PrintMessageDieOnError("Invalid filter type", ERROR);
   }
diff --git a/src/FilterCounter.h b/src/FilterCounter.h
index 23341b9..218b395 100644
--- a/src/FilterCounter.h
+++ b/src/FilterCounter.h
@@ -31,7 +31,7 @@ class FilterCounter {
   uint64_t* counts;
   
  public:
-  const static int NUM_FILTERS     = 9;
+  const static int NUM_FILTERS     = 10;
   const static int NOT_UNIT        = 0;
   const static int DIFF_FROM_REF   = 1;
   const static int MAPPING_QUALITY = 2;
@@ -41,7 +41,8 @@ class FilterCounter {
   const static int NUM_END_MATCHES = 6;
   const static int NOT_MAXIMAL_END = 7;
   const static int BP_BEFORE_INDEL = 8;
-  
+  const static int UNFILTERED      = 9;
+
   FilterCounter();
   
   void increment(const int type);
diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp
index 350d503..c45c766 100644
--- a/src/ReadContainer.cpp
+++ b/src/ReadContainer.cpp
@@ -294,7 +294,7 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
       return false;
     }
   }
-  
+  filter_counter.increment(FilterCounter::UNFILTERED);
   return true;
 }
 
diff --git a/src/RunInfo.h b/src/RunInfo.h
index 362bb31..56d4c4f 100644
--- a/src/RunInfo.h
+++ b/src/RunInfo.h
@@ -121,10 +121,6 @@ class RunInfo {
 	ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl;
 	ss << "Mean coverage\t" << static_cast<float>(total_coverage.at(i))/static_cast<float>(num_calls.at(i)) << std::endl;
 	ss << "Mean perc. agree\t" << static_cast<float>(total_agree.at(i))/static_cast<float>(num_calls.at(i)) << std::endl;
-	ss << "Read filter stats:" << std::endl;
-	for (int i = 0; i < FilterCounter::NUM_FILTERS; i++)
-	  ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
-	ss << std::endl;
       }
       ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl;
       for (size_t i = 0; i < calltype_by_period.size(); i++) {
@@ -135,6 +131,11 @@ class RunInfo {
 	}
 	ss << std::endl;
       }
+
+      ss << "Read filter stats:" << std::endl;
+      for (int i = 0; i < FilterCounter::NUM_FILTERS; i++)
+	ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
+      ss << std::endl;
     }
 
     return ss.str();

From c8244eca344fe1215e59223a57214597cafca2c8 Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Wed, 19 Mar 2014 11:54:53 -0400
Subject: [PATCH 09/13] Formatted filter stats in allelotype.stats output

---
 src/RunInfo.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/RunInfo.h b/src/RunInfo.h
index 56d4c4f..2dd4de1 100644
--- a/src/RunInfo.h
+++ b/src/RunInfo.h
@@ -21,6 +21,7 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #ifndef SRC_RUNINFO_H_
 #define SRC_RUNINFO_H_
 
+#include <iomanip>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -134,7 +135,7 @@ class RunInfo {
 
       ss << "Read filter stats:" << std::endl;
       for (int i = 0; i < FilterCounter::NUM_FILTERS; i++)
-	ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
+	ss << "\t" << std::setw(25) << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
       ss << std::endl;
     }
 

From 228ae57367c4598773c548d01efe30d55da5992b Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Wed, 19 Mar 2014 14:08:59 -0400
Subject: [PATCH 10/13] Cleaned up namespaces in various header files

---
 src/AlignmentFilters.h |  2 --
 src/ZAlgorithm.cpp     | 20 ++++++++++----------
 src/ZAlgorithm.h       |  6 ++----
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h
index a2d6b78..ec45bf7 100644
--- a/src/AlignmentFilters.h
+++ b/src/AlignmentFilters.h
@@ -27,8 +27,6 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 #include "src/AlignedRead.h"
 
-using namespace std;
-
 namespace AlignmentFilters {
   /* Returns the vector of CigarOps corresponding to the CIGAR string. */
   std::vector<BamTools::CigarOp> GetCigarOps(std::string cigar_string);
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
index 6a39f5f..04ac95b 100644
--- a/src/ZAlgorithm.cpp
+++ b/src/ZAlgorithm.cpp
@@ -27,9 +27,9 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 
 namespace ZAlgorithm{
-  void suffix_helper(int start, const string& s1, const string& s2, 
-		     vector<int>& s1_matches, vector<int>& num_matches){
-    num_matches  = vector<int>(s2.size(), -1);
+  void suffix_helper(int start, const std::string& s1, const std::string& s2, 
+		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
+    num_matches  = std::vector<int>(s2.size(), -1);
     int leftmost = s2.size(), right_index = s2.size();
     for (int i = start; i >= 0; i--){
       if (i <= leftmost){
@@ -66,9 +66,9 @@ namespace ZAlgorithm{
   }
 
 
-  void prefix_helper(unsigned int start, const string& s1, const string& s2, 
-		     vector<int>& s1_matches, vector<int>& num_matches){
-    num_matches = vector<int>(s2.size(), -1);
+  void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, 
+		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
+    num_matches = std::vector<int>(s2.size(), -1);
     int rightmost = 0, left_index = 0;
     for (int i = start; i < s2.size(); i++){
       if (i >= rightmost){
@@ -104,14 +104,14 @@ namespace ZAlgorithm{
     }
   }
 
-  void GetPrefixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches) {
-    vector<int> s1_matches;
+  void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches) {
+    std::vector<int> s1_matches;
     prefix_helper(1, s1, s1, s1_matches, s1_matches);
     prefix_helper(0, s1, s2, s1_matches, num_matches);
   }  
 
-  void GetSuffixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches) {
-    vector<int> s1_matches;
+  void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches) {
+    std::vector<int> s1_matches;
     suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches);
     suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches);
   }  
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
index 6db8886..d4976df 100644
--- a/src/ZAlgorithm.h
+++ b/src/ZAlgorithm.h
@@ -21,10 +21,8 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 #include <string>
 #include <vector>
 
-using namespace std;
-
 
 namespace ZAlgorithm{
-  void GetPrefixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches);
-  void GetSuffixMatchCounts(const string& s1, const string& s2, vector<int>& num_matches);
+  void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
+  void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
 }

From 91c53505187e4d7b116b9e39fe4389c5b8c0236f Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Wed, 19 Mar 2014 16:33:55 -0400
Subject: [PATCH 11/13] Optimized prefix/suffix filter so that it only
 calculates the values for the reference sequence region of interest

---
 src/AlignmentFilters.cpp      |  41 +++++++------
 src/ZAlgorithm.cpp            | 112 ++++++++++++++++++++++++++++++++--
 src/ZAlgorithm.h              |   2 +
 src/tests/ZAlgorithm_test.cpp |  35 ++++++-----
 4 files changed, 152 insertions(+), 38 deletions(-)

diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index f277a9f..3fb3a2c 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -189,7 +189,7 @@ namespace AlignmentFilters {
 
   /*
     
-   */
+   */ 
   bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){
     // Extract sequence, start and end coordinates of read after clipping
     string bases;
@@ -197,36 +197,41 @@ namespace AlignmentFilters {
     GetUnclippedInfo(aln, bases, start, end);
 
     // Check that the prefix match is the longest
-    vector<int> match_counts;
-    ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts);
-    if (ref_seq.size() != match_counts.size())
-      PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR);
     if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){
       int start_index = start - ref_seq_start;
-      int num_matches = match_counts[start_index];
-      for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){
-	if (i == start_index)
+      int start       = max(0, start_index - max_external);
+      int stop        = min((int)(ref_seq.size()-1), start_index + max_internal);
+      vector<int> match_counts;
+      ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts);
+
+      int align_index = start_index - start;
+      int num_matches = match_counts[align_index];
+      for (unsigned int i = 0; i < match_counts.size(); i++){
+	if (i == align_index)
 	  continue;
 	if (match_counts[i] >= num_matches)
 	  return false;
       }
     }
-    
+
     // Check that the suffix match is the longest
-    ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, match_counts);
-    if (ref_seq.size() != match_counts.size())
-      PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR);
     if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){
-      int end_index   = end - ref_seq_start;
-      int num_matches = match_counts[end_index];
-      for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){
-	if (i == end_index)
+      int end_index = end - ref_seq_start;
+      int start     = max(0, end_index - max_internal);
+      int stop      = min((int)(ref_seq.size()-1), end_index + max_external);
+      vector<int> match_counts;
+      ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts);
+      
+      int align_index = end_index - start;
+      int num_matches = match_counts[align_index];
+      for (unsigned int i = 0; i < match_counts.size(); i++){
+	if (i == align_index)
 	  continue;
 	if (match_counts[i] >= num_matches)
 	  return false;
       }
-    }
-
+    }    
+   
     return true;
   }
 }
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
index 04ac95b..d4745c4 100644
--- a/src/ZAlgorithm.cpp
+++ b/src/ZAlgorithm.cpp
@@ -23,10 +23,12 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 #include <stdlib.h>
 
-#include "ZAlgorithm.h"
+#include "src/common.h"
+#include "src/ZAlgorithm.h"
 
 
 namespace ZAlgorithm{
+  /*
   void suffix_helper(int start, const std::string& s1, const std::string& s2, 
 		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
     num_matches  = std::vector<int>(s2.size(), -1);
@@ -64,8 +66,51 @@ namespace ZAlgorithm{
       }
     }
   }
+  */
 
 
+  void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
+		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
+    num_matches  = std::vector<int>(s2_right - s2_left + 1, -1);
+    int leftmost = s2_right+1, right_index = s2_right+1;
+    for (int i = s2_right; i >= s2_left; i--){
+      if (i <= leftmost){
+	int index_a = s1.size()-1, index_b = i;
+	while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+	  index_a--;
+	  index_b--;
+	}
+	num_matches[i-s2_left] = i - index_b;
+	if (index_b < i){
+	  right_index = i;
+	  leftmost    = index_b + 1;
+	}
+      }
+      else {
+	int twin     = i - right_index + s1.size()-1;
+	int new_left = i - s1_matches[twin] + 1;
+	if (new_left > leftmost)
+	  num_matches[i-s2_left] = s1_matches[twin];
+	else if (new_left < leftmost)
+	  num_matches[i-s2_left] = i-leftmost+1;
+	else {
+	  int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1;
+	  while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+	    index_a--;
+	    index_b--;
+	  }
+	  num_matches[i-s2_left] = i-index_b;
+	  right_index            = i;
+	  leftmost               = index_b + 1;
+	}
+      }
+    }
+  }
+
+
+
+
+  /*
   void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, 
 		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
     num_matches = std::vector<int>(s2.size(), -1);
@@ -103,17 +148,74 @@ namespace ZAlgorithm{
       }
     }
   }
+  */
+
+  void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
+		     std::vector<int>& s1_matches, std::vector<int>& num_matches, int offset){
+    num_matches = std::vector<int>(s2_right-s2_left+1+offset, -1);
+    int rightmost = -1, left_index = -1;
+    for (int i = s2_left; i <= s2_right; i++){
+      if (i >= rightmost){
+	int index_a = 0, index_b = i;
+	while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
+	  index_a++;
+	  index_b++;
+	}
+	num_matches[i-s2_left+offset] = index_b - i;
+	if (index_b > i){
+	  left_index = i;
+	  rightmost  = index_b - 1;
+	}
+      }
+      else {
+	int twin      = i - left_index;
+	int new_right = i + s1_matches[twin] - 1;
+	if (new_right < rightmost)
+	  num_matches[i-s2_left+offset] = s1_matches[twin];
+	else if (new_right > rightmost)
+	  num_matches[i-s2_left+offset] = rightmost-i+1;
+	else {
+	  int index_a = rightmost+1-i, index_b = rightmost+1;
+	  while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
+	    index_a++;
+	    index_b++;
+	  }
+	  num_matches[i-s2_left+offset] = index_b - i;
+	  left_index     = i;
+	  rightmost      = index_b - 1;
+	}
+      }
+    }
+  }
+
+
 
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches) {
     std::vector<int> s1_matches;
-    prefix_helper(1, s1, s1, s1_matches, s1_matches);
-    prefix_helper(0, s1, s2, s1_matches, num_matches);
+    prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches,  1);
+    prefix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches, 0);
   }  
 
   void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches) {
     std::vector<int> s1_matches;
-    suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches);
-    suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches);
+    suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches);
+    suffix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches);
+  }  
+
+  void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches) {
+    if (s2_start < 0 or s2_stop >= s2.size())
+	PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR);
+    std::vector<int> s1_matches;
+    prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches,  1);
+    prefix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches, 0);
   }  
+
+  void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches) {
+     if (s2_start < 0 or s2_stop >= s2.size())
+	PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR);
+    std::vector<int> s1_matches;
+    suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches);
+    suffix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches);
+  }
 }
 
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
index d4976df..14ce501 100644
--- a/src/ZAlgorithm.h
+++ b/src/ZAlgorithm.h
@@ -25,4 +25,6 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 namespace ZAlgorithm{
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
   void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
+  void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches);
+  void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches);
 }
diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp
index ef215e1..6ab4372 100644
--- a/src/tests/ZAlgorithm_test.cpp
+++ b/src/tests/ZAlgorithm_test.cpp
@@ -37,14 +37,16 @@ void ZAlgorithmTest::tearDown(){}
 void ZAlgorithmTest::test_Prefix(){
   vector<int> num_matches;
   string s1, s2;
+  int s2_start, s2_stop;
   for (int i = 0; i < NUM_TRIALS; i++){
-    s1 = DNATools::RandDNA(rand()%LENGTH);
-    s2 = DNATools::RandDNA(rand()%LENGTH);
-
-    ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches);
-    for (unsigned int j = 0; j < s2.size(); j++){
-      bool match  = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0);
-      bool longer = (s1.size() > num_matches[j] && s2.size() > (j + num_matches[j]) && s2[j+num_matches[j]] == s1[num_matches[j]]);
+    s1       = DNATools::RandDNA(1+rand()%(LENGTH-1));
+    s2       = DNATools::RandDNA(1+rand()%(LENGTH-1));
+    s2_start = rand()%s2.size();
+    s2_stop  = s2_start + (rand()%(s2.size()-s2_start));
+    ZAlgorithm::GetPrefixMatchCounts(s1, s2, s2_start, s2_stop, num_matches);
+    for (int j = s2_start; j <= s2_stop; j++){
+      bool match  = (s1.substr(0, num_matches[j-s2_start]).compare(s2.substr(j, num_matches[j-s2_start])) == 0);
+      bool longer = (s1.size() > num_matches[j-s2_start] && s2.size() > (j + num_matches[j-s2_start]) && s2[j+num_matches[j-s2_start]] == s1[num_matches[j-s2_start]]);
       CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer)));
     }
   }
@@ -53,16 +55,19 @@ void ZAlgorithmTest::test_Prefix(){
 void ZAlgorithmTest::test_Suffix(){
   vector<int> num_matches;
   string s1, s2;
+  int s2_start, s2_stop;
   for (int i = 0; i < NUM_TRIALS; i++){
-    s1 = DNATools::RandDNA(LENGTH);
-    s2 = DNATools::RandDNA(LENGTH);
-    ZAlgorithm::GetSuffixMatchCounts(s1, s2, num_matches);
-    for (int j = s2.size()-1; j >= 0; j--){
-      string sub_1 = s1.substr(s1.size()-num_matches[j], num_matches[j]);
-      string sub_2 = s2.substr(j-num_matches[j]+1, num_matches[j]);
+    s1       = DNATools::RandDNA(1+rand()%(LENGTH-1));
+    s2       = DNATools::RandDNA(1+rand()%(LENGTH-1));
+    s2_start = rand()%s2.size();
+    s2_stop  = s2_start + (rand()%(s2.size()-s2_start));
+    ZAlgorithm::GetSuffixMatchCounts(s1, s2, s2_start, s2_stop, num_matches);
+    for (int j = s2_start; j <= s2_stop; j++){
+      string sub_1 = s1.substr(s1.size()-num_matches[j-s2_start], num_matches[j-s2_start]);
+      string sub_2 = s2.substr(j-num_matches[j-s2_start]+1, num_matches[j-s2_start]);
       bool match   = (sub_1.compare(sub_2) == 0);
-      bool longer  = ((j-num_matches[j]) > 0 && (s1.size()-num_matches[j]) > 0 && s1[s1.size()-num_matches[j]-1] == s2[j-num_matches[j]]);
-      CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer)));
+      bool longer  = ((j-num_matches[j-s2_start]) > 0 && (s1.size()-num_matches[j-s2_start]) > 0 && s1[s1.size()-num_matches[j-s2_start]-1] == s2[j-num_matches[j-s2_start]]);
+      CPPUNIT_ASSERT_MESSAGE("ZAlgorithm suffix error", !(!match || (match && longer)));
     }
   }
 }

From fad48b22afda146d6bc883c6cf9a3dda9fb94efe Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Wed, 19 Mar 2014 17:10:53 -0400
Subject: [PATCH 12/13] Added function documentation and integer casts to
 removed signed-unsigned comparison warnings

---
 src/AlignmentFilters.cpp | 20 ++++-----
 src/AlignmentFilters.h   |  6 ++-
 src/ZAlgorithm.cpp       | 93 ++--------------------------------------
 src/ZAlgorithm.h         | 25 +++++++++++
 4 files changed, 42 insertions(+), 102 deletions(-)

diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index 3fb3a2c..23cc72a 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -186,10 +186,7 @@ namespace AlignmentFilters {
     bases = aln->nucleotides.substr(start_index, num_bases);
   }
 
-
-  /*
-    
-   */ 
+ 
   bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){
     // Extract sequence, start and end coordinates of read after clipping
     string bases;
@@ -197,16 +194,16 @@ namespace AlignmentFilters {
     GetUnclippedInfo(aln, bases, start, end);
 
     // Check that the prefix match is the longest
-    if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){
+    if (start >= ref_seq_start && start < ref_seq_start + static_cast<int>(ref_seq.size())){
       int start_index = start - ref_seq_start;
       int start       = max(0, start_index - max_external);
-      int stop        = min((int)(ref_seq.size()-1), start_index + max_internal);
+      int stop        = min(static_cast<int>((ref_seq.size()-1)), start_index + max_internal);
       vector<int> match_counts;
       ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts);
 
       int align_index = start_index - start;
       int num_matches = match_counts[align_index];
-      for (unsigned int i = 0; i < match_counts.size(); i++){
+      for (int i = 0; i < static_cast<int>(match_counts.size()); i++){
 	if (i == align_index)
 	  continue;
 	if (match_counts[i] >= num_matches)
@@ -215,23 +212,22 @@ namespace AlignmentFilters {
     }
 
     // Check that the suffix match is the longest
-    if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){
+    if (end >= ref_seq_start && end < ref_seq_start + static_cast<int>(ref_seq.size())){
       int end_index = end - ref_seq_start;
       int start     = max(0, end_index - max_internal);
-      int stop      = min((int)(ref_seq.size()-1), end_index + max_external);
+      int stop      = min(static_cast<int>(ref_seq.size()-1), end_index + max_external);
       vector<int> match_counts;
       ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts);
       
       int align_index = end_index - start;
       int num_matches = match_counts[align_index];
-      for (unsigned int i = 0; i < match_counts.size(); i++){
+      for (int i = 0; i < static_cast<int>(match_counts.size()); i++){
 	if (i == align_index)
 	  continue;
 	if (match_counts[i] >= num_matches)
 	  return false;
       }
-    }    
-   
+    }       
     return true;
   }
 }
diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h
index ec45bf7..e548ae2 100644
--- a/src/AlignmentFilters.h
+++ b/src/AlignmentFilters.h
@@ -40,7 +40,11 @@ namespace AlignmentFilters {
   /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */
   std::pair<int,int> GetEndDistToIndel(AlignedRead* aln);
 
-  /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */
+  /* Returns true iff the alignment has: 
+     1) a maximal matching prefix compared to alignments that start [-max_upstream, max_downstream] from the 5' alignment position of the read
+     2) a maximal matching suffix compared to alignments that end   [-max_downstream, max_upstream] from the 3' alignment position of the read
+     Ignores clipped bases when performing these comparions 
+  */
   bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
 }
 
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
index d4745c4..9ec1a9b 100644
--- a/src/ZAlgorithm.cpp
+++ b/src/ZAlgorithm.cpp
@@ -28,47 +28,6 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 
 namespace ZAlgorithm{
-  /*
-  void suffix_helper(int start, const std::string& s1, const std::string& s2, 
-		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
-    num_matches  = std::vector<int>(s2.size(), -1);
-    int leftmost = s2.size(), right_index = s2.size();
-    for (int i = start; i >= 0; i--){
-      if (i <= leftmost){
-	int index_a = s1.size()-1, index_b = i;
-	while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
-	  index_a--;
-	  index_b--;
-	}
-	num_matches[i] = i - index_b;
-	if (index_b < i){
-	  right_index = i;
-	  leftmost    = index_b + 1;
-	}
-      }
-      else {
-	int twin     = i - right_index + s1.size()-1;
-	int new_left = i - s1_matches[twin] + 1;
-	if (new_left > leftmost)
-	  num_matches[i] = s1_matches[twin];
-	else if (new_left < leftmost)
-	  num_matches[i] = i-leftmost+1;
-	else {
-	  int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1;
-	  while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
-	    index_a--;
-	    index_b--;
-	  }
-	  num_matches[i] = i-index_b;
-	  right_index    = i;
-	  leftmost       = index_b + 1;
-	}
-      }
-    }
-  }
-  */
-
-
   void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
 		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
     num_matches  = std::vector<int>(s2_right - s2_left + 1, -1);
@@ -107,49 +66,6 @@ namespace ZAlgorithm{
     }
   }
 
-
-
-
-  /*
-  void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, 
-		     std::vector<int>& s1_matches, std::vector<int>& num_matches){
-    num_matches = std::vector<int>(s2.size(), -1);
-    int rightmost = 0, left_index = 0;
-    for (int i = start; i < s2.size(); i++){
-      if (i >= rightmost){
-	int index_a = 0, index_b = i;
-	while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
-	  index_a++;
-	  index_b++;
-	}
-	num_matches[i] = index_b - i;
-	if (index_b > i){
-	  left_index = i;
-	  rightmost  = index_b - 1;
-	}
-      }
-      else {
-	int twin      = i - left_index;
-	int new_right = i + s1_matches[twin] - 1;
-	if (new_right < rightmost)
-	  num_matches[i] = s1_matches[twin];
-	else if (new_right > rightmost)
-	  num_matches[i] = rightmost-i+1;
-	else {
-	  int index_a = rightmost+1-i, index_b = rightmost+1;
-	  while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
-	    index_a++;
-	    index_b++;
-	  }
-	  num_matches[i] = index_b - i;
-	  left_index     = i;
-	  rightmost      = index_b - 1;
-	}
-      }
-    }
-  }
-  */
-
   void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
 		     std::vector<int>& s1_matches, std::vector<int>& num_matches, int offset){
     num_matches = std::vector<int>(s2_right-s2_left+1+offset, -1);
@@ -157,7 +73,7 @@ namespace ZAlgorithm{
     for (int i = s2_left; i <= s2_right; i++){
       if (i >= rightmost){
 	int index_a = 0, index_b = i;
-	while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
+	while (index_a < static_cast<int>(s1.size()) && index_b < static_cast<int>(s2.size()) && s1[index_a] == s2[index_b]){
 	  index_a++;
 	  index_b++;
 	}
@@ -176,7 +92,7 @@ namespace ZAlgorithm{
 	  num_matches[i-s2_left+offset] = rightmost-i+1;
 	else {
 	  int index_a = rightmost+1-i, index_b = rightmost+1;
-	  while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){
+	  while (index_a < static_cast<int>(s1.size()) && index_b < static_cast<int>(s2.size()) && s1[index_a] == s2[index_b]){
 	    index_a++;
 	    index_b++;
 	  }
@@ -189,7 +105,6 @@ namespace ZAlgorithm{
   }
 
 
-
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches) {
     std::vector<int> s1_matches;
     prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches,  1);
@@ -203,7 +118,7 @@ namespace ZAlgorithm{
   }  
 
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches) {
-    if (s2_start < 0 or s2_stop >= s2.size())
+    if (s2_start < 0 or s2_stop >= static_cast<int>(s2.size()))
 	PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR);
     std::vector<int> s1_matches;
     prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches,  1);
@@ -211,7 +126,7 @@ namespace ZAlgorithm{
   }  
 
   void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches) {
-     if (s2_start < 0 or s2_stop >= s2.size())
+    if (s2_start < 0 or s2_stop >= static_cast<int>(s2.size()))
 	PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR);
     std::vector<int> s1_matches;
     suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches);
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
index 14ce501..777fbc8 100644
--- a/src/ZAlgorithm.h
+++ b/src/ZAlgorithm.h
@@ -23,8 +23,33 @@ along with lobSTR.  If not, see <http://www.gnu.org/licenses/>.
 
 
 namespace ZAlgorithm{
+  /* 
+   * For each position in s2, calculates the length of the matching prefix of s1 and s2[i...]
+   * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately.
+   * Runs in O(length_of_s1 + length_of_s2)
+   */
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
+  
+  /* 
+   * For each position in s2, calculates the length of the matching suffix of s1 and s2[0...i]
+   * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately.
+   * Runs in O(length_of_s1 + length_of_s2)
+   */
   void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector<int>& num_matches);
+
+  /* 
+   * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of 
+   * the matching prefix of s1 and s2[i...] and stores it in num_matches[i-s2_start]. 
+   * The provided vector is cleared and sized appropriately.
+   * Runs in O(length_of_s1 + size_of_s2_range)
+   */
   void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches);
+  
+  /* 
+   * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of 
+   * the matching suffix of s1 and s2[0...i] and stores it in num_matches[i-s2_start]. 
+   * The provided vector is cleared and sized appropriately.
+   * Runs in O(length_of_s1 + size_of_s2_range)
+   */
   void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector<int>& num_matches);
 }

From cb7eb3f6e02f136d7ac801223ddcff2c0139e93a Mon Sep 17 00:00:00 2001
From: Thomas Willems <twillems@club.wi.mit.edu>
Date: Wed, 19 Mar 2014 17:16:18 -0400
Subject: [PATCH 13/13] Added documentation for FilterCounter class

---
 src/FilterCounter.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/FilterCounter.h b/src/FilterCounter.h
index 218b395..60646fa 100644
--- a/src/FilterCounter.h
+++ b/src/FilterCounter.h
@@ -32,6 +32,8 @@ class FilterCounter {
   
  public:
   const static int NUM_FILTERS     = 10;
+
+  // Various filter types
   const static int NOT_UNIT        = 0;
   const static int DIFF_FROM_REF   = 1;
   const static int MAPPING_QUALITY = 2;
@@ -45,10 +47,13 @@ class FilterCounter {
 
   FilterCounter();
   
+  /* Increment the count for the provided filter type */
   void increment(const int type);
   
+  /* Returns the name of the filter associated with the provided filter type */
   std::string GetFilterType(const int type);
 
+  /* Returns the count associated with the provided filter type */
   uint64_t GetFilterCount(const int type);
 
   ~FilterCounter();