From 5e211493a4d586a6512c93fb41e149caa7c571c3 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Fri, 14 Mar 2014 18:07:38 -0400 Subject: [PATCH 01/13] Added files for z-algorithm computation. Created method templates for one additional alignment filter --- src/AlignmentFilters.cpp | 19 +++++++++++++++ src/AlignmentFilters.h | 3 +++ src/Makefile.am | 7 ++++++ src/tests/NWNoRefEndPenalty_test.cpp | 35 ++++++---------------------- src/tests/NWNoRefEndPenalty_test.h | 2 +- src/tests/main_test.cpp | 2 ++ 6 files changed, 39 insertions(+), 29 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index a569aca..5855ccf 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -24,6 +24,7 @@ along with lobSTR. If not, see . #include "src/AlignmentFilters.h" #include "src/common.h" +#include "src/ZAlgorithm.h" using namespace std; @@ -147,4 +148,22 @@ namespace AlignmentFilters { else return pair(head_match, match_run); } + + + + bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream){ + /* + TO DO: Fill in this function + + Extract read nucleotides + Remove any soft-clipped bases from nucleotide string + + vector match_counts; + ZAlgorithm::GetPrefixMatchCounts(string& s1, string& s2, match_counts); + + ZAlgorithm::GetSuffixMatchCounts(string& s1, string& s2, match_counts); + + */ + return true; + } } diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h index e4c4d8a..f28e49b 100644 --- a/src/AlignmentFilters.h +++ b/src/AlignmentFilters.h @@ -41,6 +41,9 @@ namespace AlignmentFilters { /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */ pair GetEndDistToIndel(AlignedRead* aln); + + /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */ + bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream); } #endif diff --git a/src/Makefile.am b/src/Makefile.am index a22cd01..f7dce80 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -201,6 +201,7 @@ liballelotype_a_SOURCES = \ TextFileReader.cpp TextFileReader.h \ TextFileWriter.cpp TextFileWriter.h \ VCFWriter.cpp VCFWriter.h \ + ZAlgorithm.cpp ZAlgorithm.h \ ZippedFastaFileReader.cpp ZippedFastaFileReader.h \ ZippedFastqFileReader.cpp ZippedFastqFileReader.h \ ZippedTextFileReader.cpp ZippedTextFileReader.h \ @@ -245,6 +246,8 @@ lobSTRTests_SOURCES = \ tests/BWAReadAligner_test.cpp \ tests/common_test.h \ tests/common_test.cpp \ + tests/DNATools.h \ + tests/DNATools.cpp \ tests/NWNoRefEndPenalty_test.h \ tests/NWNoRefEndPenalty_test.cpp \ tests/ReadContainer_test.h \ @@ -253,6 +256,8 @@ lobSTRTests_SOURCES = \ tests/RemoveDuplicates_test.cpp \ tests/VCFWriter_test.h \ tests/VCFWriter_test.cpp \ + tests/ZAlgorithm_test.h \ + tests/ZAlgorithm_test.cpp \ AlignmentFilters.cpp \ BamFileReader.cpp \ BamPairedFileReader.cpp \ @@ -281,6 +286,8 @@ lobSTRTests_SOURCES = \ ReadContainer.cpp \ RemoveDuplicates.cpp \ VCFWriter.cpp \ + ZAlgorithm.h \ + ZAlgorithm.cpp \ ZippedFastaFileReader.cpp \ ZippedFastqFileReader.cpp \ ZippedTextFileReader.cpp \ diff --git a/src/tests/NWNoRefEndPenalty_test.cpp b/src/tests/NWNoRefEndPenalty_test.cpp index bf60dec..0dc1038 100644 --- a/src/tests/NWNoRefEndPenalty_test.cpp +++ b/src/tests/NWNoRefEndPenalty_test.cpp @@ -25,36 +25,15 @@ along with lobSTR. If not, see . #include #include -#include "src/tests/NWNoRefEndPenalty_test.h" #include "src/common.h" - -// Registers the fixture into the 'registry' -CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest); +#include "src/tests/DNATools.h" +#include "src/tests/NWNoRefEndPenalty_test.h" using namespace std; +// Registers the fixture into the 'registry' +CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest); -char GetChar(int index){ - switch(index){ - case 0: - return 'A'; - case 1: - return 'C'; - case 2: - return 'G'; - case 3: - return 'T'; - default: - return 'N'; - } -} - -string RandDNA(const int length){ - stringstream seq; - for (int j = 0; j < length; j++) - seq << GetChar(rand()%4); - return seq.str(); -} void NWNoRefEndPenaltyTest::setUp() {} void NWNoRefEndPenaltyTest::tearDown() {} @@ -64,7 +43,7 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double int num_correct = 0; for (int i = 0; i < num_trials; i++){ // Create random reference sequence - string ref_seq = RandDNA(REF_LEN); + string ref_seq = DNATools::RandDNA(REF_LEN); // Choose the read region int read_start = rand()%(ref_seq.size()-READ_LEN+1); @@ -75,10 +54,10 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double for (unsigned int i = PERFECT_FLANK; i < read_seq.size()-PERFECT_FLANK; i++){ float prob = static_cast (rand()) / static_cast (RAND_MAX); if (prob <= mut_prob) - mut_read_seq_ss << RandDNA(1); + mut_read_seq_ss << DNATools::RandDNA(1); else if (prob - mut_prob <= ins_prob){ int ins_size = rand()%(MAX_INS-1) + 1; - mut_read_seq_ss << RandDNA(ins_size); + mut_read_seq_ss << DNATools::RandDNA(ins_size); } else if (prob - mut_prob - ins_prob <= del_prob){ int del_size = rand()%(MAX_DEL-1) + 1; diff --git a/src/tests/NWNoRefEndPenalty_test.h b/src/tests/NWNoRefEndPenalty_test.h index 967dffa..37cdb14 100644 --- a/src/tests/NWNoRefEndPenalty_test.h +++ b/src/tests/NWNoRefEndPenalty_test.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2011 Melissa Gymrek +Copyright (C) 2014 Thomas Willems This file is part of lobSTR. diff --git a/src/tests/main_test.cpp b/src/tests/main_test.cpp index 1026a95..4cc5bc6 100644 --- a/src/tests/main_test.cpp +++ b/src/tests/main_test.cpp @@ -27,6 +27,7 @@ along with lobSTR. If not, see . #include "src/tests/ReadContainer_test.h" #include "src/tests/RemoveDuplicates_test.h" #include "src/tests/VCFWriter_test.h" +#include "src/tests/ZAlgorithm_test.h" int main( int argc, char **argv) { // Adds the test to the list of tests to run @@ -38,6 +39,7 @@ int main( int argc, char **argv) { runner.addTest(ReadContainerTest::suite()); runner.addTest(RemoveDuplicatesTest::suite()); runner.addTest(VCFWriterTest::suite()); + runner.addTest(ZAlgorithmTest::suite()); // Run the tests bool wasSucessful = runner.run(); From 7b17ce7a3dc08bc181dc880b523bb20d570a4100 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Fri, 14 Mar 2014 18:08:50 -0400 Subject: [PATCH 02/13] Added unit tests for z-algorithm prefix and suffix methods --- src/ZAlgorithm.cpp | 99 +++++++++++++++++++++++++++++++++++ src/ZAlgorithm.h | 10 ++++ src/tests/DNATools.cpp | 31 +++++++++++ src/tests/DNATools.h | 9 ++++ src/tests/ZAlgorithm_test.cpp | 50 ++++++++++++++++++ src/tests/ZAlgorithm_test.h | 46 ++++++++++++++++ 6 files changed, 245 insertions(+) create mode 100644 src/ZAlgorithm.cpp create mode 100644 src/ZAlgorithm.h create mode 100644 src/tests/DNATools.cpp create mode 100644 src/tests/DNATools.h create mode 100644 src/tests/ZAlgorithm_test.cpp create mode 100644 src/tests/ZAlgorithm_test.h diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp new file mode 100644 index 0000000..1682278 --- /dev/null +++ b/src/ZAlgorithm.cpp @@ -0,0 +1,99 @@ +#include +#include + +#include + +#include "ZAlgorithm.h" + + +namespace ZAlgorithm{ + void suffix_helper(int start, string& s1, string& s2, + vector& s1_matches, vector& num_matches){ + num_matches = vector(s2.size(), 0); + int leftmost = s2.size(), right_index = s2.size(); + for (int i = start; i >= 0; i--){ + if (i <= leftmost){ + int index_a = s1.size()-1, index_b = i; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i] = i - index_b; + if (index_b < i){ + right_index = i; + leftmost = index_b + 1; + } + } + else { + int twin = i - right_index + s1.size()-1; + int new_left = i - s1_matches[twin] + 1; + if (new_left > leftmost) + num_matches[i] = s1_matches[twin]; + else if (new_left < leftmost) + num_matches[i] = i-leftmost+1; + else { + int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i] = i-index_b; + right_index = i; + leftmost = index_b + 1; + } + } + } + } + + + void prefix_helper(unsigned int start, string& s1, string& s2, + vector& s1_matches, vector& num_matches){ + num_matches = vector(s2.size(), 0); + int rightmost = 0, left_index = 0; + for (int i = start; i < s2.size(); i++){ + if (i >= rightmost){ + int index_a = 0, index_b = i; + while (index_a < s1.size() && index_b < s1.size() && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i] = index_b - i; + if (index_b > i){ + left_index = i; + rightmost = index_b - 1; + } + } + else { + int twin = i - left_index; + int new_right = i + s1_matches[twin] - 1; + if (new_right < rightmost) + num_matches[i] = s1_matches[twin]; + else if (new_right > rightmost) + num_matches[i] = rightmost-i+1; + else { + int index_a = rightmost+1-i, index_b = rightmost+1; + while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i] = index_b - i; + left_index = i; + rightmost = index_b - 1; + } + } + } + } + + void GetPrefixMatchCounts(string& s1, string& s2, vector& num_matches) { + vector s1_matches; + prefix_helper(1, s1, s1, s1_matches, s1_matches); + prefix_helper(0, s1, s2, s1_matches, num_matches); + } + + void GetSuffixMatchCounts(string& s1, string& s2, vector& num_matches) { + vector s1_matches; + suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches); + suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches); + } +} + diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h new file mode 100644 index 0000000..5dbb94e --- /dev/null +++ b/src/ZAlgorithm.h @@ -0,0 +1,10 @@ +#include +#include + +using namespace std; + + +namespace ZAlgorithm{ + void GetPrefixMatchCounts(string& s1, string& s2, vector& num_matches); + void GetSuffixMatchCounts(string& s1, string& s2, vector& num_matches); +} diff --git a/src/tests/DNATools.cpp b/src/tests/DNATools.cpp new file mode 100644 index 0000000..73ec7eb --- /dev/null +++ b/src/tests/DNATools.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include + +#include "src/tests/DNATools.h" + +namespace DNATools { + char GetChar(int index){ + switch(index){ + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return 'N'; + } + } + + std::string RandDNA(const int length){ + std::stringstream seq; + for (int j = 0; j < length; j++) + seq << GetChar(rand()%4); + return seq.str(); + } +} + diff --git a/src/tests/DNATools.h b/src/tests/DNATools.h new file mode 100644 index 0000000..0540313 --- /dev/null +++ b/src/tests/DNATools.h @@ -0,0 +1,9 @@ +#ifndef DNATOOLS_H +#define DNATOOLS_H + +#include + +namespace DNATools { + std::string RandDNA(const int length); +} +#endif // DNATOOLS_H diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp new file mode 100644 index 0000000..88904aa --- /dev/null +++ b/src/tests/ZAlgorithm_test.cpp @@ -0,0 +1,50 @@ + +#include +#include + +#include "src/tests/DNATools.h" +#include "src/tests/ZAlgorithm_test.h" +#include "src/common.h" +#include "src/ZAlgorithm.h" + + +using namespace std; + +void ZAlgorithmTest::setUp(){} +void ZAlgorithmTest::tearDown(){} + +void ZAlgorithmTest::test_Prefix(){ + vector num_matches; + string s1, s2; + for (int i = 0; i < NUM_TRIALS; i++){ + s1 = DNATools::RandDNA(LENGTH); + s2 = DNATools::RandDNA(LENGTH); + ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches); + for (unsigned int j = 0; j < s2.size(); j++){ + bool match = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0); + bool longer = (s1.size() > num_matches[j] && s2.size() > (j + num_matches[j]) && s2[j+num_matches[j]] == s1[num_matches[j]]); + CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer))); + } + } +} + +void ZAlgorithmTest::test_Suffix(){ + vector num_matches; + string s1, s2; + for (int i = 0; i < NUM_TRIALS; i++){ + s1 = DNATools::RandDNA(LENGTH); + s2 = DNATools::RandDNA(LENGTH); + ZAlgorithm::GetSuffixMatchCounts(s1, s2, num_matches); + for (int j = s2.size()-1; j >= 0; j--){ + string sub_1 = s1.substr(s1.size()-num_matches[j], num_matches[j]); + string sub_2 = s2.substr(j-num_matches[j]+1, num_matches[j]); + bool match = (sub_1.compare(sub_2) == 0); + bool longer = ((j-num_matches[j]) > 0 && (s1.size()-num_matches[j]) > 0 && s1[s1.size()-num_matches[j]-1] == s2[j-num_matches[j]]); + CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer))); + } + } +} + + + + diff --git a/src/tests/ZAlgorithm_test.h b/src/tests/ZAlgorithm_test.h new file mode 100644 index 0000000..fc776f7 --- /dev/null +++ b/src/tests/ZAlgorithm_test.h @@ -0,0 +1,46 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#ifndef SRC_TESTS_ZALGORITHM_H__ +#define SRC_TESTS_ZALGORITHM_H__ + +#include + +#include "src/ZAlgorithm.h" + +class ZAlgorithmTest : +public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(ZAlgorithmTest); + CPPUNIT_TEST(test_Prefix); + CPPUNIT_TEST(test_Suffix); + CPPUNIT_TEST_SUITE_END(); + + public: + const static int NUM_TRIALS=1000; + const static int LENGTH=100; + void setUp(); + void tearDown(); + void test_Prefix(); + void test_Suffix(); + + private: +}; + +#endif // SRC_TESTS_ZALGORITHM_H_ From e55c69d739499a8bf7f1bf7e0ef0212555008443 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Mon, 17 Mar 2014 14:58:47 -0400 Subject: [PATCH 03/13] Completed method to check for maximal matching suffixes and prefixes. Added preliminary unit tests for this function --- src/AlignmentFilters.cpp | 89 ++++++++++++++++++++++++++--- src/ZAlgorithm.cpp | 28 +++++++-- src/ZAlgorithm.h | 24 +++++++- src/tests/AlignmentFilters_test.cpp | 45 +++++++++++++++ src/tests/AlignmentFilters_test.h | 2 + 5 files changed, 173 insertions(+), 15 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index 5855ccf..fa8fcef 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -150,20 +150,91 @@ namespace AlignmentFilters { } + /* + Stores the sequence, start and end position of the read after removing clipped bases + using the provided references + */ + void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){ + unclipped_start = aln->read_start; + unclipped_end = aln->read_start-1; + bool begin = true; + int start_index = 0, num_bases = 0; + for(vector::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){ + switch(cigar_iter->Type) { + case 'D': + unclipped_end += cigar_iter->Length; + begin = false; + break; + case 'H': + break; + case 'S': + if (begin) start_index += cigar_iter->Length; + break; + case 'M': + unclipped_end += cigar_iter->Length; + num_bases += cigar_iter->Length; + begin = false; + break; + case 'I': + num_bases += cigar_iter->Length; + begin = false; + break; + default: + PrintMessageDieOnError("Invalid CIGAR char " + cigar_iter->Type, ERROR); + break; + } + } + bases = aln->nucleotides.substr(start_index, num_bases); + } + - bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream){ - /* - TO DO: Fill in this function - Extract read nucleotides - Remove any soft-clipped bases from nucleotide string - + /* + + */ + bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){ + // Extract sequence, start and end coordinates of read after clipping + string bases; + int start, end; + GetUnclippedInfo(aln, bases, start, end); + + std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl; + + // Check that the prefix match is the longest vector match_counts; - ZAlgorithm::GetPrefixMatchCounts(string& s1, string& s2, match_counts); + ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts); + if (ref_seq.size() != match_counts.size()) + PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR); + if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){ + int start_index = start - ref_seq_start; + int num_matches = match_counts[start_index]; + for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){ + if (i == start_index) + continue; + if (match_counts[i] >= num_matches){ + std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + return false; + } + } + } - ZAlgorithm::GetSuffixMatchCounts(string& s1, string& s2, match_counts); + // Check that the suffix match is the longest + ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, match_counts); + if (ref_seq.size() != match_counts.size()) + PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR); + if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){ + int end_index = end - ref_seq_start; + int num_matches = match_counts[end_index]; + for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){ + if (i == end_index) + continue; + if (match_counts[i] >= num_matches){ + std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + return false; + } + } + } - */ return true; } } diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp index 1682278..018ca73 100644 --- a/src/ZAlgorithm.cpp +++ b/src/ZAlgorithm.cpp @@ -1,3 +1,23 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + #include #include @@ -7,7 +27,7 @@ namespace ZAlgorithm{ - void suffix_helper(int start, string& s1, string& s2, + void suffix_helper(int start, const string& s1, const string& s2, vector& s1_matches, vector& num_matches){ num_matches = vector(s2.size(), 0); int leftmost = s2.size(), right_index = s2.size(); @@ -46,7 +66,7 @@ namespace ZAlgorithm{ } - void prefix_helper(unsigned int start, string& s1, string& s2, + void prefix_helper(unsigned int start, const string& s1, const string& s2, vector& s1_matches, vector& num_matches){ num_matches = vector(s2.size(), 0); int rightmost = 0, left_index = 0; @@ -84,13 +104,13 @@ namespace ZAlgorithm{ } } - void GetPrefixMatchCounts(string& s1, string& s2, vector& num_matches) { + void GetPrefixMatchCounts(const string& s1, const string& s2, vector& num_matches) { vector s1_matches; prefix_helper(1, s1, s1, s1_matches, s1_matches); prefix_helper(0, s1, s2, s1_matches, num_matches); } - void GetSuffixMatchCounts(string& s1, string& s2, vector& num_matches) { + void GetSuffixMatchCounts(const string& s1, const string& s2, vector& num_matches) { vector s1_matches; suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches); suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches); diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h index 5dbb94e..6db8886 100644 --- a/src/ZAlgorithm.h +++ b/src/ZAlgorithm.h @@ -1,3 +1,23 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + #include #include @@ -5,6 +25,6 @@ using namespace std; namespace ZAlgorithm{ - void GetPrefixMatchCounts(string& s1, string& s2, vector& num_matches); - void GetSuffixMatchCounts(string& s1, string& s2, vector& num_matches); + void GetPrefixMatchCounts(const string& s1, const string& s2, vector& num_matches); + void GetSuffixMatchCounts(const string& s1, const string& s2, vector& num_matches); } diff --git a/src/tests/AlignmentFilters_test.cpp b/src/tests/AlignmentFilters_test.cpp index d4fa929..3384497 100644 --- a/src/tests/AlignmentFilters_test.cpp +++ b/src/tests/AlignmentFilters_test.cpp @@ -128,3 +128,48 @@ void AlignmentFiltersTest::test_GetNumEndMatches(){ } +void AlignmentFiltersTest::test_HasLargestEndMatches(){ + AlignedRead aln; + string ref_seq; + int ref_start; + + // Simple case with perfect maximal match + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACACATACAC"; + ref_seq = "ACACACATACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Perfect maximal match and soft clipping + aln.cigar_ops = GetCigarOps("2S9M2S"); + aln.read_start = 102; + aln.nucleotides = "NNACACATACACNN"; + ref_seq = "ACACACATACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Perfect non-maximal match + aln.cigar_ops = GetCigarOps("8M"); + aln.read_start = 102; + aln.nucleotides = "ACACACAC"; + ref_seq = "ACACACACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Maximal prefix but not maximal suffix + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACATTACAC"; + ref_seq = "ACACATCACACACTACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Maximal suffix but not maximal prefix + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACACTACAC"; + ref_seq = "ACACATTACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); +} diff --git a/src/tests/AlignmentFilters_test.h b/src/tests/AlignmentFilters_test.h index 5e4f5d6..bae2181 100644 --- a/src/tests/AlignmentFilters_test.h +++ b/src/tests/AlignmentFilters_test.h @@ -30,6 +30,7 @@ public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(AlignmentFiltersTest); CPPUNIT_TEST(test_GetDistToIndel); CPPUNIT_TEST(test_GetNumEndMatches); + CPPUNIT_TEST(test_HasLargestEndMatches); CPPUNIT_TEST_SUITE_END(); public: @@ -37,6 +38,7 @@ public CppUnit::TestFixture { void tearDown(); void test_GetDistToIndel(); void test_GetNumEndMatches(); + void test_HasLargestEndMatches(); private: }; From 927870792693f4f0fca820827f8421dba0660a7c Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Mon, 17 Mar 2014 15:19:15 -0400 Subject: [PATCH 04/13] Added command line options for maximal prefix/suffix read filter --- src/main_allelotype.cpp | 8 ++++++++ src/runtime_parameters.cpp | 1 + src/runtime_parameters.h | 1 + 3 files changed, 10 insertions(+) diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp index b079394..95243d8 100644 --- a/src/main_allelotype.cpp +++ b/src/main_allelotype.cpp @@ -123,6 +123,8 @@ void show_help() { " from either end of the read.\n" "--min-read-end-match : Filter reads whose alignments don't exactly match the reference for at least\n"\ " bp at both ends. \n" + "--maximal-end-match : Filter reads whose prefix/suffix matches to reference are <= \n" + " those obtained when shifting the read ends by bp\n" "--exclude-partial: Do not report any information about partially\n" \ " spanning reads.\n\n" "Additional options\n" \ @@ -151,6 +153,7 @@ void parse_commandline_options(int argc, char* argv[]) { OPT_MAX_DIFF_REF, OPT_MAXMAPQ, OPT_MAXMATEDIST, + OPT_MAXIMAL_END_MATCH_WINDOW, OPT_MIN_BORDER, OPT_MIN_BP_BEFORE_INDEL, OPT_MIN_HET_FREQ, @@ -184,6 +187,7 @@ void parse_commandline_options(int argc, char* argv[]) { {"max-diff-ref", 1, 0, OPT_MAX_DIFF_REF}, {"mapq", 1, 0, OPT_MAXMAPQ}, {"max-matedist", 1, 0, OPT_MAXMATEDIST}, + {"maximal-end-match", 1, 0, OPT_MAXIMAL_END_MATCH_WINDOW}, {"min-border", 1, 0, OPT_MIN_BORDER}, {"min-bp-before-indel", 1, 0, OPT_MIN_BP_BEFORE_INDEL}, {"min-het-freq", 1, 0, OPT_MIN_HET_FREQ}, @@ -258,6 +262,10 @@ void parse_commandline_options(int argc, char* argv[]) { max_diff_ref = atoi(optarg); AddOption("max-diff-ref", string(optarg), true, &user_defined_arguments_allelotyper); break; + case OPT_MAXIMAL_END_MATCH_WINDOW: + maximal_end_match_window = atoi(optarg); + AddOption("maximal-end-match", string(optarg), true, &user_defined_arguments_allelotyper); + break; case OPT_MAXMAPQ: max_mapq = atoi(optarg); AddOption("mapq", string(optarg), true, &user_defined_arguments_allelotyper); diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp index cde4cf9..a4baa79 100644 --- a/src/runtime_parameters.cpp +++ b/src/runtime_parameters.cpp @@ -115,6 +115,7 @@ bool include_gl = false; bool print_reads = false; float min_het_freq = 0; int max_matedist = 100000; +int maximal_end_match_window = 0; int min_border = 0; int min_bp_before_indel = 0; int min_read_end_match = 0; diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h index c0618a5..9879204 100644 --- a/src/runtime_parameters.h +++ b/src/runtime_parameters.h @@ -140,6 +140,7 @@ extern int min_border; extern int min_bp_before_indel; extern int min_read_end_match; extern int max_matedist; +extern int maximal_end_match_window; // debug extern bool profile; From f9308b856668c3a9b0a66a42c0beefefd09fdac4 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Mon, 17 Mar 2014 15:33:03 -0400 Subject: [PATCH 05/13] Modified the allelotyper to use the prefix/suffix filter, if requested --- src/AlignmentFilters.cpp | 6 +++--- src/ReadContainer.cpp | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index fa8fcef..9cfc7ff 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -198,7 +198,7 @@ namespace AlignmentFilters { int start, end; GetUnclippedInfo(aln, bases, start, end); - std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl; + //std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl; // Check that the prefix match is the longest vector match_counts; @@ -212,7 +212,7 @@ namespace AlignmentFilters { if (i == start_index) continue; if (match_counts[i] >= num_matches){ - std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + //std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; return false; } } @@ -229,7 +229,7 @@ namespace AlignmentFilters { if (i == end_index) continue; if (match_counts[i] >= num_matches){ - std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + //std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; return false; } } diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp index 48f6435..ff4e85f 100644 --- a/src/ReadContainer.cpp +++ b/src/ReadContainer.cpp @@ -258,6 +258,19 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, return false; } + + // check that the prefix and suffix of the read match maximally compared to proximal reference locations + if (maximal_end_match_window > 0){ + map, string>::iterator loc_iter = ref_ext_nucleotides.find(pair(aligned_read->chrom, aligned_read->msStart)); + if (loc_iter == ref_ext_nucleotides.end()) + PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); + string ref_ext_seq = loc_iter->second; + bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window); + if (!maximum_end_matches) + return false; + } + + // check that both ends of the aligned read have sufficient bases before the first indel if (min_bp_before_indel > 0){ pair num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read); From 8e032aaaf8b5b647381328ca8c25c45ff602e6ba Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Mon, 17 Mar 2014 18:57:10 -0400 Subject: [PATCH 06/13] Fixed major Z-algorithm bug which caused it to fail on strings of unequal length. --- src/AlignmentFilters.cpp | 11 ++--------- src/ZAlgorithm.cpp | 6 +++--- src/main_allelotype.cpp | 4 ++-- src/tests/ZAlgorithm_test.cpp | 26 ++++++++++++++++++++++++-- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index 9cfc7ff..82a1fbe 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -188,7 +188,6 @@ namespace AlignmentFilters { } - /* */ @@ -198,8 +197,6 @@ namespace AlignmentFilters { int start, end; GetUnclippedInfo(aln, bases, start, end); - //std::cerr << "Unclipped info: " << bases << " " << start << " " << end << std::endl; - // Check that the prefix match is the longest vector match_counts; ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts); @@ -211,10 +208,8 @@ namespace AlignmentFilters { for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){ if (i == start_index) continue; - if (match_counts[i] >= num_matches){ - //std::cerr << "Prefix fail: " << start_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + if (match_counts[i] >= num_matches) return false; - } } } @@ -228,10 +223,8 @@ namespace AlignmentFilters { for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){ if (i == end_index) continue; - if (match_counts[i] >= num_matches){ - //std::cerr << "Suffix fail: " << end_index << " " << num_matches << " " << i << " " << match_counts[i] << std::endl; + if (match_counts[i] >= num_matches) return false; - } } } diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp index 018ca73..6a39f5f 100644 --- a/src/ZAlgorithm.cpp +++ b/src/ZAlgorithm.cpp @@ -29,7 +29,7 @@ along with lobSTR. If not, see . namespace ZAlgorithm{ void suffix_helper(int start, const string& s1, const string& s2, vector& s1_matches, vector& num_matches){ - num_matches = vector(s2.size(), 0); + num_matches = vector(s2.size(), -1); int leftmost = s2.size(), right_index = s2.size(); for (int i = start; i >= 0; i--){ if (i <= leftmost){ @@ -68,12 +68,12 @@ namespace ZAlgorithm{ void prefix_helper(unsigned int start, const string& s1, const string& s2, vector& s1_matches, vector& num_matches){ - num_matches = vector(s2.size(), 0); + num_matches = vector(s2.size(), -1); int rightmost = 0, left_index = 0; for (int i = start; i < s2.size(); i++){ if (i >= rightmost){ int index_a = 0, index_b = i; - while (index_a < s1.size() && index_b < s1.size() && s1[index_a] == s2[index_b]){ + while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ index_a++; index_b++; } diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp index 95243d8..2ebb7af 100644 --- a/src/main_allelotype.cpp +++ b/src/main_allelotype.cpp @@ -123,8 +123,8 @@ void show_help() { " from either end of the read.\n" "--min-read-end-match : Filter reads whose alignments don't exactly match the reference for at least\n"\ " bp at both ends. \n" - "--maximal-end-match : Filter reads whose prefix/suffix matches to reference are <= \n" - " those obtained when shifting the read ends by bp\n" + "--maximal-end-match : Filter reads whose prefix/suffix matches to reference are <= those \n" + " obtained when shifting the read ends by distances within bp\n" "--exclude-partial: Do not report any information about partially\n" \ " spanning reads.\n\n" "Additional options\n" \ diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp index 88904aa..ef215e1 100644 --- a/src/tests/ZAlgorithm_test.cpp +++ b/src/tests/ZAlgorithm_test.cpp @@ -1,7 +1,28 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ #include #include +#include + #include "src/tests/DNATools.h" #include "src/tests/ZAlgorithm_test.h" #include "src/common.h" @@ -17,8 +38,9 @@ void ZAlgorithmTest::test_Prefix(){ vector num_matches; string s1, s2; for (int i = 0; i < NUM_TRIALS; i++){ - s1 = DNATools::RandDNA(LENGTH); - s2 = DNATools::RandDNA(LENGTH); + s1 = DNATools::RandDNA(rand()%LENGTH); + s2 = DNATools::RandDNA(rand()%LENGTH); + ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches); for (unsigned int j = 0; j < s2.size(); j++){ bool match = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0); From f11d2e227f3aaf68a15104253c0bbaf24785de91 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Tue, 18 Mar 2014 17:05:44 -0400 Subject: [PATCH 07/13] Added functionality to count the number of reads filtered out by each filter. Statistics are printed at the end of allelotyping to the stats file --- src/AlignedRead.h | 5 +-- src/AlignmentFilters.cpp | 1 - src/AlignmentFilters.h | 14 ++++---- src/FilterCounter.cpp | 74 ++++++++++++++++++++++++++++++++++++++ src/FilterCounter.h | 56 +++++++++++++++++++++++++++++ src/Makefile.am | 3 ++ src/ReadContainer.cpp | 33 ++++++++++++----- src/RemoveDuplicates.h | 4 +-- src/RunInfo.h | 7 +++- src/common.cpp | 2 +- src/runtime_parameters.cpp | 1 + src/runtime_parameters.h | 4 +++ 12 files changed, 179 insertions(+), 25 deletions(-) create mode 100644 src/FilterCounter.cpp create mode 100644 src/FilterCounter.h diff --git a/src/AlignedRead.h b/src/AlignedRead.h index 87650d6..ab32091 100644 --- a/src/AlignedRead.h +++ b/src/AlignedRead.h @@ -22,9 +22,6 @@ along with lobSTR. If not, see . #include "src/api/BamAlignment.h" -using namespace std; -using BamTools::CigarOp; - /* Struct to keep track of aligned read used in allelotyping */ @@ -37,7 +34,7 @@ struct AlignedRead { int read_start; std::string nucleotides; std::string qualities; - vector cigar_ops; + std::vector cigar_ops; std::string repseq; int period; int diffFromRef; diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index 82a1fbe..f277a9f 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -29,7 +29,6 @@ along with lobSTR. If not, see . using namespace std; namespace AlignmentFilters { - template int GetDistToIndel(CigarIterator iter, CigarIterator end){ // Process leading clipping ops if (iter != end && iter->Type == 'H') diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h index f28e49b..a2d6b78 100644 --- a/src/AlignmentFilters.h +++ b/src/AlignmentFilters.h @@ -19,8 +19,8 @@ along with lobSTR. If not, see . */ -#ifndef SRC_ALIGNMENTFILTERS_H -#define SRC_ALIGNMENTFILTERS_H +#ifndef SRC_ALIGNMENTFILTERS_H_ +#define SRC_ALIGNMENTFILTERS_H_ #include #include @@ -31,19 +31,19 @@ using namespace std; namespace AlignmentFilters { /* Returns the vector of CigarOps corresponding to the CIGAR string. */ - vector GetCigarOps(string cigar_string); + std::vector GetCigarOps(std::string cigar_string); /* Returns the CIGAR string corresponding to the vector of CigarOps. */ - string GetCigarString(vector& cigar_ops); + std::string GetCigarString(std::vector& cigar_ops); /* Length of perfect base matches at 5' and 3' end of read. */ - pair GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start); + std::pair GetNumEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start); /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */ - pair GetEndDistToIndel(AlignedRead* aln); + std::pair GetEndDistToIndel(AlignedRead* aln); /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */ - bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream); + bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream); } #endif diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp new file mode 100644 index 0000000..4148278 --- /dev/null +++ b/src/FilterCounter.cpp @@ -0,0 +1,74 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#include "src/common.h" +#include "src/FilterCounter.h" + +FilterCounter::FilterCounter(){ + counts = new uint64_t[NUM_FILTERS]; + for (int i = 0; i < NUM_FILTERS; i++) + counts[i] = 0; +} + +void FilterCounter::increment(const int type){ + if (type > NUM_FILTERS || type < 0) + PrintMessageDieOnError("Invalid filter type", ERROR); + counts[type]++; +} + +std::string FilterCounter::GetFilterType(const int type){ + switch(type){ + case NOT_UNIT: + return "NOT_UNIT"; + case DIFF_FROM_REF: + return "DIFF_FROM_REF"; + case MAPPING_QUALITY: + return "MAPPING_QUALITY"; + case MATE_DIST: + return "MATE_DIST"; + case ALLELE_SIZE: + return "ALLELE_SIZE"; + case SPANNING_AMOUNT: + return "SPANNING_AMOUNT"; + case NUM_END_MATCHES: + return "NUM_END_MATCHES"; + case NOT_MAXIMAL_END: + return "NOT_MAXIMAL_END"; + case BP_BEFORE_INDEL: + return "BP_BEFORE_INDEL"; + default: + PrintMessageDieOnError("Invalid filter type", ERROR); + } +} + +uint64_t FilterCounter::GetFilterCount(const int type){ + if (type > NUM_FILTERS || type < 0) + PrintMessageDieOnError("Invalid filter type", ERROR); + return counts[type]; +} + + +FilterCounter::~FilterCounter(){ + delete [] counts; +} + + + + diff --git a/src/FilterCounter.h b/src/FilterCounter.h new file mode 100644 index 0000000..23341b9 --- /dev/null +++ b/src/FilterCounter.h @@ -0,0 +1,56 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + + +#ifndef SRC_FILTER_COUNTER_H_ +#define SRC_FILTER_COUNTER_H_ + +#include + +#include + +class FilterCounter { + private: + uint64_t* counts; + + public: + const static int NUM_FILTERS = 9; + const static int NOT_UNIT = 0; + const static int DIFF_FROM_REF = 1; + const static int MAPPING_QUALITY = 2; + const static int MATE_DIST = 3; + const static int ALLELE_SIZE = 4; + const static int SPANNING_AMOUNT = 5; + const static int NUM_END_MATCHES = 6; + const static int NOT_MAXIMAL_END = 7; + const static int BP_BEFORE_INDEL = 8; + + FilterCounter(); + + void increment(const int type); + + std::string GetFilterType(const int type); + + uint64_t GetFilterCount(const int type); + + ~FilterCounter(); +}; + +#endif diff --git a/src/Makefile.am b/src/Makefile.am index f7dce80..57b6c1c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -121,6 +121,7 @@ liblobstr_a_SOURCES = \ FastqPairedFileReader.cpp FastqPairedFileReader.h \ FFT_four_nuc_vectors.cpp FFT_four_nuc_vectors.h \ FFT_nuc_vectors.cpp FFT_nuc_vectors.h \ + FilterCounter.cpp FilterCounter.h \ gzstream.cpp gzstream.h \ STRRecord.h \ HammingWindowGenerator.cpp HammingWindowGenerator.h \ @@ -186,6 +187,7 @@ liballelotype_a_SOURCES = \ FastqFileReader.cpp FastqFileReader.h \ FastaPairedFileReader.cpp FastaPairedFileReader.h \ FastqPairedFileReader.cpp FastqPairedFileReader.h \ + FilterCounter.cpp FilterCounter.h \ Genotyper.cpp Genotyper.h \ gzstream.cpp gzstream.h \ NoiseModel.cpp NoiseModel.h \ @@ -271,6 +273,7 @@ lobSTRTests_SOURCES = \ FastqPairedFileReader.cpp \ FFT_four_nuc_vectors.cpp \ FFT_nuc_vectors.cpp \ + FilterCounter.h FilterCounter.cpp \ gzstream.cpp \ HammingWindowGenerator.cpp \ MultithreadData.cpp \ diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp index ff4e85f..350d503 100644 --- a/src/ReadContainer.cpp +++ b/src/ReadContainer.cpp @@ -225,27 +225,36 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, } // apply filters if (unit) { - if (aligned_read->diffFromRef % aligned_read->period != 0) return false; + if (aligned_read->diffFromRef % aligned_read->period != 0){ + filter_counter.increment(FilterCounter::NOT_UNIT); + return false; + } } if (abs(aligned_read->diffFromRef) > max_diff_ref) { + filter_counter.increment(FilterCounter::DIFF_FROM_REF); return false; } if (aligned_read->mapq > max_mapq) { + filter_counter.increment(FilterCounter::MAPPING_QUALITY); return false; } if (aligned_read->matedist > max_matedist) { + filter_counter.increment(FilterCounter::MATE_DIST); return false; } // Check if the allele length is valid if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) { + filter_counter.increment(FilterCounter::ALLELE_SIZE); return false; } // check that read sufficiently spans STR int max_read_start = aligned_read->msStart - min_border; int min_read_stop = aligned_read->msEnd + min_border; - if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop) - return false; + if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){ + filter_counter.increment(FilterCounter::SPANNING_AMOUNT); + return false; + } // check that both ends of the read contain sufficient perfect matches if (min_read_end_match > 0){ @@ -254,11 +263,12 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; pair num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend); - if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match) + if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){ + filter_counter.increment(FilterCounter::NUM_END_MATCHES); return false; + } } - // check that the prefix and suffix of the read match maximally compared to proximal reference locations if (maximal_end_match_window > 0){ map, string>::iterator loc_iter = ref_ext_nucleotides.find(pair(aligned_read->chrom, aligned_read->msStart)); @@ -266,18 +276,23 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window); - if (!maximum_end_matches) + if (!maximum_end_matches){ + filter_counter.increment(FilterCounter::NOT_MAXIMAL_END); return false; + } } - // check that both ends of the aligned read have sufficient bases before the first indel if (min_bp_before_indel > 0){ pair num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read); - if (num_bps.first != -1 && num_bps.first < min_bp_before_indel) + if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){ + filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; - if (num_bps.second != -1 && num_bps.second < min_bp_before_indel) + } + if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){ + filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; + } } return true; diff --git a/src/RemoveDuplicates.h b/src/RemoveDuplicates.h index 4560740..0517b6f 100644 --- a/src/RemoveDuplicates.h +++ b/src/RemoveDuplicates.h @@ -34,10 +34,10 @@ namespace RemoveDuplicates { void RemovePCRDuplicates(std::list* aligned_reads); /* Get representative read from group of duplicates, read with highest qual score */ - void GetRepRead(const list& aligned_reads, AlignedRead* rep_alignment); + void GetRepRead(const std::list& aligned_reads, AlignedRead* rep_alignment); /* Get quality score of a read */ - float GetScore(const string& quality_string); + float GetScore(const std::string& quality_string); } #endif // SRC_REMOVEDUPLICATES_H_ diff --git a/src/RunInfo.h b/src/RunInfo.h index 6f76e08..362bb31 100644 --- a/src/RunInfo.h +++ b/src/RunInfo.h @@ -25,6 +25,8 @@ along with lobSTR. If not, see . #include #include +#include "src/FilterCounter.h" + /* RunInfo stores information about the run that will be uploaded to Amazon for further analysis */ @@ -87,7 +89,7 @@ class RunInfo { } // Print to string - std::string PrintToString(int program) { + std::string PrintToString(int program, FilterCounter& filter_counter) { std::stringstream ss; ss << "Starttime: " << starttime << std::endl; ss << "Endtime: " << endtime << std::endl; @@ -119,6 +121,9 @@ class RunInfo { ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl; ss << "Mean coverage\t" << static_cast(total_coverage.at(i))/static_cast(num_calls.at(i)) << std::endl; ss << "Mean perc. agree\t" << static_cast(total_agree.at(i))/static_cast(num_calls.at(i)) << std::endl; + ss << "Read filter stats:" << std::endl; + for (int i = 0; i < FilterCounter::NUM_FILTERS; i++) + ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; ss << std::endl; } ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl; diff --git a/src/common.cpp b/src/common.cpp index c723ef1..f6676af 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -84,7 +84,7 @@ void OutputRunStatistics() { PrintMessageDieOnError("Outputting run statistics", PROGRESS); TextFileWriter sWriter(output_prefix + (program == LOBSTR? ".aligned.stats":".allelotype.stats")); // Output run statistics to stats file - string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1)); + string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1), filter_counter); sWriter.Write(stats_string); // Upload to AWS S3 if (!noweb) { diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp index a4baa79..9f020ee 100644 --- a/src/runtime_parameters.cpp +++ b/src/runtime_parameters.cpp @@ -23,6 +23,7 @@ along with lobSTR. If not, see . #include "src/runtime_parameters.h" RunInfo run_info; +FilterCounter filter_counter; int MIN_PERIOD=1; int MAX_PERIOD=6; diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h index 9879204..01853a1 100644 --- a/src/runtime_parameters.h +++ b/src/runtime_parameters.h @@ -25,6 +25,7 @@ along with lobSTR. If not, see . #include #include "src/RunInfo.h" +#include "src/FilterCounter.h" // period range for genotyper extern int MIN_PERIOD; @@ -33,6 +34,9 @@ extern int MAX_PERIOD; // global variables that probably shouldn't be global variables extern RunInfo run_info; +// when allelotyping, tracks why filtered reads were omitted +extern FilterCounter filter_counter; + // stores user arguments for lobSTR alignment // used to store the bam header for allelotyping step extern std::string user_defined_arguments; From 329a7cbf621c6536f723f94a4347e7d3b4a31eae Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Tue, 18 Mar 2014 17:23:43 -0400 Subject: [PATCH 08/13] Fixed bug in which filter stats were printed for every sample. Added unfiltered filter type --- src/FilterCounter.cpp | 2 ++ src/FilterCounter.h | 5 +++-- src/ReadContainer.cpp | 2 +- src/RunInfo.h | 9 +++++---- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp index 4148278..972a41a 100644 --- a/src/FilterCounter.cpp +++ b/src/FilterCounter.cpp @@ -53,6 +53,8 @@ std::string FilterCounter::GetFilterType(const int type){ return "NOT_MAXIMAL_END"; case BP_BEFORE_INDEL: return "BP_BEFORE_INDEL"; + case UNFILTERED: + return "UNFILTERED"; default: PrintMessageDieOnError("Invalid filter type", ERROR); } diff --git a/src/FilterCounter.h b/src/FilterCounter.h index 23341b9..218b395 100644 --- a/src/FilterCounter.h +++ b/src/FilterCounter.h @@ -31,7 +31,7 @@ class FilterCounter { uint64_t* counts; public: - const static int NUM_FILTERS = 9; + const static int NUM_FILTERS = 10; const static int NOT_UNIT = 0; const static int DIFF_FROM_REF = 1; const static int MAPPING_QUALITY = 2; @@ -41,7 +41,8 @@ class FilterCounter { const static int NUM_END_MATCHES = 6; const static int NOT_MAXIMAL_END = 7; const static int BP_BEFORE_INDEL = 8; - + const static int UNFILTERED = 9; + FilterCounter(); void increment(const int type); diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp index 350d503..c45c766 100644 --- a/src/ReadContainer.cpp +++ b/src/ReadContainer.cpp @@ -294,7 +294,7 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, return false; } } - + filter_counter.increment(FilterCounter::UNFILTERED); return true; } diff --git a/src/RunInfo.h b/src/RunInfo.h index 362bb31..56d4c4f 100644 --- a/src/RunInfo.h +++ b/src/RunInfo.h @@ -121,10 +121,6 @@ class RunInfo { ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl; ss << "Mean coverage\t" << static_cast(total_coverage.at(i))/static_cast(num_calls.at(i)) << std::endl; ss << "Mean perc. agree\t" << static_cast(total_agree.at(i))/static_cast(num_calls.at(i)) << std::endl; - ss << "Read filter stats:" << std::endl; - for (int i = 0; i < FilterCounter::NUM_FILTERS; i++) - ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; - ss << std::endl; } ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl; for (size_t i = 0; i < calltype_by_period.size(); i++) { @@ -135,6 +131,11 @@ class RunInfo { } ss << std::endl; } + + ss << "Read filter stats:" << std::endl; + for (int i = 0; i < FilterCounter::NUM_FILTERS; i++) + ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; + ss << std::endl; } return ss.str(); From c8244eca344fe1215e59223a57214597cafca2c8 Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Wed, 19 Mar 2014 11:54:53 -0400 Subject: [PATCH 09/13] Formatted filter stats in allelotype.stats output --- src/RunInfo.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/RunInfo.h b/src/RunInfo.h index 56d4c4f..2dd4de1 100644 --- a/src/RunInfo.h +++ b/src/RunInfo.h @@ -21,6 +21,7 @@ along with lobSTR. If not, see . #ifndef SRC_RUNINFO_H_ #define SRC_RUNINFO_H_ +#include #include #include #include @@ -134,7 +135,7 @@ class RunInfo { ss << "Read filter stats:" << std::endl; for (int i = 0; i < FilterCounter::NUM_FILTERS; i++) - ss << "\t" << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; + ss << "\t" << std::setw(25) << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; ss << std::endl; } From 228ae57367c4598773c548d01efe30d55da5992b Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Wed, 19 Mar 2014 14:08:59 -0400 Subject: [PATCH 10/13] Cleaned up namespaces in various header files --- src/AlignmentFilters.h | 2 -- src/ZAlgorithm.cpp | 20 ++++++++++---------- src/ZAlgorithm.h | 6 ++---- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h index a2d6b78..ec45bf7 100644 --- a/src/AlignmentFilters.h +++ b/src/AlignmentFilters.h @@ -27,8 +27,6 @@ along with lobSTR. If not, see . #include "src/AlignedRead.h" -using namespace std; - namespace AlignmentFilters { /* Returns the vector of CigarOps corresponding to the CIGAR string. */ std::vector GetCigarOps(std::string cigar_string); diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp index 6a39f5f..04ac95b 100644 --- a/src/ZAlgorithm.cpp +++ b/src/ZAlgorithm.cpp @@ -27,9 +27,9 @@ along with lobSTR. If not, see . namespace ZAlgorithm{ - void suffix_helper(int start, const string& s1, const string& s2, - vector& s1_matches, vector& num_matches){ - num_matches = vector(s2.size(), -1); + void suffix_helper(int start, const std::string& s1, const std::string& s2, + std::vector& s1_matches, std::vector& num_matches){ + num_matches = std::vector(s2.size(), -1); int leftmost = s2.size(), right_index = s2.size(); for (int i = start; i >= 0; i--){ if (i <= leftmost){ @@ -66,9 +66,9 @@ namespace ZAlgorithm{ } - void prefix_helper(unsigned int start, const string& s1, const string& s2, - vector& s1_matches, vector& num_matches){ - num_matches = vector(s2.size(), -1); + void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, + std::vector& s1_matches, std::vector& num_matches){ + num_matches = std::vector(s2.size(), -1); int rightmost = 0, left_index = 0; for (int i = start; i < s2.size(); i++){ if (i >= rightmost){ @@ -104,14 +104,14 @@ namespace ZAlgorithm{ } } - void GetPrefixMatchCounts(const string& s1, const string& s2, vector& num_matches) { - vector s1_matches; + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { + std::vector s1_matches; prefix_helper(1, s1, s1, s1_matches, s1_matches); prefix_helper(0, s1, s2, s1_matches, num_matches); } - void GetSuffixMatchCounts(const string& s1, const string& s2, vector& num_matches) { - vector s1_matches; + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { + std::vector s1_matches; suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches); suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches); } diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h index 6db8886..d4976df 100644 --- a/src/ZAlgorithm.h +++ b/src/ZAlgorithm.h @@ -21,10 +21,8 @@ along with lobSTR. If not, see . #include #include -using namespace std; - namespace ZAlgorithm{ - void GetPrefixMatchCounts(const string& s1, const string& s2, vector& num_matches); - void GetSuffixMatchCounts(const string& s1, const string& s2, vector& num_matches); + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); } From 91c53505187e4d7b116b9e39fe4389c5b8c0236f Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Wed, 19 Mar 2014 16:33:55 -0400 Subject: [PATCH 11/13] Optimized prefix/suffix filter so that it only calculates the values for the reference sequence region of interest --- src/AlignmentFilters.cpp | 41 +++++++------ src/ZAlgorithm.cpp | 112 ++++++++++++++++++++++++++++++++-- src/ZAlgorithm.h | 2 + src/tests/ZAlgorithm_test.cpp | 35 ++++++----- 4 files changed, 152 insertions(+), 38 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index f277a9f..3fb3a2c 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -189,7 +189,7 @@ namespace AlignmentFilters { /* - */ + */ bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){ // Extract sequence, start and end coordinates of read after clipping string bases; @@ -197,36 +197,41 @@ namespace AlignmentFilters { GetUnclippedInfo(aln, bases, start, end); // Check that the prefix match is the longest - vector match_counts; - ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, match_counts); - if (ref_seq.size() != match_counts.size()) - PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR); if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){ int start_index = start - ref_seq_start; - int num_matches = match_counts[start_index]; - for (int i = max(0, start_index-max_external); i < ref_seq.size() && i <= start_index + max_internal; i++){ - if (i == start_index) + int start = max(0, start_index - max_external); + int stop = min((int)(ref_seq.size()-1), start_index + max_internal); + vector match_counts; + ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts); + + int align_index = start_index - start; + int num_matches = match_counts[align_index]; + for (unsigned int i = 0; i < match_counts.size(); i++){ + if (i == align_index) continue; if (match_counts[i] >= num_matches) return false; } } - + // Check that the suffix match is the longest - ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, match_counts); - if (ref_seq.size() != match_counts.size()) - PrintMessageDieOnError("Length of Z-algorithm output must match that of the reference sequence", ERROR); if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){ - int end_index = end - ref_seq_start; - int num_matches = match_counts[end_index]; - for (int i = max(0, end_index-max_internal); i < ref_seq.size() && i <= end_index + max_external; i++){ - if (i == end_index) + int end_index = end - ref_seq_start; + int start = max(0, end_index - max_internal); + int stop = min((int)(ref_seq.size()-1), end_index + max_external); + vector match_counts; + ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts); + + int align_index = end_index - start; + int num_matches = match_counts[align_index]; + for (unsigned int i = 0; i < match_counts.size(); i++){ + if (i == align_index) continue; if (match_counts[i] >= num_matches) return false; } - } - + } + return true; } } diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp index 04ac95b..d4745c4 100644 --- a/src/ZAlgorithm.cpp +++ b/src/ZAlgorithm.cpp @@ -23,10 +23,12 @@ along with lobSTR. If not, see . #include -#include "ZAlgorithm.h" +#include "src/common.h" +#include "src/ZAlgorithm.h" namespace ZAlgorithm{ + /* void suffix_helper(int start, const std::string& s1, const std::string& s2, std::vector& s1_matches, std::vector& num_matches){ num_matches = std::vector(s2.size(), -1); @@ -64,8 +66,51 @@ namespace ZAlgorithm{ } } } + */ + void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, + std::vector& s1_matches, std::vector& num_matches){ + num_matches = std::vector(s2_right - s2_left + 1, -1); + int leftmost = s2_right+1, right_index = s2_right+1; + for (int i = s2_right; i >= s2_left; i--){ + if (i <= leftmost){ + int index_a = s1.size()-1, index_b = i; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i-s2_left] = i - index_b; + if (index_b < i){ + right_index = i; + leftmost = index_b + 1; + } + } + else { + int twin = i - right_index + s1.size()-1; + int new_left = i - s1_matches[twin] + 1; + if (new_left > leftmost) + num_matches[i-s2_left] = s1_matches[twin]; + else if (new_left < leftmost) + num_matches[i-s2_left] = i-leftmost+1; + else { + int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i-s2_left] = i-index_b; + right_index = i; + leftmost = index_b + 1; + } + } + } + } + + + + + /* void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, std::vector& s1_matches, std::vector& num_matches){ num_matches = std::vector(s2.size(), -1); @@ -103,17 +148,74 @@ namespace ZAlgorithm{ } } } + */ + + void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, + std::vector& s1_matches, std::vector& num_matches, int offset){ + num_matches = std::vector(s2_right-s2_left+1+offset, -1); + int rightmost = -1, left_index = -1; + for (int i = s2_left; i <= s2_right; i++){ + if (i >= rightmost){ + int index_a = 0, index_b = i; + while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i-s2_left+offset] = index_b - i; + if (index_b > i){ + left_index = i; + rightmost = index_b - 1; + } + } + else { + int twin = i - left_index; + int new_right = i + s1_matches[twin] - 1; + if (new_right < rightmost) + num_matches[i-s2_left+offset] = s1_matches[twin]; + else if (new_right > rightmost) + num_matches[i-s2_left+offset] = rightmost-i+1; + else { + int index_a = rightmost+1-i, index_b = rightmost+1; + while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i-s2_left+offset] = index_b - i; + left_index = i; + rightmost = index_b - 1; + } + } + } + } + + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { std::vector s1_matches; - prefix_helper(1, s1, s1, s1_matches, s1_matches); - prefix_helper(0, s1, s2, s1_matches, num_matches); + prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); + prefix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches, 0); } void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { std::vector s1_matches; - suffix_helper(s1.size()-2, s1, s1, s1_matches, s1_matches); - suffix_helper(s2.size()-1, s1, s2, s1_matches, num_matches); + suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches); + suffix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches); + } + + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { + if (s2_start < 0 or s2_stop >= s2.size()) + PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR); + std::vector s1_matches; + prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); + prefix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches, 0); } + + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { + if (s2_start < 0 or s2_stop >= s2.size()) + PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR); + std::vector s1_matches; + suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches); + suffix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches); + } } diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h index d4976df..14ce501 100644 --- a/src/ZAlgorithm.h +++ b/src/ZAlgorithm.h @@ -25,4 +25,6 @@ along with lobSTR. If not, see . namespace ZAlgorithm{ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); } diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp index ef215e1..6ab4372 100644 --- a/src/tests/ZAlgorithm_test.cpp +++ b/src/tests/ZAlgorithm_test.cpp @@ -37,14 +37,16 @@ void ZAlgorithmTest::tearDown(){} void ZAlgorithmTest::test_Prefix(){ vector num_matches; string s1, s2; + int s2_start, s2_stop; for (int i = 0; i < NUM_TRIALS; i++){ - s1 = DNATools::RandDNA(rand()%LENGTH); - s2 = DNATools::RandDNA(rand()%LENGTH); - - ZAlgorithm::GetPrefixMatchCounts(s1, s2, num_matches); - for (unsigned int j = 0; j < s2.size(); j++){ - bool match = (s1.substr(0, num_matches[j]).compare(s2.substr(j, num_matches[j])) == 0); - bool longer = (s1.size() > num_matches[j] && s2.size() > (j + num_matches[j]) && s2[j+num_matches[j]] == s1[num_matches[j]]); + s1 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2_start = rand()%s2.size(); + s2_stop = s2_start + (rand()%(s2.size()-s2_start)); + ZAlgorithm::GetPrefixMatchCounts(s1, s2, s2_start, s2_stop, num_matches); + for (int j = s2_start; j <= s2_stop; j++){ + bool match = (s1.substr(0, num_matches[j-s2_start]).compare(s2.substr(j, num_matches[j-s2_start])) == 0); + bool longer = (s1.size() > num_matches[j-s2_start] && s2.size() > (j + num_matches[j-s2_start]) && s2[j+num_matches[j-s2_start]] == s1[num_matches[j-s2_start]]); CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer))); } } @@ -53,16 +55,19 @@ void ZAlgorithmTest::test_Prefix(){ void ZAlgorithmTest::test_Suffix(){ vector num_matches; string s1, s2; + int s2_start, s2_stop; for (int i = 0; i < NUM_TRIALS; i++){ - s1 = DNATools::RandDNA(LENGTH); - s2 = DNATools::RandDNA(LENGTH); - ZAlgorithm::GetSuffixMatchCounts(s1, s2, num_matches); - for (int j = s2.size()-1; j >= 0; j--){ - string sub_1 = s1.substr(s1.size()-num_matches[j], num_matches[j]); - string sub_2 = s2.substr(j-num_matches[j]+1, num_matches[j]); + s1 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2_start = rand()%s2.size(); + s2_stop = s2_start + (rand()%(s2.size()-s2_start)); + ZAlgorithm::GetSuffixMatchCounts(s1, s2, s2_start, s2_stop, num_matches); + for (int j = s2_start; j <= s2_stop; j++){ + string sub_1 = s1.substr(s1.size()-num_matches[j-s2_start], num_matches[j-s2_start]); + string sub_2 = s2.substr(j-num_matches[j-s2_start]+1, num_matches[j-s2_start]); bool match = (sub_1.compare(sub_2) == 0); - bool longer = ((j-num_matches[j]) > 0 && (s1.size()-num_matches[j]) > 0 && s1[s1.size()-num_matches[j]-1] == s2[j-num_matches[j]]); - CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer))); + bool longer = ((j-num_matches[j-s2_start]) > 0 && (s1.size()-num_matches[j-s2_start]) > 0 && s1[s1.size()-num_matches[j-s2_start]-1] == s2[j-num_matches[j-s2_start]]); + CPPUNIT_ASSERT_MESSAGE("ZAlgorithm suffix error", !(!match || (match && longer))); } } } From fad48b22afda146d6bc883c6cf9a3dda9fb94efe Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Wed, 19 Mar 2014 17:10:53 -0400 Subject: [PATCH 12/13] Added function documentation and integer casts to removed signed-unsigned comparison warnings --- src/AlignmentFilters.cpp | 20 ++++----- src/AlignmentFilters.h | 6 ++- src/ZAlgorithm.cpp | 93 ++-------------------------------------- src/ZAlgorithm.h | 25 +++++++++++ 4 files changed, 42 insertions(+), 102 deletions(-) diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index 3fb3a2c..23cc72a 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -186,10 +186,7 @@ namespace AlignmentFilters { bases = aln->nucleotides.substr(start_index, num_bases); } - - /* - - */ + bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){ // Extract sequence, start and end coordinates of read after clipping string bases; @@ -197,16 +194,16 @@ namespace AlignmentFilters { GetUnclippedInfo(aln, bases, start, end); // Check that the prefix match is the longest - if (start >= ref_seq_start && start < ref_seq_start + ref_seq.size()){ + if (start >= ref_seq_start && start < ref_seq_start + static_cast(ref_seq.size())){ int start_index = start - ref_seq_start; int start = max(0, start_index - max_external); - int stop = min((int)(ref_seq.size()-1), start_index + max_internal); + int stop = min(static_cast((ref_seq.size()-1)), start_index + max_internal); vector match_counts; ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts); int align_index = start_index - start; int num_matches = match_counts[align_index]; - for (unsigned int i = 0; i < match_counts.size(); i++){ + for (int i = 0; i < static_cast(match_counts.size()); i++){ if (i == align_index) continue; if (match_counts[i] >= num_matches) @@ -215,23 +212,22 @@ namespace AlignmentFilters { } // Check that the suffix match is the longest - if (end >= ref_seq_start && end < ref_seq_start + ref_seq.size()){ + if (end >= ref_seq_start && end < ref_seq_start + static_cast(ref_seq.size())){ int end_index = end - ref_seq_start; int start = max(0, end_index - max_internal); - int stop = min((int)(ref_seq.size()-1), end_index + max_external); + int stop = min(static_cast(ref_seq.size()-1), end_index + max_external); vector match_counts; ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts); int align_index = end_index - start; int num_matches = match_counts[align_index]; - for (unsigned int i = 0; i < match_counts.size(); i++){ + for (int i = 0; i < static_cast(match_counts.size()); i++){ if (i == align_index) continue; if (match_counts[i] >= num_matches) return false; } - } - + } return true; } } diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h index ec45bf7..e548ae2 100644 --- a/src/AlignmentFilters.h +++ b/src/AlignmentFilters.h @@ -40,7 +40,11 @@ namespace AlignmentFilters { /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */ std::pair GetEndDistToIndel(AlignedRead* aln); - /* Returns true iff the alignment ends match maximally compared to other positions within the specified window. */ + /* Returns true iff the alignment has: + 1) a maximal matching prefix compared to alignments that start [-max_upstream, max_downstream] from the 5' alignment position of the read + 2) a maximal matching suffix compared to alignments that end [-max_downstream, max_upstream] from the 3' alignment position of the read + Ignores clipped bases when performing these comparions + */ bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream); } diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp index d4745c4..9ec1a9b 100644 --- a/src/ZAlgorithm.cpp +++ b/src/ZAlgorithm.cpp @@ -28,47 +28,6 @@ along with lobSTR. If not, see . namespace ZAlgorithm{ - /* - void suffix_helper(int start, const std::string& s1, const std::string& s2, - std::vector& s1_matches, std::vector& num_matches){ - num_matches = std::vector(s2.size(), -1); - int leftmost = s2.size(), right_index = s2.size(); - for (int i = start; i >= 0; i--){ - if (i <= leftmost){ - int index_a = s1.size()-1, index_b = i; - while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ - index_a--; - index_b--; - } - num_matches[i] = i - index_b; - if (index_b < i){ - right_index = i; - leftmost = index_b + 1; - } - } - else { - int twin = i - right_index + s1.size()-1; - int new_left = i - s1_matches[twin] + 1; - if (new_left > leftmost) - num_matches[i] = s1_matches[twin]; - else if (new_left < leftmost) - num_matches[i] = i-leftmost+1; - else { - int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1; - while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ - index_a--; - index_b--; - } - num_matches[i] = i-index_b; - right_index = i; - leftmost = index_b + 1; - } - } - } - } - */ - - void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, std::vector& s1_matches, std::vector& num_matches){ num_matches = std::vector(s2_right - s2_left + 1, -1); @@ -107,49 +66,6 @@ namespace ZAlgorithm{ } } - - - - /* - void prefix_helper(unsigned int start, const std::string& s1, const std::string& s2, - std::vector& s1_matches, std::vector& num_matches){ - num_matches = std::vector(s2.size(), -1); - int rightmost = 0, left_index = 0; - for (int i = start; i < s2.size(); i++){ - if (i >= rightmost){ - int index_a = 0, index_b = i; - while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ - index_a++; - index_b++; - } - num_matches[i] = index_b - i; - if (index_b > i){ - left_index = i; - rightmost = index_b - 1; - } - } - else { - int twin = i - left_index; - int new_right = i + s1_matches[twin] - 1; - if (new_right < rightmost) - num_matches[i] = s1_matches[twin]; - else if (new_right > rightmost) - num_matches[i] = rightmost-i+1; - else { - int index_a = rightmost+1-i, index_b = rightmost+1; - while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ - index_a++; - index_b++; - } - num_matches[i] = index_b - i; - left_index = i; - rightmost = index_b - 1; - } - } - } - } - */ - void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, std::vector& s1_matches, std::vector& num_matches, int offset){ num_matches = std::vector(s2_right-s2_left+1+offset, -1); @@ -157,7 +73,7 @@ namespace ZAlgorithm{ for (int i = s2_left; i <= s2_right; i++){ if (i >= rightmost){ int index_a = 0, index_b = i; - while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ + while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){ index_a++; index_b++; } @@ -176,7 +92,7 @@ namespace ZAlgorithm{ num_matches[i-s2_left+offset] = rightmost-i+1; else { int index_a = rightmost+1-i, index_b = rightmost+1; - while (index_a < s1.size() && index_b < s2.size() && s1[index_a] == s2[index_b]){ + while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){ index_a++; index_b++; } @@ -189,7 +105,6 @@ namespace ZAlgorithm{ } - void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { std::vector s1_matches; prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); @@ -203,7 +118,7 @@ namespace ZAlgorithm{ } void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { - if (s2_start < 0 or s2_stop >= s2.size()) + if (s2_start < 0 or s2_stop >= static_cast(s2.size())) PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR); std::vector s1_matches; prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); @@ -211,7 +126,7 @@ namespace ZAlgorithm{ } void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { - if (s2_start < 0 or s2_stop >= s2.size()) + if (s2_start < 0 or s2_stop >= static_cast(s2.size())) PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR); std::vector s1_matches; suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches); diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h index 14ce501..777fbc8 100644 --- a/src/ZAlgorithm.h +++ b/src/ZAlgorithm.h @@ -23,8 +23,33 @@ along with lobSTR. If not, see . namespace ZAlgorithm{ + /* + * For each position in s2, calculates the length of the matching prefix of s1 and s2[i...] + * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + length_of_s2) + */ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + + /* + * For each position in s2, calculates the length of the matching suffix of s1 and s2[0...i] + * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + length_of_s2) + */ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + + /* + * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of + * the matching prefix of s1 and s2[i...] and stores it in num_matches[i-s2_start]. + * The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + size_of_s2_range) + */ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); + + /* + * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of + * the matching suffix of s1 and s2[0...i] and stores it in num_matches[i-s2_start]. + * The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + size_of_s2_range) + */ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); } From cb7eb3f6e02f136d7ac801223ddcff2c0139e93a Mon Sep 17 00:00:00 2001 From: Thomas Willems Date: Wed, 19 Mar 2014 17:16:18 -0400 Subject: [PATCH 13/13] Added documentation for FilterCounter class --- src/FilterCounter.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/FilterCounter.h b/src/FilterCounter.h index 218b395..60646fa 100644 --- a/src/FilterCounter.h +++ b/src/FilterCounter.h @@ -32,6 +32,8 @@ class FilterCounter { public: const static int NUM_FILTERS = 10; + + // Various filter types const static int NOT_UNIT = 0; const static int DIFF_FROM_REF = 1; const static int MAPPING_QUALITY = 2; @@ -45,10 +47,13 @@ class FilterCounter { FilterCounter(); + /* Increment the count for the provided filter type */ void increment(const int type); + /* Returns the name of the filter associated with the provided filter type */ std::string GetFilterType(const int type); + /* Returns the count associated with the provided filter type */ uint64_t GetFilterCount(const int type); ~FilterCounter();