diff --git a/src/AlignedRead.h b/src/AlignedRead.h
index 87650d6..ab32091 100644
--- a/src/AlignedRead.h
+++ b/src/AlignedRead.h
@@ -22,9 +22,6 @@ along with lobSTR. If not, see .
#include "src/api/BamAlignment.h"
-using namespace std;
-using BamTools::CigarOp;
-
/*
Struct to keep track of aligned read used in allelotyping
*/
@@ -37,7 +34,7 @@ struct AlignedRead {
int read_start;
std::string nucleotides;
std::string qualities;
- vector cigar_ops;
+ std::vector cigar_ops;
std::string repseq;
int period;
int diffFromRef;
diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp
index a569aca..23cc72a 100644
--- a/src/AlignmentFilters.cpp
+++ b/src/AlignmentFilters.cpp
@@ -24,11 +24,11 @@ along with lobSTR. If not, see .
#include "src/AlignmentFilters.h"
#include "src/common.h"
+#include "src/ZAlgorithm.h"
using namespace std;
namespace AlignmentFilters {
-
template int GetDistToIndel(CigarIterator iter, CigarIterator end){
// Process leading clipping ops
if (iter != end && iter->Type == 'H')
@@ -147,4 +147,87 @@ namespace AlignmentFilters {
else
return pair(head_match, match_run);
}
+
+
+ /*
+ Stores the sequence, start and end position of the read after removing clipped bases
+ using the provided references
+ */
+ void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){
+ unclipped_start = aln->read_start;
+ unclipped_end = aln->read_start-1;
+ bool begin = true;
+ int start_index = 0, num_bases = 0;
+ for(vector::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){
+ switch(cigar_iter->Type) {
+ case 'D':
+ unclipped_end += cigar_iter->Length;
+ begin = false;
+ break;
+ case 'H':
+ break;
+ case 'S':
+ if (begin) start_index += cigar_iter->Length;
+ break;
+ case 'M':
+ unclipped_end += cigar_iter->Length;
+ num_bases += cigar_iter->Length;
+ begin = false;
+ break;
+ case 'I':
+ num_bases += cigar_iter->Length;
+ begin = false;
+ break;
+ default:
+ PrintMessageDieOnError("Invalid CIGAR char " + cigar_iter->Type, ERROR);
+ break;
+ }
+ }
+ bases = aln->nucleotides.substr(start_index, num_bases);
+ }
+
+
+ bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){
+ // Extract sequence, start and end coordinates of read after clipping
+ string bases;
+ int start, end;
+ GetUnclippedInfo(aln, bases, start, end);
+
+ // Check that the prefix match is the longest
+ if (start >= ref_seq_start && start < ref_seq_start + static_cast(ref_seq.size())){
+ int start_index = start - ref_seq_start;
+ int start = max(0, start_index - max_external);
+ int stop = min(static_cast((ref_seq.size()-1)), start_index + max_internal);
+ vector match_counts;
+ ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts);
+
+ int align_index = start_index - start;
+ int num_matches = match_counts[align_index];
+ for (int i = 0; i < static_cast(match_counts.size()); i++){
+ if (i == align_index)
+ continue;
+ if (match_counts[i] >= num_matches)
+ return false;
+ }
+ }
+
+ // Check that the suffix match is the longest
+ if (end >= ref_seq_start && end < ref_seq_start + static_cast(ref_seq.size())){
+ int end_index = end - ref_seq_start;
+ int start = max(0, end_index - max_internal);
+ int stop = min(static_cast(ref_seq.size()-1), end_index + max_external);
+ vector match_counts;
+ ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts);
+
+ int align_index = end_index - start;
+ int num_matches = match_counts[align_index];
+ for (int i = 0; i < static_cast(match_counts.size()); i++){
+ if (i == align_index)
+ continue;
+ if (match_counts[i] >= num_matches)
+ return false;
+ }
+ }
+ return true;
+ }
}
diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h
index e4c4d8a..e548ae2 100644
--- a/src/AlignmentFilters.h
+++ b/src/AlignmentFilters.h
@@ -19,28 +19,33 @@ along with lobSTR. If not, see .
*/
-#ifndef SRC_ALIGNMENTFILTERS_H
-#define SRC_ALIGNMENTFILTERS_H
+#ifndef SRC_ALIGNMENTFILTERS_H_
+#define SRC_ALIGNMENTFILTERS_H_
#include
#include
#include "src/AlignedRead.h"
-using namespace std;
-
namespace AlignmentFilters {
/* Returns the vector of CigarOps corresponding to the CIGAR string. */
- vector GetCigarOps(string cigar_string);
+ std::vector GetCigarOps(std::string cigar_string);
/* Returns the CIGAR string corresponding to the vector of CigarOps. */
- string GetCigarString(vector& cigar_ops);
+ std::string GetCigarString(std::vector& cigar_ops);
/* Length of perfect base matches at 5' and 3' end of read. */
- pair GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start);
+ std::pair GetNumEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start);
/* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */
- pair GetEndDistToIndel(AlignedRead* aln);
+ std::pair GetEndDistToIndel(AlignedRead* aln);
+
+ /* Returns true iff the alignment has:
+ 1) a maximal matching prefix compared to alignments that start [-max_upstream, max_downstream] from the 5' alignment position of the read
+ 2) a maximal matching suffix compared to alignments that end [-max_downstream, max_upstream] from the 3' alignment position of the read
+ Ignores clipped bases when performing these comparions
+ */
+ bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
}
#endif
diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp
new file mode 100644
index 0000000..972a41a
--- /dev/null
+++ b/src/FilterCounter.cpp
@@ -0,0 +1,76 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+#include "src/common.h"
+#include "src/FilterCounter.h"
+
+FilterCounter::FilterCounter(){
+ counts = new uint64_t[NUM_FILTERS];
+ for (int i = 0; i < NUM_FILTERS; i++)
+ counts[i] = 0;
+}
+
+void FilterCounter::increment(const int type){
+ if (type > NUM_FILTERS || type < 0)
+ PrintMessageDieOnError("Invalid filter type", ERROR);
+ counts[type]++;
+}
+
+std::string FilterCounter::GetFilterType(const int type){
+ switch(type){
+ case NOT_UNIT:
+ return "NOT_UNIT";
+ case DIFF_FROM_REF:
+ return "DIFF_FROM_REF";
+ case MAPPING_QUALITY:
+ return "MAPPING_QUALITY";
+ case MATE_DIST:
+ return "MATE_DIST";
+ case ALLELE_SIZE:
+ return "ALLELE_SIZE";
+ case SPANNING_AMOUNT:
+ return "SPANNING_AMOUNT";
+ case NUM_END_MATCHES:
+ return "NUM_END_MATCHES";
+ case NOT_MAXIMAL_END:
+ return "NOT_MAXIMAL_END";
+ case BP_BEFORE_INDEL:
+ return "BP_BEFORE_INDEL";
+ case UNFILTERED:
+ return "UNFILTERED";
+ default:
+ PrintMessageDieOnError("Invalid filter type", ERROR);
+ }
+}
+
+uint64_t FilterCounter::GetFilterCount(const int type){
+ if (type > NUM_FILTERS || type < 0)
+ PrintMessageDieOnError("Invalid filter type", ERROR);
+ return counts[type];
+}
+
+
+FilterCounter::~FilterCounter(){
+ delete [] counts;
+}
+
+
+
+
diff --git a/src/FilterCounter.h b/src/FilterCounter.h
new file mode 100644
index 0000000..60646fa
--- /dev/null
+++ b/src/FilterCounter.h
@@ -0,0 +1,62 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+
+#ifndef SRC_FILTER_COUNTER_H_
+#define SRC_FILTER_COUNTER_H_
+
+#include
+
+#include
+
+class FilterCounter {
+ private:
+ uint64_t* counts;
+
+ public:
+ const static int NUM_FILTERS = 10;
+
+ // Various filter types
+ const static int NOT_UNIT = 0;
+ const static int DIFF_FROM_REF = 1;
+ const static int MAPPING_QUALITY = 2;
+ const static int MATE_DIST = 3;
+ const static int ALLELE_SIZE = 4;
+ const static int SPANNING_AMOUNT = 5;
+ const static int NUM_END_MATCHES = 6;
+ const static int NOT_MAXIMAL_END = 7;
+ const static int BP_BEFORE_INDEL = 8;
+ const static int UNFILTERED = 9;
+
+ FilterCounter();
+
+ /* Increment the count for the provided filter type */
+ void increment(const int type);
+
+ /* Returns the name of the filter associated with the provided filter type */
+ std::string GetFilterType(const int type);
+
+ /* Returns the count associated with the provided filter type */
+ uint64_t GetFilterCount(const int type);
+
+ ~FilterCounter();
+};
+
+#endif
diff --git a/src/Makefile.am b/src/Makefile.am
index a22cd01..57b6c1c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -121,6 +121,7 @@ liblobstr_a_SOURCES = \
FastqPairedFileReader.cpp FastqPairedFileReader.h \
FFT_four_nuc_vectors.cpp FFT_four_nuc_vectors.h \
FFT_nuc_vectors.cpp FFT_nuc_vectors.h \
+ FilterCounter.cpp FilterCounter.h \
gzstream.cpp gzstream.h \
STRRecord.h \
HammingWindowGenerator.cpp HammingWindowGenerator.h \
@@ -186,6 +187,7 @@ liballelotype_a_SOURCES = \
FastqFileReader.cpp FastqFileReader.h \
FastaPairedFileReader.cpp FastaPairedFileReader.h \
FastqPairedFileReader.cpp FastqPairedFileReader.h \
+ FilterCounter.cpp FilterCounter.h \
Genotyper.cpp Genotyper.h \
gzstream.cpp gzstream.h \
NoiseModel.cpp NoiseModel.h \
@@ -201,6 +203,7 @@ liballelotype_a_SOURCES = \
TextFileReader.cpp TextFileReader.h \
TextFileWriter.cpp TextFileWriter.h \
VCFWriter.cpp VCFWriter.h \
+ ZAlgorithm.cpp ZAlgorithm.h \
ZippedFastaFileReader.cpp ZippedFastaFileReader.h \
ZippedFastqFileReader.cpp ZippedFastqFileReader.h \
ZippedTextFileReader.cpp ZippedTextFileReader.h \
@@ -245,6 +248,8 @@ lobSTRTests_SOURCES = \
tests/BWAReadAligner_test.cpp \
tests/common_test.h \
tests/common_test.cpp \
+ tests/DNATools.h \
+ tests/DNATools.cpp \
tests/NWNoRefEndPenalty_test.h \
tests/NWNoRefEndPenalty_test.cpp \
tests/ReadContainer_test.h \
@@ -253,6 +258,8 @@ lobSTRTests_SOURCES = \
tests/RemoveDuplicates_test.cpp \
tests/VCFWriter_test.h \
tests/VCFWriter_test.cpp \
+ tests/ZAlgorithm_test.h \
+ tests/ZAlgorithm_test.cpp \
AlignmentFilters.cpp \
BamFileReader.cpp \
BamPairedFileReader.cpp \
@@ -266,6 +273,7 @@ lobSTRTests_SOURCES = \
FastqPairedFileReader.cpp \
FFT_four_nuc_vectors.cpp \
FFT_nuc_vectors.cpp \
+ FilterCounter.h FilterCounter.cpp \
gzstream.cpp \
HammingWindowGenerator.cpp \
MultithreadData.cpp \
@@ -281,6 +289,8 @@ lobSTRTests_SOURCES = \
ReadContainer.cpp \
RemoveDuplicates.cpp \
VCFWriter.cpp \
+ ZAlgorithm.h \
+ ZAlgorithm.cpp \
ZippedFastaFileReader.cpp \
ZippedFastqFileReader.cpp \
ZippedTextFileReader.cpp \
diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp
index 48f6435..c45c766 100644
--- a/src/ReadContainer.cpp
+++ b/src/ReadContainer.cpp
@@ -225,27 +225,36 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
}
// apply filters
if (unit) {
- if (aligned_read->diffFromRef % aligned_read->period != 0) return false;
+ if (aligned_read->diffFromRef % aligned_read->period != 0){
+ filter_counter.increment(FilterCounter::NOT_UNIT);
+ return false;
+ }
}
if (abs(aligned_read->diffFromRef) > max_diff_ref) {
+ filter_counter.increment(FilterCounter::DIFF_FROM_REF);
return false;
}
if (aligned_read->mapq > max_mapq) {
+ filter_counter.increment(FilterCounter::MAPPING_QUALITY);
return false;
}
if (aligned_read->matedist > max_matedist) {
+ filter_counter.increment(FilterCounter::MATE_DIST);
return false;
}
// Check if the allele length is valid
if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) {
+ filter_counter.increment(FilterCounter::ALLELE_SIZE);
return false;
}
// check that read sufficiently spans STR
int max_read_start = aligned_read->msStart - min_border;
int min_read_stop = aligned_read->msEnd + min_border;
- if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop)
- return false;
+ if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){
+ filter_counter.increment(FilterCounter::SPANNING_AMOUNT);
+ return false;
+ }
// check that both ends of the read contain sufficient perfect matches
if (min_read_end_match > 0){
@@ -254,19 +263,38 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
string ref_ext_seq = loc_iter->second;
pair num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend);
- if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match)
+ if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){
+ filter_counter.increment(FilterCounter::NUM_END_MATCHES);
+ return false;
+ }
+ }
+
+ // check that the prefix and suffix of the read match maximally compared to proximal reference locations
+ if (maximal_end_match_window > 0){
+ map, string>::iterator loc_iter = ref_ext_nucleotides.find(pair(aligned_read->chrom, aligned_read->msStart));
+ if (loc_iter == ref_ext_nucleotides.end())
+ PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
+ string ref_ext_seq = loc_iter->second;
+ bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window);
+ if (!maximum_end_matches){
+ filter_counter.increment(FilterCounter::NOT_MAXIMAL_END);
return false;
+ }
}
// check that both ends of the aligned read have sufficient bases before the first indel
if (min_bp_before_indel > 0){
pair num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read);
- if (num_bps.first != -1 && num_bps.first < min_bp_before_indel)
+ if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){
+ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
return false;
- if (num_bps.second != -1 && num_bps.second < min_bp_before_indel)
+ }
+ if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){
+ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
return false;
+ }
}
-
+ filter_counter.increment(FilterCounter::UNFILTERED);
return true;
}
diff --git a/src/RemoveDuplicates.h b/src/RemoveDuplicates.h
index 4560740..0517b6f 100644
--- a/src/RemoveDuplicates.h
+++ b/src/RemoveDuplicates.h
@@ -34,10 +34,10 @@ namespace RemoveDuplicates {
void RemovePCRDuplicates(std::list* aligned_reads);
/* Get representative read from group of duplicates, read with highest qual score */
- void GetRepRead(const list& aligned_reads, AlignedRead* rep_alignment);
+ void GetRepRead(const std::list& aligned_reads, AlignedRead* rep_alignment);
/* Get quality score of a read */
- float GetScore(const string& quality_string);
+ float GetScore(const std::string& quality_string);
}
#endif // SRC_REMOVEDUPLICATES_H_
diff --git a/src/RunInfo.h b/src/RunInfo.h
index 6f76e08..2dd4de1 100644
--- a/src/RunInfo.h
+++ b/src/RunInfo.h
@@ -21,10 +21,13 @@ along with lobSTR. If not, see .
#ifndef SRC_RUNINFO_H_
#define SRC_RUNINFO_H_
+#include
#include
#include
#include
+#include "src/FilterCounter.h"
+
/*
RunInfo stores information about the run that will be uploaded to Amazon for further analysis
*/
@@ -87,7 +90,7 @@ class RunInfo {
}
// Print to string
- std::string PrintToString(int program) {
+ std::string PrintToString(int program, FilterCounter& filter_counter) {
std::stringstream ss;
ss << "Starttime: " << starttime << std::endl;
ss << "Endtime: " << endtime << std::endl;
@@ -119,7 +122,6 @@ class RunInfo {
ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl;
ss << "Mean coverage\t" << static_cast(total_coverage.at(i))/static_cast(num_calls.at(i)) << std::endl;
ss << "Mean perc. agree\t" << static_cast(total_agree.at(i))/static_cast(num_calls.at(i)) << std::endl;
- ss << std::endl;
}
ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl;
for (size_t i = 0; i < calltype_by_period.size(); i++) {
@@ -130,6 +132,11 @@ class RunInfo {
}
ss << std::endl;
}
+
+ ss << "Read filter stats:" << std::endl;
+ for (int i = 0; i < FilterCounter::NUM_FILTERS; i++)
+ ss << "\t" << std::setw(25) << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl;
+ ss << std::endl;
}
return ss.str();
diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp
new file mode 100644
index 0000000..9ec1a9b
--- /dev/null
+++ b/src/ZAlgorithm.cpp
@@ -0,0 +1,136 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+#include
+#include
+
+#include
+
+#include "src/common.h"
+#include "src/ZAlgorithm.h"
+
+
+namespace ZAlgorithm{
+ void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
+ std::vector& s1_matches, std::vector& num_matches){
+ num_matches = std::vector(s2_right - s2_left + 1, -1);
+ int leftmost = s2_right+1, right_index = s2_right+1;
+ for (int i = s2_right; i >= s2_left; i--){
+ if (i <= leftmost){
+ int index_a = s1.size()-1, index_b = i;
+ while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+ index_a--;
+ index_b--;
+ }
+ num_matches[i-s2_left] = i - index_b;
+ if (index_b < i){
+ right_index = i;
+ leftmost = index_b + 1;
+ }
+ }
+ else {
+ int twin = i - right_index + s1.size()-1;
+ int new_left = i - s1_matches[twin] + 1;
+ if (new_left > leftmost)
+ num_matches[i-s2_left] = s1_matches[twin];
+ else if (new_left < leftmost)
+ num_matches[i-s2_left] = i-leftmost+1;
+ else {
+ int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1;
+ while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){
+ index_a--;
+ index_b--;
+ }
+ num_matches[i-s2_left] = i-index_b;
+ right_index = i;
+ leftmost = index_b + 1;
+ }
+ }
+ }
+ }
+
+ void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right,
+ std::vector& s1_matches, std::vector& num_matches, int offset){
+ num_matches = std::vector(s2_right-s2_left+1+offset, -1);
+ int rightmost = -1, left_index = -1;
+ for (int i = s2_left; i <= s2_right; i++){
+ if (i >= rightmost){
+ int index_a = 0, index_b = i;
+ while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){
+ index_a++;
+ index_b++;
+ }
+ num_matches[i-s2_left+offset] = index_b - i;
+ if (index_b > i){
+ left_index = i;
+ rightmost = index_b - 1;
+ }
+ }
+ else {
+ int twin = i - left_index;
+ int new_right = i + s1_matches[twin] - 1;
+ if (new_right < rightmost)
+ num_matches[i-s2_left+offset] = s1_matches[twin];
+ else if (new_right > rightmost)
+ num_matches[i-s2_left+offset] = rightmost-i+1;
+ else {
+ int index_a = rightmost+1-i, index_b = rightmost+1;
+ while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){
+ index_a++;
+ index_b++;
+ }
+ num_matches[i-s2_left+offset] = index_b - i;
+ left_index = i;
+ rightmost = index_b - 1;
+ }
+ }
+ }
+ }
+
+
+ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) {
+ std::vector s1_matches;
+ prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1);
+ prefix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches, 0);
+ }
+
+ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) {
+ std::vector s1_matches;
+ suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches);
+ suffix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches);
+ }
+
+ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) {
+ if (s2_start < 0 or s2_stop >= static_cast(s2.size()))
+ PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR);
+ std::vector s1_matches;
+ prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1);
+ prefix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches, 0);
+ }
+
+ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) {
+ if (s2_start < 0 or s2_stop >= static_cast(s2.size()))
+ PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR);
+ std::vector s1_matches;
+ suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches);
+ suffix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches);
+ }
+}
+
diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h
new file mode 100644
index 0000000..777fbc8
--- /dev/null
+++ b/src/ZAlgorithm.h
@@ -0,0 +1,55 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+#include
+#include
+
+
+namespace ZAlgorithm{
+ /*
+ * For each position in s2, calculates the length of the matching prefix of s1 and s2[i...]
+ * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately.
+ * Runs in O(length_of_s1 + length_of_s2)
+ */
+ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches);
+
+ /*
+ * For each position in s2, calculates the length of the matching suffix of s1 and s2[0...i]
+ * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately.
+ * Runs in O(length_of_s1 + length_of_s2)
+ */
+ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches);
+
+ /*
+ * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of
+ * the matching prefix of s1 and s2[i...] and stores it in num_matches[i-s2_start].
+ * The provided vector is cleared and sized appropriately.
+ * Runs in O(length_of_s1 + size_of_s2_range)
+ */
+ void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches);
+
+ /*
+ * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of
+ * the matching suffix of s1 and s2[0...i] and stores it in num_matches[i-s2_start].
+ * The provided vector is cleared and sized appropriately.
+ * Runs in O(length_of_s1 + size_of_s2_range)
+ */
+ void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches);
+}
diff --git a/src/common.cpp b/src/common.cpp
index c723ef1..f6676af 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -84,7 +84,7 @@ void OutputRunStatistics() {
PrintMessageDieOnError("Outputting run statistics", PROGRESS);
TextFileWriter sWriter(output_prefix + (program == LOBSTR? ".aligned.stats":".allelotype.stats"));
// Output run statistics to stats file
- string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1));
+ string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1), filter_counter);
sWriter.Write(stats_string);
// Upload to AWS S3
if (!noweb) {
diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp
index 73dd6c6..5d4d3cc 100644
--- a/src/main_allelotype.cpp
+++ b/src/main_allelotype.cpp
@@ -123,6 +123,8 @@ void show_help() {
" from either end of the read.\n"
"--min-read-end-match : Filter reads whose alignments don't exactly match the reference for at least\n"\
" bp at both ends. \n"
+ "--maximal-end-match : Filter reads whose prefix/suffix matches to reference are <= those \n"
+ " obtained when shifting the read ends by distances within bp\n"
"--exclude-partial: Do not report any information about partially\n" \
" spanning reads.\n\n"
"Additional options\n" \
@@ -151,6 +153,7 @@ void parse_commandline_options(int argc, char* argv[]) {
OPT_MAX_DIFF_REF,
OPT_MAXMAPQ,
OPT_MAXMATEDIST,
+ OPT_MAXIMAL_END_MATCH_WINDOW,
OPT_MIN_BORDER,
OPT_MIN_BP_BEFORE_INDEL,
OPT_MIN_HET_FREQ,
@@ -184,6 +187,7 @@ void parse_commandline_options(int argc, char* argv[]) {
{"max-diff-ref", 1, 0, OPT_MAX_DIFF_REF},
{"mapq", 1, 0, OPT_MAXMAPQ},
{"max-matedist", 1, 0, OPT_MAXMATEDIST},
+ {"maximal-end-match", 1, 0, OPT_MAXIMAL_END_MATCH_WINDOW},
{"min-border", 1, 0, OPT_MIN_BORDER},
{"min-bp-before-indel", 1, 0, OPT_MIN_BP_BEFORE_INDEL},
{"min-het-freq", 1, 0, OPT_MIN_HET_FREQ},
@@ -258,6 +262,10 @@ void parse_commandline_options(int argc, char* argv[]) {
max_diff_ref = atoi(optarg);
AddOption("max-diff-ref", string(optarg), true, &user_defined_arguments_allelotyper);
break;
+ case OPT_MAXIMAL_END_MATCH_WINDOW:
+ maximal_end_match_window = atoi(optarg);
+ AddOption("maximal-end-match", string(optarg), true, &user_defined_arguments_allelotyper);
+ break;
case OPT_MAXMAPQ:
max_mapq = atoi(optarg);
AddOption("mapq", string(optarg), true, &user_defined_arguments_allelotyper);
diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp
index cde4cf9..9f020ee 100644
--- a/src/runtime_parameters.cpp
+++ b/src/runtime_parameters.cpp
@@ -23,6 +23,7 @@ along with lobSTR. If not, see .
#include "src/runtime_parameters.h"
RunInfo run_info;
+FilterCounter filter_counter;
int MIN_PERIOD=1;
int MAX_PERIOD=6;
@@ -115,6 +116,7 @@ bool include_gl = false;
bool print_reads = false;
float min_het_freq = 0;
int max_matedist = 100000;
+int maximal_end_match_window = 0;
int min_border = 0;
int min_bp_before_indel = 0;
int min_read_end_match = 0;
diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h
index c0618a5..01853a1 100644
--- a/src/runtime_parameters.h
+++ b/src/runtime_parameters.h
@@ -25,6 +25,7 @@ along with lobSTR. If not, see .
#include
#include "src/RunInfo.h"
+#include "src/FilterCounter.h"
// period range for genotyper
extern int MIN_PERIOD;
@@ -33,6 +34,9 @@ extern int MAX_PERIOD;
// global variables that probably shouldn't be global variables
extern RunInfo run_info;
+// when allelotyping, tracks why filtered reads were omitted
+extern FilterCounter filter_counter;
+
// stores user arguments for lobSTR alignment
// used to store the bam header for allelotyping step
extern std::string user_defined_arguments;
@@ -140,6 +144,7 @@ extern int min_border;
extern int min_bp_before_indel;
extern int min_read_end_match;
extern int max_matedist;
+extern int maximal_end_match_window;
// debug
extern bool profile;
diff --git a/src/tests/AlignmentFilters_test.cpp b/src/tests/AlignmentFilters_test.cpp
index d4fa929..3384497 100644
--- a/src/tests/AlignmentFilters_test.cpp
+++ b/src/tests/AlignmentFilters_test.cpp
@@ -128,3 +128,48 @@ void AlignmentFiltersTest::test_GetNumEndMatches(){
}
+void AlignmentFiltersTest::test_HasLargestEndMatches(){
+ AlignedRead aln;
+ string ref_seq;
+ int ref_start;
+
+ // Simple case with perfect maximal match
+ aln.cigar_ops = GetCigarOps("9M");
+ aln.read_start = 102;
+ aln.nucleotides = "ACACATACAC";
+ ref_seq = "ACACACATACACACACAC";
+ ref_start = 100;
+ CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+ // Perfect maximal match and soft clipping
+ aln.cigar_ops = GetCigarOps("2S9M2S");
+ aln.read_start = 102;
+ aln.nucleotides = "NNACACATACACNN";
+ ref_seq = "ACACACATACACACACAC";
+ ref_start = 100;
+ CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+ // Perfect non-maximal match
+ aln.cigar_ops = GetCigarOps("8M");
+ aln.read_start = 102;
+ aln.nucleotides = "ACACACAC";
+ ref_seq = "ACACACACACACACAC";
+ ref_start = 100;
+ CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+ // Maximal prefix but not maximal suffix
+ aln.cigar_ops = GetCigarOps("9M");
+ aln.read_start = 102;
+ aln.nucleotides = "ACATTACAC";
+ ref_seq = "ACACATCACACACTACAC";
+ ref_start = 100;
+ CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+
+ // Maximal suffix but not maximal prefix
+ aln.cigar_ops = GetCigarOps("9M");
+ aln.read_start = 102;
+ aln.nucleotides = "ACACTACAC";
+ ref_seq = "ACACATTACACACACAC";
+ ref_start = 100;
+ CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10));
+}
diff --git a/src/tests/AlignmentFilters_test.h b/src/tests/AlignmentFilters_test.h
index 5e4f5d6..bae2181 100644
--- a/src/tests/AlignmentFilters_test.h
+++ b/src/tests/AlignmentFilters_test.h
@@ -30,6 +30,7 @@ public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(AlignmentFiltersTest);
CPPUNIT_TEST(test_GetDistToIndel);
CPPUNIT_TEST(test_GetNumEndMatches);
+ CPPUNIT_TEST(test_HasLargestEndMatches);
CPPUNIT_TEST_SUITE_END();
public:
@@ -37,6 +38,7 @@ public CppUnit::TestFixture {
void tearDown();
void test_GetDistToIndel();
void test_GetNumEndMatches();
+ void test_HasLargestEndMatches();
private:
};
diff --git a/src/tests/DNATools.cpp b/src/tests/DNATools.cpp
new file mode 100644
index 0000000..73ec7eb
--- /dev/null
+++ b/src/tests/DNATools.cpp
@@ -0,0 +1,31 @@
+#include
+#include
+
+#include
+
+#include "src/tests/DNATools.h"
+
+namespace DNATools {
+ char GetChar(int index){
+ switch(index){
+ case 0:
+ return 'A';
+ case 1:
+ return 'C';
+ case 2:
+ return 'G';
+ case 3:
+ return 'T';
+ default:
+ return 'N';
+ }
+ }
+
+ std::string RandDNA(const int length){
+ std::stringstream seq;
+ for (int j = 0; j < length; j++)
+ seq << GetChar(rand()%4);
+ return seq.str();
+ }
+}
+
diff --git a/src/tests/DNATools.h b/src/tests/DNATools.h
new file mode 100644
index 0000000..0540313
--- /dev/null
+++ b/src/tests/DNATools.h
@@ -0,0 +1,9 @@
+#ifndef DNATOOLS_H
+#define DNATOOLS_H
+
+#include
+
+namespace DNATools {
+ std::string RandDNA(const int length);
+}
+#endif // DNATOOLS_H
diff --git a/src/tests/NWNoRefEndPenalty_test.cpp b/src/tests/NWNoRefEndPenalty_test.cpp
index bf60dec..0dc1038 100644
--- a/src/tests/NWNoRefEndPenalty_test.cpp
+++ b/src/tests/NWNoRefEndPenalty_test.cpp
@@ -25,36 +25,15 @@ along with lobSTR. If not, see .
#include
#include
-#include "src/tests/NWNoRefEndPenalty_test.h"
#include "src/common.h"
-
-// Registers the fixture into the 'registry'
-CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest);
+#include "src/tests/DNATools.h"
+#include "src/tests/NWNoRefEndPenalty_test.h"
using namespace std;
+// Registers the fixture into the 'registry'
+CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest);
-char GetChar(int index){
- switch(index){
- case 0:
- return 'A';
- case 1:
- return 'C';
- case 2:
- return 'G';
- case 3:
- return 'T';
- default:
- return 'N';
- }
-}
-
-string RandDNA(const int length){
- stringstream seq;
- for (int j = 0; j < length; j++)
- seq << GetChar(rand()%4);
- return seq.str();
-}
void NWNoRefEndPenaltyTest::setUp() {}
void NWNoRefEndPenaltyTest::tearDown() {}
@@ -64,7 +43,7 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double
int num_correct = 0;
for (int i = 0; i < num_trials; i++){
// Create random reference sequence
- string ref_seq = RandDNA(REF_LEN);
+ string ref_seq = DNATools::RandDNA(REF_LEN);
// Choose the read region
int read_start = rand()%(ref_seq.size()-READ_LEN+1);
@@ -75,10 +54,10 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double
for (unsigned int i = PERFECT_FLANK; i < read_seq.size()-PERFECT_FLANK; i++){
float prob = static_cast (rand()) / static_cast (RAND_MAX);
if (prob <= mut_prob)
- mut_read_seq_ss << RandDNA(1);
+ mut_read_seq_ss << DNATools::RandDNA(1);
else if (prob - mut_prob <= ins_prob){
int ins_size = rand()%(MAX_INS-1) + 1;
- mut_read_seq_ss << RandDNA(ins_size);
+ mut_read_seq_ss << DNATools::RandDNA(ins_size);
}
else if (prob - mut_prob - ins_prob <= del_prob){
int del_size = rand()%(MAX_DEL-1) + 1;
diff --git a/src/tests/NWNoRefEndPenalty_test.h b/src/tests/NWNoRefEndPenalty_test.h
index 967dffa..37cdb14 100644
--- a/src/tests/NWNoRefEndPenalty_test.h
+++ b/src/tests/NWNoRefEndPenalty_test.h
@@ -1,5 +1,5 @@
/*
-Copyright (C) 2011 Melissa Gymrek
+Copyright (C) 2014 Thomas Willems
This file is part of lobSTR.
diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp
new file mode 100644
index 0000000..6ab4372
--- /dev/null
+++ b/src/tests/ZAlgorithm_test.cpp
@@ -0,0 +1,77 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+#include
+#include
+
+#include
+
+#include "src/tests/DNATools.h"
+#include "src/tests/ZAlgorithm_test.h"
+#include "src/common.h"
+#include "src/ZAlgorithm.h"
+
+
+using namespace std;
+
+void ZAlgorithmTest::setUp(){}
+void ZAlgorithmTest::tearDown(){}
+
+void ZAlgorithmTest::test_Prefix(){
+ vector num_matches;
+ string s1, s2;
+ int s2_start, s2_stop;
+ for (int i = 0; i < NUM_TRIALS; i++){
+ s1 = DNATools::RandDNA(1+rand()%(LENGTH-1));
+ s2 = DNATools::RandDNA(1+rand()%(LENGTH-1));
+ s2_start = rand()%s2.size();
+ s2_stop = s2_start + (rand()%(s2.size()-s2_start));
+ ZAlgorithm::GetPrefixMatchCounts(s1, s2, s2_start, s2_stop, num_matches);
+ for (int j = s2_start; j <= s2_stop; j++){
+ bool match = (s1.substr(0, num_matches[j-s2_start]).compare(s2.substr(j, num_matches[j-s2_start])) == 0);
+ bool longer = (s1.size() > num_matches[j-s2_start] && s2.size() > (j + num_matches[j-s2_start]) && s2[j+num_matches[j-s2_start]] == s1[num_matches[j-s2_start]]);
+ CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer)));
+ }
+ }
+}
+
+void ZAlgorithmTest::test_Suffix(){
+ vector num_matches;
+ string s1, s2;
+ int s2_start, s2_stop;
+ for (int i = 0; i < NUM_TRIALS; i++){
+ s1 = DNATools::RandDNA(1+rand()%(LENGTH-1));
+ s2 = DNATools::RandDNA(1+rand()%(LENGTH-1));
+ s2_start = rand()%s2.size();
+ s2_stop = s2_start + (rand()%(s2.size()-s2_start));
+ ZAlgorithm::GetSuffixMatchCounts(s1, s2, s2_start, s2_stop, num_matches);
+ for (int j = s2_start; j <= s2_stop; j++){
+ string sub_1 = s1.substr(s1.size()-num_matches[j-s2_start], num_matches[j-s2_start]);
+ string sub_2 = s2.substr(j-num_matches[j-s2_start]+1, num_matches[j-s2_start]);
+ bool match = (sub_1.compare(sub_2) == 0);
+ bool longer = ((j-num_matches[j-s2_start]) > 0 && (s1.size()-num_matches[j-s2_start]) > 0 && s1[s1.size()-num_matches[j-s2_start]-1] == s2[j-num_matches[j-s2_start]]);
+ CPPUNIT_ASSERT_MESSAGE("ZAlgorithm suffix error", !(!match || (match && longer)));
+ }
+ }
+}
+
+
+
+
diff --git a/src/tests/ZAlgorithm_test.h b/src/tests/ZAlgorithm_test.h
new file mode 100644
index 0000000..fc776f7
--- /dev/null
+++ b/src/tests/ZAlgorithm_test.h
@@ -0,0 +1,46 @@
+/*
+Copyright (C) 2014 Thomas Willems
+
+This file is part of lobSTR.
+
+lobSTR is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+lobSTR is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with lobSTR. If not, see .
+
+*/
+
+#ifndef SRC_TESTS_ZALGORITHM_H__
+#define SRC_TESTS_ZALGORITHM_H__
+
+#include
+
+#include "src/ZAlgorithm.h"
+
+class ZAlgorithmTest :
+public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(ZAlgorithmTest);
+ CPPUNIT_TEST(test_Prefix);
+ CPPUNIT_TEST(test_Suffix);
+ CPPUNIT_TEST_SUITE_END();
+
+ public:
+ const static int NUM_TRIALS=1000;
+ const static int LENGTH=100;
+ void setUp();
+ void tearDown();
+ void test_Prefix();
+ void test_Suffix();
+
+ private:
+};
+
+#endif // SRC_TESTS_ZALGORITHM_H_
diff --git a/src/tests/main_test.cpp b/src/tests/main_test.cpp
index 1026a95..4cc5bc6 100644
--- a/src/tests/main_test.cpp
+++ b/src/tests/main_test.cpp
@@ -27,6 +27,7 @@ along with lobSTR. If not, see .
#include "src/tests/ReadContainer_test.h"
#include "src/tests/RemoveDuplicates_test.h"
#include "src/tests/VCFWriter_test.h"
+#include "src/tests/ZAlgorithm_test.h"
int main( int argc, char **argv) {
// Adds the test to the list of tests to run
@@ -38,6 +39,7 @@ int main( int argc, char **argv) {
runner.addTest(ReadContainerTest::suite());
runner.addTest(RemoveDuplicatesTest::suite());
runner.addTest(VCFWriterTest::suite());
+ runner.addTest(ZAlgorithmTest::suite());
// Run the tests
bool wasSucessful = runner.run();