diff --git a/src/AlignedRead.h b/src/AlignedRead.h index 87650d6..ab32091 100644 --- a/src/AlignedRead.h +++ b/src/AlignedRead.h @@ -22,9 +22,6 @@ along with lobSTR. If not, see . #include "src/api/BamAlignment.h" -using namespace std; -using BamTools::CigarOp; - /* Struct to keep track of aligned read used in allelotyping */ @@ -37,7 +34,7 @@ struct AlignedRead { int read_start; std::string nucleotides; std::string qualities; - vector cigar_ops; + std::vector cigar_ops; std::string repseq; int period; int diffFromRef; diff --git a/src/AlignmentFilters.cpp b/src/AlignmentFilters.cpp index a569aca..23cc72a 100644 --- a/src/AlignmentFilters.cpp +++ b/src/AlignmentFilters.cpp @@ -24,11 +24,11 @@ along with lobSTR. If not, see . #include "src/AlignmentFilters.h" #include "src/common.h" +#include "src/ZAlgorithm.h" using namespace std; namespace AlignmentFilters { - template int GetDistToIndel(CigarIterator iter, CigarIterator end){ // Process leading clipping ops if (iter != end && iter->Type == 'H') @@ -147,4 +147,87 @@ namespace AlignmentFilters { else return pair(head_match, match_run); } + + + /* + Stores the sequence, start and end position of the read after removing clipped bases + using the provided references + */ + void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){ + unclipped_start = aln->read_start; + unclipped_end = aln->read_start-1; + bool begin = true; + int start_index = 0, num_bases = 0; + for(vector::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){ + switch(cigar_iter->Type) { + case 'D': + unclipped_end += cigar_iter->Length; + begin = false; + break; + case 'H': + break; + case 'S': + if (begin) start_index += cigar_iter->Length; + break; + case 'M': + unclipped_end += cigar_iter->Length; + num_bases += cigar_iter->Length; + begin = false; + break; + case 'I': + num_bases += cigar_iter->Length; + begin = false; + break; + default: + PrintMessageDieOnError("Invalid CIGAR char " + cigar_iter->Type, ERROR); + break; + } + } + bases = aln->nucleotides.substr(start_index, num_bases); + } + + + bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){ + // Extract sequence, start and end coordinates of read after clipping + string bases; + int start, end; + GetUnclippedInfo(aln, bases, start, end); + + // Check that the prefix match is the longest + if (start >= ref_seq_start && start < ref_seq_start + static_cast(ref_seq.size())){ + int start_index = start - ref_seq_start; + int start = max(0, start_index - max_external); + int stop = min(static_cast((ref_seq.size()-1)), start_index + max_internal); + vector match_counts; + ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts); + + int align_index = start_index - start; + int num_matches = match_counts[align_index]; + for (int i = 0; i < static_cast(match_counts.size()); i++){ + if (i == align_index) + continue; + if (match_counts[i] >= num_matches) + return false; + } + } + + // Check that the suffix match is the longest + if (end >= ref_seq_start && end < ref_seq_start + static_cast(ref_seq.size())){ + int end_index = end - ref_seq_start; + int start = max(0, end_index - max_internal); + int stop = min(static_cast(ref_seq.size()-1), end_index + max_external); + vector match_counts; + ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts); + + int align_index = end_index - start; + int num_matches = match_counts[align_index]; + for (int i = 0; i < static_cast(match_counts.size()); i++){ + if (i == align_index) + continue; + if (match_counts[i] >= num_matches) + return false; + } + } + return true; + } } diff --git a/src/AlignmentFilters.h b/src/AlignmentFilters.h index e4c4d8a..e548ae2 100644 --- a/src/AlignmentFilters.h +++ b/src/AlignmentFilters.h @@ -19,28 +19,33 @@ along with lobSTR. If not, see . */ -#ifndef SRC_ALIGNMENTFILTERS_H -#define SRC_ALIGNMENTFILTERS_H +#ifndef SRC_ALIGNMENTFILTERS_H_ +#define SRC_ALIGNMENTFILTERS_H_ #include #include #include "src/AlignedRead.h" -using namespace std; - namespace AlignmentFilters { /* Returns the vector of CigarOps corresponding to the CIGAR string. */ - vector GetCigarOps(string cigar_string); + std::vector GetCigarOps(std::string cigar_string); /* Returns the CIGAR string corresponding to the vector of CigarOps. */ - string GetCigarString(vector& cigar_ops); + std::string GetCigarString(std::vector& cigar_ops); /* Length of perfect base matches at 5' and 3' end of read. */ - pair GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start); + std::pair GetNumEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start); /* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */ - pair GetEndDistToIndel(AlignedRead* aln); + std::pair GetEndDistToIndel(AlignedRead* aln); + + /* Returns true iff the alignment has: + 1) a maximal matching prefix compared to alignments that start [-max_upstream, max_downstream] from the 5' alignment position of the read + 2) a maximal matching suffix compared to alignments that end [-max_downstream, max_upstream] from the 3' alignment position of the read + Ignores clipped bases when performing these comparions + */ + bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream); } #endif diff --git a/src/FilterCounter.cpp b/src/FilterCounter.cpp new file mode 100644 index 0000000..972a41a --- /dev/null +++ b/src/FilterCounter.cpp @@ -0,0 +1,76 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#include "src/common.h" +#include "src/FilterCounter.h" + +FilterCounter::FilterCounter(){ + counts = new uint64_t[NUM_FILTERS]; + for (int i = 0; i < NUM_FILTERS; i++) + counts[i] = 0; +} + +void FilterCounter::increment(const int type){ + if (type > NUM_FILTERS || type < 0) + PrintMessageDieOnError("Invalid filter type", ERROR); + counts[type]++; +} + +std::string FilterCounter::GetFilterType(const int type){ + switch(type){ + case NOT_UNIT: + return "NOT_UNIT"; + case DIFF_FROM_REF: + return "DIFF_FROM_REF"; + case MAPPING_QUALITY: + return "MAPPING_QUALITY"; + case MATE_DIST: + return "MATE_DIST"; + case ALLELE_SIZE: + return "ALLELE_SIZE"; + case SPANNING_AMOUNT: + return "SPANNING_AMOUNT"; + case NUM_END_MATCHES: + return "NUM_END_MATCHES"; + case NOT_MAXIMAL_END: + return "NOT_MAXIMAL_END"; + case BP_BEFORE_INDEL: + return "BP_BEFORE_INDEL"; + case UNFILTERED: + return "UNFILTERED"; + default: + PrintMessageDieOnError("Invalid filter type", ERROR); + } +} + +uint64_t FilterCounter::GetFilterCount(const int type){ + if (type > NUM_FILTERS || type < 0) + PrintMessageDieOnError("Invalid filter type", ERROR); + return counts[type]; +} + + +FilterCounter::~FilterCounter(){ + delete [] counts; +} + + + + diff --git a/src/FilterCounter.h b/src/FilterCounter.h new file mode 100644 index 0000000..60646fa --- /dev/null +++ b/src/FilterCounter.h @@ -0,0 +1,62 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + + +#ifndef SRC_FILTER_COUNTER_H_ +#define SRC_FILTER_COUNTER_H_ + +#include + +#include + +class FilterCounter { + private: + uint64_t* counts; + + public: + const static int NUM_FILTERS = 10; + + // Various filter types + const static int NOT_UNIT = 0; + const static int DIFF_FROM_REF = 1; + const static int MAPPING_QUALITY = 2; + const static int MATE_DIST = 3; + const static int ALLELE_SIZE = 4; + const static int SPANNING_AMOUNT = 5; + const static int NUM_END_MATCHES = 6; + const static int NOT_MAXIMAL_END = 7; + const static int BP_BEFORE_INDEL = 8; + const static int UNFILTERED = 9; + + FilterCounter(); + + /* Increment the count for the provided filter type */ + void increment(const int type); + + /* Returns the name of the filter associated with the provided filter type */ + std::string GetFilterType(const int type); + + /* Returns the count associated with the provided filter type */ + uint64_t GetFilterCount(const int type); + + ~FilterCounter(); +}; + +#endif diff --git a/src/Makefile.am b/src/Makefile.am index a22cd01..57b6c1c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -121,6 +121,7 @@ liblobstr_a_SOURCES = \ FastqPairedFileReader.cpp FastqPairedFileReader.h \ FFT_four_nuc_vectors.cpp FFT_four_nuc_vectors.h \ FFT_nuc_vectors.cpp FFT_nuc_vectors.h \ + FilterCounter.cpp FilterCounter.h \ gzstream.cpp gzstream.h \ STRRecord.h \ HammingWindowGenerator.cpp HammingWindowGenerator.h \ @@ -186,6 +187,7 @@ liballelotype_a_SOURCES = \ FastqFileReader.cpp FastqFileReader.h \ FastaPairedFileReader.cpp FastaPairedFileReader.h \ FastqPairedFileReader.cpp FastqPairedFileReader.h \ + FilterCounter.cpp FilterCounter.h \ Genotyper.cpp Genotyper.h \ gzstream.cpp gzstream.h \ NoiseModel.cpp NoiseModel.h \ @@ -201,6 +203,7 @@ liballelotype_a_SOURCES = \ TextFileReader.cpp TextFileReader.h \ TextFileWriter.cpp TextFileWriter.h \ VCFWriter.cpp VCFWriter.h \ + ZAlgorithm.cpp ZAlgorithm.h \ ZippedFastaFileReader.cpp ZippedFastaFileReader.h \ ZippedFastqFileReader.cpp ZippedFastqFileReader.h \ ZippedTextFileReader.cpp ZippedTextFileReader.h \ @@ -245,6 +248,8 @@ lobSTRTests_SOURCES = \ tests/BWAReadAligner_test.cpp \ tests/common_test.h \ tests/common_test.cpp \ + tests/DNATools.h \ + tests/DNATools.cpp \ tests/NWNoRefEndPenalty_test.h \ tests/NWNoRefEndPenalty_test.cpp \ tests/ReadContainer_test.h \ @@ -253,6 +258,8 @@ lobSTRTests_SOURCES = \ tests/RemoveDuplicates_test.cpp \ tests/VCFWriter_test.h \ tests/VCFWriter_test.cpp \ + tests/ZAlgorithm_test.h \ + tests/ZAlgorithm_test.cpp \ AlignmentFilters.cpp \ BamFileReader.cpp \ BamPairedFileReader.cpp \ @@ -266,6 +273,7 @@ lobSTRTests_SOURCES = \ FastqPairedFileReader.cpp \ FFT_four_nuc_vectors.cpp \ FFT_nuc_vectors.cpp \ + FilterCounter.h FilterCounter.cpp \ gzstream.cpp \ HammingWindowGenerator.cpp \ MultithreadData.cpp \ @@ -281,6 +289,8 @@ lobSTRTests_SOURCES = \ ReadContainer.cpp \ RemoveDuplicates.cpp \ VCFWriter.cpp \ + ZAlgorithm.h \ + ZAlgorithm.cpp \ ZippedFastaFileReader.cpp \ ZippedFastqFileReader.cpp \ ZippedTextFileReader.cpp \ diff --git a/src/ReadContainer.cpp b/src/ReadContainer.cpp index 48f6435..c45c766 100644 --- a/src/ReadContainer.cpp +++ b/src/ReadContainer.cpp @@ -225,27 +225,36 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, } // apply filters if (unit) { - if (aligned_read->diffFromRef % aligned_read->period != 0) return false; + if (aligned_read->diffFromRef % aligned_read->period != 0){ + filter_counter.increment(FilterCounter::NOT_UNIT); + return false; + } } if (abs(aligned_read->diffFromRef) > max_diff_ref) { + filter_counter.increment(FilterCounter::DIFF_FROM_REF); return false; } if (aligned_read->mapq > max_mapq) { + filter_counter.increment(FilterCounter::MAPPING_QUALITY); return false; } if (aligned_read->matedist > max_matedist) { + filter_counter.increment(FilterCounter::MATE_DIST); return false; } // Check if the allele length is valid if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) { + filter_counter.increment(FilterCounter::ALLELE_SIZE); return false; } // check that read sufficiently spans STR int max_read_start = aligned_read->msStart - min_border; int min_read_stop = aligned_read->msEnd + min_border; - if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop) - return false; + if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){ + filter_counter.increment(FilterCounter::SPANNING_AMOUNT); + return false; + } // check that both ends of the read contain sufficient perfect matches if (min_read_end_match > 0){ @@ -254,19 +263,38 @@ bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; pair num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend); - if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match) + if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){ + filter_counter.increment(FilterCounter::NUM_END_MATCHES); + return false; + } + } + + // check that the prefix and suffix of the read match maximally compared to proximal reference locations + if (maximal_end_match_window > 0){ + map, string>::iterator loc_iter = ref_ext_nucleotides.find(pair(aligned_read->chrom, aligned_read->msStart)); + if (loc_iter == ref_ext_nucleotides.end()) + PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); + string ref_ext_seq = loc_iter->second; + bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window); + if (!maximum_end_matches){ + filter_counter.increment(FilterCounter::NOT_MAXIMAL_END); return false; + } } // check that both ends of the aligned read have sufficient bases before the first indel if (min_bp_before_indel > 0){ pair num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read); - if (num_bps.first != -1 && num_bps.first < min_bp_before_indel) + if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){ + filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; - if (num_bps.second != -1 && num_bps.second < min_bp_before_indel) + } + if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){ + filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; + } } - + filter_counter.increment(FilterCounter::UNFILTERED); return true; } diff --git a/src/RemoveDuplicates.h b/src/RemoveDuplicates.h index 4560740..0517b6f 100644 --- a/src/RemoveDuplicates.h +++ b/src/RemoveDuplicates.h @@ -34,10 +34,10 @@ namespace RemoveDuplicates { void RemovePCRDuplicates(std::list* aligned_reads); /* Get representative read from group of duplicates, read with highest qual score */ - void GetRepRead(const list& aligned_reads, AlignedRead* rep_alignment); + void GetRepRead(const std::list& aligned_reads, AlignedRead* rep_alignment); /* Get quality score of a read */ - float GetScore(const string& quality_string); + float GetScore(const std::string& quality_string); } #endif // SRC_REMOVEDUPLICATES_H_ diff --git a/src/RunInfo.h b/src/RunInfo.h index 6f76e08..2dd4de1 100644 --- a/src/RunInfo.h +++ b/src/RunInfo.h @@ -21,10 +21,13 @@ along with lobSTR. If not, see . #ifndef SRC_RUNINFO_H_ #define SRC_RUNINFO_H_ +#include #include #include #include +#include "src/FilterCounter.h" + /* RunInfo stores information about the run that will be uploaded to Amazon for further analysis */ @@ -87,7 +90,7 @@ class RunInfo { } // Print to string - std::string PrintToString(int program) { + std::string PrintToString(int program, FilterCounter& filter_counter) { std::stringstream ss; ss << "Starttime: " << starttime << std::endl; ss << "Endtime: " << endtime << std::endl; @@ -119,7 +122,6 @@ class RunInfo { ss << "Num calls >=5x\t" << num_calls5x.at(i) << std::endl; ss << "Mean coverage\t" << static_cast(total_coverage.at(i))/static_cast(num_calls.at(i)) << std::endl; ss << "Mean perc. agree\t" << static_cast(total_agree.at(i))/static_cast(num_calls.at(i)) << std::endl; - ss << std::endl; } ss << "Call type by period (0/0, 0/1, 1/1, 1/2)" << std::endl; for (size_t i = 0; i < calltype_by_period.size(); i++) { @@ -130,6 +132,11 @@ class RunInfo { } ss << std::endl; } + + ss << "Read filter stats:" << std::endl; + for (int i = 0; i < FilterCounter::NUM_FILTERS; i++) + ss << "\t" << std::setw(25) << filter_counter.GetFilterType(i) << ":\t" << filter_counter.GetFilterCount(i) << std::endl; + ss << std::endl; } return ss.str(); diff --git a/src/ZAlgorithm.cpp b/src/ZAlgorithm.cpp new file mode 100644 index 0000000..9ec1a9b --- /dev/null +++ b/src/ZAlgorithm.cpp @@ -0,0 +1,136 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#include +#include + +#include + +#include "src/common.h" +#include "src/ZAlgorithm.h" + + +namespace ZAlgorithm{ + void suffix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, + std::vector& s1_matches, std::vector& num_matches){ + num_matches = std::vector(s2_right - s2_left + 1, -1); + int leftmost = s2_right+1, right_index = s2_right+1; + for (int i = s2_right; i >= s2_left; i--){ + if (i <= leftmost){ + int index_a = s1.size()-1, index_b = i; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i-s2_left] = i - index_b; + if (index_b < i){ + right_index = i; + leftmost = index_b + 1; + } + } + else { + int twin = i - right_index + s1.size()-1; + int new_left = i - s1_matches[twin] + 1; + if (new_left > leftmost) + num_matches[i-s2_left] = s1_matches[twin]; + else if (new_left < leftmost) + num_matches[i-s2_left] = i-leftmost+1; + else { + int index_a = s1.size()-2-i+leftmost, index_b = leftmost-1; + while (index_a >= 0 && index_b >= 0 && s1[index_a] == s2[index_b]){ + index_a--; + index_b--; + } + num_matches[i-s2_left] = i-index_b; + right_index = i; + leftmost = index_b + 1; + } + } + } + } + + void prefix_helper(const std::string& s1, const std::string& s2, int s2_left, int s2_right, + std::vector& s1_matches, std::vector& num_matches, int offset){ + num_matches = std::vector(s2_right-s2_left+1+offset, -1); + int rightmost = -1, left_index = -1; + for (int i = s2_left; i <= s2_right; i++){ + if (i >= rightmost){ + int index_a = 0, index_b = i; + while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i-s2_left+offset] = index_b - i; + if (index_b > i){ + left_index = i; + rightmost = index_b - 1; + } + } + else { + int twin = i - left_index; + int new_right = i + s1_matches[twin] - 1; + if (new_right < rightmost) + num_matches[i-s2_left+offset] = s1_matches[twin]; + else if (new_right > rightmost) + num_matches[i-s2_left+offset] = rightmost-i+1; + else { + int index_a = rightmost+1-i, index_b = rightmost+1; + while (index_a < static_cast(s1.size()) && index_b < static_cast(s2.size()) && s1[index_a] == s2[index_b]){ + index_a++; + index_b++; + } + num_matches[i-s2_left+offset] = index_b - i; + left_index = i; + rightmost = index_b - 1; + } + } + } + } + + + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { + std::vector s1_matches; + prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); + prefix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches, 0); + } + + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches) { + std::vector s1_matches; + suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches); + suffix_helper(s1, s2, 0, s2.size()-1, s1_matches, num_matches); + } + + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { + if (s2_start < 0 or s2_stop >= static_cast(s2.size())) + PrintMessageDieOnError("Invalid string indices provided to GetPrefixMatchCounts", ERROR); + std::vector s1_matches; + prefix_helper(s1, s1, 1, s1.size()-1, s1_matches, s1_matches, 1); + prefix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches, 0); + } + + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches) { + if (s2_start < 0 or s2_stop >= static_cast(s2.size())) + PrintMessageDieOnError("Invalid string indices provided to GetSuffixMatchCounts", ERROR); + std::vector s1_matches; + suffix_helper(s1, s1, 0, s1.size()-2, s1_matches, s1_matches); + suffix_helper(s1, s2, s2_start, s2_stop, s1_matches, num_matches); + } +} + diff --git a/src/ZAlgorithm.h b/src/ZAlgorithm.h new file mode 100644 index 0000000..777fbc8 --- /dev/null +++ b/src/ZAlgorithm.h @@ -0,0 +1,55 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#include +#include + + +namespace ZAlgorithm{ + /* + * For each position in s2, calculates the length of the matching prefix of s1 and s2[i...] + * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + length_of_s2) + */ + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + + /* + * For each position in s2, calculates the length of the matching suffix of s1 and s2[0...i] + * and stores it in num_matches[i]. The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + length_of_s2) + */ + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, std::vector& num_matches); + + /* + * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of + * the matching prefix of s1 and s2[i...] and stores it in num_matches[i-s2_start]. + * The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + size_of_s2_range) + */ + void GetPrefixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); + + /* + * For each position i in s2 in the range [s2_start, s2_stop], calculates the length of + * the matching suffix of s1 and s2[0...i] and stores it in num_matches[i-s2_start]. + * The provided vector is cleared and sized appropriately. + * Runs in O(length_of_s1 + size_of_s2_range) + */ + void GetSuffixMatchCounts(const std::string& s1, const std::string& s2, int s2_start, int s2_stop, std::vector& num_matches); +} diff --git a/src/common.cpp b/src/common.cpp index c723ef1..f6676af 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -84,7 +84,7 @@ void OutputRunStatistics() { PrintMessageDieOnError("Outputting run statistics", PROGRESS); TextFileWriter sWriter(output_prefix + (program == LOBSTR? ".aligned.stats":".allelotype.stats")); // Output run statistics to stats file - string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1)); + string stats_string = run_info.PrintToString((program == LOBSTR ? 0: 1), filter_counter); sWriter.Write(stats_string); // Upload to AWS S3 if (!noweb) { diff --git a/src/main_allelotype.cpp b/src/main_allelotype.cpp index 73dd6c6..5d4d3cc 100644 --- a/src/main_allelotype.cpp +++ b/src/main_allelotype.cpp @@ -123,6 +123,8 @@ void show_help() { " from either end of the read.\n" "--min-read-end-match : Filter reads whose alignments don't exactly match the reference for at least\n"\ " bp at both ends. \n" + "--maximal-end-match : Filter reads whose prefix/suffix matches to reference are <= those \n" + " obtained when shifting the read ends by distances within bp\n" "--exclude-partial: Do not report any information about partially\n" \ " spanning reads.\n\n" "Additional options\n" \ @@ -151,6 +153,7 @@ void parse_commandline_options(int argc, char* argv[]) { OPT_MAX_DIFF_REF, OPT_MAXMAPQ, OPT_MAXMATEDIST, + OPT_MAXIMAL_END_MATCH_WINDOW, OPT_MIN_BORDER, OPT_MIN_BP_BEFORE_INDEL, OPT_MIN_HET_FREQ, @@ -184,6 +187,7 @@ void parse_commandline_options(int argc, char* argv[]) { {"max-diff-ref", 1, 0, OPT_MAX_DIFF_REF}, {"mapq", 1, 0, OPT_MAXMAPQ}, {"max-matedist", 1, 0, OPT_MAXMATEDIST}, + {"maximal-end-match", 1, 0, OPT_MAXIMAL_END_MATCH_WINDOW}, {"min-border", 1, 0, OPT_MIN_BORDER}, {"min-bp-before-indel", 1, 0, OPT_MIN_BP_BEFORE_INDEL}, {"min-het-freq", 1, 0, OPT_MIN_HET_FREQ}, @@ -258,6 +262,10 @@ void parse_commandline_options(int argc, char* argv[]) { max_diff_ref = atoi(optarg); AddOption("max-diff-ref", string(optarg), true, &user_defined_arguments_allelotyper); break; + case OPT_MAXIMAL_END_MATCH_WINDOW: + maximal_end_match_window = atoi(optarg); + AddOption("maximal-end-match", string(optarg), true, &user_defined_arguments_allelotyper); + break; case OPT_MAXMAPQ: max_mapq = atoi(optarg); AddOption("mapq", string(optarg), true, &user_defined_arguments_allelotyper); diff --git a/src/runtime_parameters.cpp b/src/runtime_parameters.cpp index cde4cf9..9f020ee 100644 --- a/src/runtime_parameters.cpp +++ b/src/runtime_parameters.cpp @@ -23,6 +23,7 @@ along with lobSTR. If not, see . #include "src/runtime_parameters.h" RunInfo run_info; +FilterCounter filter_counter; int MIN_PERIOD=1; int MAX_PERIOD=6; @@ -115,6 +116,7 @@ bool include_gl = false; bool print_reads = false; float min_het_freq = 0; int max_matedist = 100000; +int maximal_end_match_window = 0; int min_border = 0; int min_bp_before_indel = 0; int min_read_end_match = 0; diff --git a/src/runtime_parameters.h b/src/runtime_parameters.h index c0618a5..01853a1 100644 --- a/src/runtime_parameters.h +++ b/src/runtime_parameters.h @@ -25,6 +25,7 @@ along with lobSTR. If not, see . #include #include "src/RunInfo.h" +#include "src/FilterCounter.h" // period range for genotyper extern int MIN_PERIOD; @@ -33,6 +34,9 @@ extern int MAX_PERIOD; // global variables that probably shouldn't be global variables extern RunInfo run_info; +// when allelotyping, tracks why filtered reads were omitted +extern FilterCounter filter_counter; + // stores user arguments for lobSTR alignment // used to store the bam header for allelotyping step extern std::string user_defined_arguments; @@ -140,6 +144,7 @@ extern int min_border; extern int min_bp_before_indel; extern int min_read_end_match; extern int max_matedist; +extern int maximal_end_match_window; // debug extern bool profile; diff --git a/src/tests/AlignmentFilters_test.cpp b/src/tests/AlignmentFilters_test.cpp index d4fa929..3384497 100644 --- a/src/tests/AlignmentFilters_test.cpp +++ b/src/tests/AlignmentFilters_test.cpp @@ -128,3 +128,48 @@ void AlignmentFiltersTest::test_GetNumEndMatches(){ } +void AlignmentFiltersTest::test_HasLargestEndMatches(){ + AlignedRead aln; + string ref_seq; + int ref_start; + + // Simple case with perfect maximal match + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACACATACAC"; + ref_seq = "ACACACATACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Perfect maximal match and soft clipping + aln.cigar_ops = GetCigarOps("2S9M2S"); + aln.read_start = 102; + aln.nucleotides = "NNACACATACACNN"; + ref_seq = "ACACACATACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Perfect non-maximal match + aln.cigar_ops = GetCigarOps("8M"); + aln.read_start = 102; + aln.nucleotides = "ACACACAC"; + ref_seq = "ACACACACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Maximal prefix but not maximal suffix + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACATTACAC"; + ref_seq = "ACACATCACACACTACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); + + // Maximal suffix but not maximal prefix + aln.cigar_ops = GetCigarOps("9M"); + aln.read_start = 102; + aln.nucleotides = "ACACTACAC"; + ref_seq = "ACACATTACACACACAC"; + ref_start = 100; + CPPUNIT_ASSERT(!AlignmentFilters::HasLargestEndMatches(&aln, ref_seq, ref_start, 10, 10)); +} diff --git a/src/tests/AlignmentFilters_test.h b/src/tests/AlignmentFilters_test.h index 5e4f5d6..bae2181 100644 --- a/src/tests/AlignmentFilters_test.h +++ b/src/tests/AlignmentFilters_test.h @@ -30,6 +30,7 @@ public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(AlignmentFiltersTest); CPPUNIT_TEST(test_GetDistToIndel); CPPUNIT_TEST(test_GetNumEndMatches); + CPPUNIT_TEST(test_HasLargestEndMatches); CPPUNIT_TEST_SUITE_END(); public: @@ -37,6 +38,7 @@ public CppUnit::TestFixture { void tearDown(); void test_GetDistToIndel(); void test_GetNumEndMatches(); + void test_HasLargestEndMatches(); private: }; diff --git a/src/tests/DNATools.cpp b/src/tests/DNATools.cpp new file mode 100644 index 0000000..73ec7eb --- /dev/null +++ b/src/tests/DNATools.cpp @@ -0,0 +1,31 @@ +#include +#include + +#include + +#include "src/tests/DNATools.h" + +namespace DNATools { + char GetChar(int index){ + switch(index){ + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return 'N'; + } + } + + std::string RandDNA(const int length){ + std::stringstream seq; + for (int j = 0; j < length; j++) + seq << GetChar(rand()%4); + return seq.str(); + } +} + diff --git a/src/tests/DNATools.h b/src/tests/DNATools.h new file mode 100644 index 0000000..0540313 --- /dev/null +++ b/src/tests/DNATools.h @@ -0,0 +1,9 @@ +#ifndef DNATOOLS_H +#define DNATOOLS_H + +#include + +namespace DNATools { + std::string RandDNA(const int length); +} +#endif // DNATOOLS_H diff --git a/src/tests/NWNoRefEndPenalty_test.cpp b/src/tests/NWNoRefEndPenalty_test.cpp index bf60dec..0dc1038 100644 --- a/src/tests/NWNoRefEndPenalty_test.cpp +++ b/src/tests/NWNoRefEndPenalty_test.cpp @@ -25,36 +25,15 @@ along with lobSTR. If not, see . #include #include -#include "src/tests/NWNoRefEndPenalty_test.h" #include "src/common.h" - -// Registers the fixture into the 'registry' -CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest); +#include "src/tests/DNATools.h" +#include "src/tests/NWNoRefEndPenalty_test.h" using namespace std; +// Registers the fixture into the 'registry' +CPPUNIT_TEST_SUITE_REGISTRATION(NWNoRefEndPenaltyTest); -char GetChar(int index){ - switch(index){ - case 0: - return 'A'; - case 1: - return 'C'; - case 2: - return 'G'; - case 3: - return 'T'; - default: - return 'N'; - } -} - -string RandDNA(const int length){ - stringstream seq; - for (int j = 0; j < length; j++) - seq << GetChar(rand()%4); - return seq.str(); -} void NWNoRefEndPenaltyTest::setUp() {} void NWNoRefEndPenaltyTest::tearDown() {} @@ -64,7 +43,7 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double int num_correct = 0; for (int i = 0; i < num_trials; i++){ // Create random reference sequence - string ref_seq = RandDNA(REF_LEN); + string ref_seq = DNATools::RandDNA(REF_LEN); // Choose the read region int read_start = rand()%(ref_seq.size()-READ_LEN+1); @@ -75,10 +54,10 @@ int NWNoRefEndPenaltyTest::GenAlignments(int num_trials, double mut_prob, double for (unsigned int i = PERFECT_FLANK; i < read_seq.size()-PERFECT_FLANK; i++){ float prob = static_cast (rand()) / static_cast (RAND_MAX); if (prob <= mut_prob) - mut_read_seq_ss << RandDNA(1); + mut_read_seq_ss << DNATools::RandDNA(1); else if (prob - mut_prob <= ins_prob){ int ins_size = rand()%(MAX_INS-1) + 1; - mut_read_seq_ss << RandDNA(ins_size); + mut_read_seq_ss << DNATools::RandDNA(ins_size); } else if (prob - mut_prob - ins_prob <= del_prob){ int del_size = rand()%(MAX_DEL-1) + 1; diff --git a/src/tests/NWNoRefEndPenalty_test.h b/src/tests/NWNoRefEndPenalty_test.h index 967dffa..37cdb14 100644 --- a/src/tests/NWNoRefEndPenalty_test.h +++ b/src/tests/NWNoRefEndPenalty_test.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2011 Melissa Gymrek +Copyright (C) 2014 Thomas Willems This file is part of lobSTR. diff --git a/src/tests/ZAlgorithm_test.cpp b/src/tests/ZAlgorithm_test.cpp new file mode 100644 index 0000000..6ab4372 --- /dev/null +++ b/src/tests/ZAlgorithm_test.cpp @@ -0,0 +1,77 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#include +#include + +#include + +#include "src/tests/DNATools.h" +#include "src/tests/ZAlgorithm_test.h" +#include "src/common.h" +#include "src/ZAlgorithm.h" + + +using namespace std; + +void ZAlgorithmTest::setUp(){} +void ZAlgorithmTest::tearDown(){} + +void ZAlgorithmTest::test_Prefix(){ + vector num_matches; + string s1, s2; + int s2_start, s2_stop; + for (int i = 0; i < NUM_TRIALS; i++){ + s1 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2_start = rand()%s2.size(); + s2_stop = s2_start + (rand()%(s2.size()-s2_start)); + ZAlgorithm::GetPrefixMatchCounts(s1, s2, s2_start, s2_stop, num_matches); + for (int j = s2_start; j <= s2_stop; j++){ + bool match = (s1.substr(0, num_matches[j-s2_start]).compare(s2.substr(j, num_matches[j-s2_start])) == 0); + bool longer = (s1.size() > num_matches[j-s2_start] && s2.size() > (j + num_matches[j-s2_start]) && s2[j+num_matches[j-s2_start]] == s1[num_matches[j-s2_start]]); + CPPUNIT_ASSERT_MESSAGE("ZAlgorithm prefix error", !(!match || (match && longer))); + } + } +} + +void ZAlgorithmTest::test_Suffix(){ + vector num_matches; + string s1, s2; + int s2_start, s2_stop; + for (int i = 0; i < NUM_TRIALS; i++){ + s1 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2 = DNATools::RandDNA(1+rand()%(LENGTH-1)); + s2_start = rand()%s2.size(); + s2_stop = s2_start + (rand()%(s2.size()-s2_start)); + ZAlgorithm::GetSuffixMatchCounts(s1, s2, s2_start, s2_stop, num_matches); + for (int j = s2_start; j <= s2_stop; j++){ + string sub_1 = s1.substr(s1.size()-num_matches[j-s2_start], num_matches[j-s2_start]); + string sub_2 = s2.substr(j-num_matches[j-s2_start]+1, num_matches[j-s2_start]); + bool match = (sub_1.compare(sub_2) == 0); + bool longer = ((j-num_matches[j-s2_start]) > 0 && (s1.size()-num_matches[j-s2_start]) > 0 && s1[s1.size()-num_matches[j-s2_start]-1] == s2[j-num_matches[j-s2_start]]); + CPPUNIT_ASSERT_MESSAGE("ZAlgorithm suffix error", !(!match || (match && longer))); + } + } +} + + + + diff --git a/src/tests/ZAlgorithm_test.h b/src/tests/ZAlgorithm_test.h new file mode 100644 index 0000000..fc776f7 --- /dev/null +++ b/src/tests/ZAlgorithm_test.h @@ -0,0 +1,46 @@ +/* +Copyright (C) 2014 Thomas Willems + +This file is part of lobSTR. + +lobSTR is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +lobSTR is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with lobSTR. If not, see . + +*/ + +#ifndef SRC_TESTS_ZALGORITHM_H__ +#define SRC_TESTS_ZALGORITHM_H__ + +#include + +#include "src/ZAlgorithm.h" + +class ZAlgorithmTest : +public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(ZAlgorithmTest); + CPPUNIT_TEST(test_Prefix); + CPPUNIT_TEST(test_Suffix); + CPPUNIT_TEST_SUITE_END(); + + public: + const static int NUM_TRIALS=1000; + const static int LENGTH=100; + void setUp(); + void tearDown(); + void test_Prefix(); + void test_Suffix(); + + private: +}; + +#endif // SRC_TESTS_ZALGORITHM_H_ diff --git a/src/tests/main_test.cpp b/src/tests/main_test.cpp index 1026a95..4cc5bc6 100644 --- a/src/tests/main_test.cpp +++ b/src/tests/main_test.cpp @@ -27,6 +27,7 @@ along with lobSTR. If not, see . #include "src/tests/ReadContainer_test.h" #include "src/tests/RemoveDuplicates_test.h" #include "src/tests/VCFWriter_test.h" +#include "src/tests/ZAlgorithm_test.h" int main( int argc, char **argv) { // Adds the test to the list of tests to run @@ -38,6 +39,7 @@ int main( int argc, char **argv) { runner.addTest(ReadContainerTest::suite()); runner.addTest(RemoveDuplicatesTest::suite()); runner.addTest(VCFWriterTest::suite()); + runner.addTest(ZAlgorithmTest::suite()); // Run the tests bool wasSucessful = runner.run();