Skip to content

Commit

Permalink
Merge pull request #31 from tfwillems/master
Browse files Browse the repository at this point in the history
New Alignment Filter
  • Loading branch information
Melissa Gymrek committed Mar 19, 2014
2 parents 29cfb0c + cb7eb3f commit cb24c2d
Show file tree
Hide file tree
Showing 24 changed files with 719 additions and 54 deletions.
5 changes: 1 addition & 4 deletions src/AlignedRead.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ along with lobSTR. If not, see <http://www.gnu.org/licenses/>.

#include "src/api/BamAlignment.h"

using namespace std;
using BamTools::CigarOp;

/*
Struct to keep track of aligned read used in allelotyping
*/
Expand All @@ -37,7 +34,7 @@ struct AlignedRead {
int read_start;
std::string nucleotides;
std::string qualities;
vector<BamTools::CigarOp> cigar_ops;
std::vector<BamTools::CigarOp> cigar_ops;
std::string repseq;
int period;
int diffFromRef;
Expand Down
85 changes: 84 additions & 1 deletion src/AlignmentFilters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ along with lobSTR. If not, see <http://www.gnu.org/licenses/>.

#include "src/AlignmentFilters.h"
#include "src/common.h"
#include "src/ZAlgorithm.h"

using namespace std;

namespace AlignmentFilters {

template<typename CigarIterator> int GetDistToIndel(CigarIterator iter, CigarIterator end){
// Process leading clipping ops
if (iter != end && iter->Type == 'H')
Expand Down Expand Up @@ -147,4 +147,87 @@ namespace AlignmentFilters {
else
return pair<int,int>(head_match, match_run);
}


/*
Stores the sequence, start and end position of the read after removing clipped bases
using the provided references
*/
void GetUnclippedInfo(AlignedRead* aln, string& bases, int& unclipped_start, int& unclipped_end){
unclipped_start = aln->read_start;
unclipped_end = aln->read_start-1;
bool begin = true;
int start_index = 0, num_bases = 0;
for(vector<BamTools::CigarOp>::iterator cigar_iter = aln->cigar_ops.begin(); cigar_iter != aln->cigar_ops.end(); cigar_iter++){
switch(cigar_iter->Type) {
case 'D':
unclipped_end += cigar_iter->Length;
begin = false;
break;
case 'H':
break;
case 'S':
if (begin) start_index += cigar_iter->Length;
break;
case 'M':
unclipped_end += cigar_iter->Length;
num_bases += cigar_iter->Length;
begin = false;
break;
case 'I':
num_bases += cigar_iter->Length;
begin = false;
break;
default:
PrintMessageDieOnError("Invalid CIGAR char " + cigar_iter->Type, ERROR);
break;
}
}
bases = aln->nucleotides.substr(start_index, num_bases);
}


bool HasLargestEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start, int max_external, int max_internal){
// Extract sequence, start and end coordinates of read after clipping
string bases;
int start, end;
GetUnclippedInfo(aln, bases, start, end);

// Check that the prefix match is the longest
if (start >= ref_seq_start && start < ref_seq_start + static_cast<int>(ref_seq.size())){
int start_index = start - ref_seq_start;
int start = max(0, start_index - max_external);
int stop = min(static_cast<int>((ref_seq.size()-1)), start_index + max_internal);
vector<int> match_counts;
ZAlgorithm::GetPrefixMatchCounts(bases, ref_seq, start, stop, match_counts);

int align_index = start_index - start;
int num_matches = match_counts[align_index];
for (int i = 0; i < static_cast<int>(match_counts.size()); i++){
if (i == align_index)
continue;
if (match_counts[i] >= num_matches)
return false;
}
}

// Check that the suffix match is the longest
if (end >= ref_seq_start && end < ref_seq_start + static_cast<int>(ref_seq.size())){
int end_index = end - ref_seq_start;
int start = max(0, end_index - max_internal);
int stop = min(static_cast<int>(ref_seq.size()-1), end_index + max_external);
vector<int> match_counts;
ZAlgorithm::GetSuffixMatchCounts(bases, ref_seq, start, stop, match_counts);

int align_index = end_index - start;
int num_matches = match_counts[align_index];
for (int i = 0; i < static_cast<int>(match_counts.size()); i++){
if (i == align_index)
continue;
if (match_counts[i] >= num_matches)
return false;
}
}
return true;
}
}
21 changes: 13 additions & 8 deletions src/AlignmentFilters.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,33 @@ along with lobSTR. If not, see <http://www.gnu.org/licenses/>.
*/


#ifndef SRC_ALIGNMENTFILTERS_H
#define SRC_ALIGNMENTFILTERS_H
#ifndef SRC_ALIGNMENTFILTERS_H_
#define SRC_ALIGNMENTFILTERS_H_

#include <string>
#include <vector>

#include "src/AlignedRead.h"

using namespace std;

namespace AlignmentFilters {
/* Returns the vector of CigarOps corresponding to the CIGAR string. */
vector<BamTools::CigarOp> GetCigarOps(string cigar_string);
std::vector<BamTools::CigarOp> GetCigarOps(std::string cigar_string);

/* Returns the CIGAR string corresponding to the vector of CigarOps. */
string GetCigarString(vector<BamTools::CigarOp>& cigar_ops);
std::string GetCigarString(std::vector<BamTools::CigarOp>& cigar_ops);

/* Length of perfect base matches at 5' and 3' end of read. */
pair<int,int> GetNumEndMatches(AlignedRead* aln, const string& ref_seq, int ref_seq_start);
std::pair<int,int> GetNumEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start);

/* Minimum distances from 5' and 3' end of reads to first indel. If no such indel exists, returns (-1,-1). */
pair<int,int> GetEndDistToIndel(AlignedRead* aln);
std::pair<int,int> GetEndDistToIndel(AlignedRead* aln);

/* Returns true iff the alignment has:
1) a maximal matching prefix compared to alignments that start [-max_upstream, max_downstream] from the 5' alignment position of the read
2) a maximal matching suffix compared to alignments that end [-max_downstream, max_upstream] from the 3' alignment position of the read
Ignores clipped bases when performing these comparions
*/
bool HasLargestEndMatches(AlignedRead* aln, const std::string& ref_seq, int ref_seq_start, int max_upstream, int max_downstream);
}

#endif
76 changes: 76 additions & 0 deletions src/FilterCounter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
Copyright (C) 2014 Thomas Willems <[email protected]>
This file is part of lobSTR.
lobSTR is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
lobSTR is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with lobSTR. If not, see <http://www.gnu.org/licenses/>.
*/

#include "src/common.h"
#include "src/FilterCounter.h"

FilterCounter::FilterCounter(){
counts = new uint64_t[NUM_FILTERS];
for (int i = 0; i < NUM_FILTERS; i++)
counts[i] = 0;
}

void FilterCounter::increment(const int type){
if (type > NUM_FILTERS || type < 0)
PrintMessageDieOnError("Invalid filter type", ERROR);
counts[type]++;
}

std::string FilterCounter::GetFilterType(const int type){
switch(type){
case NOT_UNIT:
return "NOT_UNIT";
case DIFF_FROM_REF:
return "DIFF_FROM_REF";
case MAPPING_QUALITY:
return "MAPPING_QUALITY";
case MATE_DIST:
return "MATE_DIST";
case ALLELE_SIZE:
return "ALLELE_SIZE";
case SPANNING_AMOUNT:
return "SPANNING_AMOUNT";
case NUM_END_MATCHES:
return "NUM_END_MATCHES";
case NOT_MAXIMAL_END:
return "NOT_MAXIMAL_END";
case BP_BEFORE_INDEL:
return "BP_BEFORE_INDEL";
case UNFILTERED:
return "UNFILTERED";
default:
PrintMessageDieOnError("Invalid filter type", ERROR);
}
}

uint64_t FilterCounter::GetFilterCount(const int type){
if (type > NUM_FILTERS || type < 0)
PrintMessageDieOnError("Invalid filter type", ERROR);
return counts[type];
}


FilterCounter::~FilterCounter(){
delete [] counts;
}




62 changes: 62 additions & 0 deletions src/FilterCounter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
Copyright (C) 2014 Thomas Willems <[email protected]>
This file is part of lobSTR.
lobSTR is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
lobSTR is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with lobSTR. If not, see <http://www.gnu.org/licenses/>.
*/


#ifndef SRC_FILTER_COUNTER_H_
#define SRC_FILTER_COUNTER_H_

#include <stdint.h>

#include <string>

class FilterCounter {
private:
uint64_t* counts;

public:
const static int NUM_FILTERS = 10;

// Various filter types
const static int NOT_UNIT = 0;
const static int DIFF_FROM_REF = 1;
const static int MAPPING_QUALITY = 2;
const static int MATE_DIST = 3;
const static int ALLELE_SIZE = 4;
const static int SPANNING_AMOUNT = 5;
const static int NUM_END_MATCHES = 6;
const static int NOT_MAXIMAL_END = 7;
const static int BP_BEFORE_INDEL = 8;
const static int UNFILTERED = 9;

FilterCounter();

/* Increment the count for the provided filter type */
void increment(const int type);

/* Returns the name of the filter associated with the provided filter type */
std::string GetFilterType(const int type);

/* Returns the count associated with the provided filter type */
uint64_t GetFilterCount(const int type);

~FilterCounter();
};

#endif
10 changes: 10 additions & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ liblobstr_a_SOURCES = \
FastqPairedFileReader.cpp FastqPairedFileReader.h \
FFT_four_nuc_vectors.cpp FFT_four_nuc_vectors.h \
FFT_nuc_vectors.cpp FFT_nuc_vectors.h \
FilterCounter.cpp FilterCounter.h \
gzstream.cpp gzstream.h \
STRRecord.h \
HammingWindowGenerator.cpp HammingWindowGenerator.h \
Expand Down Expand Up @@ -186,6 +187,7 @@ liballelotype_a_SOURCES = \
FastqFileReader.cpp FastqFileReader.h \
FastaPairedFileReader.cpp FastaPairedFileReader.h \
FastqPairedFileReader.cpp FastqPairedFileReader.h \
FilterCounter.cpp FilterCounter.h \
Genotyper.cpp Genotyper.h \
gzstream.cpp gzstream.h \
NoiseModel.cpp NoiseModel.h \
Expand All @@ -201,6 +203,7 @@ liballelotype_a_SOURCES = \
TextFileReader.cpp TextFileReader.h \
TextFileWriter.cpp TextFileWriter.h \
VCFWriter.cpp VCFWriter.h \
ZAlgorithm.cpp ZAlgorithm.h \
ZippedFastaFileReader.cpp ZippedFastaFileReader.h \
ZippedFastqFileReader.cpp ZippedFastqFileReader.h \
ZippedTextFileReader.cpp ZippedTextFileReader.h \
Expand Down Expand Up @@ -245,6 +248,8 @@ lobSTRTests_SOURCES = \
tests/BWAReadAligner_test.cpp \
tests/common_test.h \
tests/common_test.cpp \
tests/DNATools.h \
tests/DNATools.cpp \
tests/NWNoRefEndPenalty_test.h \
tests/NWNoRefEndPenalty_test.cpp \
tests/ReadContainer_test.h \
Expand All @@ -253,6 +258,8 @@ lobSTRTests_SOURCES = \
tests/RemoveDuplicates_test.cpp \
tests/VCFWriter_test.h \
tests/VCFWriter_test.cpp \
tests/ZAlgorithm_test.h \
tests/ZAlgorithm_test.cpp \
AlignmentFilters.cpp \
BamFileReader.cpp \
BamPairedFileReader.cpp \
Expand All @@ -266,6 +273,7 @@ lobSTRTests_SOURCES = \
FastqPairedFileReader.cpp \
FFT_four_nuc_vectors.cpp \
FFT_nuc_vectors.cpp \
FilterCounter.h FilterCounter.cpp \
gzstream.cpp \
HammingWindowGenerator.cpp \
MultithreadData.cpp \
Expand All @@ -281,6 +289,8 @@ lobSTRTests_SOURCES = \
ReadContainer.cpp \
RemoveDuplicates.cpp \
VCFWriter.cpp \
ZAlgorithm.h \
ZAlgorithm.cpp \
ZippedFastaFileReader.cpp \
ZippedFastqFileReader.cpp \
ZippedTextFileReader.cpp \
Expand Down
Loading

0 comments on commit cb24c2d

Please sign in to comment.