Skip to content

Commit

Permalink
Fixed issues with overhanging bases when aligning near a gap while us…
Browse files Browse the repository at this point in the history
…ing anchored alignment. The first issue was that the incremental hash key should have been reset when an unknown base was met, because it couldn't be 2bitpacked. The other one was that SeqAn, unlike EDlib, reports alignments even if all bases are Xs. I placed a threshold on the edit distance on alignment of the front and back part of a read (before first and after last alignment). These parts should not have edit distance larger than half their length. Otherwise, the read will be clipped on that end.
  • Loading branch information
isovic committed Nov 16, 2015
1 parent 8ef2eb0 commit db1362c
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ obj_debug/
obj_linux/
obj_mac/
obj_test/
obj_testext/
obj_extcigar/
# bin/graphmap-not_release
# bin/graphmap-debug
Expand Down
24 changes: 20 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ BIN = ./bin/graphmap-not_release
BIN_DEBUG = ./bin/graphmap-debug
BIN_LINUX = ./bin/Linux-x64/graphmap
BIN_MAC = ./bin/Mac/graphmap
OBJ = ./obj_test
OBJ_TESTING = ./obj_test
OBJ_TESTING_EXT = ./obj_testext
OBJ_DEBUG = ./obj_debug
OBJ_LINUX = ./obj_linux
OBJ_EXTCIGAR = ./obj_extcigar
Expand All @@ -22,7 +23,8 @@ CC_FILES := $(wildcard $(SOURCE)/*/*.cc) $(wildcard $(SOURCE)/*.cc)
H_FILES := $(wildcard $(SOURCE)/*/*.h) $(wildcard $(SOURCE)/*.h)

OBJ_FILES := $(CPP_FILES:.cpp=.o) $(CC_FILES:.cc=.o)
OBJ_FILES_FOLDER := $(addprefix $(OBJ)/,$(OBJ_FILES))
OBJ_FILES_FOLDER_TESTING := $(addprefix $(OBJ_TESTING)/,$(OBJ_FILES))
OBJ_FILES_FOLDER_TESTING_EXT := $(addprefix $(OBJ_TESTING_EXT)/,$(OBJ_FILES))
OBJ_FILES_FOLDER_DEBUG := $(addprefix $(OBJ_DEBUG)/,$(OBJ_FILES))
OBJ_FILES_FOLDER_LINUX := $(addprefix $(OBJ_LINUX)/,$(OBJ_FILES))
OBJ_FILES_FOLDER_EXTCIGAR := $(addprefix $(OBJ_EXTCIGAR)/,$(OBJ_FILES))
Expand All @@ -36,6 +38,7 @@ CC_FLAGS_DEBUG = -O0 -g -rdynamic -c -fmessage-length=0 -ffreestanding -fopenmp
CC_FLAGS_RELEASE = -DRELEASE_VERSION -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread # -march=native
CC_FLAGS_EXTCIGAR = -DRELEASE_VERSION -DUSE_EXTENDED_CIGAR_FORMAT -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -pthread -march=native
CC_FLAGS_NOT_RELEASE = -O3 -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
CC_FLAGS_NOT_RELEASE_EXT = -O3 -DUSE_EXTENDED_CIGAR_FORMAT -fdata-sections -ffunction-sections -c -fmessage-length=0 -ffreestanding -fopenmp -m64 -std=c++11 -Werror=return-type -Wuninitialized -pthread -march=native
LD_FLAGS = -static-libgcc -static-libstdc++ -m64 -ffreestanding
LD_LIBS = -lpthread -lgomp -lm -lz -ldivsufsort64

Expand All @@ -45,9 +48,9 @@ all: gcc_version_check linux



testing: $(OBJ_FILES_FOLDER)
testing: $(OBJ_FILES_FOLDER_TESTING)
mkdir -p $(dir $(BIN))
$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER) $(LD_LIBS)
$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING) $(LD_LIBS)

obj_test/%.o: %.cc $(H_FILES)
mkdir -p $(dir $@)
Expand All @@ -57,6 +60,19 @@ obj_test/%.o: %.cpp $(H_FILES)
mkdir -p $(dir $@)
$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE) -o $@ $<

testingext: $(OBJ_FILES_FOLDER_TESTING_EXT)
mkdir -p $(dir $(BIN))
$(GCC) $(LD_FLAGS) $(LIB_DIRS) -o $(BIN) $(OBJ_FILES_FOLDER_TESTING_EXT) $(LD_LIBS)

obj_testext/%.o: %.cc $(H_FILES)
mkdir -p $(dir $@)
$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<

obj_testext/%.o: %.cpp $(H_FILES)
mkdir -p $(dir $@)
$(GCC) $(CC_LIBS) $(INCLUDE) $(CC_FLAGS_NOT_RELEASE_EXT) -o $@ $<



gcc_version_check:
ifneq ($(GCC_MAJOR_VERSION_GE_4), 1)
Expand Down
7 changes: 7 additions & 0 deletions src/alignment/local_realignment_wrappers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align

std::vector<unsigned char> alignment;
// alignment.reserve(align.data_rows[0].
int64_t aln_edit_distance = 0;

// Reinitilaize the iterators.
TGapsIterator itGapsText = seqan::begin(gapsText);
Expand All @@ -141,6 +142,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align

if (alignment_type == ALIGNMENT_TYPE_NW || alignment_type == ALIGNMENT_TYPE_SHW) {
alignment.insert(alignment.begin(), (*ret_start_offset), (char) EDLIB_D);
aln_edit_distance += (*ret_start_offset);
*ret_start_offset = 0;
}

Expand All @@ -155,6 +157,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align
alignment.insert(alignment.end(), num_gaps, (char) EDLIB_D);
itGapsText += num_gaps;
itGapsPattern += num_gaps;
aln_edit_distance += num_gaps;
continue;
}
// Count deletions.
Expand All @@ -163,6 +166,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align
alignment.insert(alignment.end(), num_gaps, (char) EDLIB_I);
itGapsText += num_gaps;
itGapsPattern += num_gaps;
aln_edit_distance += num_gaps;
continue;
}

Expand All @@ -184,6 +188,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align
{
++numChar;
alignment.insert(alignment.end(), 1, (char) EDLIB_X);
aln_edit_distance += 1;
++itGapsText;
++itGapsPattern;
}
Expand All @@ -196,6 +201,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align

if (alignment_type == ALIGNMENT_TYPE_NW) {
alignment.insert(alignment.end(), (*ret_end_offset), (char) EDLIB_D);
aln_edit_distance += (*ret_end_offset);
*ret_end_offset = 0;
}

Expand All @@ -214,6 +220,7 @@ int SeqAnAlignmentToEdlibAlignmentNoCigar(seqan::Align<seqan::Dna5String> &align
// *ret_end_offset = 0;

ret_alignment = alignment;
*edit_distance = aln_edit_distance;

// if (alignment.size() > 0)
// cigar = AlignmentToCigar((unsigned char *) &alignment[0], alignment.size());
Expand Down
12 changes: 10 additions & 2 deletions src/graphmap/anchored_mapping.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali
int64_t query_start = best_path->get_mapping_data().clusters.front().query.start; // - clip_count_front;
int64_t query_end = best_path->get_mapping_data().clusters.back().query.end; // + clip_count_back; /////I

/// Aligning the begining of the read (in front of the first anchor).
if (clip_count_front > 0) {
/// Check if we need to extend the alignment to the left boundary. Also, even if the user specified it, if we are to close to the boundary, just clip it.
if (end_to_end == false || ((alignment_position_start - clip_count_front*2) < reference_start)) {
Expand All @@ -491,7 +492,10 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali
-1, parameters.match_score, -parameters.mismatch_penalty, -parameters.gap_open_penalty, -parameters.gap_extend_penalty,
&leftover_left_start, &leftover_left_end,
&leftover_left_edit_distance, leftover_left_alignment);
if (ret_code_right != 0) {
/// Check if the return code is ok. Otherwise, just clip the front.
/// Added on 15.11.2015.: check if the edit distance of the front part is too high. EDlib will automatically return an error code, but SeqAn won't.
/// An example is when the entire front part does not match (e.g. alignment of a read to a part of the reference consisted of N bases).
if (ret_code_right != 0 || leftover_left_edit_distance > clip_count_front/2) {
// TODO: This is a nasty hack. EDlib used to crash when query and target are extremely small, e.g. query = "C" and target = "TC".
// In this manner we just ignore the leading part, and clip it.
std::vector<unsigned char> insertions_front(clip_count_front, EDLIB_I);
Expand Down Expand Up @@ -718,6 +722,7 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali



/// Aligning the end of the read.
if (clip_count_back > 0) {
/// Handle the clipping at the end, or extend alignment to the end of the sequence.
if (end_to_end == false || (alignment_position_end + 1 + clip_count_back * 2) >= (reference_start + reference_length)) {
Expand All @@ -737,7 +742,10 @@ int AnchoredAlignment(bool is_linear, bool end_to_end, AlignmentFunctionType Ali
-1, parameters.match_score, -parameters.mismatch_penalty, -parameters.gap_open_penalty, -parameters.gap_extend_penalty,
&leftover_right_start, &leftover_right_end,
&leftover_right_edit_distance, leftover_right_alignment);
if (ret_code_right != 0) {
/// Check if the return code is ok. Otherwise, just clip the back.
/// Added on 15.11.2015.: check if the edit distance of the back part is too high. EDlib will automatically return an error code, but SeqAn won't.
/// An example is when the entire back part does not match (e.g. alignment of a read to a part of the reference consisted of N bases).
if (ret_code_right != 0 || leftover_right_edit_distance > clip_count_back/2) {
// TODO: This is a nasty hack. EDlib used to crash when query and target are extremely small, e.g. query = "C" and target = "TC".
// In this manner we just ignore the trailing part, and clip it.
std::vector<unsigned char> insertions_back(clip_count_back, EDLIB_I);
Expand Down
3 changes: 2 additions & 1 deletion src/graphmap/core_graphmap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,14 @@ int GraphMap::ProcessKmerCacheFriendly_(int8_t *kmer, int64_t kmer_start_positio
uint64_t hits_start = -1;
uint64_t num_hits = 0;
int64_t *hits = NULL;

int ret_search = ((IndexHash *) index_read)->FindAllRawPositionsOfIncrementalSeed(kmer, (uint64_t) k, (uint64_t) parameters->max_num_hits, &hits, &hits_start, &num_hits);

if (ret_search == 1) { // There are no hits for the current kmer.
return 1;
} else if (ret_search == 2) { // There are too many seeds for the current kmer.
// We ignore this case, and we'll use all the hits.
} else if (ret_search > 2) { // Something other went wrong.
} else if (ret_search > 2 || ret_search < 0) { // Something other went wrong.
return 2;
}

Expand Down
9 changes: 7 additions & 2 deletions src/index/index_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,19 +225,23 @@ int IndexHash::GenerateFromSingleSequenceOnlyForward(const SingleSequence& seque
// }

int64_t hash_key = -1;
bool last_skipped = true;

for (uint64_t i=0; i<(sequence.get_sequence_length() - k_ + 1); i++) {
int8_t *seed_start = &(data_[i]);
// int64_t hash_key = GenerateHashKey(seed_start, k_);

if (i == 0) {
if (i == 0 || last_skipped == true) {
hash_key = GenerateHashKey(seed_start, k_);
last_skipped = false;
} else {
hash_key = UpdateHashKey(seed_start, k_, hash_key);
}

if (hash_key < 0)
if (hash_key < 0) {
last_skipped = true;
continue;
}

// kmer_hash_[hash_key].push_back(((int64_t) i));
// printf ("\nkmer_counts[%ld] = %ld\n", hash_key, kmer_counts[hash_key]);
Expand Down Expand Up @@ -319,6 +323,7 @@ int IndexHash::FindAllRawPositionsOfIncrementalSeed(int8_t* seed, uint64_t seed_
}

if (hash_key < 0 || hash_key >= kmer_hash_size_) {
kmer_hash_last_key_initialized_ = false;
return 3;
}

Expand Down

0 comments on commit db1362c

Please sign in to comment.