From deb2d9515ee21afa1ff00b0d4c06ab0b7fc95695 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 21 Sep 2023 16:48:26 -0700 Subject: [PATCH] avoid direct use of coordinates mapping in has_reading_frames --- intact/intact.py | 30 +++++++++--------------------- util/aligned_sequence.py | 13 ++++++++++--- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/intact/intact.py b/intact/intact.py index 9423ea9..c0042e4 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -505,18 +505,11 @@ def translate(seq, frame = 0, to_stop = False): def has_reading_frames( - alignment, sequence, reference, is_small, + aligned_sequence, is_small, expected, error_bar, reverse = False ): - """ - Check for presence of small reading frames - """ - - coordinates_mapping = coords.map_positions(alignment[0], alignment[1].seq) - reverse_coordinates_mapping = coords.map_positions(alignment[1], alignment[0].seq) - - reference_aligned_mapping = coords.map_nonaligned_to_aligned_positions(reference, alignment[0].seq) - query_aligned_mapping = coords.map_nonaligned_to_aligned_positions(sequence, alignment[1].seq) + sequence = aligned_sequence.this + reference = aligned_sequence.reference errors = [] matches = [] @@ -543,14 +536,11 @@ def find_closest(aminoacids, start, direction, target): else: return 0 - def find_candidate_positions(e, q_start, q_end): - expected_nucleotides = e.nucleotides + def find_candidate_positions(e): + q_start = aligned_sequence.map_index(e.start) + q_end = aligned_sequence.map_index(e.end) expected_aminoacids = e.aminoacids expected_protein = expected_aminoacids.strip("*") - q_start = coordinates_mapping[e.start] - q_end = coordinates_mapping[e.end - 1 if e.end == len(coordinates_mapping) else e.end] - got_nucleotides = sequence.seq[q_start:q_end] - got_aminoacids = translate(got_nucleotides) q_start_a = q_start // 3 q_end_a = q_end // 3 n = len(sequence.seq) - 1 @@ -576,9 +566,7 @@ def find_candidate_positions(e, q_start, q_end): "forward", dist, got_protein, got_aminoacids) def find_real_correspondence(e): - q_start = coordinates_mapping[e.start] - q_end = coordinates_mapping[e.end - 1 if e.end == len(coordinates_mapping) else e.end] - candidates = find_candidate_positions(e, q_start, q_end) + candidates = find_candidate_positions(e) return min(candidates, key=lambda x: x.distance) def get_indel_impact(alignment): @@ -895,12 +883,12 @@ def intact( working_dir, alignment = aligned_sequence.get_alignment() sequence_orfs, orf_errors = has_reading_frames( - alignment, sequence, reference, False, + aligned_sequence, False, forward_orfs, error_bar) sequence_errors.extend(orf_errors) sequence_small_orfs, small_orf_errors = has_reading_frames( - alignment, sequence, reference, True, + aligned_sequence, True, small_orfs, error_bar, reverse = False) if include_small_orfs: sequence_errors.extend(small_orf_errors) diff --git a/util/aligned_sequence.py b/util/aligned_sequence.py index 949293f..844530d 100644 --- a/util/aligned_sequence.py +++ b/util/aligned_sequence.py @@ -33,14 +33,21 @@ def aligned_this(self): return self.get_alignment()[1] - def map_index(self, index): + def get_coordinates_mapping(self): if not self.coordinates_mapping: self.coordinates_mapping = coords.map_positions(self.aligned_reference(), self.aligned_this()) + return self.coordinates_mapping + + + def map_index(self, index): + if isinstance(index, ReferenceIndex): + index = index.value + if not isinstance(index, int): raise TypeError(f"Expected integer as index", index) - return self.coordinates_mapping[index] + return self.get_coordinates_mapping()[index] def index(self, index): @@ -59,7 +66,7 @@ def slice(self, first, last): newthis = self.this[first:(last + 1)] newreference = self.reference[self.map_index(first):(self.map_index(last) + 1)] # TODO: calculate new "coordinates_mapping" and new "alignment" from these indexes. - return Sequence(this=newthis, reference=newreference) + return AlignedSequence(this=newthis, reference=newreference) def reverse(self):