From 87e361fda9d6eddbcb39368d8ea4bd5eaf04db6a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 21 Sep 2023 17:52:52 -0700 Subject: [PATCH] move find_orf into its own source file --- intact/intact.py | 66 +----------------------------- util/aligned_sequence.py | 1 + util/find_orf.py | 66 ++++++++++++++++++++++++++++++ util/get_query_aminoacids_table.py | 10 +++++ 4 files changed, 79 insertions(+), 64 deletions(-) create mode 100644 util/find_orf.py create mode 100644 util/get_query_aminoacids_table.py diff --git a/intact/intact.py b/intact/intact.py index c8fd23f..c5196ea 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -22,6 +22,7 @@ from util.expected_orf import ExpectedORF from util.translate_to_aminoacids import translate_to_aminoacids from util.get_biggest_protein import get_biggest_protein +from util.find_orf import find_orf WRONGORFNUMBER_ERROR = "WrongORFNumber" @@ -432,14 +433,6 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance): #/end def has_rev_response_element -def has_start_codon(orf): - return orf.aminoacids[0] == "M" - - -def has_stop_codon(orf): - return orf.aminoacids[-1] == "*" - - def has_reading_frames( aligned_sequence, is_small, expected, error_bar, reverse = False @@ -450,61 +443,6 @@ def has_reading_frames( errors = [] matches = [] - try: - query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)] - except Bio.Data.CodonTable.TranslationError as e: - log.error(e) - err = IntactnessError(sequence.id, INVALID_CODON, e.args[0]) - errors.append(err) - return matches, errors - - def find_closest(aminoacids, start, direction, target): - distance = 0 - n = len(aminoacids) - 1 - - while start + distance >= 0 and start + distance <= n: - if aminoacids[start + distance] == target: - return start + distance - distance += direction - - if target == '*': - return n - else: - return 0 - - def find_candidate_positions(e): - q_start = ReferenceIndex(e.start).mapto(aligned_sequence) - q_end = ReferenceIndex(e.end).mapto(aligned_sequence) - expected_aminoacids = e.aminoacids - expected_protein = expected_aminoacids.strip("*") - q_start_a = q_start // 3 - q_end_a = q_end // 3 - n = len(sequence.seq) - 1 - visited_set = set() - - for frame in range(3): - aminoacids = query_aminoacids_table[frame] - for start_direction in [-1, +1]: - for end_direction in [-1, +1]: - closest_start_a = q_start_a if not has_start_codon(e) else find_closest(aminoacids, q_start_a, start_direction, 'M') - closest_end_a = q_end_a if not has_stop_codon(e) else find_closest(aminoacids, q_end_a, end_direction, '*') - got_aminoacids = aminoacids[closest_start_a:closest_end_a + 1] - if got_aminoacids in visited_set: - continue - else: - visited_set.add(got_aminoacids) - - closest_start = min(n, (closest_start_a * 3) + frame) - closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame) - got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids) - dist = detailed_aligner.align(got_protein, expected_protein).distance() - yield CandidateORF(e.name, closest_start, closest_end, e.start, e.end, - "forward", dist, got_protein, got_aminoacids) - - def find_real_correspondence(e): - candidates = find_candidate_positions(e) - return min(candidates, key=lambda x: x.distance) - def get_indel_impact(alignment): shift = 0 impacted = 0 @@ -528,7 +466,7 @@ def get_indel_impact(alignment): return impacted for e in expected: - best_match = find_real_correspondence(e) + best_match = find_orf(aligned_sequence, e) matches.append(best_match) got_protein = best_match.protein diff --git a/util/aligned_sequence.py b/util/aligned_sequence.py index 2e8bcd4..fd7bb40 100644 --- a/util/aligned_sequence.py +++ b/util/aligned_sequence.py @@ -7,6 +7,7 @@ from util.candidate_orf import CandidateORF from util.expected_orf import ExpectedORF from util.reference_index import ReferenceIndex +from util.find_orf import find_orf @dataclass class AlignedSequence: diff --git a/util/find_orf.py b/util/find_orf.py new file mode 100644 index 0000000..bfe976d --- /dev/null +++ b/util/find_orf.py @@ -0,0 +1,66 @@ +from util.reference_index import ReferenceIndex +from util.get_query_aminoacids_table import get_query_aminoacids_table +from util.get_biggest_protein import get_biggest_protein +from util.candidate_orf import CandidateORF + +import util.detailed_aligner as detailed_aligner + +def has_start_codon(orf): + return orf.aminoacids[0] == "M" + + +def has_stop_codon(orf): + return orf.aminoacids[-1] == "*" + + +def find_closest(aminoacids, start, direction, target): + distance = 0 + n = len(aminoacids) - 1 + + while start + distance >= 0 and start + distance <= n: + if aminoacids[start + distance] == target: + return start + distance + distance += direction + + if target == '*': + return n + else: + return 0 + + +def find_candidate_positions(aligned_sequence, e): + q_start = ReferenceIndex(e.start).mapto(aligned_sequence) + q_end = ReferenceIndex(e.end).mapto(aligned_sequence) + expected_aminoacids = e.aminoacids + expected_protein = expected_aminoacids.strip("*") + q_start_a = q_start // 3 + q_end_a = q_end // 3 + n = len(aligned_sequence.this.seq) - 1 + visited_set = set() + query_aminoacids_table = get_query_aminoacids_table(aligned_sequence.this) + if isinstance(query_aminoacids_table, Exception): + raise query_aminoacids_table + + for frame in range(3): + aminoacids = query_aminoacids_table[frame] + for start_direction in [-1, +1]: + for end_direction in [-1, +1]: + closest_start_a = q_start_a if not has_start_codon(e) else find_closest(aminoacids, q_start_a, start_direction, 'M') + closest_end_a = q_end_a if not has_stop_codon(e) else find_closest(aminoacids, q_end_a, end_direction, '*') + got_aminoacids = aminoacids[closest_start_a:closest_end_a + 1] + if got_aminoacids in visited_set: + continue + else: + visited_set.add(got_aminoacids) + + closest_start = min(n, (closest_start_a * 3) + frame) + closest_end = min(n + 1, (closest_end_a * 3) + 3 + frame) + got_protein = get_biggest_protein(has_start_codon(e), got_aminoacids) + dist = detailed_aligner.align(got_protein, expected_protein).distance() + yield CandidateORF(e.name, closest_start, closest_end, e.start, e.end, + "forward", dist, got_protein, got_aminoacids) + + +def find_orf(aligned_sequence, e): + candidates = find_candidate_positions(aligned_sequence, e) + return min(candidates, key=lambda x: x.distance) diff --git a/util/get_query_aminoacids_table.py b/util/get_query_aminoacids_table.py new file mode 100644 index 0000000..09cd412 --- /dev/null +++ b/util/get_query_aminoacids_table.py @@ -0,0 +1,10 @@ +from util.translate_to_aminoacids import translate_to_aminoacids + +def get_query_aminoacids_table(sequence): + if not hasattr(sequence, "query_aminoacids_table"): + try: + sequence.query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)] + except Bio.Data.CodonTable.TranslationError as e: + sequence.query_aminoacids_table = e + + return sequence.query_aminoacids_table