From 3a216a2e86f2c72e66d49e178928889f9ccbee1e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 21 Sep 2023 16:57:38 -0700 Subject: [PATCH] move shared classes to their own source files --- intact/intact.py | 78 +++------------------------------ util/aligned_sequence.py | 20 +++++---- util/candidate_orf.py | 13 ++++++ util/expected_orf.py | 40 +++++++++++++++++ util/get_biggest_protein.py | 13 ++++++ util/reference_index.py | 8 ++++ util/translate_to_aminoacids.py | 6 +++ 7 files changed, 99 insertions(+), 79 deletions(-) create mode 100644 util/candidate_orf.py create mode 100644 util/expected_orf.py create mode 100644 util/get_biggest_protein.py create mode 100644 util/reference_index.py create mode 100644 util/translate_to_aminoacids.py diff --git a/intact/intact.py b/intact/intact.py index 40cbcdc..c8fd23f 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -15,8 +15,13 @@ import util.log as log import util.coordinates as coords import util.detailed_aligner as detailed_aligner -from util.aligned_sequence import AlignedSequence, ReferenceIndex +from util.aligned_sequence import AlignedSequence +from util.reference_index import ReferenceIndex from util.blastrow import BlastRow +from util.candidate_orf import CandidateORF +from util.expected_orf import ExpectedORF +from util.translate_to_aminoacids import translate_to_aminoacids +from util.get_biggest_protein import get_biggest_protein WRONGORFNUMBER_ERROR = "WrongORFNumber" @@ -46,54 +51,6 @@ class IntactnessError: message: str -@dataclass -class ExpectedORF: - name: str - start: int - end: int - deletion_tolerence: int - nucleotides: str - aminoacids: str - protein: str - - - @staticmethod - def subtyped(aligned_sequence, name, start, end, deletion_tolerence): - vpr_defective_insertion_pos = 5772 - start = start if start < vpr_defective_insertion_pos else start - 1 - end = end if end < vpr_defective_insertion_pos else end - 1 - - start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based. - end_s = ReferenceIndex(end).mapto(aligned_sequence) - - nucleotides = str(aligned_sequence.this.seq[start_s:end_s]) - aminoacids = translate(nucleotides) - has_start_codon = translate(aligned_sequence.this.seq[(start - 1):end]).startswith("M") - protein = get_biggest_protein(has_start_codon, aminoacids) - - return ExpectedORF(name=name, - start=start_s, - end=end_s, - deletion_tolerence=deletion_tolerence, - nucleotides=nucleotides, - aminoacids=aminoacids, - protein=protein, - ) - - -@dataclass -class CandidateORF: - name: str - start: int - end: int - subtype_start: int - subtype_end: int - orientation: str - distance: float - protein: str - aminoacids: str - - @dataclass class FoundORF: name: str @@ -475,21 +432,6 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance): #/end def has_rev_response_element -def get_biggest_protein(has_start_codon, aminoacids): - def skip_to_startcodon(x): - index = x.find("M") - if index >= 0: - return x[index:] - else: - return "" - - parts = aminoacids.split("*") - subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts - longest = max(subparts, key=len) - return longest - - - def has_start_codon(orf): return orf.aminoacids[0] == "M" @@ -498,12 +440,6 @@ def has_stop_codon(orf): return orf.aminoacids[-1] == "*" -def translate(seq, frame = 0, to_stop = False): - for_translation = seq[frame:] - for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3]) - return Seq.translate(for_translation, to_stop = to_stop) - - def has_reading_frames( aligned_sequence, is_small, expected, error_bar, reverse = False @@ -515,7 +451,7 @@ def has_reading_frames( matches = [] try: - query_aminoacids_table = [translate(sequence.seq, i) for i in range(3)] + query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)] except Bio.Data.CodonTable.TranslationError as e: log.error(e) err = IntactnessError(sequence.id, INVALID_CODON, e.args[0]) diff --git a/util/aligned_sequence.py b/util/aligned_sequence.py index f828f33..2e8bcd4 100644 --- a/util/aligned_sequence.py +++ b/util/aligned_sequence.py @@ -4,19 +4,16 @@ import util.coordinates as coords import util.wrappers as wrappers - -@dataclass -class ReferenceIndex: - value: int - - def mapto(self, aligned): - return aligned.map_index(self.value) +from util.candidate_orf import CandidateORF +from util.expected_orf import ExpectedORF +from util.reference_index import ReferenceIndex @dataclass class AlignedSequence: this: Seq reference: Seq - alignment: (str, str) = dataclasses.field(default=None) + orfs: dict[str, CandidateORF] = dataclasses.field(default=None) + alignment: (str, str) = dataclasses.field(default=None) coordinates_mapping: list[int] = dataclasses.field(default=None) @@ -82,3 +79,10 @@ def reverse(self): def alignment_score(self): return sum([a==b for a, b in zip(self.get_alignment()[0].seq, self.get_alignment()[1].seq)]) + + + def get_orf(self, expected_orf: ExpectedORF): + if expected_orf.name not in self.orfs: + self.orfs[expected_orf.name] = find_orf(expected_orf) + + return self.orfs[expected_orf.name] diff --git a/util/candidate_orf.py b/util/candidate_orf.py new file mode 100644 index 0000000..d535a26 --- /dev/null +++ b/util/candidate_orf.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass + +@dataclass +class CandidateORF: + name: str + start: int + end: int + subtype_start: int + subtype_end: int + orientation: str + distance: float + protein: str + aminoacids: str diff --git a/util/expected_orf.py b/util/expected_orf.py new file mode 100644 index 0000000..ac76de1 --- /dev/null +++ b/util/expected_orf.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass +from util.reference_index import ReferenceIndex +from util.translate_to_aminoacids import translate_to_aminoacids +from util.get_biggest_protein import get_biggest_protein + + +@dataclass +class ExpectedORF: + name: str + start: int + end: int + deletion_tolerence: int + nucleotides: str + aminoacids: str + protein: str + + + @staticmethod + def subtyped(aligned_sequence, name, start, end, deletion_tolerence): + vpr_defective_insertion_pos = 5772 + start = start if start < vpr_defective_insertion_pos else start - 1 + end = end if end < vpr_defective_insertion_pos else end - 1 + + start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based. + end_s = ReferenceIndex(end).mapto(aligned_sequence) + + nucleotides = str(aligned_sequence.this.seq[start_s:end_s]) + aminoacids = translate_to_aminoacids(nucleotides) + has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[(start - 1):end]).startswith("M") + protein = get_biggest_protein(has_start_codon, aminoacids) + + return ExpectedORF(name=name, + start=start_s, + end=end_s, + deletion_tolerence=deletion_tolerence, + nucleotides=nucleotides, + aminoacids=aminoacids, + protein=protein, + ) + diff --git a/util/get_biggest_protein.py b/util/get_biggest_protein.py new file mode 100644 index 0000000..bfaf270 --- /dev/null +++ b/util/get_biggest_protein.py @@ -0,0 +1,13 @@ + +def get_biggest_protein(has_start_codon, aminoacids): + def skip_to_startcodon(x): + index = x.find("M") + if index >= 0: + return x[index:] + else: + return "" + + parts = aminoacids.split("*") + subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts + longest = max(subparts, key=len) + return longest diff --git a/util/reference_index.py b/util/reference_index.py new file mode 100644 index 0000000..7ce01b7 --- /dev/null +++ b/util/reference_index.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass + +@dataclass +class ReferenceIndex: + value: int + + def mapto(self, aligned): + return aligned.map_index(self.value) diff --git a/util/translate_to_aminoacids.py b/util/translate_to_aminoacids.py new file mode 100644 index 0000000..69af323 --- /dev/null +++ b/util/translate_to_aminoacids.py @@ -0,0 +1,6 @@ +from Bio import Seq + +def translate_to_aminoacids(seq, frame = 0, to_stop = False): + for_translation = seq[frame:] + for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3]) + return Seq.translate(for_translation, to_stop = to_stop)