From 3a216a2e86f2c72e66d49e178928889f9ccbee1e Mon Sep 17 00:00:00 2001
From: Vitaliy Mysak <vmysak@bccfe.ca>
Date: Thu, 21 Sep 2023 16:57:38 -0700
Subject: [PATCH] move shared classes to their own source files

---
 intact/intact.py                | 78 +++------------------------------
 util/aligned_sequence.py        | 20 +++++----
 util/candidate_orf.py           | 13 ++++++
 util/expected_orf.py            | 40 +++++++++++++++++
 util/get_biggest_protein.py     | 13 ++++++
 util/reference_index.py         |  8 ++++
 util/translate_to_aminoacids.py |  6 +++
 7 files changed, 99 insertions(+), 79 deletions(-)
 create mode 100644 util/candidate_orf.py
 create mode 100644 util/expected_orf.py
 create mode 100644 util/get_biggest_protein.py
 create mode 100644 util/reference_index.py
 create mode 100644 util/translate_to_aminoacids.py

diff --git a/intact/intact.py b/intact/intact.py
index 40cbcdc..c8fd23f 100644
--- a/intact/intact.py
+++ b/intact/intact.py
@@ -15,8 +15,13 @@
 import util.log as log
 import util.coordinates as coords
 import util.detailed_aligner as detailed_aligner
-from util.aligned_sequence import AlignedSequence, ReferenceIndex
+from util.aligned_sequence import AlignedSequence
+from util.reference_index import ReferenceIndex
 from util.blastrow import BlastRow
+from util.candidate_orf import CandidateORF
+from util.expected_orf import ExpectedORF
+from util.translate_to_aminoacids import translate_to_aminoacids
+from util.get_biggest_protein import get_biggest_protein
 
 
 WRONGORFNUMBER_ERROR    = "WrongORFNumber"
@@ -46,54 +51,6 @@ class IntactnessError:
     message: str
 
 
-@dataclass
-class ExpectedORF:
-    name: str
-    start: int
-    end: int
-    deletion_tolerence: int
-    nucleotides: str
-    aminoacids: str
-    protein: str
-
-
-    @staticmethod
-    def subtyped(aligned_sequence, name, start, end, deletion_tolerence):
-        vpr_defective_insertion_pos = 5772
-        start = start if start < vpr_defective_insertion_pos else start - 1
-        end = end if end < vpr_defective_insertion_pos else end - 1
-
-        start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based.
-        end_s = ReferenceIndex(end).mapto(aligned_sequence)
-
-        nucleotides = str(aligned_sequence.this.seq[start_s:end_s])
-        aminoacids = translate(nucleotides)
-        has_start_codon = translate(aligned_sequence.this.seq[(start - 1):end]).startswith("M")
-        protein = get_biggest_protein(has_start_codon, aminoacids)
-
-        return ExpectedORF(name=name,
-                           start=start_s,
-                           end=end_s,
-                           deletion_tolerence=deletion_tolerence,
-                           nucleotides=nucleotides,
-                           aminoacids=aminoacids,
-                           protein=protein,
-                           )
-
-
-@dataclass
-class CandidateORF:
-    name: str
-    start: int
-    end: int
-    subtype_start: int
-    subtype_end: int
-    orientation: str
-    distance: float
-    protein: str
-    aminoacids: str
-
-
 @dataclass
 class FoundORF:
     name: str
@@ -475,21 +432,6 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance):
 #/end def has_rev_response_element
 
 
-def get_biggest_protein(has_start_codon, aminoacids):
-    def skip_to_startcodon(x):
-        index = x.find("M")
-        if index >= 0:
-            return x[index:]
-        else:
-            return ""
-
-    parts = aminoacids.split("*")
-    subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts
-    longest = max(subparts, key=len)
-    return longest
-
-
-
 def has_start_codon(orf):
     return orf.aminoacids[0] == "M"
 
@@ -498,12 +440,6 @@ def has_stop_codon(orf):
     return orf.aminoacids[-1] == "*"
 
 
-def translate(seq, frame = 0, to_stop = False):
-    for_translation = seq[frame:]
-    for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3])
-    return Seq.translate(for_translation, to_stop = to_stop)
-
-
 def has_reading_frames(
     aligned_sequence, is_small,
     expected, error_bar, reverse = False
@@ -515,7 +451,7 @@ def has_reading_frames(
     matches = []
 
     try:
-        query_aminoacids_table = [translate(sequence.seq, i) for i in range(3)]
+        query_aminoacids_table = [translate_to_aminoacids(sequence.seq, i) for i in range(3)]
     except Bio.Data.CodonTable.TranslationError as e:
         log.error(e)
         err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
diff --git a/util/aligned_sequence.py b/util/aligned_sequence.py
index f828f33..2e8bcd4 100644
--- a/util/aligned_sequence.py
+++ b/util/aligned_sequence.py
@@ -4,19 +4,16 @@
 
 import util.coordinates as coords
 import util.wrappers as wrappers
-
-@dataclass
-class ReferenceIndex:
-    value: int
-
-    def mapto(self, aligned):
-        return aligned.map_index(self.value)
+from util.candidate_orf import CandidateORF
+from util.expected_orf import ExpectedORF
+from util.reference_index import ReferenceIndex
 
 @dataclass
 class AlignedSequence:
     this: Seq
     reference: Seq
-    alignment: (str, str) = dataclasses.field(default=None)
+    orfs: dict[str, CandidateORF]  = dataclasses.field(default=None)
+    alignment: (str, str)          = dataclasses.field(default=None)
     coordinates_mapping: list[int] = dataclasses.field(default=None)
 
 
@@ -82,3 +79,10 @@ def reverse(self):
 
     def alignment_score(self):
         return sum([a==b for a, b in zip(self.get_alignment()[0].seq, self.get_alignment()[1].seq)])
+
+
+    def get_orf(self, expected_orf: ExpectedORF):
+        if expected_orf.name not in self.orfs:
+            self.orfs[expected_orf.name] = find_orf(expected_orf)
+
+        return self.orfs[expected_orf.name]
diff --git a/util/candidate_orf.py b/util/candidate_orf.py
new file mode 100644
index 0000000..d535a26
--- /dev/null
+++ b/util/candidate_orf.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+
+@dataclass
+class CandidateORF:
+    name: str
+    start: int
+    end: int
+    subtype_start: int
+    subtype_end: int
+    orientation: str
+    distance: float
+    protein: str
+    aminoacids: str
diff --git a/util/expected_orf.py b/util/expected_orf.py
new file mode 100644
index 0000000..ac76de1
--- /dev/null
+++ b/util/expected_orf.py
@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+from util.reference_index import ReferenceIndex
+from util.translate_to_aminoacids import translate_to_aminoacids
+from util.get_biggest_protein import get_biggest_protein
+
+
+@dataclass
+class ExpectedORF:
+    name: str
+    start: int
+    end: int
+    deletion_tolerence: int
+    nucleotides: str
+    aminoacids: str
+    protein: str
+
+
+    @staticmethod
+    def subtyped(aligned_sequence, name, start, end, deletion_tolerence):
+        vpr_defective_insertion_pos = 5772
+        start = start if start < vpr_defective_insertion_pos else start - 1
+        end = end if end < vpr_defective_insertion_pos else end - 1
+
+        start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based.
+        end_s = ReferenceIndex(end).mapto(aligned_sequence)
+
+        nucleotides = str(aligned_sequence.this.seq[start_s:end_s])
+        aminoacids = translate_to_aminoacids(nucleotides)
+        has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[(start - 1):end]).startswith("M")
+        protein = get_biggest_protein(has_start_codon, aminoacids)
+
+        return ExpectedORF(name=name,
+                           start=start_s,
+                           end=end_s,
+                           deletion_tolerence=deletion_tolerence,
+                           nucleotides=nucleotides,
+                           aminoacids=aminoacids,
+                           protein=protein,
+                           )
+
diff --git a/util/get_biggest_protein.py b/util/get_biggest_protein.py
new file mode 100644
index 0000000..bfaf270
--- /dev/null
+++ b/util/get_biggest_protein.py
@@ -0,0 +1,13 @@
+
+def get_biggest_protein(has_start_codon, aminoacids):
+    def skip_to_startcodon(x):
+        index = x.find("M")
+        if index >= 0:
+            return x[index:]
+        else:
+            return ""
+
+    parts = aminoacids.split("*")
+    subparts = [skip_to_startcodon(x) for x in parts] if has_start_codon else parts
+    longest = max(subparts, key=len)
+    return longest
diff --git a/util/reference_index.py b/util/reference_index.py
new file mode 100644
index 0000000..7ce01b7
--- /dev/null
+++ b/util/reference_index.py
@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+
+@dataclass
+class ReferenceIndex:
+    value: int
+
+    def mapto(self, aligned):
+        return aligned.map_index(self.value)
diff --git a/util/translate_to_aminoacids.py b/util/translate_to_aminoacids.py
new file mode 100644
index 0000000..69af323
--- /dev/null
+++ b/util/translate_to_aminoacids.py
@@ -0,0 +1,6 @@
+from Bio import Seq
+
+def translate_to_aminoacids(seq, frame = 0, to_stop = False):
+    for_translation = seq[frame:]
+    for_translation += 'N' * ({0: 0, 1: 2, 2: 1}[len(for_translation) % 3])
+    return Seq.translate(for_translation, to_stop = to_stop)