factor out reading of HXB2 ORFs

cfe-lab · Sep 22, 2023 · 3b24f43 · 3b24f43
1 parent c399add
commit 3b24f43
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 18 deletions.
diff --git a/intact/intact.py b/intact/intact.py
@@ -638,10 +638,16 @@ def write(self, sequence, is_intact, orfs, errors, holistic):
             for error in errors:
                 self.errors_writer.writerow([error[key] for key in self.errors_header])
 
-def strip_sequence_dashes(seq):
-    return SeqRecord.SeqRecord(
-        Seq.Seq(str(seq.seq).replace("-","").replace("\n", "")),
-        id = seq.id, name = seq.name)
+def read_hxb2_orfs(aligned_subtype, orfs):
+    for (name, start, end, delta) in orfs:
+        vpr_defective_insertion_pos = 5772
+        start = start if start < vpr_defective_insertion_pos else start - 1
+        end = end if end < vpr_defective_insertion_pos else end - 1
+
+        # decrement is needed because original coordinates are 1-based.
+        start = start - 1
+
+        yield ExpectedORF.subtyped(aligned_subtype, name, start, end, delta)
 
 def intact( working_dir,
             input_file,
@@ -677,7 +683,7 @@ def intact( working_dir,
     subtype_choices = {}
     with open(st.alignment_file(subtype), 'r') as in_handle:
         for sequence in SeqIO.parse(in_handle, "fasta"):
-            subtype_choices[sequence.id] = strip_sequence_dashes(sequence)
+            subtype_choices[sequence.id] = sequence
 
     def analyse_single_sequence(holistic, sequence, blast_rows):
         sequence_errors = []
@@ -724,13 +730,9 @@ def analyse_single_sequence(holistic, sequence, blast_rows):
             sequence = aligned_sequence.this
 
         # convert ORF positions to appropriate subtype
-        forward_orfs, reverse_orfs, small_orfs = [
-        [
-            ExpectedORF.subtyped(aligned_subtype, n, s, e, delta) \
-            for (n, s, e, delta) in orfs
-        ] \
-        for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]
-        ]
+        forward_orfs, reverse_orfs, small_orfs = \
+            [list(read_hxb2_orfs(aligned_subtype, orfs)) \
+             for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]]
 
         holistic.orfs_start = min(forward_orfs, key=lambda e: e.start).start
         holistic.orfs_end = max(forward_orfs, key=lambda e: e.end).end

diff --git a/util/expected_orf.py b/util/expected_orf.py
@@ -17,16 +17,12 @@ class ExpectedORF:
 
     @staticmethod
     def subtyped(aligned_sequence, name, start, end, deletion_tolerence):
-        vpr_defective_insertion_pos = 5772
-        start = start if start < vpr_defective_insertion_pos else start - 1
-        end = end if end < vpr_defective_insertion_pos else end - 1
-
-        start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based.
+        start_s = ReferenceIndex(start).mapto(aligned_sequence)
         end_s = ReferenceIndex(end).mapto(aligned_sequence)
 
         nucleotides = str(aligned_sequence.this.seq[start_s:end_s])
         aminoacids = translate_to_aminoacids(nucleotides)
-        has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[(start - 1):end]).startswith("M")
+        has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[start:end]).startswith("M")
         protein = get_biggest_protein(has_start_codon, aminoacids)
 
         return ExpectedORF(name=name,