diff --git a/intact/intact.py b/intact/intact.py index 485586e..a79c815 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -638,10 +638,16 @@ def write(self, sequence, is_intact, orfs, errors, holistic): for error in errors: self.errors_writer.writerow([error[key] for key in self.errors_header]) -def strip_sequence_dashes(seq): - return SeqRecord.SeqRecord( - Seq.Seq(str(seq.seq).replace("-","").replace("\n", "")), - id = seq.id, name = seq.name) +def read_hxb2_orfs(aligned_subtype, orfs): + for (name, start, end, delta) in orfs: + vpr_defective_insertion_pos = 5772 + start = start if start < vpr_defective_insertion_pos else start - 1 + end = end if end < vpr_defective_insertion_pos else end - 1 + + # decrement is needed because original coordinates are 1-based. + start = start - 1 + + yield ExpectedORF.subtyped(aligned_subtype, name, start, end, delta) def intact( working_dir, input_file, @@ -677,7 +683,7 @@ def intact( working_dir, subtype_choices = {} with open(st.alignment_file(subtype), 'r') as in_handle: for sequence in SeqIO.parse(in_handle, "fasta"): - subtype_choices[sequence.id] = strip_sequence_dashes(sequence) + subtype_choices[sequence.id] = sequence def analyse_single_sequence(holistic, sequence, blast_rows): sequence_errors = [] @@ -724,13 +730,9 @@ def analyse_single_sequence(holistic, sequence, blast_rows): sequence = aligned_sequence.this # convert ORF positions to appropriate subtype - forward_orfs, reverse_orfs, small_orfs = [ - [ - ExpectedORF.subtyped(aligned_subtype, n, s, e, delta) \ - for (n, s, e, delta) in orfs - ] \ - for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs] - ] + forward_orfs, reverse_orfs, small_orfs = \ + [list(read_hxb2_orfs(aligned_subtype, orfs)) \ + for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]] holistic.orfs_start = min(forward_orfs, key=lambda e: e.start).start holistic.orfs_end = max(forward_orfs, key=lambda e: e.end).end diff --git a/util/expected_orf.py b/util/expected_orf.py index ac76de1..cbf987c 100644 --- a/util/expected_orf.py +++ b/util/expected_orf.py @@ -17,16 +17,12 @@ class ExpectedORF: @staticmethod def subtyped(aligned_sequence, name, start, end, deletion_tolerence): - vpr_defective_insertion_pos = 5772 - start = start if start < vpr_defective_insertion_pos else start - 1 - end = end if end < vpr_defective_insertion_pos else end - 1 - - start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based. + start_s = ReferenceIndex(start).mapto(aligned_sequence) end_s = ReferenceIndex(end).mapto(aligned_sequence) nucleotides = str(aligned_sequence.this.seq[start_s:end_s]) aminoacids = translate_to_aminoacids(nucleotides) - has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[(start - 1):end]).startswith("M") + has_start_codon = translate_to_aminoacids(aligned_sequence.this.seq[start:end]).startswith("M") protein = get_biggest_protein(has_start_codon, aminoacids) return ExpectedORF(name=name,