From 7c4eedea5351b41919ab7d9532916f586af0fbd8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 22 Sep 2023 11:29:36 -0700 Subject: [PATCH] factor out reading of hbx2 orfs coordinates --- intact/intact.py | 19 ++++++++++++------- util/expected_orf.py | 6 +----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/intact/intact.py b/intact/intact.py index 485586e..761fd8d 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -643,6 +643,15 @@ def strip_sequence_dashes(seq): Seq.Seq(str(seq.seq).replace("-","").replace("\n", "")), id = seq.id, name = seq.name) +def read_hxb2_orfs(aligned_subtype, orfs): + for (name, start, end, delta) in orfs: + vpr_defective_insertion_pos = 5771 + start = start - 1 # Decrement is needed because the original "start" is 1-based. + start = start if start < vpr_defective_insertion_pos else start - 1 + end = end if end < vpr_defective_insertion_pos else end - 1 + + yield ExpectedORF.subtyped(aligned_subtype, name, start, end, delta) + def intact( working_dir, input_file, subtype, @@ -724,13 +733,9 @@ def analyse_single_sequence(holistic, sequence, blast_rows): sequence = aligned_sequence.this # convert ORF positions to appropriate subtype - forward_orfs, reverse_orfs, small_orfs = [ - [ - ExpectedORF.subtyped(aligned_subtype, n, s, e, delta) \ - for (n, s, e, delta) in orfs - ] \ - for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs] - ] + forward_orfs, reverse_orfs, small_orfs = \ + [list(read_hxb2_orfs(aligned_subtype, orfs)) \ + for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]] holistic.orfs_start = min(forward_orfs, key=lambda e: e.start).start holistic.orfs_end = max(forward_orfs, key=lambda e: e.end).end diff --git a/util/expected_orf.py b/util/expected_orf.py index ac76de1..17cb00a 100644 --- a/util/expected_orf.py +++ b/util/expected_orf.py @@ -17,11 +17,7 @@ class ExpectedORF: @staticmethod def subtyped(aligned_sequence, name, start, end, deletion_tolerence): - vpr_defective_insertion_pos = 5772 - start = start if start < vpr_defective_insertion_pos else start - 1 - end = end if end < vpr_defective_insertion_pos else end - 1 - - start_s = ReferenceIndex(start - 1).mapto(aligned_sequence) # decrement is needed because original "start" is 1-based. + start_s = ReferenceIndex(start).mapto(aligned_sequence) end_s = ReferenceIndex(end).mapto(aligned_sequence) nucleotides = str(aligned_sequence.this.seq[start_s:end_s])