Skip to content

Commit

Permalink
factor out analyse_single_sequence function
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 22, 2023
1 parent 43efacd commit 47fde59
Showing 1 changed file with 154 additions and 150 deletions.
304 changes: 154 additions & 150 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,173 +679,177 @@ def intact( working_dir,
for sequence in SeqIO.parse(in_handle, "fasta"):
subtype_choices[sequence.id] = strip_sequence_dashes(sequence)

hxb2_reference = st.HXB2()
def analyse_single_sequence(holistic, sequence, blast_rows):
sequence_errors = []

with OutputWriter(working_dir, "csv" if output_csv else "json") as writer:

blast_it = blast_iterate_inf(subtype, input_file, working_dir) if check_internal_inversion or check_nonhiv or check_scramble or 1 < len(subtype_choices) else iterate_empty_lists()
for (sequence, blast_rows) in with_blast_rows(blast_it, iterate_sequences(input_file)):
holistic.qlen = len(sequence)
holistic.blast_n_conseqs = len(blast_rows)

blast_rows_statistics = {}
for blast_row in blast_rows:
blast_rows_statistics[blast_row.sseqid] = blast_rows_statistics.get(blast_row.sseqid, 0) + abs(blast_row.qend - blast_row.qstart)
aligned_reference_length = sum(abs(x.send - x.sstart) + 1 for x in blast_rows)
blast_matched_slen = blast_rows[0].slen if blast_rows else 1
holistic.blast_sseq_coverage = aligned_reference_length / blast_matched_slen

if blast_rows_statistics:
reference_name = max(blast_rows_statistics, key=blast_rows_statistics.get)
else:
reference_name = sorted(subtype_choices.keys())[0]
blast_rows_statistics = {}
for blast_row in blast_rows:
blast_rows_statistics[blast_row.sseqid] = blast_rows_statistics.get(blast_row.sseqid, 0) + abs(blast_row.qend - blast_row.qstart)

reference = subtype_choices[reference_name]
aligned_subtype = AlignedSequence(this=reference, reference=hxb2_reference)
if blast_rows_statistics:
reference_name = max(blast_rows_statistics, key=blast_rows_statistics.get)
else:
reference_name = sorted(subtype_choices.keys())[0]

holistic.inferred_subtype = reference_name
reference = subtype_choices[reference_name]
aligned_subtype = AlignedSequence(this=reference, reference=st.HXB2())

# convert ORF positions to appropriate subtype
forward_orfs, reverse_orfs, small_orfs = [
[
ExpectedORF.subtyped(aligned_subtype, n, s, e, delta) \
for (n, s, e, delta) in orfs
] \
for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]
]

# convert PSI locus and RRE locus to appropriate subtype
psi_locus = [ReferenceIndex(x).mapto(aligned_subtype) for x in hxb2_psi_locus]
rre_locus = [ReferenceIndex(x).mapto(aligned_subtype) for x in hxb2_rre_locus]

holistic.orfs_start = min(forward_orfs, key=lambda e: e.start).start
holistic.orfs_end = max(forward_orfs, key=lambda e: e.end).end
clamp = lambda p: max(min(p, holistic.orfs_end), holistic.orfs_start)
aligned_reference_orfs_length = sum(abs(clamp(x.send + 1) - clamp(x.sstart)) for x in blast_rows)
blast_matched_orfs_slen = holistic.orfs_end - holistic.orfs_start
holistic.blast_sseq_orfs_coverage = aligned_reference_orfs_length / blast_matched_orfs_slen

reverse_sequence = SeqRecord.SeqRecord(Seq.reverse_complement(sequence.seq),
id = sequence.id + " [REVERSED]",
name = sequence.name
)

forward_aligned_sequence = AlignedSequence(this=sequence, reference=aligned_subtype.this)
reverse_aligned_sequence = forward_aligned_sequence.reverse()

forward_score = forward_aligned_sequence.alignment_score()
reverse_score = reverse_aligned_sequence.alignment_score()
if forward_score >= reverse_score:
aligned_sequence = forward_aligned_sequence
else:
print("Reversing sequence " + sequence.id + "; forward score "
+ str(forward_score) + "; reverse score " + str(reverse_score))
aligned_sequence = reverse_aligned_sequence
sequence = reverse_sequence

alignment = aligned_sequence.get_alignment()

try:
sequence_orfs, orf_errors = has_reading_frames(
aligned_sequence, False,
forward_orfs, error_bar)
sequence_errors.extend(orf_errors)

sequence_small_orfs, small_orf_errors = has_reading_frames(
aligned_sequence, True,
small_orfs, error_bar, reverse = False)
if include_small_orfs:
sequence_errors.extend(small_orf_errors)

# convert ORF positions to appropriate subtype
forward_orfs, reverse_orfs, small_orfs = [
[
ExpectedORF.subtyped(aligned_subtype, n, s, e, delta) \
for (n, s, e, delta) in orfs
] \
for orfs in [hxb2_forward_orfs, hxb2_reverse_orfs, hxb2_small_orfs]
]
hxb2_found_orfs = [FoundORF(
o.name,
o.start,
o.end,
o.subtype_start,
o.subtype_end,
o.orientation,
o.distance,
str(o.protein),
str(o.aminoacids),
str(sequence[o.start:o.end].seq),
) for o in sorted(sequence_orfs + sequence_small_orfs, key=lambda o: o.start)]

except Bio.Data.CodonTable.TranslationError as e:
log.error(e)
err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
sequence_errors.append(err)
hxb2_found_orfs = []

if include_packaging_signal:
missing_psi_locus = has_packaging_signal(alignment,
psi_locus,
const.PSI_ERROR_TOLERANCE)
if missing_psi_locus is not None:
sequence_errors.append(missing_psi_locus)

if include_rre:
missing_rre_locus = has_rev_response_element(alignment,
rre_locus,
const.RRE_ERROR_TOLERANCE
)
if missing_rre_locus is not None:
sequence_errors.append(missing_rre_locus)

if check_major_splice_donor_site:
mutated_splice_donor_site = has_mutated_major_splice_donor_site(
alignment,
ReferenceIndex(hxb2_msd_site_locus).mapto(aligned_subtype),
ReferenceIndex(hxb2_msd_site_locus + 1).mapto(aligned_subtype),
const.DEFAULT_MSD_SEQUENCE)
if mutated_splice_donor_site is not None:
sequence_errors.append(mutated_splice_donor_site)

if run_hypermut is not None:
hypermutated = isHypermut(holistic, alignment)

if hypermutated is not None:
sequence_errors.append(hypermutated)

if check_long_deletion is not None:
long_deletion = has_long_deletion(sequence, alignment)
if long_deletion:
sequence_errors.append(long_deletion)

if check_nonhiv:
error = is_nonhiv(holistic, sequence.id, len(sequence), blast_rows)
if error:
sequence_errors.append(error)

if check_scramble:
error = is_scrambled(sequence.id, blast_rows)
if error:
sequence_errors.append(error)

if check_internal_inversion:
error = contains_internal_inversion(sequence.id, blast_rows)
if error:
sequence_errors.append(error)

is_intact = len(sequence_errors) == 0

# add the small orf errors after the intactness check if not included
if not include_small_orfs:
sequence_errors.extend(small_orf_errors)

return is_intact, hxb2_found_orfs, sequence_errors

# convert PSI locus and RRE locus to appropriate subtype
psi_locus = [ReferenceIndex(x).mapto(aligned_subtype) for x in hxb2_psi_locus]
rre_locus = [ReferenceIndex(x).mapto(aligned_subtype) for x in hxb2_rre_locus]
with OutputWriter(working_dir, "csv" if output_csv else "json") as writer:

sequence_errors = []
blast_it = blast_iterate_inf(subtype, input_file, working_dir) if check_internal_inversion or check_nonhiv or check_scramble or 1 < len(subtype_choices) else iterate_empty_lists()
for (sequence, blast_rows) in with_blast_rows(blast_it, iterate_sequences(input_file)):
holistic = HolisticInfo()
holistic.qlen = len(sequence)
holistic.inferred_subtype = reference_name
holistic.blast_n_conseqs = len(blast_rows)

aligned_reference_length = sum(abs(x.send - x.sstart) + 1 for x in blast_rows)
blast_matched_slen = blast_rows[0].slen if blast_rows else 1
holistic.blast_sseq_coverage = aligned_reference_length / blast_matched_slen

holistic.orfs_start = min(forward_orfs, key=lambda e: e.start).start
holistic.orfs_end = max(forward_orfs, key=lambda e: e.end).end
clamp = lambda p: max(min(p, holistic.orfs_end), holistic.orfs_start)
aligned_reference_orfs_length = sum(abs(clamp(x.send + 1) - clamp(x.sstart)) for x in blast_rows)
blast_matched_orfs_slen = holistic.orfs_end - holistic.orfs_start
holistic.blast_sseq_orfs_coverage = aligned_reference_orfs_length / blast_matched_orfs_slen

reverse_sequence = SeqRecord.SeqRecord(Seq.reverse_complement(sequence.seq),
id = sequence.id + " [REVERSED]",
name = sequence.name
)

try:
forward_aligned_sequence = AlignedSequence(this=sequence, reference=aligned_subtype.this)
reverse_aligned_sequence = forward_aligned_sequence.reverse()
is_intact, hxb2_found_orfs, sequence_errors = analyse_single_sequence(holistic, sequence, blast_rows)
except wrappers.AlignmentFailure:
err = IntactnessError(sequence.id, ALIGNMENT_FAILED, "Alignment failed for this sequence. It probably contains invalid symbols.")
sequence_errors.append(err)
errors = [x.__dict__ for x in sequence_errors]
holistic = holistic.__dict__
writer.write(sequence, is_intact=False, orfs=[], errors=errors, holistic=holistic)
continue

forward_score = forward_aligned_sequence.alignment_score()
reverse_score = reverse_aligned_sequence.alignment_score()
if forward_score >= reverse_score:
aligned_sequence = forward_aligned_sequence
else:
print("Reversing sequence " + sequence.id + "; forward score "
+ str(forward_score) + "; reverse score " + str(reverse_score))
aligned_sequence = reverse_aligned_sequence
sequence = reverse_sequence

alignment = aligned_sequence.get_alignment()

try:
sequence_orfs, orf_errors = has_reading_frames(
aligned_sequence, False,
forward_orfs, error_bar)
sequence_errors.extend(orf_errors)

sequence_small_orfs, small_orf_errors = has_reading_frames(
aligned_sequence, True,
small_orfs, error_bar, reverse = False)
if include_small_orfs:
sequence_errors.extend(small_orf_errors)

hxb2_found_orfs = [FoundORF(
o.name,
o.start,
o.end,
o.subtype_start,
o.subtype_end,
o.orientation,
o.distance,
str(o.protein),
str(o.aminoacids),
str(sequence[o.start:o.end].seq),
) for o in sorted(sequence_orfs + sequence_small_orfs, key=lambda o: o.start)]

except Bio.Data.CodonTable.TranslationError as e:
log.error(e)
err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
sequence_errors.append(err)
is_intact = False
hxb2_found_orfs = []

if include_packaging_signal:
missing_psi_locus = has_packaging_signal(alignment,
psi_locus,
const.PSI_ERROR_TOLERANCE)
if missing_psi_locus is not None:
sequence_errors.append(missing_psi_locus)

if include_rre:
missing_rre_locus = has_rev_response_element(alignment,
rre_locus,
const.RRE_ERROR_TOLERANCE
)
if missing_rre_locus is not None:
sequence_errors.append(missing_rre_locus)

if check_major_splice_donor_site:
mutated_splice_donor_site = has_mutated_major_splice_donor_site(
alignment,
ReferenceIndex(hxb2_msd_site_locus).mapto(aligned_subtype),
ReferenceIndex(hxb2_msd_site_locus + 1).mapto(aligned_subtype),
const.DEFAULT_MSD_SEQUENCE)
if mutated_splice_donor_site is not None:
sequence_errors.append(mutated_splice_donor_site)

if run_hypermut is not None:
hypermutated = isHypermut(holistic, alignment)

if hypermutated is not None:
sequence_errors.append(hypermutated)

if check_long_deletion is not None:
long_deletion = has_long_deletion(sequence, alignment)
if long_deletion:
sequence_errors.append(long_deletion)

if check_nonhiv:
error = is_nonhiv(holistic, sequence.id, len(sequence), blast_rows)
if error:
sequence_errors.append(error)

if check_scramble:
error = is_scrambled(sequence.id, blast_rows)
if error:
sequence_errors.append(error)

if check_internal_inversion:
error = contains_internal_inversion(sequence.id, blast_rows)
if error:
sequence_errors.append(error)

is_intact = len(sequence_errors) == 0

# add the small orf errors after the intactness check if not included
if not include_small_orfs:
sequence_errors.extend(small_orf_errors)
sequence_errors = [err]

orfs = [x.__dict__ for x in hxb2_found_orfs]
errors = [x.__dict__ for x in sequence_errors]
holistic = holistic.__dict__
writer.write(sequence, is_intact, orfs, errors, holistic=holistic)


#/end def intact
#/end intact.py

0 comments on commit 47fde59

Please sign in to comment.