Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
make InvalidCodon error more descriptive
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 25, 2023
1 parent 12cbef1 commit 2849537
Showing 1 changed file with 93 additions and 52 deletions.
145 changes: 93 additions & 52 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,19 @@
import dataclasses
from dataclasses import dataclass
from collections import Counter
import Bio
from Bio import Seq, SeqIO, SeqRecord
from Bio.Data import IUPACData
from scipy.stats import fisher_exact

import util.constants as const
import util.subtypes as st
import util.wrappers as wrappers
import util.log as log
import util.coordinates as coords
import util.detailed_aligner as detailed_aligner
from util.aligned_sequence import AlignedSequence
from util.reference_index import ReferenceIndex
from util.blastrow import BlastRow
from util.mapped_orf import MappedORF
from util.initialize_orf import initialize_orf
from util.get_query_aminoacids_table import get_query_aminoacids_table
from util.get_biggest_protein import get_biggest_protein
from util.find_orf import find_orf


Expand All @@ -35,7 +31,7 @@
NONHIV_ERROR = "NonHIV"
INTERNALINVERSION_ERROR = "InternalInversion"
ALIGNMENT_FAILED = "AlignmentFailed" # Happens when mafft process fails
INVALID_CODON = "InvalidCodon" # Happens when there is an invalid codon in ORF
UNKNOWN_NUCLEOTIDE = "UnknownNucleotide" # Happens when there is an invalid codon anywhere in a sequence

FRAMESHIFTINORF_ERROR = "FrameshiftInOrf"
MSDMUTATED_ERROR = "MajorSpliceDonorSiteMutated"
Expand Down Expand Up @@ -220,7 +216,7 @@ def isHypermut(holistic, aln):
aln -- AlignIO object where aln[0] is the reference and aln[1] is the query.
Internal Args:
newAln -- all non-indel positions (i.e. removes all sites
newAln -- all non-indel positions (i.e. removes all sites
where a '-' exists in either sequence.
ref_str -- joined string from newAln[0]
qry_str -- joined string from newAln[1]
Expand Down Expand Up @@ -281,15 +277,15 @@ def has_long_deletion(sequence, alignment):
#/end has_long_deletion


def has_mutated_major_splice_donor_site(alignment,
splice_donor_start_pos,
def has_mutated_major_splice_donor_site(alignment,
splice_donor_start_pos,
splice_donor_end_pos,
splice_donor_sequence):
"""
Determines whether the major splice donor site is mutated.
Keyword Args:
alignment -- multiple sequence alignment object containing the
alignment -- multiple sequence alignment object containing the
reference and query sequence.
splice_donor_start_pos -- first position of splice donor site
splice_donor_end_pos -- last position of splice donor site
Expand All @@ -298,66 +294,66 @@ def has_mutated_major_splice_donor_site(alignment,

sd_begin = [m.start() for m in re.finditer(r"[^-]",
str(alignment[0].seq))][splice_donor_start_pos]

sd_end = [m.start() for m in re.finditer(r"[^-]",
str(alignment[0].seq))][splice_donor_end_pos]

sd = alignment[1].seq[sd_begin:(sd_end + 1)]

# splice donor site is missing from sequence
if all([x == "-" for x in sd]):
return IntactnessError(
alignment[1].id, MSDMUTATED_ERROR,
"Query sequence has a missing splice donor site, "
"Query sequence has a missing splice donor site, "
+ "".join(sd.upper()) + "."
)

if sd.upper() != splice_donor_sequence.upper():

return IntactnessError(
alignment[1].id, MSDMUTATED_ERROR,
"Query sequence has a mutated splice donor site, "
"Query sequence has a mutated splice donor site, "
+ "".join(sd.upper()) + "."
)

return None









def has_packaging_signal(alignment, psi_locus, psi_tolerance):
"""
Determines presence and possible intactness of HIV
Determines presence and possible intactness of HIV
Packaging Signal Region.
Keyword Args:
alignment -- multiple sequence alignment object containing the
alignment -- multiple sequence alignment object containing the
reference and query sequence.
psi_locus -- tuple containing start and end coordinates of PSI wrt
the reference being used.
psi_tolerance -- number of deletions in query tolerated to be intact
Internal Args:
packaging_begin -- Aligned PSI start position.
packaging_end -- Aligned PSI end position.
query_start -- beginning position of query sequence.
query_psi -- extracted PSI region from query
query_psi_deletions -- number of deletions in query PSI region
Return:
PSINOTFOUND_ERROR -- IntactnessError denoting query does not encompass
the complete PSI region.
PSIDELETION_ERROR -- IntactnessError denoting query likely contains
defective PSI.
None -- Denotes intact PSI.
None -- Denotes intact PSI.
"""
packaging_begin = [m.start() for m in re.finditer(r"[^-]",
str(alignment[0].seq))][psi_locus[0]]
Expand All @@ -378,7 +374,7 @@ def has_packaging_signal(alignment, psi_locus, psi_tolerance):
if query_psi_deletions > psi_tolerance:
return IntactnessError(
alignment[1].id, PSIDELETION_ERROR,
"Query Sequence exceeds maximum deletion tolerance in PSI. " +
"Query Sequence exceeds maximum deletion tolerance in PSI. " +
"Contains " + str(query_psi_deletions) + " deletions with max "
+ "tolerance of " + str(psi_tolerance) + " deletions."
)
Expand All @@ -388,32 +384,32 @@ def has_packaging_signal(alignment, psi_locus, psi_tolerance):

def has_rev_response_element(alignment, rre_locus, rre_tolerance):
"""
Determines presence and possible intactness of HIV
Determines presence and possible intactness of HIV
Packaging Signal Region.
Keyword Args:
alignment -- multiple sequence alignment object containing the
alignment -- multiple sequence alignment object containing the
reference and query sequence.
rre_locus -- tuple containing start and end coordinates of RRE wrt
the reference being used.
RRE_tolerance -- number of deletions in query tolerated to be intact
Internal Args:
rre_begin -- Aligned RRE start position.
rre_end -- Aligned RRE end position.
query_rre -- extracted RRE region from query
query_rre_deletions -- number of deletions in query RRE region
Return:
RREDELETION_ERROR -- IntactnessError denoting query likely contains
defective RRE.
None -- Denotes intact RRE.
None -- Denotes intact RRE.
"""
rre_begin = [m.start() for m in re.finditer(r"[^-]",
str(alignment[0].seq))][rre_locus[0]]
Expand All @@ -424,7 +420,7 @@ def has_rev_response_element(alignment, rre_locus, rre_tolerance):
if query_rre_deletions > rre_tolerance:
return IntactnessError(
alignment[1].id, RREDELETION_ERROR,
"Query Sequence exceeds maximum deletion tolerance in RRE. " +
"Query Sequence exceeds maximum deletion tolerance in RRE. " +
"Contains " + str(query_rre_deletions) + " deletions with max "
+ "tolerance of " + str(rre_tolerance) + " deletions."
)
Expand Down Expand Up @@ -529,8 +525,8 @@ def get_indel_impact(alignment):
errors.append(IntactnessError(
sequence.id, FRAMESHIFTINORF_ERROR,
("Smaller " if is_small else "")
+ "ORF " + str(e.name) + " at " + str(e.start)
+ "-" + str(e.end)
+ "ORF " + str(e.name) + " at " + str(e.start)
+ "-" + str(e.end)
+ " contains out of frame indels that impact " + str(impacted_by_indels)
+ " positions."
))
Expand Down Expand Up @@ -653,6 +649,44 @@ def read_hxb2_orfs(aligned_subtype, orfs):
yield initialize_orf(aligned_subtype, name, start, end, delta)


VALID_DNA_CHARACTERS = IUPACData.ambiguous_dna_letters.upper()

def find_invalid_subsequences(sequence):
invalid_subsequences = []
current_subsequence = []
start_position = None

for idx, symbol in enumerate(sequence.seq):
if symbol not in VALID_DNA_CHARACTERS:
if start_position is None:
start_position = idx
current_subsequence.append(symbol)
else:
if current_subsequence:
end_position = idx - 1
invalid_subsequences.append(
{
'sequence': ''.join(current_subsequence),
'start': start_position,
'end': end_position
}
)
current_subsequence = []
start_position = None

if current_subsequence:
end_position = len(sequence.seq) - 1
invalid_subsequences.append(
{
'sequence': ''.join(current_subsequence),
'start': start_position,
'end': end_position
}
)

return invalid_subsequences


def intact( working_dir,
input_file,
subtype,
Expand Down Expand Up @@ -692,13 +726,20 @@ def intact( working_dir,
def analyse_single_sequence(holistic, sequence, blast_rows):
sequence_errors = []

e = get_query_aminoacids_table(sequence)
if isinstance(e, Exception):
log.error(e)
err = IntactnessError(sequence.id, INVALID_CODON, e.args[0])
invalid_subsequences = find_invalid_subsequences(sequence)
if invalid_subsequences:
error_details = ', '.join(
f"{subseq['sequence']} (start: {subseq['start']}, end: {subseq['end']})" \
for subseq in invalid_subsequences
)

err = IntactnessError(sequence.id, UNKNOWN_NUCLEOTIDE,
f'Sequence contains invalid nucleotides: {error_details}')
sequence_errors.append(err)
sequence = SeqRecord.SeqRecord(Seq.Seq(''.join(x for x in sequence.seq if x in "ACTGN")),
id = sequence.id, name = sequence.name)

sequence = SeqRecord.SeqRecord(
Seq.Seq(''.join(x for x in sequence.seq if x in VALID_DNA_CHARACTERS)),
id=sequence.id, name=sequence.name)

holistic.qlen = len(sequence)
holistic.blast_n_conseqs = len(blast_rows)
Expand Down

0 comments on commit 2849537

Please sign in to comment.