Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Add another output file: holistic.csv
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Sep 18, 2023
1 parent 76f0401 commit 022ee3d
Show file tree
Hide file tree
Showing 8 changed files with 526 additions and 16 deletions.
51 changes: 41 additions & 10 deletions intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@ class FoundORF:
aminoacids: str
nucleotides: str

@dataclass
class HolisticInfo:
hypermutation_probablility: float = dataclasses.field(default=None)
inferred_subtype: str = dataclasses.field(default=None)
blast_matched_qlen: int = dataclasses.field(default=None) # number of query nucleotides matched to a known reference sequence
blast_sseq_coverage: float = dataclasses.field(default=None) # percentage of reference sequence covered by the query sequence
blast_qseq_coverage: float = dataclasses.field(default=None) # percentage of the query sequence covered by reference sequence
blast_n_conseqs: int = dataclasses.field(default=None) # number of blast conseqs in the resulting match


def iterate_values_from_csv(file_path):
with open(file_path, 'r') as csv_file:
Expand Down Expand Up @@ -201,12 +210,16 @@ def is_scrambled(seqid, blast_rows):
f"Sequence is {direction}-scrambled.")


def is_nonhiv(seqid, seqlen, blast_rows):
def is_nonhiv(holistic, seqid, seqlen, blast_rows):
aligned_length = sum(abs(x.qend - x.qstart) + 1 for x in blast_rows)
total_length = blast_rows[0].qlen if blast_rows else 1
ratio = aligned_length / total_length
holistic.total_matched_qlen = blast_rows[0].qlen if blast_rows else 1
holistic.blast_qseq_coverage = aligned_length / holistic.total_matched_qlen

if ratio < 0.8 and seqlen > total_length * 0.6:
aligned_reference_length = sum(abs(x.qend - x.qstart) + 1 for x in blast_rows)
total_matched_slen = blast_rows[0].qlen if blast_rows else 1
holistic.blast_sseq_coverage = aligned_reference_length / total_matched_slen

if holistic.blast_qseq_coverage < 0.8 and seqlen > holistic.total_matched_qlen * 0.6:
return IntactnessError(seqid, NONHIV_ERROR,
"Sequence contains unrecognized parts. "
"It is probably a Human/HIV Chimera sequence.")
Expand All @@ -229,7 +242,7 @@ def _getPositions(pattern, string):
return set(m.start() for m in re.finditer(pattern, string))
#/end _getPositions

def isHypermut(aln):
def isHypermut(holistic, aln):
"""
APOBEC3G/F hypermutation scan and test based on Rose and Korber, Bioinformatics (2000).
Briefly, scans reference for APOBEC possible signatures and non-signatures and performs
Expand Down Expand Up @@ -269,6 +282,8 @@ def isHypermut(aln):
alternative = 'greater'
)

holistic.hypermutation_probablility = 1 - pval

if pval < 0.05:
return IntactnessError(
aln[1].id, HYPERMUTATION_ERROR,
Expand Down Expand Up @@ -698,6 +713,7 @@ def __init__(self, working_dir, fmt):
self.intact_path = os.path.join(working_dir, "intact.fasta")
self.non_intact_path = os.path.join(working_dir, "nonintact.fasta")
self.orf_path = os.path.join(working_dir, f"orfs.{fmt}")
self.holistic_path = os.path.join(working_dir, f"holistic.{fmt}")
self.error_path = os.path.join(working_dir, f"errors.{fmt}")

if fmt not in ("json", "csv"):
Expand All @@ -708,15 +724,20 @@ def __enter__(self, *args):
self.intact_file = open(self.intact_path, 'w')
self.nonintact_file = open(self.non_intact_path, 'w')
self.orfs_file = open(self.orf_path, 'w')
self.holistic_file = open(self.holistic_path, 'w')
self.errors_file = open(self.error_path, 'w')

if self.fmt == "json":
self.orfs = {}
self.holistic = {}
self.errors = {}
elif self.fmt == "csv":
self.orfs_writer = csv.writer(self.orfs_file)
self.orfs_header = ['seqid'] + [field.name for field in dataclasses.fields(CandidateORF)]
self.orfs_writer.writerow(self.orfs_header)
self.holistic_writer = csv.writer(self.holistic_file)
self.holistic_header = ['seqid'] + [field.name for field in dataclasses.fields(HolisticInfo)]
self.holistic_writer.writerow(self.holistic_header)
self.errors_writer = csv.writer(self.errors_file)
self.errors_header = [field.name for field in dataclasses.fields(IntactnessError)]
self.errors_writer.writerow(self.errors_header)
Expand All @@ -727,32 +748,37 @@ def __enter__(self, *args):
def __exit__(self, *args):
if self.fmt == "json":
json.dump(self.orfs, self.orfs_file, indent=4)
json.dump(self.holistic, self.holistic_file, indent=4)
json.dump(self.errors, self.errors_file, indent=4)

self.intact_file.close()
self.nonintact_file.close()
self.orfs_file.close()
self.holistic_file.close()
self.errors_file.close()

log.info('Intact sequences written to ' + self.intact_path)
log.info('Non-intact sequences written to ' + self.non_intact_path)
log.info('ORFs for all sequences written to ' + self.orf_path)
log.info('Holistic info for all sequences written to ' + self.holistic_path)
log.info('Intactness error information written to ' + self.error_path)
if os.path.exists(os.path.join(self.working_dir, 'blast.csv')):
log.info('Blast output written to ' + os.path.join(self.working_dir, 'blast.csv'))


def write(self, sequence, is_intact, orfs, errors):
def write(self, sequence, is_intact, orfs, errors, holistic):
fasta_file = self.intact_file if is_intact else self.nonintact_file
SeqIO.write([sequence], fasta_file, "fasta")
fasta_file.flush()

if self.fmt == "json":
self.orfs[sequence.id] = orfs
self.holistic[sequence.id] = holistic
self.errors[sequence.id] = errors
elif self.fmt == "csv":
for orf in orfs:
self.orfs_writer.writerow([(sequence.id if key == 'seqid' else orf[key]) for key in self.orfs_header])
self.holistic_writer.writerow([(sequence.id if key == 'seqid' else holistic[key]) for key in self.holistic_header])
for error in errors:
self.errors_writer.writerow([error[key] for key in self.errors_header])

Expand Down Expand Up @@ -831,6 +857,9 @@ def intact( working_dir,
rre_locus = [pos_mapping[x] for x in hxb2_rre_locus]

sequence_errors = []
holistic = HolisticInfo()
holistic.inferred_subtype = reference_name
holistic.blast_n_conseqs = len(blast_rows)

reverse_sequence = SeqRecord.SeqRecord(Seq.reverse_complement(sequence.seq),
id = sequence.id + " [REVERSED]",
Expand All @@ -844,7 +873,8 @@ def intact( working_dir,
err = IntactnessError(sequence.id, ALIGNMENT_FAILED, "Alignment failed for this sequence. It probably contains invalid symbols.")
sequence_errors.append(err)
errors = [x.__dict__ for x in sequence_errors]
writer.write(sequence, is_intact=False, orfs=[], errors=errors)
holistic = holistic.__dict__
writer.write(sequence, is_intact=False, orfs=[], errors=errors, holistic=holistic)
continue

forward_score = alignment_score(alignment)
Expand Down Expand Up @@ -902,7 +932,7 @@ def intact( working_dir,
sequence_errors.append(mutated_splice_donor_site)

if run_hypermut is not None:
hypermutated = isHypermut(alignment)
hypermutated = isHypermut(holistic, alignment)

if hypermutated is not None:
sequence_errors.append(hypermutated)
Expand All @@ -913,7 +943,7 @@ def intact( working_dir,
sequence_errors.append(long_deletion)

if check_nonhiv:
error = is_nonhiv(sequence.id, len(sequence), blast_rows)
error = is_nonhiv(holistic, sequence.id, len(sequence), blast_rows)
if error:
sequence_errors.append(error)

Expand All @@ -935,6 +965,7 @@ def intact( working_dir,

orfs = [x.__dict__ for x in hxb2_found_orfs]
errors = [x.__dict__ for x in sequence_errors]
writer.write(sequence, is_intact, orfs, errors)
holistic = holistic.__dict__
writer.write(sequence, is_intact, orfs, errors, holistic=holistic)
#/end def intact
#/end intact.py
42 changes: 42 additions & 0 deletions tests/expected-results-large-csv/holistic.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
seqid,hypermutation_probablility,inferred_subtype,blast_matched_qlen,blast_sseq_coverage,blast_qseq_coverage,blast_n_conseqs
KX505501.1,0.7087072014754221,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.2158237356034052,1.2158237356034052,4
MN691959,0.19593905853945925,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1086063415148004,1.1086063415148004,3
MN692074,0.36378645339477633,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1728099569171853,1.1728099569171853,4
MN692145,0.1661041079701131,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1271545051088863,1.1271545051088863,3
MN090335,0.1754017863888554,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0603153600176425,1.0603153600176425,3
MN090376,0.026007919521734202,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0604340567612687,1.0604340567612687,3
MK115581.1,0.6897199265079494,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0046340179041602,1.0046340179041602,2
MK115690.1,0.05065930954004094,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9949427185468056,0.9949427185468056,2
MK115571.1,0.8012585672082311,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0113902490951672,1.0113902490951672,2
MK115514.1,0.6458974386368621,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0173736943082499,1.0173736943082499,2
MK115488.1,0.6511896911074662,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0325262392185388,1.0325262392185388,6
MK115030.1,0.031598631869680704,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0655270655270654,1.0655270655270654,3
MK115498.1,0.8339748776671196,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0080329774865235,1.0080329774865235,2
MK115211.1,0.11689558806708,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0598981399468557,1.0598981399468557,3
MK115158.1,0.002572269807584293,Ref.47_BF.ES.08.P1942.GQ372987,,0.9699223449633599,0.9699223449633599,1
MK114705.1,0.14449377496074622,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.122622463075125,1.122622463075125,6
MK114856.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0812493405085997,1.0812493405085997,4
MK115009.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0590854784403172,1.0590854784403172,3
MK115387.1,0.5412311092694289,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.040936952714536,1.040936952714536,2
MK115491.1,0.8951015182445495,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0299299511780937,1.0299299511780937,2
MK116110.1,0.07021438897893317,Ref.B.TH.90.BK132.AY173951,,0.9972119995539199,0.9972119995539199,3
MK115527.1,0.7689834393883834,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0056956017297753,1.0056956017297753,2
MK114997.1,0.054959132555391754,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0516841524019878,1.0516841524019878,2
MK115518.1,0.6385326595592609,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9996854356715948,0.9996854356715948,3
MK115065.1,0.033517722768753644,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.069459518124593,1.069459518124593,6
MK115464.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9893407844354756,0.9893407844354756,2
MK115530.1,0.5789377103398377,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9992665549036044,0.9992665549036044,2
MK115520.1,0.5200353682902832,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.987902805297737,0.987902805297737,3
MK115503.1,0.4263025132504157,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9953207861079338,0.9953207861079338,2
MK115570.1,0.738578434638724,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0057986294148655,1.0057986294148655,2
MK115509.1,0.7866198309713798,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0197797498128942,1.0197797498128942,2
MK115702.1,0.14401391767451666,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0596834469114091,1.0596834469114091,4
MK115095.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.060085367188355,1.060085367188355,2
MK115490.1,0.8863248655310947,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0204343639670483,1.0204343639670483,3
MK115576.1,0.818189227062389,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0342110943233327,1.0342110943233327,3
OQ092466,0.3876036547663967,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1192442700805285,1.1192442700805285,3
OQ092463,0.21628713708846803,Ref.B.TH.90.BK132.AY173951,,0.9884435190005205,0.9884435190005205,2
OQ092465,0.02412789935966586,Ref.28_BF.BR.99.BREPM12817.DQ085874,,0.9620043482762191,0.9620043482762191,2
OQ092462,0.10777665573070194,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1301214741610048,1.1301214741610048,3
OQ092464,0.006887768010151674,Ref.28_BF.BR.99.BREPM12817.DQ085874,,0.9678735872750105,0.9678735872750105,2
OQ092467,0.6416537859942263,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0962157809983897,1.0962157809983897,3
Loading

0 comments on commit 022ee3d

Please sign in to comment.