From 022ee3d87d100e7b12c1bd5e2cc394a2ef350d7c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 18 Sep 2023 11:13:15 -0700 Subject: [PATCH] Add another output file: holistic.csv --- intact/intact.py | 51 ++- tests/expected-results-large-csv/holistic.csv | 42 ++ tests/expected-results-large/holistic.json | 371 ++++++++++++++++++ .../expected-results-single-csv/holistic.csv | 2 + tests/expected-results-single/holistic.json | 11 + tests/expected-results-small-csv/holistic.csv | 6 + tests/expected-results-small/holistic.json | 47 +++ tests/test_nonhiv.py | 12 +- 8 files changed, 526 insertions(+), 16 deletions(-) create mode 100644 tests/expected-results-large-csv/holistic.csv create mode 100644 tests/expected-results-large/holistic.json create mode 100644 tests/expected-results-single-csv/holistic.csv create mode 100644 tests/expected-results-single/holistic.json create mode 100644 tests/expected-results-small-csv/holistic.csv create mode 100644 tests/expected-results-small/holistic.json diff --git a/intact/intact.py b/intact/intact.py index 204a9cf..5b8b0f3 100644 --- a/intact/intact.py +++ b/intact/intact.py @@ -98,6 +98,15 @@ class FoundORF: aminoacids: str nucleotides: str +@dataclass +class HolisticInfo: + hypermutation_probablility: float = dataclasses.field(default=None) + inferred_subtype: str = dataclasses.field(default=None) + blast_matched_qlen: int = dataclasses.field(default=None) # number of query nucleotides matched to a known reference sequence + blast_sseq_coverage: float = dataclasses.field(default=None) # percentage of reference sequence covered by the query sequence + blast_qseq_coverage: float = dataclasses.field(default=None) # percentage of the query sequence covered by reference sequence + blast_n_conseqs: int = dataclasses.field(default=None) # number of blast conseqs in the resulting match + def iterate_values_from_csv(file_path): with open(file_path, 'r') as csv_file: @@ -201,12 +210,16 @@ def is_scrambled(seqid, blast_rows): f"Sequence is {direction}-scrambled.") -def is_nonhiv(seqid, seqlen, blast_rows): +def is_nonhiv(holistic, seqid, seqlen, blast_rows): aligned_length = sum(abs(x.qend - x.qstart) + 1 for x in blast_rows) - total_length = blast_rows[0].qlen if blast_rows else 1 - ratio = aligned_length / total_length + holistic.total_matched_qlen = blast_rows[0].qlen if blast_rows else 1 + holistic.blast_qseq_coverage = aligned_length / holistic.total_matched_qlen - if ratio < 0.8 and seqlen > total_length * 0.6: + aligned_reference_length = sum(abs(x.qend - x.qstart) + 1 for x in blast_rows) + total_matched_slen = blast_rows[0].qlen if blast_rows else 1 + holistic.blast_sseq_coverage = aligned_reference_length / total_matched_slen + + if holistic.blast_qseq_coverage < 0.8 and seqlen > holistic.total_matched_qlen * 0.6: return IntactnessError(seqid, NONHIV_ERROR, "Sequence contains unrecognized parts. " "It is probably a Human/HIV Chimera sequence.") @@ -229,7 +242,7 @@ def _getPositions(pattern, string): return set(m.start() for m in re.finditer(pattern, string)) #/end _getPositions -def isHypermut(aln): +def isHypermut(holistic, aln): """ APOBEC3G/F hypermutation scan and test based on Rose and Korber, Bioinformatics (2000). Briefly, scans reference for APOBEC possible signatures and non-signatures and performs @@ -269,6 +282,8 @@ def isHypermut(aln): alternative = 'greater' ) + holistic.hypermutation_probablility = 1 - pval + if pval < 0.05: return IntactnessError( aln[1].id, HYPERMUTATION_ERROR, @@ -698,6 +713,7 @@ def __init__(self, working_dir, fmt): self.intact_path = os.path.join(working_dir, "intact.fasta") self.non_intact_path = os.path.join(working_dir, "nonintact.fasta") self.orf_path = os.path.join(working_dir, f"orfs.{fmt}") + self.holistic_path = os.path.join(working_dir, f"holistic.{fmt}") self.error_path = os.path.join(working_dir, f"errors.{fmt}") if fmt not in ("json", "csv"): @@ -708,15 +724,20 @@ def __enter__(self, *args): self.intact_file = open(self.intact_path, 'w') self.nonintact_file = open(self.non_intact_path, 'w') self.orfs_file = open(self.orf_path, 'w') + self.holistic_file = open(self.holistic_path, 'w') self.errors_file = open(self.error_path, 'w') if self.fmt == "json": self.orfs = {} + self.holistic = {} self.errors = {} elif self.fmt == "csv": self.orfs_writer = csv.writer(self.orfs_file) self.orfs_header = ['seqid'] + [field.name for field in dataclasses.fields(CandidateORF)] self.orfs_writer.writerow(self.orfs_header) + self.holistic_writer = csv.writer(self.holistic_file) + self.holistic_header = ['seqid'] + [field.name for field in dataclasses.fields(HolisticInfo)] + self.holistic_writer.writerow(self.holistic_header) self.errors_writer = csv.writer(self.errors_file) self.errors_header = [field.name for field in dataclasses.fields(IntactnessError)] self.errors_writer.writerow(self.errors_header) @@ -727,32 +748,37 @@ def __enter__(self, *args): def __exit__(self, *args): if self.fmt == "json": json.dump(self.orfs, self.orfs_file, indent=4) + json.dump(self.holistic, self.holistic_file, indent=4) json.dump(self.errors, self.errors_file, indent=4) self.intact_file.close() self.nonintact_file.close() self.orfs_file.close() + self.holistic_file.close() self.errors_file.close() log.info('Intact sequences written to ' + self.intact_path) log.info('Non-intact sequences written to ' + self.non_intact_path) log.info('ORFs for all sequences written to ' + self.orf_path) + log.info('Holistic info for all sequences written to ' + self.holistic_path) log.info('Intactness error information written to ' + self.error_path) if os.path.exists(os.path.join(self.working_dir, 'blast.csv')): log.info('Blast output written to ' + os.path.join(self.working_dir, 'blast.csv')) - def write(self, sequence, is_intact, orfs, errors): + def write(self, sequence, is_intact, orfs, errors, holistic): fasta_file = self.intact_file if is_intact else self.nonintact_file SeqIO.write([sequence], fasta_file, "fasta") fasta_file.flush() if self.fmt == "json": self.orfs[sequence.id] = orfs + self.holistic[sequence.id] = holistic self.errors[sequence.id] = errors elif self.fmt == "csv": for orf in orfs: self.orfs_writer.writerow([(sequence.id if key == 'seqid' else orf[key]) for key in self.orfs_header]) + self.holistic_writer.writerow([(sequence.id if key == 'seqid' else holistic[key]) for key in self.holistic_header]) for error in errors: self.errors_writer.writerow([error[key] for key in self.errors_header]) @@ -831,6 +857,9 @@ def intact( working_dir, rre_locus = [pos_mapping[x] for x in hxb2_rre_locus] sequence_errors = [] + holistic = HolisticInfo() + holistic.inferred_subtype = reference_name + holistic.blast_n_conseqs = len(blast_rows) reverse_sequence = SeqRecord.SeqRecord(Seq.reverse_complement(sequence.seq), id = sequence.id + " [REVERSED]", @@ -844,7 +873,8 @@ def intact( working_dir, err = IntactnessError(sequence.id, ALIGNMENT_FAILED, "Alignment failed for this sequence. It probably contains invalid symbols.") sequence_errors.append(err) errors = [x.__dict__ for x in sequence_errors] - writer.write(sequence, is_intact=False, orfs=[], errors=errors) + holistic = holistic.__dict__ + writer.write(sequence, is_intact=False, orfs=[], errors=errors, holistic=holistic) continue forward_score = alignment_score(alignment) @@ -902,7 +932,7 @@ def intact( working_dir, sequence_errors.append(mutated_splice_donor_site) if run_hypermut is not None: - hypermutated = isHypermut(alignment) + hypermutated = isHypermut(holistic, alignment) if hypermutated is not None: sequence_errors.append(hypermutated) @@ -913,7 +943,7 @@ def intact( working_dir, sequence_errors.append(long_deletion) if check_nonhiv: - error = is_nonhiv(sequence.id, len(sequence), blast_rows) + error = is_nonhiv(holistic, sequence.id, len(sequence), blast_rows) if error: sequence_errors.append(error) @@ -935,6 +965,7 @@ def intact( working_dir, orfs = [x.__dict__ for x in hxb2_found_orfs] errors = [x.__dict__ for x in sequence_errors] - writer.write(sequence, is_intact, orfs, errors) + holistic = holistic.__dict__ + writer.write(sequence, is_intact, orfs, errors, holistic=holistic) #/end def intact #/end intact.py diff --git a/tests/expected-results-large-csv/holistic.csv b/tests/expected-results-large-csv/holistic.csv new file mode 100644 index 0000000..d49fe7f --- /dev/null +++ b/tests/expected-results-large-csv/holistic.csv @@ -0,0 +1,42 @@ +seqid,hypermutation_probablility,inferred_subtype,blast_matched_qlen,blast_sseq_coverage,blast_qseq_coverage,blast_n_conseqs +KX505501.1,0.7087072014754221,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.2158237356034052,1.2158237356034052,4 +MN691959,0.19593905853945925,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1086063415148004,1.1086063415148004,3 +MN692074,0.36378645339477633,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1728099569171853,1.1728099569171853,4 +MN692145,0.1661041079701131,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1271545051088863,1.1271545051088863,3 +MN090335,0.1754017863888554,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0603153600176425,1.0603153600176425,3 +MN090376,0.026007919521734202,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0604340567612687,1.0604340567612687,3 +MK115581.1,0.6897199265079494,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0046340179041602,1.0046340179041602,2 +MK115690.1,0.05065930954004094,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9949427185468056,0.9949427185468056,2 +MK115571.1,0.8012585672082311,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0113902490951672,1.0113902490951672,2 +MK115514.1,0.6458974386368621,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0173736943082499,1.0173736943082499,2 +MK115488.1,0.6511896911074662,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0325262392185388,1.0325262392185388,6 +MK115030.1,0.031598631869680704,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0655270655270654,1.0655270655270654,3 +MK115498.1,0.8339748776671196,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0080329774865235,1.0080329774865235,2 +MK115211.1,0.11689558806708,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0598981399468557,1.0598981399468557,3 +MK115158.1,0.002572269807584293,Ref.47_BF.ES.08.P1942.GQ372987,,0.9699223449633599,0.9699223449633599,1 +MK114705.1,0.14449377496074622,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.122622463075125,1.122622463075125,6 +MK114856.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0812493405085997,1.0812493405085997,4 +MK115009.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0590854784403172,1.0590854784403172,3 +MK115387.1,0.5412311092694289,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.040936952714536,1.040936952714536,2 +MK115491.1,0.8951015182445495,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0299299511780937,1.0299299511780937,2 +MK116110.1,0.07021438897893317,Ref.B.TH.90.BK132.AY173951,,0.9972119995539199,0.9972119995539199,3 +MK115527.1,0.7689834393883834,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0056956017297753,1.0056956017297753,2 +MK114997.1,0.054959132555391754,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0516841524019878,1.0516841524019878,2 +MK115518.1,0.6385326595592609,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9996854356715948,0.9996854356715948,3 +MK115065.1,0.033517722768753644,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.069459518124593,1.069459518124593,6 +MK115464.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9893407844354756,0.9893407844354756,2 +MK115530.1,0.5789377103398377,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9992665549036044,0.9992665549036044,2 +MK115520.1,0.5200353682902832,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.987902805297737,0.987902805297737,3 +MK115503.1,0.4263025132504157,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,0.9953207861079338,0.9953207861079338,2 +MK115570.1,0.738578434638724,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0057986294148655,1.0057986294148655,2 +MK115509.1,0.7866198309713798,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0197797498128942,1.0197797498128942,2 +MK115702.1,0.14401391767451666,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0596834469114091,1.0596834469114091,4 +MK115095.1,1.0,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.060085367188355,1.060085367188355,2 +MK115490.1,0.8863248655310947,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0204343639670483,1.0204343639670483,3 +MK115576.1,0.818189227062389,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0342110943233327,1.0342110943233327,3 +OQ092466,0.3876036547663967,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1192442700805285,1.1192442700805285,3 +OQ092463,0.21628713708846803,Ref.B.TH.90.BK132.AY173951,,0.9884435190005205,0.9884435190005205,2 +OQ092465,0.02412789935966586,Ref.28_BF.BR.99.BREPM12817.DQ085874,,0.9620043482762191,0.9620043482762191,2 +OQ092462,0.10777665573070194,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1301214741610048,1.1301214741610048,3 +OQ092464,0.006887768010151674,Ref.28_BF.BR.99.BREPM12817.DQ085874,,0.9678735872750105,0.9678735872750105,2 +OQ092467,0.6416537859942263,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0962157809983897,1.0962157809983897,3 diff --git a/tests/expected-results-large/holistic.json b/tests/expected-results-large/holistic.json new file mode 100644 index 0000000..8abd52c --- /dev/null +++ b/tests/expected-results-large/holistic.json @@ -0,0 +1,371 @@ +{ + "KX505501.1": { + "hypermutation_probablility": 0.7087072014754221, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.2158237356034052, + "blast_qseq_coverage": 1.2158237356034052, + "blast_n_conseqs": 4, + "total_matched_qlen": 1997 + }, + "MN691959": { + "hypermutation_probablility": 0.19593905853945925, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1086063415148004, + "blast_qseq_coverage": 1.1086063415148004, + "blast_n_conseqs": 3, + "total_matched_qlen": 9493 + }, + "MN692074": { + "hypermutation_probablility": 0.36378645339477633, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1728099569171853, + "blast_qseq_coverage": 1.1728099569171853, + "blast_n_conseqs": 4, + "total_matched_qlen": 4178 + }, + "MN692145": { + "hypermutation_probablility": 0.1661041079701131, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1271545051088863, + "blast_qseq_coverage": 1.1271545051088863, + "blast_n_conseqs": 3, + "total_matched_qlen": 9689 + }, + "MN090335": { + "hypermutation_probablility": 0.1754017863888554, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0603153600176425, + "blast_qseq_coverage": 1.0603153600176425, + "blast_n_conseqs": 3, + "total_matched_qlen": 9069 + }, + "MN090376": { + "hypermutation_probablility": 0.026007919521734202, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0604340567612687, + "blast_qseq_coverage": 1.0604340567612687, + "blast_n_conseqs": 3, + "total_matched_qlen": 8985 + }, + "MK115581.1": { + "hypermutation_probablility": 0.6897199265079494, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0046340179041602, + "blast_qseq_coverage": 1.0046340179041602, + "blast_n_conseqs": 2, + "total_matched_qlen": 9495 + }, + "MK115690.1": { + "hypermutation_probablility": 0.05065930954004094, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9949427185468056, + "blast_qseq_coverage": 0.9949427185468056, + "blast_n_conseqs": 2, + "total_matched_qlen": 9689 + }, + "MK115571.1": { + "hypermutation_probablility": 0.8012585672082311, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0113902490951672, + "blast_qseq_coverage": 1.0113902490951672, + "blast_n_conseqs": 2, + "total_matched_qlen": 9394 + }, + "MK115514.1": { + "hypermutation_probablility": 0.6458974386368621, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0173736943082499, + "blast_qseq_coverage": 1.0173736943082499, + "blast_n_conseqs": 2, + "total_matched_qlen": 9382 + }, + "MK115488.1": { + "hypermutation_probablility": 0.6511896911074662, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0325262392185388, + "blast_qseq_coverage": 1.0325262392185388, + "blast_n_conseqs": 6, + "total_matched_qlen": 9623 + }, + "MK115030.1": { + "hypermutation_probablility": 0.031598631869680704, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0655270655270654, + "blast_qseq_coverage": 1.0655270655270654, + "blast_n_conseqs": 3, + "total_matched_qlen": 9126 + }, + "MK115498.1": { + "hypermutation_probablility": 0.8339748776671196, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0080329774865235, + "blast_qseq_coverage": 1.0080329774865235, + "blast_n_conseqs": 2, + "total_matched_qlen": 9461 + }, + "MK115211.1": { + "hypermutation_probablility": 0.11689558806708, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0598981399468557, + "blast_qseq_coverage": 1.0598981399468557, + "blast_n_conseqs": 3, + "total_matched_qlen": 9032 + }, + "MK115158.1": { + "hypermutation_probablility": 0.002572269807584293, + "inferred_subtype": "Ref.47_BF.ES.08.P1942.GQ372987", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9699223449633599, + "blast_qseq_coverage": 0.9699223449633599, + "blast_n_conseqs": 1, + "total_matched_qlen": 9143 + }, + "MK114705.1": { + "hypermutation_probablility": 0.14449377496074622, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.122622463075125, + "blast_qseq_coverage": 1.122622463075125, + "blast_n_conseqs": 6, + "total_matched_qlen": 9411 + }, + "MK114856.1": { + "hypermutation_probablility": 1.0, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0812493405085997, + "blast_qseq_coverage": 1.0812493405085997, + "blast_n_conseqs": 4, + "total_matched_qlen": 9477 + }, + "MK115009.1": { + "hypermutation_probablility": 1.0, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0590854784403172, + "blast_qseq_coverage": 1.0590854784403172, + "blast_n_conseqs": 3, + "total_matched_qlen": 9207 + }, + "MK115387.1": { + "hypermutation_probablility": 0.5412311092694289, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.040936952714536, + "blast_qseq_coverage": 1.040936952714536, + "blast_n_conseqs": 2, + "total_matched_qlen": 9136 + }, + "MK115491.1": { + "hypermutation_probablility": 0.8951015182445495, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0299299511780937, + "blast_qseq_coverage": 1.0299299511780937, + "blast_n_conseqs": 2, + "total_matched_qlen": 9422 + }, + "MK116110.1": { + "hypermutation_probablility": 0.07021438897893317, + "inferred_subtype": "Ref.B.TH.90.BK132.AY173951", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9972119995539199, + "blast_qseq_coverage": 0.9972119995539199, + "blast_n_conseqs": 3, + "total_matched_qlen": 8967 + }, + "MK115527.1": { + "hypermutation_probablility": 0.7689834393883834, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0056956017297753, + "blast_qseq_coverage": 1.0056956017297753, + "blast_n_conseqs": 2, + "total_matched_qlen": 9481 + }, + "MK114997.1": { + "hypermutation_probablility": 0.054959132555391754, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0516841524019878, + "blast_qseq_coverage": 1.0516841524019878, + "blast_n_conseqs": 2, + "total_matched_qlen": 9055 + }, + "MK115518.1": { + "hypermutation_probablility": 0.6385326595592609, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9996854356715948, + "blast_qseq_coverage": 0.9996854356715948, + "blast_n_conseqs": 3, + "total_matched_qlen": 9537 + }, + "MK115065.1": { + "hypermutation_probablility": 0.033517722768753644, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.069459518124593, + "blast_qseq_coverage": 1.069459518124593, + "blast_n_conseqs": 6, + "total_matched_qlen": 9214 + }, + "MK115464.1": { + "hypermutation_probablility": 1.0, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9893407844354756, + "blast_qseq_coverage": 0.9893407844354756, + "blast_n_conseqs": 2, + "total_matched_qlen": 9663 + }, + "MK115530.1": { + "hypermutation_probablility": 0.5789377103398377, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9992665549036044, + "blast_qseq_coverage": 0.9992665549036044, + "blast_n_conseqs": 2, + "total_matched_qlen": 9544 + }, + "MK115520.1": { + "hypermutation_probablility": 0.5200353682902832, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.987902805297737, + "blast_qseq_coverage": 0.987902805297737, + "blast_n_conseqs": 3, + "total_matched_qlen": 9589 + }, + "MK115503.1": { + "hypermutation_probablility": 0.4263025132504157, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9953207861079338, + "blast_qseq_coverage": 0.9953207861079338, + "blast_n_conseqs": 2, + "total_matched_qlen": 9617 + }, + "MK115570.1": { + "hypermutation_probablility": 0.738578434638724, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0057986294148655, + "blast_qseq_coverage": 1.0057986294148655, + "blast_n_conseqs": 2, + "total_matched_qlen": 9485 + }, + "MK115509.1": { + "hypermutation_probablility": 0.7866198309713798, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0197797498128942, + "blast_qseq_coverage": 1.0197797498128942, + "blast_n_conseqs": 2, + "total_matched_qlen": 9353 + }, + "MK115702.1": { + "hypermutation_probablility": 0.14401391767451666, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0596834469114091, + "blast_qseq_coverage": 1.0596834469114091, + "blast_n_conseqs": 4, + "total_matched_qlen": 9098 + }, + "MK115095.1": { + "hypermutation_probablility": 1.0, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.060085367188355, + "blast_qseq_coverage": 1.060085367188355, + "blast_n_conseqs": 2, + "total_matched_qlen": 9137 + }, + "MK115490.1": { + "hypermutation_probablility": 0.8863248655310947, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0204343639670483, + "blast_qseq_coverage": 1.0204343639670483, + "blast_n_conseqs": 3, + "total_matched_qlen": 9347 + }, + "MK115576.1": { + "hypermutation_probablility": 0.818189227062389, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0342110943233327, + "blast_qseq_coverage": 1.0342110943233327, + "blast_n_conseqs": 3, + "total_matched_qlen": 9266 + }, + "OQ092466": { + "hypermutation_probablility": 0.3876036547663967, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1192442700805285, + "blast_qseq_coverage": 1.1192442700805285, + "blast_n_conseqs": 3, + "total_matched_qlen": 9686 + }, + "OQ092463": { + "hypermutation_probablility": 0.21628713708846803, + "inferred_subtype": "Ref.B.TH.90.BK132.AY173951", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9884435190005205, + "blast_qseq_coverage": 0.9884435190005205, + "blast_n_conseqs": 2, + "total_matched_qlen": 9605 + }, + "OQ092465": { + "hypermutation_probablility": 0.02412789935966586, + "inferred_subtype": "Ref.28_BF.BR.99.BREPM12817.DQ085874", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9620043482762191, + "blast_qseq_coverage": 0.9620043482762191, + "blast_n_conseqs": 2, + "total_matched_qlen": 9659 + }, + "OQ092462": { + "hypermutation_probablility": 0.10777665573070194, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1301214741610048, + "blast_qseq_coverage": 1.1301214741610048, + "blast_n_conseqs": 3, + "total_matched_qlen": 9714 + }, + "OQ092464": { + "hypermutation_probablility": 0.006887768010151674, + "inferred_subtype": "Ref.28_BF.BR.99.BREPM12817.DQ085874", + "blast_matched_qlen": null, + "blast_sseq_coverage": 0.9678735872750105, + "blast_qseq_coverage": 0.9678735872750105, + "blast_n_conseqs": 2, + "total_matched_qlen": 9556 + }, + "OQ092467": { + "hypermutation_probablility": 0.6416537859942263, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0962157809983897, + "blast_qseq_coverage": 1.0962157809983897, + "blast_n_conseqs": 3, + "total_matched_qlen": 9936 + } +} \ No newline at end of file diff --git a/tests/expected-results-single-csv/holistic.csv b/tests/expected-results-single-csv/holistic.csv new file mode 100644 index 0000000..db0f640 --- /dev/null +++ b/tests/expected-results-single-csv/holistic.csv @@ -0,0 +1,2 @@ +seqid,hypermutation_probablility,inferred_subtype,blast_matched_qlen,blast_sseq_coverage,blast_qseq_coverage,blast_n_conseqs +MN692145,0.1661041079701131,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1271545051088863,1.1271545051088863,3 diff --git a/tests/expected-results-single/holistic.json b/tests/expected-results-single/holistic.json new file mode 100644 index 0000000..2ccb11a --- /dev/null +++ b/tests/expected-results-single/holistic.json @@ -0,0 +1,11 @@ +{ + "MN692145": { + "hypermutation_probablility": 0.1661041079701131, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1271545051088863, + "blast_qseq_coverage": 1.1271545051088863, + "blast_n_conseqs": 3, + "total_matched_qlen": 9689 + } +} \ No newline at end of file diff --git a/tests/expected-results-small-csv/holistic.csv b/tests/expected-results-small-csv/holistic.csv new file mode 100644 index 0000000..a33ac0a --- /dev/null +++ b/tests/expected-results-small-csv/holistic.csv @@ -0,0 +1,6 @@ +seqid,hypermutation_probablility,inferred_subtype,blast_matched_qlen,blast_sseq_coverage,blast_qseq_coverage,blast_n_conseqs +KX505501.1,0.7087072014754221,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.2158237356034052,1.2158237356034052,4 +MN691959,0.19593905853945925,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1086063415148004,1.1086063415148004,3 +MN692074,0.36378645339477633,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1728099569171853,1.1728099569171853,4 +MN692145,0.1661041079701131,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.1271545051088863,1.1271545051088863,3 +MN090335,0.1754017863888554,Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455,,1.0603153600176425,1.0603153600176425,3 diff --git a/tests/expected-results-small/holistic.json b/tests/expected-results-small/holistic.json new file mode 100644 index 0000000..a454390 --- /dev/null +++ b/tests/expected-results-small/holistic.json @@ -0,0 +1,47 @@ +{ + "KX505501.1": { + "hypermutation_probablility": 0.7087072014754221, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.2158237356034052, + "blast_qseq_coverage": 1.2158237356034052, + "blast_n_conseqs": 4, + "total_matched_qlen": 1997 + }, + "MN691959": { + "hypermutation_probablility": 0.19593905853945925, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1086063415148004, + "blast_qseq_coverage": 1.1086063415148004, + "blast_n_conseqs": 3, + "total_matched_qlen": 9493 + }, + "MN692074": { + "hypermutation_probablility": 0.36378645339477633, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1728099569171853, + "blast_qseq_coverage": 1.1728099569171853, + "blast_n_conseqs": 4, + "total_matched_qlen": 4178 + }, + "MN692145": { + "hypermutation_probablility": 0.1661041079701131, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.1271545051088863, + "blast_qseq_coverage": 1.1271545051088863, + "blast_n_conseqs": 3, + "total_matched_qlen": 9689 + }, + "MN090335": { + "hypermutation_probablility": 0.1754017863888554, + "inferred_subtype": "Ref.B.FR.83.HXB2_LAI_IIIB_BRU.K03455", + "blast_matched_qlen": null, + "blast_sseq_coverage": 1.0603153600176425, + "blast_qseq_coverage": 1.0603153600176425, + "blast_n_conseqs": 3, + "total_matched_qlen": 9069 + } +} \ No newline at end of file diff --git a/tests/test_nonhiv.py b/tests/test_nonhiv.py index 7a6b26a..ed0cf2e 100644 --- a/tests/test_nonhiv.py +++ b/tests/test_nonhiv.py @@ -3,7 +3,7 @@ import os import intact.intact as intact -from intact.intact import is_nonhiv, IntactnessError +from intact.intact import is_nonhiv, IntactnessError, HolisticInfo class BlastRow: def __init__(self, qstart, qend, qlen): @@ -14,7 +14,7 @@ def __init__(self, qstart, qend, qlen): def test_is_nonhiv_empty_blast_rows(): blast_rows = [] - result = is_nonhiv("id", 5, blast_rows) + result = is_nonhiv(HolisticInfo(), "id", 5, blast_rows) assert isinstance(result, IntactnessError) assert result.error == intact.NONHIV_ERROR @@ -25,7 +25,7 @@ def test_is_nonhiv_low_ratio(): BlastRow(200, 300, 1000), BlastRow(400, 500, 1000) ] - result = is_nonhiv("id", 1000, blast_rows) + result = is_nonhiv(HolisticInfo(), "id", 1000, blast_rows) assert isinstance(result, IntactnessError) assert result.error == intact.NONHIV_ERROR @@ -36,7 +36,7 @@ def test_is_nonhiv_high_ratio(): BlastRow(200, 400, 1000), BlastRow(400, 900, 1000) ] - assert is_nonhiv("id", 1000, blast_rows) is None + assert is_nonhiv(HolisticInfo(), "id", 1000, blast_rows) is None def test_is_nonhiv_high_ratio_reversed(): @@ -45,11 +45,11 @@ def test_is_nonhiv_high_ratio_reversed(): BlastRow(200, 400, 1000), BlastRow(900, 400, 1000) ] - assert is_nonhiv("id", 1000, blast_rows) is None + assert is_nonhiv(HolisticInfo(), "id", 1000, blast_rows) is None def test_is_nonhiv_single_blast_row(): blast_rows = [BlastRow(0, 200, 1000)] - result = is_nonhiv("id", 1000, blast_rows) + result = is_nonhiv(HolisticInfo(), "id", 1000, blast_rows) assert isinstance(result, IntactnessError) assert result.error == intact.NONHIV_ERROR