From 111905825ce611af4ff8426cd6a62adb3277e39d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 Nov 2023 15:13:34 -0800 Subject: [PATCH] Integrate contig stitcher structures into denovo pipeline --- micall/core/contig_stitcher.py | 21 ++++++++------ micall/core/denovo.py | 50 +++++++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 25bc80dc5..0530b2364 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -22,8 +22,8 @@ class Contig: @dataclass class GenotypedContig(Contig): ref_name: str - ref_seq: str - matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. + ref_seq: Optional[str] # None in cases where the reference organism is unknown. + match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: """ @@ -36,12 +36,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, - matched_fraction=None) + match_fraction=None) right = GenotypedContig(name=f'right({self.name})', seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, - matched_fraction=None) + match_fraction=None) return (left, right) @@ -59,7 +59,7 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit): name = query.name, ref_name = query.ref_name, ref_seq = query.ref_seq, - matched_fraction = query.matched_fraction) + match_fraction = query.match_fraction) def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: @@ -171,7 +171,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: name=f'{left.name}+{right.name}', ref_name=left.ref_name, ref_seq=left.ref_seq, - matched_fraction=None) + match_fraction=None) left_alignment = left.alignment right_alignment = \ @@ -184,8 +184,11 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: def align_to_reference(contig): - aligner = Aligner(seq=contig.ref_seq, preset='map-ont') - alignments = list(aligner.map(contig.seq)) + if contig.ref_seq is None: + return contig + + aligner = Aligner(seq=str(contig.ref_seq), preset='map-ont') + alignments = list(aligner.map(str(contig.seq))) if not alignments: return contig @@ -278,7 +281,7 @@ def stitch_2_contigs(left, right): # Return something that can be fed back into the loop. overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', seq=overlap_seq, ref_name=left.ref_name, - ref_seq=left.ref_seq, matched_fraction=None) + ref_seq=left.ref_seq, match_fraction=None) overlap_contig = SyntheticContig(overlap_query, r_st=left_overlap.alignment.r_st, r_ei=right_overlap.alignment.r_ei) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index c3ce3c9c7..bed1f62a5 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,6 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig +from micall.core.contig_stitcher import GenotypedContig IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), @@ -28,6 +29,32 @@ logger = logging.getLogger(__name__) +def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> typing.Iterable[GenotypedContig]: + projects = ProjectConfig.loadDefault() + + for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): + (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) + seq = record.seq + if match_fraction < 0: + seq = seq.reverse_complement() + match_fraction *= -1 + + group_ref = group_refs.get(ref_name) + try: + ref_seq = projects.getGenotypeReference(ref_name) + except KeyError: + try: + ref_seq = projects.getReference(ref_name) + except: + ref_seq = None + + yield GenotypedContig(name=record.name, + seq=seq, + ref_name=ref_name, + ref_seq=ref_seq, + match_fraction=match_fraction) + + def write_contig_refs(contigs_fasta_path, contigs_csv, merged_contigs_csv=None, @@ -55,19 +82,16 @@ def write_contig_refs(contigs_fasta_path, genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) - genotype_count = 0 - for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): - (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) - seq = record.seq - if match_fraction < 0: - seq = seq.reverse_complement() - match_fraction *= -1 - writer.writerow(dict(ref=ref_name, - match=match_fraction, - group_ref=group_refs.get(ref_name), - contig=seq)) - genotype_count += 1 - return genotype_count + + contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) + + for contig in contigs: + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=group_refs.get(contig.ref_name), + contig=contig.seq)) + + return len(contigs) def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None):