Integrate contig stitcher structures into denovo pipeline

cfe-lab · Nov 17, 2023 · 1b0c0ca · 1b0c0ca
1 parent c40ec0e
commit 1b0c0ca
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 59 deletions.
diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py
@@ -22,8 +22,8 @@ class Contig:
 @dataclass
 class GenotypedContig(Contig):
     ref_name: str
-    ref_seq: str
-    matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.
+    ref_seq: Optional[str]          # None in cases where the reference organism is unknown.
+    match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.
 
     def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']:
         """
@@ -36,12 +36,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont
                                seq=self.seq[:ceil(cut_point)],
                                ref_seq=self.ref_seq,
                                ref_name=self.ref_name,
-                               matched_fraction=None)
+                               match_fraction=None)
         right = GenotypedContig(name=f'right({self.name})',
                                 seq=self.seq[ceil(cut_point):],
                                 ref_seq=self.ref_seq,
                                 ref_name=self.ref_name,
-                                matched_fraction=None)
+                                match_fraction=None)
 
         return (left, right)
 
@@ -59,7 +59,7 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit):
             name = query.name,
             ref_name = query.ref_name,
             ref_seq = query.ref_seq,
-            matched_fraction = query.matched_fraction)
+            match_fraction = query.match_fraction)
 
 
     def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']:
@@ -171,7 +171,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
                                 name=f'{left.name}+{right.name}',
                                 ref_name=left.ref_name,
                                 ref_seq=left.ref_seq,
-                                matched_fraction=None)
+                                match_fraction=None)
 
         left_alignment = left.alignment
         right_alignment = \
@@ -184,6 +184,9 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
 
 
 def align_to_reference(contig):
+    if contig.ref_seq is None:
+        return contig
+
     aligner = Aligner(seq=contig.ref_seq, preset='map-ont')
     alignments = list(aligner.map(contig.seq))
     if not alignments:
@@ -278,7 +281,7 @@ def stitch_2_contigs(left, right):
     # Return something that can be fed back into the loop.
     overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})',
                                     seq=overlap_seq, ref_name=left.ref_name,
-                                    ref_seq=left.ref_seq, matched_fraction=None)
+                                    ref_seq=left.ref_seq, match_fraction=None)
     overlap_contig = SyntheticContig(overlap_query,
                                      r_st=left_overlap.alignment.r_st,
                                      r_ei=right_overlap.alignment.r_ei)

diff --git a/micall/core/denovo.py b/micall/core/denovo.py
@@ -19,6 +19,7 @@
 from Bio.SeqRecord import SeqRecord
 
 from micall.core.project_config import ProjectConfig
+from micall.core.contig_stitcher import GenotypedContig
 
 IVA = "iva"
 DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__),
@@ -28,6 +29,32 @@
 logger = logging.getLogger(__name__)
 
 
+def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> typing.Iterable[GenotypedContig]:
+    projects = ProjectConfig.loadDefault()
+
+    for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
+        (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0))
+        seq = record.seq
+        if match_fraction < 0:
+            seq = seq.reverse_complement()
+            match_fraction *= -1
+
+        group_ref = group_refs.get(ref_name)
+        try:
+            ref_seq = projects.getGenotypeReference(ref_name)
+        except KeyError:
+            try:
+                ref_seq = projects.getReference(ref_name)
+            except:
+                ref_seq = None
+
+        yield GenotypedContig(name=record.name,
+                              seq=seq,
+                              ref_name=ref_name,
+                              ref_seq=ref_seq,
+                              match_fraction=match_fraction)
+
+
 def write_contig_refs(contigs_fasta_path,
                       contigs_csv,
                       merged_contigs_csv=None,
@@ -55,19 +82,16 @@ def write_contig_refs(contigs_fasta_path,
     genotypes = genotype(contigs_fasta_path,
                          blast_csv=blast_csv,
                          group_refs=group_refs)
-    genotype_count = 0
-    for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
-        (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0))
-        seq = record.seq
-        if match_fraction < 0:
-            seq = seq.reverse_complement()
-            match_fraction *= -1
-        writer.writerow(dict(ref=ref_name,
-                             match=match_fraction,
-                             group_ref=group_refs.get(ref_name),
-                             contig=seq))
-        genotype_count += 1
-    return genotype_count
+
+    contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path))
+
+    for contig in contigs:
+        writer.writerow(dict(ref=contig.ref_name,
+                             match=contig.match_fraction,
+                             group_ref=group_refs.get(contig.ref_name),
+                             contig=contig.seq))
+
+    return len(contigs)
 
 
 def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None):