Skip to content

Commit

Permalink
Integrate contig stitcher structures into denovo pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Nov 17, 2023
1 parent c40ec0e commit 1b0c0ca
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 59 deletions.
17 changes: 10 additions & 7 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class Contig:
@dataclass
class GenotypedContig(Contig):
ref_name: str
ref_seq: str
matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.
ref_seq: Optional[str] # None in cases where the reference organism is unknown.
match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.

def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']:
"""
Expand All @@ -36,12 +36,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont
seq=self.seq[:ceil(cut_point)],
ref_seq=self.ref_seq,
ref_name=self.ref_name,
matched_fraction=None)
match_fraction=None)
right = GenotypedContig(name=f'right({self.name})',
seq=self.seq[ceil(cut_point):],
ref_seq=self.ref_seq,
ref_name=self.ref_name,
matched_fraction=None)
match_fraction=None)

return (left, right)

Expand All @@ -59,7 +59,7 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit):
name = query.name,
ref_name = query.ref_name,
ref_seq = query.ref_seq,
matched_fraction = query.matched_fraction)
match_fraction = query.match_fraction)


def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']:
Expand Down Expand Up @@ -171,7 +171,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
name=f'{left.name}+{right.name}',
ref_name=left.ref_name,
ref_seq=left.ref_seq,
matched_fraction=None)
match_fraction=None)

left_alignment = left.alignment
right_alignment = \
Expand All @@ -184,6 +184,9 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:


def align_to_reference(contig):
if contig.ref_seq is None:
return contig

aligner = Aligner(seq=contig.ref_seq, preset='map-ont')
alignments = list(aligner.map(contig.seq))
if not alignments:
Expand Down Expand Up @@ -278,7 +281,7 @@ def stitch_2_contigs(left, right):
# Return something that can be fed back into the loop.
overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})',
seq=overlap_seq, ref_name=left.ref_name,
ref_seq=left.ref_seq, matched_fraction=None)
ref_seq=left.ref_seq, match_fraction=None)
overlap_contig = SyntheticContig(overlap_query,
r_st=left_overlap.alignment.r_st,
r_ei=right_overlap.alignment.r_ei)
Expand Down
50 changes: 37 additions & 13 deletions micall/core/denovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from Bio.SeqRecord import SeqRecord

from micall.core.project_config import ProjectConfig
from micall.core.contig_stitcher import GenotypedContig

IVA = "iva"
DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__),
Expand All @@ -28,6 +29,32 @@
logger = logging.getLogger(__name__)


def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> typing.Iterable[GenotypedContig]:
projects = ProjectConfig.loadDefault()

for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
(ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0))
seq = record.seq
if match_fraction < 0:
seq = seq.reverse_complement()
match_fraction *= -1

group_ref = group_refs.get(ref_name)
try:
ref_seq = projects.getGenotypeReference(ref_name)
except KeyError:
try:
ref_seq = projects.getReference(ref_name)
except:
ref_seq = None

yield GenotypedContig(name=record.name,
seq=seq,
ref_name=ref_name,
ref_seq=ref_seq,
match_fraction=match_fraction)


def write_contig_refs(contigs_fasta_path,
contigs_csv,
merged_contigs_csv=None,
Expand Down Expand Up @@ -55,19 +82,16 @@ def write_contig_refs(contigs_fasta_path,
genotypes = genotype(contigs_fasta_path,
blast_csv=blast_csv,
group_refs=group_refs)
genotype_count = 0
for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")):
(ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0))
seq = record.seq
if match_fraction < 0:
seq = seq.reverse_complement()
match_fraction *= -1
writer.writerow(dict(ref=ref_name,
match=match_fraction,
group_ref=group_refs.get(ref_name),
contig=seq))
genotype_count += 1
return genotype_count

contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path))

for contig in contigs:
writer.writerow(dict(ref=contig.ref_name,
match=contig.match_fraction,
group_ref=group_refs.get(contig.ref_name),
contig=contig.seq))

return len(contigs)


def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None):
Expand Down
Loading

0 comments on commit 1b0c0ca

Please sign in to comment.