Skip to content

Commit

Permalink
Contig stitcher: ensure match_fraction value for every contig
Browse files Browse the repository at this point in the history
It is required by remap, so we have to provide something to it.
However, it is not clear how to recalculate the value for subparts
of initial contigs since it is initially set by BLAST.

The easiest would be to rerun BLAST on final contigs,
but that also seems wasteful.

Current solution simply copies the match_fraction from parent
of a subpart, or takes min of two "parents" for contigs
that are munged together.
  • Loading branch information
Donaim committed Nov 17, 2023
1 parent 1b0c0ca commit 7ab3666
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Contig:
class GenotypedContig(Contig):
ref_name: str
ref_seq: Optional[str] # None in cases where the reference organism is unknown.
match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.
match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP.

def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']:
"""
Expand All @@ -32,16 +32,17 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont
"""

cut_point = max(0, cut_point)
match_fraction = self.match_fraction
left = GenotypedContig(name=f'left({self.name})',
seq=self.seq[:ceil(cut_point)],
ref_seq=self.ref_seq,
ref_name=self.ref_name,
match_fraction=None)
match_fraction=match_fraction)
right = GenotypedContig(name=f'right({self.name})',
seq=self.seq[ceil(cut_point):],
ref_seq=self.ref_seq,
ref_name=self.ref_name,
match_fraction=None)
match_fraction=match_fraction)

return (left, right)

Expand Down Expand Up @@ -167,11 +168,12 @@ def rstrip_query(self):
@staticmethod
def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
query_seq = left.rstrip_query().seq + right.lstrip_query().seq
match_fraction = min(left.match_fraction, right.match_fraction)
query = GenotypedContig(seq=query_seq,
name=f'{left.name}+{right.name}',
ref_name=left.ref_name,
ref_seq=left.ref_seq,
match_fraction=None)
match_fraction=match_fraction)

left_alignment = left.alignment
right_alignment = \
Expand Down Expand Up @@ -279,9 +281,10 @@ def stitch_2_contigs(left, right):
overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-')

# Return something that can be fed back into the loop.
match_fraction = min(left.match_fraction, right.match_fraction)
overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})',
seq=overlap_seq, ref_name=left.ref_name,
ref_seq=left.ref_seq, match_fraction=None)
ref_seq=left.ref_seq, match_fraction=match_fraction)
overlap_contig = SyntheticContig(overlap_query,
r_st=left_overlap.alignment.r_st,
r_ei=right_overlap.alignment.r_ei)
Expand Down

0 comments on commit 7ab3666

Please sign in to comment.