Skip to content

Commit

Permalink
Contig stitcher: ignore contigs that align in-reverse
Browse files Browse the repository at this point in the history
Also:
* Small improvements to logging in contig stitcher
* Fix MockAlignment is_rev field
  • Loading branch information
Donaim committed Dec 5, 2023
1 parent a08b68f commit b786073
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 32 deletions.
65 changes: 42 additions & 23 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,49 +197,61 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
return AlignedContig(query, alignment)


def align_to_reference(contig) -> Iterable[GenotypedContig]:
def align_to_reference(contig) -> Iterable[Tuple[GenotypedContig, bool]]:
if contig.ref_seq is None:
logger.info("Contig %r not aligned - no reference.", contig.name,
extra={"action": "alignment", "type": "noref", "contig": contig})
yield contig
yield (contig, False)
return

aligner = Aligner(seq=contig.ref_seq, preset='map-ont')
alignments = list(aligner.map(contig.seq))
if not alignments:
logger.info("Contig %r not aligned - backend choice.", contig.name,
extra={"action": "alignment", "type": "zerohits", "contig": contig})
yield contig
return
reversed_alignments = [alignment for alignment in alignments if alignment.strand == -1]
alignments = [alignment for alignment in alignments if alignment.strand == 1]

logger.info("Contig %r produced %s reverse-complement alignments.", contig.name, len(reversed_alignments),
extra={"action": "alignment", "type": "reversenumber",
"contig": contig, "n": len(reversed_alignments)})

hits_array = [CigarHit(Cigar.coerce(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments]
connected = connect_cigar_hits(hits_array)
connected = connect_cigar_hits(hits_array) if hits_array else []

logger.info("Contig %r aligned in %s parts.", contig.name, len(connected),
logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected),
extra={"action": "alignment", "type": "hitnumber",
"contig": contig, "n": len(connected)})

def logpart(i, part):
logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s].",
def logpart(i, part, is_rev):
logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s]%s.",
i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei,
" (rev)" if is_rev else "",
extra={"action": "alignment", "type": "hit",
"contig": contig, "part": part, "i": i})
logger.debug("Part %r of contig %r aligned as %s.", i, contig.name, part)
logger.debug("Part %r of contig %r aligned as %s.%s", i, contig.name, part,
" (rev)" if is_rev else "")

if len(connected) == 1:
logpart(0, connected[0])
yield AlignedContig(query=contig, alignment=connected[0])
to_return = connected + reversed_alignments
if len(to_return) == 0:
logger.info("Contig %r not aligned - backend choice.", contig.name,
extra={"action": "alignment", "type": "zerohits", "contig": contig})
yield (contig, False)
return

if len(to_return) == 1:
is_rev = to_return[0] in reversed_alignments
logpart(0, to_return[0], is_rev)
yield (AlignedContig(query=contig, alignment=connected[0]), is_rev)
return

for i, single_hit in enumerate(connected):
logpart(i, single_hit)
for i, single_hit in enumerate(connected + reversed_alignments):
query = GenotypedContig(name=f'part({i}, {contig.name})',
seq=contig.seq,
ref_name=contig.ref_name,
group_ref=contig.group_ref,
ref_seq=contig.ref_seq,
match_fraction=contig.match_fraction)
yield AlignedContig(query=query, alignment=single_hit)
is_rev = single_hit in reversed_alignments
logpart(i, single_hit, is_rev)
yield (AlignedContig(query=query, alignment=single_hit), is_rev)


def align_all_to_reference(contigs):
Expand Down Expand Up @@ -514,13 +526,20 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig
logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.",
contig.name, contig.ref_name, contig.group_ref, len(contig.seq),
extra={"action": "intro", "contig": contig})
logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.",
contig.name, contig.seq, contig.ref_name,
contig.group_ref, contig.ref_seq, len(contig.seq),
extra={"action": "intro", "contig": contig})

aligned = align_all_to_reference(contigs)

maybe_aligned = align_all_to_reference(contigs)
# Contigs aligned in reverse do not need any more processing
yield from (x for (x, is_rev) in aligned if is_rev)
aligned = [x for (x, is_rev) in aligned if not is_rev]

# Contigs that did not align do not need any more processing
yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig))
aligned: List[AlignedContig] = \
[x for x in maybe_aligned if isinstance(x, AlignedContig)]
yield from (x for x in aligned if not isinstance(x, AlignedContig))
aligned = [x for x in aligned if isinstance(x, AlignedContig)]

aligned = split_contigs_with_gaps(aligned)
aligned = drop_completely_covered(aligned)
Expand Down Expand Up @@ -548,7 +567,7 @@ def combine(contigs):

def main(args):
import argparse
from micall.core.denovo import write_contig_refs
from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here.

parser = argparse.ArgumentParser()
parser.add_argument('contigs', type=argparse.FileType('r'))
Expand Down
10 changes: 5 additions & 5 deletions micall/tests/test_contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner):
),
]

contigs = align_all_to_reference(contigs)
contigs = [c for c, is_rev in align_all_to_reference(contigs)]
assert len(list(contigs[0].alignment.gaps())) == 1
assert len(list(contigs[1].alignment.gaps())) == 0

Expand Down Expand Up @@ -469,7 +469,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner):
),
]

contigs = align_all_to_reference(contigs)
contigs = [c for c, is_rev in align_all_to_reference(contigs)]
assert len(list(contigs[0].alignment.gaps())) == 1
assert len(list(contigs[1].alignment.gaps())) == 0

Expand Down Expand Up @@ -662,18 +662,18 @@ def test_correct_processing_complex_logs(exact_aligner):
list(stitch_contigs(contigs))

messages = list(iterate_messages())
assert len(messages) == 48
assert len(messages) == 64
assert all(name == "micall.core.contig_stitcher" for name, m in messages)

info_messages = [m for name, m in messages if m.levelname == 'INFO']
debug_messages = [m for name, m in messages if m.levelname == 'DEBUG']
assert len(info_messages) == 32
assert len(info_messages) == 40
assert len(debug_messages) == len(messages) - len(info_messages)

info_actions = [(m.action + ':' + (m.type if hasattr(m, 'type') else '')) for m in info_messages]
assert info_actions == \
['intro:'] * 8 + \
['alignment:hitnumber', 'alignment:hit'] * 8 + \
['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \
['stitch:'] * 2 + ['nooverlap:'] + ['stitch:'] * 2 + ['nooverlap:'] * 3


Expand Down
8 changes: 4 additions & 4 deletions micall/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

@dataclass
class MockAlignment:
is_rev: bool
strand: int # +1 if on the forward strand; -1 if on the reverse strand
mapq: int
cigar: list
cigar_str: str
Expand Down Expand Up @@ -41,17 +41,17 @@ def map(self, seq):
continue

mapq = 60
is_rev = False # Doesn't handle reverse complements in this mock.
strand = 1 # Doesn't handle reverse complements in this mock.
r_st = self.seq.index(substring)
r_en = r_st + len(substring)
q_st = start
q_en = end
cigar = [[q_en - q_st, CigarActions.MATCH]]
cigar_str = f'{(q_en - q_st)}M'
al = MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en)
al = MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en)
if (q_st, q_en, r_st, r_en) not in returned:
returned.add((q_st, q_en, r_st, r_en))
yield MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en)
yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en)

max_matches -= 1
if max_matches < 1:
Expand Down

0 comments on commit b786073

Please sign in to comment.