Skip to content

Commit

Permalink
Contig stitcher: improve handling of reverse complement alignments
Browse files Browse the repository at this point in the history
  • Loading branch information
Donaim committed Jan 18, 2024
1 parent e203e9a commit 4b08a3f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 49 deletions.
85 changes: 39 additions & 46 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from itertools import accumulate, takewhile, tee, islice, chain
from gotoh import align_it
from queue import LifoQueue
from Bio import Seq
import logging

from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit
Expand Down Expand Up @@ -213,60 +214,52 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]:
alignments = list(aligner.map(contig.seq))
hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1),
"forward" if x.strand == 1 else "reverse") for x in alignments]
reversed_alignments = [alignment for alignment, strand in hits_array if strand == "reverse"]
alignments = [alignment for alignment, strand in hits_array if strand == "forward"]

logger.info("Contig %r produced %s reverse-complement alignments.",
contig.name, len(reversed_alignments),
extra={"action": "alignment", "type": "reversenumber",
"contig": contig, "n": len(reversed_alignments)})
connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else []

connected = connect_cigar_hits(alignments) if alignments else []

logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected),
extra={"action": "alignment", "type": "hitnumber",
"contig": contig, "n": len(connected)})

def logpart(i, part, is_rev):
logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.",
i, contig.name, part.name, part.alignment.q_st,
part.alignment.q_ei, part.alignment.r_st, part.alignment.r_ei,
" (rev)" if is_rev else "",
extra={"action": "alignment", "type": "hit",
"contig": contig, "part": part, "i": i})
logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name,
part.name, part.alignment, " (rev)" if is_rev else "")

def make_aligned(query, alignment, is_rev):
return AlignedContig.make(
query=query,
alignment=alignment,
strand=is_rev)

to_return = connected + reversed_alignments
if len(to_return) == 0:
logger.info("Contig %r not aligned - backend choice.", contig.name,
if not connected:
logger.info("Contig %r not aligned - backend's choice.", contig.name,
extra={"action": "alignment", "type": "zerohits", "contig": contig})
yield contig
return

if len(to_return) == 1:
is_rev = "forward" if to_return[0] in alignments else "reverse"
part = make_aligned(contig, to_return[0], is_rev)
logpart(0, part, is_rev)
yield part
if len(set(map(lambda p: p[1], hits_array))) > 1:
logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name,
extra={"action": "alignment", "type": "strandconflict", "contig": contig})
yield contig
return

for i, single_hit in enumerate(to_return):
query = GenotypedContig(name=generate_new_name(),
seq=contig.seq,
ref_name=contig.ref_name,
group_ref=contig.group_ref,
ref_seq=contig.ref_seq,
match_fraction=contig.match_fraction)
is_rev = "forward" if single_hit in alignments else "reverse"
part = make_aligned(query, single_hit, is_rev)
logpart(i, part, is_rev)
logger.info("Contig %r produced %s aligner hits. After connecting then, the number became %s.",
contig.name, len(hits_array), len(connected),
extra={"action": "alignment", "type": "hitnumber", "contig": contig,
"initial": hits_array, "connected": connected})

strand = hits_array[0][1]
if strand == "reverse":
rc = str(Seq(contig.seq).reverse_complement())
new_contig = replace(contig, seq=rc)
logger.info("Reverse complemented contig %r.", contig.name,
extra={"action": "alignment", "type": "reversecomplement",
"contig": contig, "result": new_contig})
contig = new_contig

for i, single_hit in enumerate(connected):
if len(connected) == 1:
query = contig
else:
query = replace(contig, name=generate_new_name())

part = AlignedContig.make(query, single_hit, strand)

logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.",
i, contig.name,part.name,part.alignment.q_st,
part.alignment.q_ei,part.alignment.r_st,part.alignment.r_ei,
" (rev)" if strand == "reverse" else "",
extra={"action": "alignment", "type": "hit",
"contig": contig, "part":part, "i": i})
logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name,
part.name,part.alignment, " (rev)" if strand == "reverse" else "")

yield part


Expand Down
4 changes: 3 additions & 1 deletion micall/core/plot_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,10 @@ def unwrap_final(contig):
anomaly.append(event.part.name)
elif event.type == "noref":
unknown.append(event.contig.name)
elif event.type == "zerohits":
elif event.type == "zerohits" or event.type == "strandconflict":
anomaly.append(event.contig.name)
elif event.type == "reversecomplement":
record_contig(event.new_contig, [event.contig])
elif event.type in ("hitnumber", "reversenumber"):
pass
else:
Expand Down
4 changes: 2 additions & 2 deletions micall/tests/test_contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,11 +653,11 @@ def test_correct_processing_complex_logs(exact_aligner):

assert len(handler.logs) == 0
list(stitch_consensus(contigs))
assert len(handler.logs) == 158
assert len(handler.logs) == 150

info_messages = [m for m in handler.logs if m.levelname == 'INFO']
debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG']
assert len(info_messages) == 40
assert len(info_messages) == 32
assert len(debug_messages) == len(handler.logs) - len(info_messages)


Expand Down

0 comments on commit 4b08a3f

Please sign in to comment.