diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 08de60f8b..a281c8f21 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -7,6 +7,7 @@ from itertools import accumulate, takewhile, tee, islice, chain from gotoh import align_it from queue import LifoQueue +from Bio import Seq import logging from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @@ -213,61 +214,56 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: alignments = list(aligner.map(contig.seq)) hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), "forward" if x.strand == 1 else "reverse") for x in alignments] - reversed_alignments = [alignment for alignment, strand in hits_array if strand == "reverse"] - alignments = [alignment for alignment, strand in hits_array if strand == "forward"] - logger.info("Contig %r produced %s reverse-complement alignments.", - contig.name, len(reversed_alignments), - extra={"action": "alignment", "type": "reversenumber", - "contig": contig, "n": len(reversed_alignments)}) + connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] - connected = connect_cigar_hits(alignments) if alignments else [] + if not connected: + logger.info("Contig %r not aligned - backend's choice.", contig.name, + extra={"action": "alignment", "type": "zerohits", "contig": contig}) + yield contig + return + + if len(set(map(lambda p: p[1], hits_array))) > 1: + logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name, + extra={"action": "alignment", "type": "strandconflict", "contig": contig}) + yield contig + return + + logger.info("Contig %r produced %s aligner hits. After connecting then, the number became %s.", + contig.name, len(hits_array), len(connected), + extra={"action": "alignment", "type": "hitnumber", "contig": contig, + "initial": hits_array, "connected": connected}) - logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected), - extra={"action": "alignment", "type": "hitnumber", - "contig": contig, "n": len(connected)}) + strand = hits_array[0][1] + if strand == "reverse": + rc = str(Seq(contig.seq).reverse_complement()) + new_contig = replace(contig, seq=rc) + logger.info("Reverse complemented contig %r.", contig.name, + extra={"action": "alignment", "type": "reversecomplement", + "contig": contig, "result": new_contig}) + contig = new_contig + + def make_aligned(i, query, alignment): + result = AlignedContig.make(query, alignment, strand) - def logpart(i, part, is_rev): logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", - i, contig.name, part.name, part.alignment.q_st, - part.alignment.q_ei, part.alignment.r_st, part.alignment.r_ei, - " (rev)" if is_rev else "", + i, contig.name, result.name, result.alignment.q_st, + result.alignment.q_ei, result.alignment.r_st, result.alignment.r_ei, + " (rev)" if strand == "reverse" else "", extra={"action": "alignment", "type": "hit", - "contig": contig, "part": part, "i": i}) + "contig": contig, "part": result, "i": i}) logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, - part.name, part.alignment, " (rev)" if is_rev else "") - - def make_aligned(query, alignment, is_rev): - return AlignedContig.make( - query=query, - alignment=alignment, - strand=is_rev) + result.name, result.alignment, " (rev)" if strand == "reverse" else "") - to_return = connected + reversed_alignments - if len(to_return) == 0: - logger.info("Contig %r not aligned - backend choice.", contig.name, - extra={"action": "alignment", "type": "zerohits", "contig": contig}) - yield contig - return + return result - if len(to_return) == 1: - is_rev = "forward" if to_return[0] in alignments else "reverse" - part = make_aligned(contig, to_return[0], is_rev) - logpart(0, part, is_rev) - yield part + if len(connected) == 1: + yield make_aligned(0, contig, connected[0]) return - for i, single_hit in enumerate(to_return): - query = GenotypedContig(name=generate_new_name(), - seq=contig.seq, - ref_name=contig.ref_name, - group_ref=contig.group_ref, - ref_seq=contig.ref_seq, - match_fraction=contig.match_fraction) - is_rev = "forward" if single_hit in alignments else "reverse" - part = make_aligned(query, single_hit, is_rev) - logpart(i, part, is_rev) - yield part + for i, single_hit in enumerate(connected): + query = replace(contig, name=generate_new_name()) + yield make_aligned(i, query, single_hit) def align_all_to_reference(contigs): diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index cca8444fb..6f86730ab 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -532,8 +532,10 @@ def unwrap_final(contig): anomaly.append(event.part.name) elif event.type == "noref": unknown.append(event.contig.name) - elif event.type == "zerohits": + elif event.type == "zerohits" or event.type == "strandconflict": anomaly.append(event.contig.name) + elif event.type == "reversecomplement": + record_contig(event.new_contig, [event.contig]) elif event.type in ("hitnumber", "reversenumber"): pass else: diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e8ac45110..49f19b4a5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -653,11 +653,11 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 158 + assert len(handler.logs) == 150 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] - assert len(info_messages) == 40 + assert len(info_messages) == 32 assert len(debug_messages) == len(handler.logs) - len(info_messages)