Skip to content

Commit

Permalink
Small improvements to Cigar tools and Contig stitcher
Browse files Browse the repository at this point in the history
* Extracted the `align_to_reference` method from `GenotypedContig`
  class and converted it into a standalone function, streamlining how
  contig alignment is invoked across the application.

* Transformed the `Cigar` data structure from a mutable list to an
  immutable tuple to enforce the immutability of CIGAR sequences after
  creation for safety and reliability.

* Revised the `epsilon` property calculation in `CigarHit` for better
  accuracy and consistency.

* Revised in-code documentation, removing redundant comments and placing
  technical details at the end of the file for cleaner, more focused code.

* Removed the unused `closest_key` method from `IntDict`, simplifying the interface.

* Enhanced comments in `CigarHit` and `Cigar`, making the code easier to understand
  and clarifying the purpose and functionality of core classes.
  • Loading branch information
Donaim committed Nov 15, 2023
1 parent 87d2d4d commit 2fd20f5
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 135 deletions.
28 changes: 15 additions & 13 deletions micall/core/contig_stitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,6 @@ class GenotypedContig(Contig):
ref_seq: str
matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`.

def align_to_reference(self):
aligner = Aligner(seq=self.ref_seq, preset='map-ont')
alignments = list(aligner.map(self.seq))
if not alignments:
return self

hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments]
single_cigar_hit = connect_cigar_hits(hits_array)
return AlignedContig(query=self, alignment=single_cigar_hit)


def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']:
"""
Cuts this alignment in two parts with cut_point between them.
Expand All @@ -59,6 +48,8 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont

@dataclass
class AlignedContig(GenotypedContig):
query: GenotypedContig
alignment: CigarHit

def __init__(self, query: GenotypedContig, alignment: CigarHit):
self.query = query
Expand Down Expand Up @@ -173,6 +164,17 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig:
return AlignedContig(query, alignment)


def align_to_reference(contig):
aligner = Aligner(seq=contig.ref_seq, preset='map-ont')
alignments = list(aligner.map(contig.seq))
if not alignments:
return contig

hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments]
single_cigar_hit = connect_cigar_hits(hits_array)
return AlignedContig(query=contig, alignment=single_cigar_hit)


def align_queries(seq1: str, seq2: str) -> Tuple[str, str]:
gap_open_penalty = 15
gap_extend_penalty = 3
Expand Down Expand Up @@ -376,7 +378,7 @@ def try_split(contig):
continue

Check warning on line 378 in micall/core/contig_stitcher.py

View check run for this annotation

Codecov / codecov/patch

micall/core/contig_stitcher.py#L378

Added line #L378 was not covered by tests

if covered(contig, gap):
midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon
midpoint = gap.r_st + (gap.r_ei - gap.r_st) // 2 + contig.alignment.epsilon
left_part, right_part = contig.cut_reference(midpoint)
left_part = left_part.rstrip_query()
right_part = right_part.lstrip_query()
Expand All @@ -398,7 +400,7 @@ def try_split(contig):


def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]:
maybe_aligned = list(map(GenotypedContig.align_to_reference, contigs))
maybe_aligned = list(map(align_to_reference, contigs))

# Contigs that did not align do not need any more processing
yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig))
Expand Down
6 changes: 3 additions & 3 deletions micall/tests/test_contig_stitcher.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

import pytest
import random
from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance
from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_to_reference
from micall.tests.utils import MockAligner, fixed_random_seed


Expand Down Expand Up @@ -405,7 +405,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner):
),
]

contigs = [x.align_to_reference() for x in contigs]
contigs = list(map(align_to_reference, contigs))
assert len(list(contigs[0].alignment.gaps())) == 1
assert len(list(contigs[1].alignment.gaps())) == 0

Expand Down Expand Up @@ -434,7 +434,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner):
),
]

contigs = [x.align_to_reference() for x in contigs]
contigs = list(map(align_to_reference, contigs))
assert len(list(contigs[0].alignment.gaps())) == 1
assert len(list(contigs[1].alignment.gaps())) == 0

Expand Down
Loading

0 comments on commit 2fd20f5

Please sign in to comment.