From 827ef059fe0b6a1a823bd101111ff81389d7e067 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 28 Jul 2023 09:52:13 -0700 Subject: [PATCH 001/485] Configure all options through the environment by default Options such as passwords need to be configured through the environment. No option must be configured through the command line. Thus, we are making it simpler by using just the environment in our example. Another benefit is that changing these options is easier because we don't have to re-enable the systemd unit file. --- docs/admin.md | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/docs/admin.md b/docs/admin.md index b039fc166..5841e9dd3 100644 --- a/docs/admin.md +++ b/docs/admin.md @@ -109,11 +109,8 @@ Here's an example configuration, in `/etc/systemd/system/micall_watcher.service` Description=micall_watcher [Service] - ExecStart=/usr/local/share/venv-micall/bin/python3.6 \ - /usr/local/share/MiCall/micall_watcher.py \ - --pipeline_version=8.0 --raw_data=/data/raw \ - --micall_filter_quality_pipeline_id=100 --micall_main_pipeline_id=101 \ - --micall_resistance_pipeline_id=102 + ExecStart=/usr/local/share/venv-micall/bin/python3 \ + /usr/local/share/MiCall/micall_watcher.py EnvironmentFile=/etc/micall/micall.conf User=micall @@ -123,10 +120,11 @@ Here's an example configuration, in `/etc/systemd/system/micall_watcher.service` [Install] WantedBy=multi-user.target -The settings can either be given on the command line or set as -environment variables. Environment variables are a better option for -sensitive parameters like passwords, because the command line is visible to all -users. Environment variables go in the configuration file listed in the +Micall watcher accepts multiple settings which can be passed +directly as command line arguments, or as environment variables. +Environment variables are a better option for sensitive parameters like passwords, +because the command line is visible to all users. +Environment variables go in the configuration file listed in the `EnvironmentFile=` setting. In this example, it's `/etc/micall/micall.conf` $ sudo mkdir /etc/micall @@ -136,16 +134,24 @@ users. Environment variables go in the configuration file listed in the Make sure you reduce the read permissions on the `.conf` file so other users can't read it. The environment variable names are the same as the command options, but they add a `MICALL_` prefix, if it's not already there. +To list all the available options, run `python3 micall_watcher.py --help`. +Below is the example config: # This is an example of /etc/micall/micall.conf # You can add comment lines that start with # - MICALL_KIVE_SERVER=https://example.com - MICALL_KIVE_USER=badexample - MICALL_KIVE_PASSWORD=badexample + MICALL_KIVE_SERVER=https://kive.example.com + MICALL_KIVE_USER=kiveusername + MICALL_KIVE_PASSWORD=kivepassword - MICALL_QAI_SERVER=https://example.com - MICALL_QAI_USER=badexample - MICALL_QAI_PASSWORD=badexample + MICALL_QAI_SERVER=https://qai.example.com + MICALL_QAI_USER=qaiuser + MICALL_QAI_PASSWORD=qaipassword + + MICALL_RAW_DATA=/data/raw + + MICALL_MAIN_PIPELINE_ID=100 + MICALL_FILTER_QUALITY_PIPELINE_ID=101 + MICALL_RESISTANCE_PIPELINE_ID=102 Don't put the environment variables directly in the `.service` file, because its contents are visible to all users with `systemctl show micall_watcher`. From 8c85ce560d0cf22c335ef4d7ab2e77412e0b6a4d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 31 Aug 2023 16:03:58 -0700 Subject: [PATCH 002/485] fix big deletion test for minimap --- micall/tests/test_consensus_aligner.py | 52 ++++++++++++++++++++------ 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index af13d86d8..1713cedbd 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -1,7 +1,9 @@ import math import typing +import random from io import StringIO from pytest import approx +from contextlib import contextmanager from micall.core.aln2counts import SeedAmino, ReportAmino from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, CigarActions, AminoAlignment @@ -12,6 +14,29 @@ from micall.utils.report_amino import ReportNucleotide +@contextmanager +def fixed_random_seed(seed): + original_state = random.getstate() + random.seed(seed) + try: + yield + finally: + random.setstate(original_state) + + +def mutate_sequence(rate, seq): + def mutate(x): + if random.random() > rate: + return x + + while True: + y = random.choice(['A', 'C', 'G', 'T']) + if y != x: return y + + with fixed_random_seed(42): + return ''.join(mutate(x) for x in seq) + + def assert_alignments(aligner: ConsensusAligner, *expected_alignments: AlignmentWrapper): __tracebackhide__ = True @@ -411,25 +436,30 @@ def test_start_contig_deletion_minimap2(projects): def test_start_contig_big_deletion_minimap2(projects): seed_name = 'HCV-1a' seed_seq = projects.getReference(seed_name) - consensus = seed_seq[340:920] + seed_seq[3000:9000] + seed_seq = mutate_sequence(seq=seed_seq, rate=0.04) + consensus = seed_seq[290:983] + seed_seq[3000:9269] + expected_alignment = [AlignmentWrapper(ctg='N/A', ctg_len=len(seed_seq), - r_st=340, - r_en=920, + mlen=668, + r_st=290, + r_en=983, q_st=0, - q_en=580, + q_en=693, mapq=60, - cigar=[[580, CigarActions.MATCH]], - NM=0), + cigar=[[693, CigarActions.MATCH]], + NM=25), AlignmentWrapper(ctg='N/A', ctg_len=len(seed_seq), + mlen=6013, r_st=3000, - r_en=9000, - q_st=580, - q_en=6580, + r_en=9269, + q_st=693, + q_en=6962, mapq=60, - cigar=[[6000, CigarActions.MATCH]], - NM=0)] + cigar=[[6269, CigarActions.MATCH]], + NM=256)] + aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) From 26d3522cb1ee248360ce92a17e183a365dd690e1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Sep 2023 10:32:54 -0700 Subject: [PATCH 003/485] Fix mutation rate handling in aligner tests --- micall/tests/test_consensus_aligner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 1713cedbd..2483563a8 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -26,7 +26,7 @@ def fixed_random_seed(seed): def mutate_sequence(rate, seq): def mutate(x): - if random.random() > rate: + if random.random() >= rate: return x while True: @@ -34,7 +34,7 @@ def mutate(x): if y != x: return y with fixed_random_seed(42): - return ''.join(mutate(x) for x in seq) + return ''.join(map(mutate, seq)) def assert_alignments(aligner: ConsensusAligner, From ea0b85270769981b6afc713b1cc31fec6fa6f36a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 12:18:15 -0700 Subject: [PATCH 004/485] Add aln2counts_simplify tool This is similar to denovo_simplify, but only runs the postprocessing part. --- micall/utils/aln2counts_simplify.py | 146 ++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 micall/utils/aln2counts_simplify.py diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py new file mode 100644 index 000000000..cc5ccc472 --- /dev/null +++ b/micall/utils/aln2counts_simplify.py @@ -0,0 +1,146 @@ +import sys +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from collections import defaultdict +import logging +import os +from csv import DictReader + +from micall.core.trim_fastqs import trim +from micall.utils.dd import DD +from micall.core.aln2counts import aln2counts + +logger = logging.getLogger(__name__) + + +def parse_args(argv): + parser = ArgumentParser( + description='Find the simplest list of aligned reads that reproduces a chosen problem.', + formatter_class=ArgumentDefaultsHelpFormatter) + parser.add_argument('filename', + help='Input file with the initial aligned reads') + parser.add_argument('simple', + help='Output file with the simplified aligned reads') + parser.add_argument('--test', + help='name of the test to run', + choices=MicallDD.test_names, + default=MicallDD.test_names[0]) + + return parser.parse_args(argv) + + +class MicallDD(DD): + test_names = ['subseq'] + + def __init__(self, + filename, + simple, + test_name): + super(MicallDD, self).__init__() + self.filename = filename + self.simple = simple + base, ext = os.path.splitext(simple) + self.best = base + '_best' + ext + self.get_result = getattr(self, 'check_' + test_name) + self.reads = read_aligned(self.filename) + + def _test(self, read_indexes): + read_count = len(read_indexes) + self.write_simple_aligned(self.simple, read_indexes) + workdir = os.path.dirname(self.simple) + + def writer(filename): + return open(os.path.join(workdir, filename), 'w+') + + with open(self.simple, 'r') as aligned_csv, \ + writer('stitched.csv') as output: + # noinspection PyBroadException + try: + aln2counts(aligned_csv, + # TODO: maybe redirect to os.devnull instead. + writer('nuc.csv'), + writer('amino.csv'), + writer('insertions.csv'), + writer('conseq.csv'), + writer('failed_align.csv'), + # --- # + conseq_stitched_csv=output, + nuc_detail_csv=writer("nuc_detail.csv"), + ) + + exception = None + except Exception as ex: + logger.warning(f'Assembly failed: {ex!r}.', exc_info=True) + print(f'Assembly failed: {ex!r}.') + exception = ex + + output.seek(0) + result = self.get_result(output, read_count, exception) + if result == DD.FAIL: + os.rename(self.simple, self.best) + return result + + @staticmethod + def check_subseq(output, read_count, exception): + if exception is not None: + return DD.UNRESOLVED + + simple_count = len(output.readlines()) - 1 + + logger.debug('Result: %d simplified reads from %d selected reads.', + simple_count, + read_count) + + expected_substring = os.environ.get('MICALL_DD_SUBSEQ', None) + if expected_substring is None: + raise RuntimeError(f"Expected ${'MICALL_DD_SUBSEQ'!r} environment value to be set for the {'subseq'!r} test") + output.seek(0) + success = any((expected_substring in line) for line in output) + + return DD.FAIL if success else DD.PASS + + def write_simple_aligned(self, filename, read_indexes): + selected_reads = (self.reads[i] for i in read_indexes) + with open(filename, 'w') as f: + f.write('refname,qcut,rank,count,offset,seq\n') + for line in selected_reads: + f.write(line) + + +def read_aligned(filename): + """ Load all the reads from an aligned reads file into a dictionary. + + @param filename: the aligned.csv file to open + @param reads: dict({index: line}) + """ + + with open(filename, 'r') as f: + header = next(f) + + # Sanity check that may detect instances where an incorrect file has been passed as input. + if header.strip() != 'refname,qcut,rank,count,offset,seq': + raise ValueError(f'Aligned reads file {filename!r} does not start with a known header') + + return f.readlines() + + +def main(argv): + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s[%(levelname)s]%(module)s:%(lineno)d - %(message)s', + stream=sys.stdout) + args = parse_args(argv) + try: + logger.info('Starting.') + dd = MicallDD(args.filename, + args.simple, + args.test) + read_indexes = list(range(len(dd.reads))) + min_indexes = dd.ddmin(read_indexes) + dd.test(min_indexes) + logger.info('Done.') + except Exception as ex: + logger.error('Failed.', exc_info=ex) + + +if __name__ == '__main__': + main(sys.argv[1:]) From 2dfcd18f9cf5c967aa67596c883401a74a502e22 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 13:48:32 -0700 Subject: [PATCH 005/485] Improve aln2counts simplify logging --- micall/utils/aln2counts_simplify.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index cc5ccc472..492c48330 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -105,6 +105,19 @@ def write_simple_aligned(self, filename, read_indexes): for line in selected_reads: f.write(line) + def coerce(self, c): + if c is None: + return 'None' + blocks = [] # [[first, last]] indexes for all contiguous blocks + for i in c: + if (not blocks) or blocks[-1][-1] != i-1: + blocks.append([i, i]) + else: + blocks[-1][-1] = i + return '[' + ', '.join(str(block[0]) if block[0] == block[1] + else '{}-{}'.format(*block) + for block in blocks) + ']' + def read_aligned(filename): """ Load all the reads from an aligned reads file into a dictionary. From ae78622aab36d1916355f56a0361e8ac68d0bab1 Mon Sep 17 00:00:00 2001 From: Donaim <15235025+Donaim@users.noreply.github.com> Date: Wed, 11 Oct 2023 14:20:59 -0700 Subject: [PATCH 006/485] Fix wording in aln2counts_simplify.py Co-authored-by: Don Kirkby --- micall/utils/aln2counts_simplify.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index 492c48330..16a1eb23b 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -69,8 +69,8 @@ def writer(filename): exception = None except Exception as ex: - logger.warning(f'Assembly failed: {ex!r}.', exc_info=True) - print(f'Assembly failed: {ex!r}.') + logger.warning(f'Read counting failed: {ex!r}.', exc_info=True) + print(f'Read counting failed: {ex!r}.') exception = ex output.seek(0) From 9981c010f8943b61d7d58a8159a305e71ada0186 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 14:14:11 -0700 Subject: [PATCH 007/485] Make aln2counts simplifier save all best inputs and outputs --- micall/utils/aln2counts_simplify.py | 59 ++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index 16a1eb23b..5bcccda18 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -4,6 +4,8 @@ import logging import os from csv import DictReader +import typing +from typing import Union from micall.core.trim_fastqs import trim from micall.utils.dd import DD @@ -38,8 +40,6 @@ def __init__(self, super(MicallDD, self).__init__() self.filename = filename self.simple = simple - base, ext = os.path.splitext(simple) - self.best = base + '_best' + ext self.get_result = getattr(self, 'check_' + test_name) self.reads = read_aligned(self.filename) @@ -52,19 +52,26 @@ def writer(filename): return open(os.path.join(workdir, filename), 'w+') with open(self.simple, 'r') as aligned_csv, \ - writer('stitched.csv') as output: + writer('nuc.csv') as nuc_csv, \ + writer('amino.csv') as amino_csv, \ + writer('insertions.csv') as insertions_csv, \ + writer('conseq.csv') as conseq_csv, \ + writer('failed_align.csv') as failed_align_csv, \ + writer('nuc_detail.csv') as nuc_detail_csv, \ + writer('stitched.csv') as stitched_csv: + # noinspection PyBroadException try: aln2counts(aligned_csv, # TODO: maybe redirect to os.devnull instead. - writer('nuc.csv'), - writer('amino.csv'), - writer('insertions.csv'), - writer('conseq.csv'), - writer('failed_align.csv'), + nuc_csv, + amino_csv, + insertions_csv, + conseq_csv, + failed_align_csv, # --- # - conseq_stitched_csv=output, - nuc_detail_csv=writer("nuc_detail.csv"), + nuc_detail_csv=nuc_detail_csv, + conseq_stitched_csv=stitched_csv, ) exception = None @@ -73,18 +80,25 @@ def writer(filename): print(f'Read counting failed: {ex!r}.') exception = ex - output.seek(0) - result = self.get_result(output, read_count, exception) + stitched_csv.seek(0) + result = self.get_result(stitched_csv, read_count, exception) if result == DD.FAIL: - os.rename(self.simple, self.best) + save_best(aligned_csv) + save_best(nuc_csv) + save_best(amino_csv) + save_best(insertions_csv) + save_best(conseq_csv) + save_best(failed_align_csv) + save_best(stitched_csv) + return result @staticmethod - def check_subseq(output, read_count, exception): + def check_subseq(stitched_csv, read_count, exception): if exception is not None: return DD.UNRESOLVED - simple_count = len(output.readlines()) - 1 + simple_count = len(stitched_csv.readlines()) - 1 logger.debug('Result: %d simplified reads from %d selected reads.', simple_count, @@ -93,8 +107,8 @@ def check_subseq(output, read_count, exception): expected_substring = os.environ.get('MICALL_DD_SUBSEQ', None) if expected_substring is None: raise RuntimeError(f"Expected ${'MICALL_DD_SUBSEQ'!r} environment value to be set for the {'subseq'!r} test") - output.seek(0) - success = any((expected_substring in line) for line in output) + stitched_csv.seek(0) + success = any((expected_substring in line) for line in stitched_csv) return DD.FAIL if success else DD.PASS @@ -119,6 +133,17 @@ def coerce(self, c): for block in blocks) + ']' +def save_best(file: Union[str, '_io.TextIOWrapper']): + """ Save the current best version of a file. + """ + + filename = file if type(file) is str else file.name + base, ext = os.path.splitext(filename) + best = base + '_best' + ext + + os.rename(filename, best) + + def read_aligned(filename): """ Load all the reads from an aligned reads file into a dictionary. From cd60a552ed1b730439295ad0e52a6ce9892d0833 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 14:28:31 -0700 Subject: [PATCH 008/485] Factor out commont string constants in aln2counts_simplify --- micall/utils/aln2counts_simplify.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index 5bcccda18..c40b20b11 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -13,6 +13,8 @@ logger = logging.getLogger(__name__) +ALIGNED_CSV_HEADER = 'refname,qcut,rank,count,offset,seq' +SUBSEQ_ENV_VARNAME = 'MICALL_DD_SUBSEQ' def parse_args(argv): parser = ArgumentParser( @@ -104,9 +106,9 @@ def check_subseq(stitched_csv, read_count, exception): simple_count, read_count) - expected_substring = os.environ.get('MICALL_DD_SUBSEQ', None) + expected_substring = os.environ.get(SUBSEQ_ENV_VARNAME, None) if expected_substring is None: - raise RuntimeError(f"Expected ${'MICALL_DD_SUBSEQ'!r} environment value to be set for the {'subseq'!r} test") + raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the {'subseq'!r} test") stitched_csv.seek(0) success = any((expected_substring in line) for line in stitched_csv) @@ -115,7 +117,8 @@ def check_subseq(stitched_csv, read_count, exception): def write_simple_aligned(self, filename, read_indexes): selected_reads = (self.reads[i] for i in read_indexes) with open(filename, 'w') as f: - f.write('refname,qcut,rank,count,offset,seq\n') + f.write(ALIGNED_CSV_HEADER) + f.write('\n') for line in selected_reads: f.write(line) @@ -155,7 +158,7 @@ def read_aligned(filename): header = next(f) # Sanity check that may detect instances where an incorrect file has been passed as input. - if header.strip() != 'refname,qcut,rank,count,offset,seq': + if header.strip() != ALIGNED_CSV_HEADER.strip(): raise ValueError(f'Aligned reads file {filename!r} does not start with a known header') return f.readlines() From 6ca68d5f17332230e62a2414e13627682471dad0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 14:32:39 -0700 Subject: [PATCH 009/485] Cleanup small things in aln2counts_simplify.py --- micall/utils/aln2counts_simplify.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index c40b20b11..983274ecc 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -4,7 +4,6 @@ import logging import os from csv import DictReader -import typing from typing import Union from micall.core.trim_fastqs import trim @@ -64,14 +63,14 @@ def writer(filename): # noinspection PyBroadException try: - aln2counts(aligned_csv, - # TODO: maybe redirect to os.devnull instead. + aln2counts(# Inputs # + aligned_csv, nuc_csv, amino_csv, insertions_csv, conseq_csv, failed_align_csv, - # --- # + # Outputs # nuc_detail_csv=nuc_detail_csv, conseq_stitched_csv=stitched_csv, ) @@ -79,7 +78,6 @@ def writer(filename): exception = None except Exception as ex: logger.warning(f'Read counting failed: {ex!r}.', exc_info=True) - print(f'Read counting failed: {ex!r}.') exception = ex stitched_csv.seek(0) @@ -102,13 +100,13 @@ def check_subseq(stitched_csv, read_count, exception): simple_count = len(stitched_csv.readlines()) - 1 - logger.debug('Result: %d simplified reads from %d selected reads.', - simple_count, - read_count) + logger.debug('Result: %d stitched sequences from %d selected reads.', + simple_count, read_count) expected_substring = os.environ.get(SUBSEQ_ENV_VARNAME, None) if expected_substring is None: raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the {'subseq'!r} test") + stitched_csv.seek(0) success = any((expected_substring in line) for line in stitched_csv) From 9732ccbc8066c85b241fb0005a5e645b9d1a2afc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 11 Oct 2023 14:42:27 -0700 Subject: [PATCH 010/485] Allow MICALL_DD_SUBSEQ to be a regex --- micall/utils/aln2counts_simplify.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index 983274ecc..599ba6a64 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -3,6 +3,7 @@ from collections import defaultdict import logging import os +import re from csv import DictReader from typing import Union @@ -44,6 +45,12 @@ def __init__(self, self.get_result = getattr(self, 'check_' + test_name) self.reads = read_aligned(self.filename) + expected_subsequence = os.environ.get(SUBSEQ_ENV_VARNAME, None) + if expected_subsequence is None: + raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the {'subseq'!r} test") + + self.expected_subsequence_re = re.compile(expected_subsequence) + def _test(self, read_indexes): read_count = len(read_indexes) self.write_simple_aligned(self.simple, read_indexes) @@ -93,8 +100,7 @@ def writer(filename): return result - @staticmethod - def check_subseq(stitched_csv, read_count, exception): + def check_subseq(self, stitched_csv, read_count, exception): if exception is not None: return DD.UNRESOLVED @@ -103,12 +109,8 @@ def check_subseq(stitched_csv, read_count, exception): logger.debug('Result: %d stitched sequences from %d selected reads.', simple_count, read_count) - expected_substring = os.environ.get(SUBSEQ_ENV_VARNAME, None) - if expected_substring is None: - raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the {'subseq'!r} test") - stitched_csv.seek(0) - success = any((expected_substring in line) for line in stitched_csv) + success = self.expected_subsequence_re.search(stitched_csv.read()) return DD.FAIL if success else DD.PASS From 89bc57956ea085bf0668a6022ae792d8da03ecf5 Mon Sep 17 00:00:00 2001 From: Donaim <15235025+Donaim@users.noreply.github.com> Date: Wed, 11 Oct 2023 15:51:27 -0700 Subject: [PATCH 011/485] Update micall/utils/aln2counts_simplify.py Co-authored-by: Don Kirkby --- micall/utils/aln2counts_simplify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/utils/aln2counts_simplify.py b/micall/utils/aln2counts_simplify.py index 599ba6a64..8a93f2bdc 100644 --- a/micall/utils/aln2counts_simplify.py +++ b/micall/utils/aln2counts_simplify.py @@ -47,7 +47,7 @@ def __init__(self, expected_subsequence = os.environ.get(SUBSEQ_ENV_VARNAME, None) if expected_subsequence is None: - raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the {'subseq'!r} test") + raise RuntimeError(f"Expected ${SUBSEQ_ENV_VARNAME!r} environment variable to be set for the 'subseq' test") self.expected_subsequence_re = re.compile(expected_subsequence) From 182c05054cd7475c35084d3d68a782f4484875eb Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Wed, 11 Oct 2023 21:45:59 +0000 Subject: [PATCH 012/485] Add devcontainer configuration. --- .devcontainer/Dockerfile | 7 +++++++ .devcontainer/devcontainer.json | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 000000000..df3fb2490 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,7 @@ +FROM cfelab/micall:v7.15.13 + +# Uninstall some Python packages that don't upgrade correctly. +RUN python -m pip install --upgrade pip && \ + python -m pip uninstall -y python-Levenshtein Levenshtein genetracks + +ENTRYPOINT bash \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..ebd16c0b4 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/universal +{ + "name": "MiCall dev", + + // More info: https://containers.dev/guide/dockerfile + "build": { + "dockerfile": "Dockerfile" + }, + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "python -m pip install -r requirements-dev.txt", + + // Configure tool-specific properties. + "customizations": { + "vscode": { + "extensions": ["ms-python.python"] + } + } + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} \ No newline at end of file From 058a3582245c075286e5c7ef3644a9a65b4c42b8 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Fri, 13 Oct 2023 09:22:32 -0700 Subject: [PATCH 013/485] Force reinstall of pyvdrm in devcontainer. --- .devcontainer/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index df3fb2490..8eb70eabf 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -2,6 +2,6 @@ FROM cfelab/micall:v7.15.13 # Uninstall some Python packages that don't upgrade correctly. RUN python -m pip install --upgrade pip && \ - python -m pip uninstall -y python-Levenshtein Levenshtein genetracks + python -m pip uninstall -y python-Levenshtein Levenshtein genetracks pyvdrm ENTRYPOINT bash \ No newline at end of file From 2c4a70987b8d5492710fddada29aac1f725d4025 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Fri, 13 Oct 2023 13:22:38 -0700 Subject: [PATCH 014/485] Upgrade matplotlib to avoid a bogus warning. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3da10e7cf..8941a9e11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2 numpy==1.24.3 scipy==1.10.1 -matplotlib==3.7.2 +matplotlib==3.7.3 cutadapt==3.7 python-Levenshtein==0.21.1 PyYAML==6.0.1 From 1eea726e7735b837ba0f37797d43cd38258cf5d4 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Fri, 13 Oct 2023 14:19:11 -0700 Subject: [PATCH 015/485] Tweak layout option in concordance plots. --- micall/core/coverage_plots.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/micall/core/coverage_plots.py b/micall/core/coverage_plots.py index 5773717f5..c225d36dc 100644 --- a/micall/core/coverage_plots.py +++ b/micall/core/coverage_plots.py @@ -223,7 +223,7 @@ def concordance_plot(concordance_csv, plot_path=None, filetype='png', concordanc if plot_path is None: plot_path, _ = os.path.split(concordance_csv.name) reader = DictReader(concordance_csv) - fig, ax = plt.subplots(figsize=(4, 3), dpi=100) + fig, ax = plt.subplots(figsize=(4, 3), dpi=100, layout='tight') paths = [] for (reference, region), group in itertools.groupby(reader, itemgetter('reference', 'region')): @@ -250,7 +250,6 @@ def concordance_plot(concordance_csv, plot_path=None, filetype='png', concordanc plt.ylim([0, 110]) plt.xlabel('Reference coordinates (AA)', fontsize=9) plt.ylabel('20-base window average', fontsize=9) - plt.tight_layout() figname_parts = ['concordance', reference, region, filetype] if concordance_prefix: figname_parts.insert(0, concordance_prefix) From 2d270967ae26f552d086f8a848a45359b448f6ef Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 16 Oct 2023 15:19:38 -0700 Subject: [PATCH 016/485] Use consensus for breaking conflicts in stitching When combining two regions during final stitching, various mismatches can occur due to the fact that those regions are aligned separately. This, in turn, can introduce various defects into the resulting sequence. This change tries to improve conflict resolution by relying on query positions from the initial consensus sequence of the two regions. --- micall/core/aln2counts.py | 69 ++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index ee20ea053..b2d879d8f 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -173,23 +173,74 @@ def get_insertion_info(left, report_aminos, report_nucleotides): return insert_behind, insertion_coverage + +def build_consensus_table(nuc_dict, region_nucleotides, region_start): + """Builds a consensus table with pairs of existing and new nucleotides that share a consensus index.""" + + consensus_nucleotides = defaultdict(lambda: {'left': None, 'right': None}) + + for nuc_index, nuclotide in enumerate(region_nucleotides): + position = region_start + nuc_index - 1 + existing_nucleotide = nuc_dict.get(position, None) + new_nucleotide = nuclotide.seed_nucleotide + + if existing_nucleotide and existing_nucleotide.consensus_index is not None: + consensus_nucleotides[existing_nucleotide.consensus_index]['left'] = existing_nucleotide + if new_nucleotide.consensus_index is not None: + consensus_nucleotides[new_nucleotide.consensus_index]['right'] = new_nucleotide + + return consensus_nucleotides + + +def resolve_nucleotide_conflict(left, right, nuc_index, overlap_midpoint, is_amino): + """Resolves a conflict between two nucleotides based on the regions overlap midpoint.""" + + if left and right: + # we prefer the next region after the middle of the overlap with the previous translated region for an amino nucleotide + if nuc_index >= overlap_midpoint and is_amino: + return right + else: + return left + elif left: + return left + elif right: + return right + + def combine_region_nucleotides(nuc_dict, region_nucleotides, region_start, prev_region_end, is_amino, region_name): assert region_start is not None mismatch = False overlap_midpoint = int((prev_region_end - region_start + 1) / 2) if overlap_midpoint < 0: overlap_midpoint = 0 + + consensus_nucleotides = build_consensus_table(nuc_dict, region_nucleotides, region_start) + previous_consensus_index = None + for nuc_index, nucleotide in enumerate(region_nucleotides): position = region_start + nuc_index - 1 - if position not in nuc_dict: - # if we have not seen this position before, add it - nuc_dict[position] = nucleotide.seed_nucleotide - else: - if is_amino and nuc_index >= overlap_midpoint: - # after the middle of the overlap with the previous translated region, we use this region - nuc_dict[position] = nucleotide.seed_nucleotide - if nuc_dict[position].counts != nucleotide.seed_nucleotide.counts: - mismatch = True + existing_nucleotide = nuc_dict.get(position, None) + + def consensus_nucleotide(): + if previous_consensus_index is not None: + left = consensus_nucleotides[previous_consensus_index + 1]['left'] + right = consensus_nucleotides[previous_consensus_index + 1]['right'] + return resolve_nucleotide_conflict(left, right, + nuc_index, overlap_midpoint, + is_amino) + + def region_nucleotide(): + return resolve_nucleotide_conflict(existing_nucleotide, + nucleotide.seed_nucleotide, + nuc_index, overlap_midpoint, + is_amino) + + nuc_dict[position] = consensus_nucleotide() or region_nucleotide() + if existing_nucleotide and existing_nucleotide.counts != nuc_dict[position].counts: + mismatch = True + + previous_consensus_index = nuc_dict[position].consensus_index + if mismatch: logger.debug(f"Disagreement in counts for region {region_name}") From 8681e50940587021664296982bb3518597fc39ad Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 18 Oct 2023 10:03:47 -0700 Subject: [PATCH 017/485] Revert "Use consensus for breaking conflicts in stitching" This reverts commit 2d270967ae26f552d086f8a848a45359b448f6ef. --- micall/core/aln2counts.py | 69 +++++---------------------------------- 1 file changed, 9 insertions(+), 60 deletions(-) diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index b2d879d8f..ee20ea053 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -173,74 +173,23 @@ def get_insertion_info(left, report_aminos, report_nucleotides): return insert_behind, insertion_coverage - -def build_consensus_table(nuc_dict, region_nucleotides, region_start): - """Builds a consensus table with pairs of existing and new nucleotides that share a consensus index.""" - - consensus_nucleotides = defaultdict(lambda: {'left': None, 'right': None}) - - for nuc_index, nuclotide in enumerate(region_nucleotides): - position = region_start + nuc_index - 1 - existing_nucleotide = nuc_dict.get(position, None) - new_nucleotide = nuclotide.seed_nucleotide - - if existing_nucleotide and existing_nucleotide.consensus_index is not None: - consensus_nucleotides[existing_nucleotide.consensus_index]['left'] = existing_nucleotide - if new_nucleotide.consensus_index is not None: - consensus_nucleotides[new_nucleotide.consensus_index]['right'] = new_nucleotide - - return consensus_nucleotides - - -def resolve_nucleotide_conflict(left, right, nuc_index, overlap_midpoint, is_amino): - """Resolves a conflict between two nucleotides based on the regions overlap midpoint.""" - - if left and right: - # we prefer the next region after the middle of the overlap with the previous translated region for an amino nucleotide - if nuc_index >= overlap_midpoint and is_amino: - return right - else: - return left - elif left: - return left - elif right: - return right - - def combine_region_nucleotides(nuc_dict, region_nucleotides, region_start, prev_region_end, is_amino, region_name): assert region_start is not None mismatch = False overlap_midpoint = int((prev_region_end - region_start + 1) / 2) if overlap_midpoint < 0: overlap_midpoint = 0 - - consensus_nucleotides = build_consensus_table(nuc_dict, region_nucleotides, region_start) - previous_consensus_index = None - for nuc_index, nucleotide in enumerate(region_nucleotides): position = region_start + nuc_index - 1 - existing_nucleotide = nuc_dict.get(position, None) - - def consensus_nucleotide(): - if previous_consensus_index is not None: - left = consensus_nucleotides[previous_consensus_index + 1]['left'] - right = consensus_nucleotides[previous_consensus_index + 1]['right'] - return resolve_nucleotide_conflict(left, right, - nuc_index, overlap_midpoint, - is_amino) - - def region_nucleotide(): - return resolve_nucleotide_conflict(existing_nucleotide, - nucleotide.seed_nucleotide, - nuc_index, overlap_midpoint, - is_amino) - - nuc_dict[position] = consensus_nucleotide() or region_nucleotide() - if existing_nucleotide and existing_nucleotide.counts != nuc_dict[position].counts: - mismatch = True - - previous_consensus_index = nuc_dict[position].consensus_index - + if position not in nuc_dict: + # if we have not seen this position before, add it + nuc_dict[position] = nucleotide.seed_nucleotide + else: + if is_amino and nuc_index >= overlap_midpoint: + # after the middle of the overlap with the previous translated region, we use this region + nuc_dict[position] = nucleotide.seed_nucleotide + if nuc_dict[position].counts != nucleotide.seed_nucleotide.counts: + mismatch = True if mismatch: logger.debug(f"Disagreement in counts for region {region_name}") From 70e3f37865b456557c5bf502152919e0155449f7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 1 Nov 2023 09:28:22 -0700 Subject: [PATCH 018/485] Complete CIGAR string definition --- micall/utils/consensus_aligner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index ab0bbdbc9..66e9ca6bf 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -23,9 +23,10 @@ # Most codons in an insertion or deletion that is still aligned in amino acids. MAXIMUM_AMINO_GAP = 10 +# Mapping as defined in https://samtools.github.io/hts-specs/SAMv1.pdf, page 8 CigarActions = IntEnum( 'CigarActions', - 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED', + 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED PADDING SEQ_MATCH MISMATCH', start=0) From cde06c8533d56910979734dce5d7bbe436154a69 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 1 Nov 2023 09:28:22 -0700 Subject: [PATCH 019/485] Complete CIGAR string definition --- micall/utils/consensus_aligner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index ab0bbdbc9..66e9ca6bf 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -23,9 +23,10 @@ # Most codons in an insertion or deletion that is still aligned in amino acids. MAXIMUM_AMINO_GAP = 10 +# Mapping as defined in https://samtools.github.io/hts-specs/SAMv1.pdf, page 8 CigarActions = IntEnum( 'CigarActions', - 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED', + 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED PADDING SEQ_MATCH MISMATCH', start=0) From 062a800736c12cffe8c5b4848e09063b6f59b10d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 08:44:03 -0800 Subject: [PATCH 020/485] Add module for CIGAR strings handling Create a module for handling CIGAR strings and their related alignment formats. This includes functions for managing coordinate mapping between the query sequence and the reference sequence, as well as handling CIGAR strings. The added classes incorporate various methods to extend coordinates, convert them and translate them. It also includes a class for managing CIGAR hits, which includes functions to slice CIGAR operations, check for overlap, and converting operations to a multiple sequence alignment (MSA). This update helps to provide a more comprehensive set of tools for handling and interpreting CIGAR strings and alignments. --- micall/utils/cigar_tools.py | 475 ++++++++++++++++++++++++++++++++++++ 1 file changed, 475 insertions(+) create mode 100644 micall/utils/cigar_tools.py diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py new file mode 100644 index 000000000..cffab829d --- /dev/null +++ b/micall/utils/cigar_tools.py @@ -0,0 +1,475 @@ +""" +Module for handling CIGAR strings and related alignment formats. +""" + +from math import ceil, floor +import re +from typing import List, Tuple, Iterable, Optional +from collections import OrderedDict +from dataclasses import dataclass +import itertools +import copy +from functools import cached_property + +from micall.utils.consensus_aligner import CigarActions + + +CIGAR_OP_MAPPING = { + 'M': CigarActions.MATCH, + 'I': CigarActions.INSERT, + 'D': CigarActions.DELETE, + 'N': CigarActions.SKIPPED, + 'S': CigarActions.SOFT_CLIPPED, + 'H': CigarActions.HARD_CLIPPED, + 'P': CigarActions.PADDING, + '=': CigarActions.SEQ_MATCH, + 'X': CigarActions.MISMATCH, +} + + +def parse_cigar_operation(operation: str) -> CigarActions: + if operation in CIGAR_OP_MAPPING: + return CIGAR_OP_MAPPING[operation] + else: + raise ValueError(f"Unexpected CIGAR action: {operation}.") + + +def cigar_operation_to_str(op: CigarActions) -> str: + return [k for (k, v) in CIGAR_OP_MAPPING.items() if v == op][0] + + +class CoordinateMapping: + def __init__(self): + self.query_to_ref_d = {} + self.ref_to_query_d = {} + self.ref_to_op_d = {} + self.query_to_op_d = {} + + + def extend(self, + ref_index: Optional[int], + query_index: Optional[int], + op_index: Optional[int]): + + if ref_index is not None and query_index is not None: + self.ref_to_query_d[ref_index] = query_index + self.query_to_ref_d[query_index] = ref_index + + if op_index is not None: + if ref_index is not None: + self.ref_to_op_d[ref_index] = op_index + if query_index is not None: + self.query_to_op_d[query_index] = op_index + + + def ref_to_query(self, index) -> Optional[int]: + return self.ref_to_query_d.get(index, None) + + + def query_to_ref(self, index) -> Optional[int]: + return self.query_to_ref_d.get(index, None) + + + @staticmethod + def _find_closest_key(mapping: dict, index: int) -> int: + return min(mapping, key=lambda k: abs(mapping[k] - index)) + + + def ref_to_closest_query(self, index) -> int: + return CoordinateMapping._find_closest_key(self.query_to_ref_d, index) + + + def query_to_closest_ref(self, index) -> int: + return CoordinateMapping._find_closest_key(self.ref_to_query_d, index) + + + def ref_to_leftsup_query(self, index) -> Optional[int]: + left_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if v <= index) + return max(left_neihbourhood, default=None) + + + def ref_to_rightinf_query(self, index) -> Optional[int]: + right_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if index <= v) + return min(right_neihbourhood, default=None) + + + def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): + r = self.ref_to_op_d.get(ref_index, None) + q = self.query_to_op_d.get(query_index, None) + if r is not None and q is not None: + return conflict(r, q) + + return r if q is None else q + + + def translate_coordinates(self, reference_offset: int, query_offset: int) -> 'CoordinateMapping': + ret = CoordinateMapping() + + ret.ref_to_query_d = {k + reference_offset: v + query_offset for (k, v) in self.ref_to_query_d.items()} + ret.query_to_ref_d = {k + query_offset: v + reference_offset for (k, v) in self.query_to_ref_d.items()} + ret.ref_to_op_d = {k + reference_offset: v for (k, v) in self.ref_to_op_d.items()} + ret.query_to_op_d = {k + query_offset: v for (k, v) in self.query_to_op_d.items()} + + return ret + + +class Cigar(list): + """ + A CIGAR string represents a read alignment against a reference sequence. + It is a run-length encoded sequence of alignment operations listed below: + + M: Alignment match (can be a sequence match or mismatch) + D: Deletion from the reference + I: Insertion to the reference + S: Soft clip on the read (ignored region, not aligned but present in the read) + H: Hard clip on the read (ignored region, not present in the read) + N: Skipped region from the reference + P: Padding (silent deletion from padded reference, not applicable for our case) + =: Sequence match + X: Sequence mismatch + + CIGAR strings are defined in the SAM specification + (https://samtools.github.io/hts-specs/SAMv1.pdf). + """ + + + def __init__(self, cigar_lst): + super().__init__([]) + for x in cigar_lst: self.append(x) + + + @staticmethod + def coerce(obj): + if isinstance(obj, Cigar): + return obj + + if isinstance(obj, str): + return Cigar.parse(obj) + + if isinstance(obj, list): + return Cigar(obj) + + raise TypeError(f"Cannot coerce {obj!r} to CIGAR string.") + + + @staticmethod + def parse(string): + data = [] + while string: + match = re.match(r'([0-9]+)([^0-9])', string) + if match: + num, operation = match.groups() + data.append([int(num), parse_cigar_operation(operation)]) + string = string[match.end():] + else: + raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") + + return Cigar(data) + + + def append(self, item: Tuple[int, CigarActions]): + # Type checking + if not isinstance(item, list) and not isinstance(item, tuple): + raise ValueError(f"Invalid CIGAR list: {item!r} is not a tuple.") + if len(item) != 2: + raise ValueError(f"Invalid CIGAR list: {item!r} is has a bad length.") + + num, operation = item + if isinstance(operation, int): + operation = CigarActions(operation) + if not isinstance(num, int) or not isinstance(operation, CigarActions): + raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") + + # Normalization + if num == 0: + return + + if self: + last_num, last_operation = self[-1] + if operation == last_operation: + self[-1] = (last_num + num, operation) + return + + super().append((num, operation)) + + + def iterate_operations(self) -> Iterable[CigarActions]: + for num, operation in self: + for _ in range(num): + yield operation + + + def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optional[int], Optional[int]]]: + ref_pointer = 0 + query_pointer = 0 + + for operation in self.iterate_operations(): + if operation in (CigarActions.MATCH, CigarActions.SEQ_MATCH, CigarActions.MISMATCH): + yield (operation, ref_pointer, query_pointer) + query_pointer += 1 + ref_pointer += 1 + + elif operation in (CigarActions.INSERT, CigarActions.SOFT_CLIPPED): + yield (operation, None, query_pointer) + query_pointer += 1 + + elif operation in (CigarActions.DELETE, CigarActions.SKIPPED): + yield (operation, ref_pointer, None) + ref_pointer += 1 + + else: + yield (operation, None, None) + + + @cached_property + def query_length(self): + return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) + in self.iterate_operations_with_pointers()), + default=0) + + + @cached_property + def ref_length(self): + return max((ref_pointer + 1 if ref_pointer is not None else 0 for (_, ref_pointer, _) + in self.iterate_operations_with_pointers()), + default=0) + + + def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': + return Cigar([(1, op) for op in self.iterate_operations()] + [start_inclusive:end_noninclusive]) + + + @cached_property + def coordinate_mapping(self) -> CoordinateMapping: + """ + Convert a CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. + TODO: describe the domains and holes. + + :param cigar: a CIGAR string. + + :return: Lists of integers representing the mappings of coordinates from the reference + sequence to the query sequence, and back. + """ + + mapping = CoordinateMapping() + + for op_pointer, (operation, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()): + mapping.extend(ref_pointer, + query_pointer, + op_pointer) + + return mapping + + + def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: + reference_msa = '' + query_msa = '' + + for operation, ref_pointer, query_pointer in self.iterate_operations_with_pointers(): + if ref_pointer is None and query_pointer is None: + continue + + try: + if ref_pointer is not None: + reference_msa += reference_seq[ref_pointer] + else: + reference_msa += '-' + + if query_pointer is not None: + query_msa += query_seq[query_pointer] + else: + query_msa += '-' + + except IndexError: + raise ValueError("CIGAR string corresponds to a larger match than either reference or query.") + + return reference_msa, query_msa + + + def __repr__(self): + return f'Cigar({str(self)!r})' + + + def __str__(self): + """ Inverse of Cigar.parse """ + return ''.join('{}{}'.format(num, cigar_operation_to_str(op)) for num, op in self) + + +@dataclass +class CigarHit: + cigar: Cigar + r_st: int + r_ei: int # inclusive + q_st: int + q_ei: int # inclusive + + + def __post_init__(self): + self.cigar = Cigar.coerce(self.cigar) + + if self.r_len != self.cigar.ref_length: + raise ValueError(f"CIGAR string maps {self.cigar.ref_length}" + f" reference positions, but CIGAR hit range is {self.r_len}") + + if self.q_len != self.cigar.query_length: + raise ValueError(f"CIGAR string maps {self.cigar.query_length}" + f" query positions, but CIGAR hit range is {self.q_len}") + + + @property + def r_len(self): + return self.r_ei + 1 - self.r_st + + + @property + def q_len(self): + return self.q_ei + 1 - self.q_st + + + def overlaps(self, other) -> bool: + def intervals_overlap(x, y): + """ Check if two intervals [x0, x1] and [y0, y1] overlap. """ + return x[0] <= y[1] and x[1] >= y[0] + + return intervals_overlap((self.r_st, self.r_ei), (other.r_st, other.r_ei)) \ + or intervals_overlap((self.q_st, self.q_ei), (other.q_st, other.q_ei)) + + + def __add__(self, other): + """ + Inserts deletions/insertions between self and other, + then ajusts boundaries appropriately. + """ + + if self.overlaps(other): + raise ValueError("Cannot combine overlapping CIGAR hits") + + if (self.r_st, self.r_ei) < (other.r_st, other.r_ei): + # Note: in cases where one CigarHit is empty, comparing only by a single coordiate is not sufficient. + left = self + right = other + else: + left = other + right = self + + cigar = left.cigar \ + + Cigar.coerce([(right.r_st - left.r_ei - 1, CigarActions.DELETE)]) \ + + Cigar.coerce([(right.q_st - left.q_ei - 1, CigarActions.INSERT)]) \ + + right.cigar + + return CigarHit(cigar=cigar, + r_st=left.r_st, + r_ei=right.r_ei, + q_st=left.q_st, + q_ei=right.q_ei, + ) + + + def _slice(self, r_st, r_ei, q_st, q_ei) -> 'CigarHit': + mapping = self.coordinate_mapping + + o_st = mapping.ref_or_query_to_op(r_st, q_st, min) + o_ei = mapping.ref_or_query_to_op(r_ei, q_ei, max) + if o_st is None or o_ei is None: + cigar = Cigar([]) + else: + cigar = self.cigar.slice_operations(o_st, o_ei + 1) + + return CigarHit(cigar=cigar, + r_st = r_st, + r_ei = r_ei, + q_st = q_st, + q_ei = q_ei, + ) + + + def _ref_cut_to_query_cut(self, cut_point: float): + mapping = self.coordinate_mapping + + left_query_cut_point = mapping.ref_to_leftsup_query(floor(cut_point)) + right_query_cut_point = mapping.ref_to_rightinf_query(ceil(cut_point)) + + if left_query_cut_point is None: + return self.q_st - 0.1 + if right_query_cut_point is None: + return self.q_ei + 0.1 + + lerp = lambda start, end, t: (1 - t) * start + t * end + query_cut_point = lerp(left_query_cut_point, right_query_cut_point, + cut_point - floor(cut_point)) + + if float(query_cut_point).is_integer(): + # Disambiguate to the right. + query_cut_point += 0.1 / (self.r_st + self.r_ei + self.q_st + self.q_ei) + + return query_cut_point + + + def cut_reference(self, cut_point: float) -> 'CigarHit': + """ + Splits alignment in two parts such that cut_point is in between. + Guarantees that the two parts do not share any elements, + and that no element is lost. + """ + + if float(cut_point).is_integer(): + raise ValueError("Cut accepts fractions, not integers") + + if self.r_len == 0 or \ + not (self.r_st - 1 < cut_point < self.r_ei + 1): + raise IndexError("Cut point out of reference bounds") + + query_cut_point = self._ref_cut_to_query_cut(cut_point) + assert (self.q_st - 1 <= query_cut_point <= self.q_ei + 1) + + left = self._slice(self.r_st, floor(cut_point), + self.q_st, floor(query_cut_point)) + right = self._slice(ceil(cut_point), self.r_ei, + ceil(query_cut_point), self.q_ei) + + return left, right + + + @cached_property + def coordinate_mapping(self) -> CoordinateMapping: + return self.cigar.coordinate_mapping.translate_coordinates(self.r_st, self.q_st) + + + def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: + return self.cigar.to_msa(reference_seq[self.r_st:], query_seq[self.q_st:]) + + + def __repr__(self): + return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' + + +def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> CigarHit: + """ + This function exists to deal with the fact that mappy does not always + connect big gaps, and returns surrounding parts as two separate alignment hits. + + For those cases we simply connect all the parts that do not overlap. + + Order of cigar_hits matters because we ignore alignments + that overlap with previously found alignments. + """ + + if not len(cigar_hits) > 0: + raise ValueError("Expected a non-empty list of cigar hits") + + accumulator = [] + + # Collect non-overlaping parts. + # Earlier matches have priority over ones that come after. + for hit in cigar_hits: + if any(earlier.overlaps(hit) for earlier in accumulator): + continue + + accumulator.append(hit) + + # Sort by interval start positions. + sorted_parts = sorted(accumulator, key=lambda p: p.r_st) + + # Collect all intervals back together, connecting them with CigarActions.DELETE. + return sum(sorted_parts[1:], start=sorted_parts[0]) From 647ad391937e66629ef61f247ef74f89b6ecc43b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 08:47:03 -0800 Subject: [PATCH 021/485] Add initial unit tests for CIGAR module --- micall/tests/test_cigar_tools.py | 278 +++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 micall/tests/test_cigar_tools.py diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py new file mode 100644 index 000000000..05eff8db8 --- /dev/null +++ b/micall/tests/test_cigar_tools.py @@ -0,0 +1,278 @@ +import pytest +from typing import List, Tuple + +from micall.utils.consensus_aligner import CigarActions +from micall.utils.cigar_tools import Cigar, CigarHit + + +cigar_mapping_cases: List[Tuple[Cigar, 'mapping', 'closest_mapping']] = [ + # Simple cases + ('3M', {0: 0, 1: 1, 2: 2}, + {0: 0, 1: 1, 2: 2}), + ('1M1D1M', {0: 0, 2: 1}, + {0: 0, 1: 0, 2: 1}), + ('1M1I1M', {0: 0, 1: 2}, + {0: 0, 1: 2}), + ('2M2D2M', {0: 0, 1: 1, 4: 2, 5: 3}, + {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3}), + ('2M2I2M', {0: 0, 1: 1, 2: 4, 3: 5}, + {0: 0, 1: 1, 2: 4, 3: 5}), + ('3M1D3M', {0: 0, 1: 1, 2: 2, 4: 3, 5: 4, 6: 5}, + {0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5}), + ('3M1I3M', {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, + {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}), + ('7M1I3M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}), + ('5M2D4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 7: 5, 8: 6, 9: 7, 10: 8}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 5, 8: 6, 9: 7, 10: 8}), + ('5M3I4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}), + ('1M1D', {0: 0}, + {0: 0}), + ('1M1I', {0: 0}, + {0: 0}), + ('1I1M', {0: 1}, + {0: 1}), + ('1D1M', {1: 0}, + {1: 0}), + + # Multiple deletions and insertions + ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, + {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 6, 7: 7}), + ('2M2I2M2D2M', {0: 0, 1: 1, 2: 4, 3: 5, 6: 6, 7: 7}, + {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 6, 6: 6, 7: 7}), + ('2=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, + {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4}), + ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, + {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 6, 7: 7}), + ('3=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 4, 7: 4, 8: 5}), + + # Edge cases + ('', {}, ValueError()), + ('12I', {}, ValueError()), + ('12D', {}, ValueError()), +] + + +@pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) +def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): + mapping = Cigar.coerce(cigar_str).coordinate_mapping + + assert expected_mapping == mapping.ref_to_query_d + assert expected_mapping == {i: mapping.ref_to_query(i) for i in mapping.ref_to_query_d} + + +@pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) +def test_cigar_to_coordinate_bijection_property(cigar_str): + inverse = lambda d: {v: k for k, v in d.items()} + + mapping = Cigar.coerce(cigar_str).coordinate_mapping + + assert mapping.query_to_ref_d == inverse(mapping.ref_to_query_d) + assert mapping.ref_to_query_d == inverse(mapping.query_to_ref_d) + assert mapping.ref_to_query_d == inverse(inverse(mapping.ref_to_query_d)) + assert mapping.query_to_ref_d == inverse(inverse(mapping.query_to_ref_d)) + + +@pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) +def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping): + mapping = Cigar.coerce(cigar_str).coordinate_mapping + + if isinstance(expected_closest_mapping, Exception): + with pytest.raises(type(expected_closest_mapping)): + mapping.ref_to_closest_query(0) + + else: + fullrange = {i: mapping.ref_to_closest_query(i) \ + for i in range(min(mapping.ref_to_query_d), 1 + max(mapping.ref_to_query_d))} + assert expected_closest_mapping == fullrange + + +@pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) +def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): + cigar = Cigar.coerce(cigar_str) + hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) + mapping = hit.coordinate_mapping + + # Coordinates are translated by q_st and r_st. + expected_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_mapping.items()} + assert mapping.ref_to_query(0) == None + assert mapping.query_to_ref(0) == None + assert expected_mapping \ + == {i: mapping.ref_to_query(i) for i in mapping.ref_to_query_d} + + +@pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) +def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_mapping): + cigar = Cigar.coerce(cigar_str) + hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) + mapping = hit.coordinate_mapping + + if isinstance(expected_closest_mapping, Exception): + with pytest.raises(type(expected_closest_mapping)): + mapping.ref_to_closest_query(0) + + else: + # Coordinates are translated by q_st and r_st. + expected_closest_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()} + fullrange = {i: mapping.ref_to_closest_query(i) \ + for i in range(min(mapping.ref_to_query_d), 1 + max(mapping.ref_to_query_d))} + assert expected_closest_mapping == fullrange + + +def test_invalid_operation_in_cigar_string(): + with pytest.raises(ValueError): + Cigar.coerce('3M1Z3M') # Z operation is not implemented + + +def test_invalid_operation_in_cigar_list(): + with pytest.raises(ValueError): + Cigar.coerce([(3, 42)]) # Operation code "42" does not exist + + +def test_invalid_cigar_string(): + with pytest.raises(ValueError): + Cigar.coerce('3MMMMMM3M') # Too many Ms + with pytest.raises(ValueError): + Cigar.coerce('3') # Not enough Ms + + +cigar_hit_ref_cut_cases = [ + # Trivial cases + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 3.5, + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('6M', r_st=4, r_ei=9, q_st=4, q_ei=9)]), + + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 4.5, + [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), + CigarHit('5M', r_st=5, r_ei=9, q_st=5, q_ei=9)]), + + (CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8), 3.5, + [CigarHit('4M', r_st=0, r_ei=3, q_st=0, q_ei=3), + CigarHit('5M', r_st=4, r_ei=8, q_st=4, q_ei=8)]), + + # Simple cases + (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 3.5, + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('6M9D9M', r_st=4, r_ei=27, q_st=4, q_ei=18)]), + + (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 20.5, + [CigarHit('9M9D2M', r_st=1, r_ei=20, q_st=1, q_ei=11), + CigarHit('7M', r_st=21, r_ei=27, q_st=12, q_ei=18)]), + + (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 3.5, + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('6M9I9M', r_st=4, r_ei=18, q_st=4, q_ei=27)]), + + (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 13.5 or 27/2, + [CigarHit('9M9I4M', r_st=1, r_ei=13, q_st=1, q_ei=22), + CigarHit('5M', r_st=14, r_ei=18, q_st=23, q_ei=27)]), + + # Ambigous cases + (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 13.5 or 27/2, + [CigarHit('9M4D', r_st=1, r_ei=13, q_st=1, q_ei=9), + CigarHit('5D9M', r_st=14, r_ei=27, q_st=10, q_ei=18)]), + + (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.2, + [CigarHit('9M1I', r_st=1, r_ei=9, q_st=1, q_ei=10), + CigarHit('8I9M', r_st=10, r_ei=18, q_st=11, q_ei=27)]), + + # Edge cases + (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.5, # no middlepoint + [CigarHit('9M5I', r_st=1, r_ei=9, q_st=1, q_ei=14), + CigarHit('4I9M', r_st=10, r_ei=18, q_st=15, q_ei=27)]), + + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 8.5, # one is singleton + [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8), + CigarHit('1M', r_st=9, r_ei=9, q_st=9, q_ei=9)]), + + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 9.5, # one is empty + [CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), + CigarHit('', r_st=10, r_ei=9, q_st=10, q_ei=9)]), + + (CigarHit('7M', r_st=3, r_ei=9, q_st=3, q_ei=9), 2.5, # one is empty + [CigarHit('', r_st=3, r_ei=2, q_st=3, q_ei=2), + CigarHit('7M', r_st=3, r_ei=9, q_st=3, q_ei=9)]), + + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 0.5, # one is empty around 0 + [CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), + CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)]), + + (CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8), -0.5, # another one is empty and negative + [CigarHit('', r_st=0, r_ei=-1, q_st=0, q_ei=-1), + CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8)]), + + (CigarHit('9D', r_st=1, r_ei=9, q_st=1, q_ei=0), 3.5, + [CigarHit('3D', r_st=1, r_ei=3, q_st=1, q_ei=0), + CigarHit('6D', r_st=4, r_ei=9, q_st=1, q_ei=0)]), + + (CigarHit('9D', r_st=0, r_ei=8, q_st=0, q_ei=-1), -0.5, + [CigarHit('', r_st=0, r_ei=-1, q_st=0, q_ei=-1), + CigarHit('9D', r_st=0, r_ei=8, q_st=0, q_ei=-1)]), + + (CigarHit('2=1X2N1N2=1H2S', r_st=1, r_ei=8, q_st=1, q_ei=7), 3.5, + [CigarHit('2=1X', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('3N2=1H2S', r_st=4, r_ei=8, q_st=4, q_ei=7)]), + + # Negative cases + (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 20.5, + IndexError("20.5 is bigger than reference (18)")), + + (CigarHit('', r_st=2, r_ei=1, q_st=2, q_ei=1), 2.5, + IndexError("Empty string cannot be cut")), + + (CigarHit('', r_st=2, r_ei=1, q_st=2, q_ei=1), 1.5, + IndexError("Empty string cannot be cut")), + + (CigarHit('9I', r_st=1, r_ei=0, q_st=1, q_ei=9), 3.5, + IndexError("Out of reference bounds")), + + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 4, + ValueError("Cut point must not be an integer")), + +] + +@pytest.mark.parametrize('hit, cut_point, expected_result', cigar_hit_ref_cut_cases) +def test_cigar_hit_ref_cut(hit, cut_point, expected_result): + if isinstance(expected_result, Exception): + with pytest.raises(type(expected_result)): + hit.cut_reference(cut_point) + + else: + expected_left, expected_right = expected_result + left, right = hit.cut_reference(cut_point) + assert expected_left == left + assert expected_right == right + + +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_ref_cut_add_prop(hit, cut_point): + left, right = hit.cut_reference(cut_point) + assert left + right == hit == right + left + + +@pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ + ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), + ('ACTG', '', '4D', 'ACTG', '----'), + ('', 'ACTG', '4I', '----', 'ACTG'), + ('ACTGAC', 'ACAC', '2M2D2M', 'ACTGAC', 'AC--AC'), + ('ACAC', 'ACTGAC', '2M2I2M', 'AC--AC', 'ACTGAC'), + ('GCTATGGGAA', 'GCTATGGGAA', '5M3D2M', 'GCTATGGGAA', 'GCTAT---GG'), + ('ACTG', 'ACTG', '2M99H77P2M', 'ACTG', 'ACTG'), # Ignores non-consuming operations. +]) +def test_cigar_to_msa(reference_seq, query_seq, cigar, expected_reference, expected_query): + assert Cigar.coerce(cigar).to_msa(reference_seq, query_seq) \ + == (expected_reference, expected_query) + + +@pytest.mark.parametrize("cigar, reference_seq, query_seq", [ + ('10M', 'A' * 3, 'A' * 10), # reference is shorter than CIGAR + ('10M', 'A' * 10, 'A' * 3), # query is shorter than CIGAR + ('10D', 'A' * 3, 'A' * 3), + ('10I', 'A' * 3, 'A' * 3), +]) +def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): + with pytest.raises(ValueError): + Cigar.coerce(cigar).to_msa(reference_seq, query_seq) From 0164ba6ca2de85ddff9366a1f8b84d33c73b35af Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 08:47:22 -0800 Subject: [PATCH 022/485] Add initial implementation of the new contig stitcher --- micall/core/contig_stitcher.py | 202 +++++++++++++++++++++++++++ micall/tests/test_contig_stitcher.py | 25 ++++ 2 files changed, 227 insertions(+) create mode 100644 micall/core/contig_stitcher.py create mode 100644 micall/tests/test_contig_stitcher.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py new file mode 100644 index 000000000..f36234304 --- /dev/null +++ b/micall/core/contig_stitcher.py @@ -0,0 +1,202 @@ +import argparse +import logging +import os +from typing import Iterable, Optional, Tuple, List +from collections import Counter, deque +from csv import DictWriter, DictReader +from dataclasses import dataclass +from datetime import datetime +from glob import glob +from io import StringIO +from itertools import chain +from operator import itemgetter +from shutil import rmtree +from subprocess import run, PIPE, CalledProcessError, STDOUT +from tempfile import mkdtemp +from mappy import Aligner +from functools import cached_property +from gotoh import align_it + +from micall.utils.cigar_tools import connect_cigar_hits, CigarHit + + +@dataclass +class Contig: + name: str + seq: str + + +@dataclass +class GenotypedContig(Contig): + ref_name: str + ref_seq: str + matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. + + @property + def contig(self): + return self + +@dataclass +class AlignedContig: + contig: GenotypedContig + alignment: CigarHit + + def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: + """ Cuts this alignment in two parts with cut_point between them. """ + + alignment_left, alignment_right = self.alignment.cut_reference(cut_point) + return (AlignedContig(self.contig, alignment_left), + AlignedContig(self.contig, alignment_right)) + + + @cached_property + def msa(self): + return self.alignment.to_msa(self.contig.ref_seq, self.contig.seq) + + + @cached_property + def seq(self): + seq_left, ref_seq_left = self.msa + return ''.join((c for c in ref_seq_left if c != '-')) + + +class FrankensteinContig(AlignedContig): + """ Assembled of parts that were not even aligned together, + and of some parts that were not aligned at all. + Yet its .seq string looks like a real contig. """ + + def __init__(self, parts: List[GenotypedContig]): + self.parts = [subpart for part in parts for subpart in + (part.parts if isinstance(part, FrankensteinContig) else [part])] + + name = '+'.join(map(lambda acontig: acontig.contig.name, self.parts)) + ref = self.parts[0].contig + contig = GenotypedContig(name=name, seq=self.seq, + ref_name=ref.ref_name, + ref_seq=ref.ref_seq, + matched_fraction=ref.matched_fraction) + + alignment = connect_cigar_hits([part.alignment for part in self.parts + if isinstance(part, AlignedContig)]) + + super().__init__(contig, alignment) + + + @cached_property + def seq(self): + return ''.join(map(lambda part: part.seq, self.parts)) + + + +def align_to_reference(contig: GenotypedContig): + aligner = Aligner(seq=contig.ref_seq, preset='map-ont') + alignments = list(aligner.map(contig.seq)) + if not alignments: + return contig + + hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] + single_cigar_hit = connect_cigar_hits(hits_array) + return AlignedContig(contig=contig, alignment=single_cigar_hit) + + +def align_equal(seq1, seq2) -> Tuple[str, str]: + gap_open_penalty = 15 + gap_extend_penalty = 3 + use_terminal_gap_penalty = 1 + aseq1, aseq2, score = \ + align_it( + seq1, seq2, + gap_open_penalty, + gap_extend_penalty, + use_terminal_gap_penalty) + + return aseq1, aseq2 + + +def interval_contains(x, y): + """ Check if interval (x0, x1) contains interval (y0, y1). """ + return x[0] <= y[0] and x[1] >= y[1] + + +def find_all_overlapping_contigs(self, aligned_contigs): + for other in aligned_contigs: + if self.contig.ref_name != other.contig.ref_name: + continue + + if self.alignment.overlaps(other.alignment): + yield other + + +def find_overlapping_contig(self, aligned_contigs): + every = find_all_overlapping_contigs(self, aligned_contigs) + return max(every, key=lambda other: other.alignment.r_ei - other.alignment.r_st if other else 0, + default=None) + + +def calculate_concordance(left: str, right: str) -> Iterable[float]: + window_size = 10 + scores = deque([0] * window_size, maxlen=window_size) + scores_sum = 0 + result = [] + + assert len(left) == len(right), "Can only calculate concordance for same sized sequences" + + for (a, b) in zip(left, right): + current = a == b + scores_sum -= scores.popleft() + scores_sum += (a == b) + scores.append(current) + result.append(scores_sum / window_size) + + return result + + +def stitch_2_contigs(left, right): + # Cut in 4 parts. + left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) + right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) + + # Align overlapping parts, then recombine based on concordance. + aligned_left, aligned_right = align_equal(left_overlap.seq, right_overlap.seq) + concordance = calculate_concordance(aligned_left, aligned_right) + max_concordance_index = max(range(len(concordance)), + key=lambda i: concordance[i]) + aligned_left_part = aligned_left[:max_concordance_index] + aligned_right_part = aligned_right[max_concordance_index:] + overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') + + # Return something that can be fed back into the loop. + overlap_contig = GenotypedContig(name=f'overlap({left.contig.name},{right.contig.name})', + seq=overlap_seq, ref_name=left.contig.ref_name, + ref_seq=left.contig.ref_seq, matched_fraction=None) + return FrankensteinContig([left_remainder, overlap_contig, right_remainder]) + + +def stitch_contigs(contigs: Iterable[GenotypedContig]): + aligned = list(map(align_to_reference, contigs)) + + # Contigs that did not align do not need any more processing + stitched = yield from (x for x in aligned if not isinstance(x, AlignedContig)) + aligned = [x for x in aligned if isinstance(x, AlignedContig)] + + while aligned: + # Going left-to-right through aligned parts. + current = min(aligned, key=lambda x: x.alignment.r_st) + aligned.remove(current) + + # Filter out all contigs that are contained within the current one. + # TODO: actually filter out if covered by multiple contigs + aligned = [x for x in aligned if not \ + interval_contains((current.alignment.r_st, current.alignment.r_ei), + (x.alignment.r_st, x.alignment.r_ei))] + + # Find overlap. If there isn't one - we are done with the current contig. + overlapping_contig = find_overlapping_contig(current, aligned) + if not overlapping_contig: + yield current + continue + + # Get overlaping regions + new_contig = stitch_2_contigs(current, overlapping_contig) + aligned.remove(overlapping_contig) + aligned.append(new_contig) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py new file mode 100644 index 000000000..e330e66d5 --- /dev/null +++ b/micall/tests/test_contig_stitcher.py @@ -0,0 +1,25 @@ +import pytest +from micall.core.contig_stitcher import stitch_contigs, GenotypedContig + + +def test_3(): + ref_seq = 'A' * 100 + 'C' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert 100 == sum(len(x.seq) for x in result) + assert result[0].contig.name == 'a+overlap(a,b)+b' From d3c4aa7cb383bb24fe1d9e6b18e9b0927210758f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 08:47:42 -0800 Subject: [PATCH 023/485] Add tests for the new contig stitcher --- micall/tests/test_contig_stitcher.py | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e330e66d5..7501dd6cd 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -2,6 +2,44 @@ from micall.core.contig_stitcher import stitch_contigs, GenotypedContig +def test_1(): + contigs = [ + GenotypedContig(name='a', + seq='ACTGACTG' * 100, + ref_name='testref', + ref_seq='ACTGACTG' * 100, + matched_fraction=1.0, + ), + ] + + result = list(stitch_contigs(contigs)) + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + +def test_2(): + ref_seq = 'A' * 100 + + contigs = [ + GenotypedContig(name='a', + seq=ref_seq, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='C' * 100, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + def test_3(): ref_seq = 'A' * 100 + 'C' * 100 From 3be6566f6a1c25d3de1cfdc23bdee986183339d2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 09:05:09 -0800 Subject: [PATCH 024/485] Improve concordance calculation by scanning in both directions The updated function now creates a list with the same length as the input strings, initialized with zeros. The function then performs a moving window average comparison on the strings in both forward and reverse directions. This enhancement is designed to provide a more thorough and robust analysis of the sequence comparisons. Also add a docstring to it. --- micall/core/contig_stitcher.py | 42 +++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f36234304..f4a057ec6 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -133,20 +133,40 @@ def find_overlapping_contig(self, aligned_contigs): default=None) -def calculate_concordance(left: str, right: str) -> Iterable[float]: - window_size = 10 - scores = deque([0] * window_size, maxlen=window_size) - scores_sum = 0 - result = [] +def calculate_concordance(left: str, right: str) -> List[float]: + """ + Calculate concordance for two given sequences using a sliding window method. + + The function compares the two strings from both left to right and then right to left, + calculating for each position the ratio of matching characters in a window around the + current position (10 characters to the left and right). + + It's required that the input strings are of the same length. + + :param left: string representing first sequence + :param right: string representing second sequence + :return: list representing concordance ratio for each position + """ + + result = [0] * len(left) assert len(left) == len(right), "Can only calculate concordance for same sized sequences" - for (a, b) in zip(left, right): - current = a == b - scores_sum -= scores.popleft() - scores_sum += (a == b) - scores.append(current) - result.append(scores_sum / window_size) + def slide(left, right): + window_size = 10 + scores = deque([0] * window_size, maxlen=window_size) + scores_sum = 0 + + for i, (a, b) in enumerate(zip(left, right)): + current = a == b + scores_sum -= scores.popleft() + scores_sum += current + scores.append(current) + result[i] += scores_sum / window_size + + # Slide forward, then in reverse, adding the scores at each position. + slide(left, right) + slide(reversed(left), reversed(right)) return result From c6221d61afe05ca2ad779d1e22767f321596dad7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 09:16:39 -0800 Subject: [PATCH 025/485] Contig stitcher: ensure the order of stitched contigs Instead of appending the newly stitched part to the end, prepend it at the start. This way we make sure that it will be processed on the next loop cycle. --- micall/core/contig_stitcher.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f4a057ec6..8f33a7d2d 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -199,13 +199,14 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): stitched = yield from (x for x in aligned if not isinstance(x, AlignedContig)) aligned = [x for x in aligned if isinstance(x, AlignedContig)] + # Going left-to-right through aligned contigs. + aligned = list(sorted(aligned, key=lambda x: x.alignment.r_st)) while aligned: - # Going left-to-right through aligned parts. - current = min(aligned, key=lambda x: x.alignment.r_st) - aligned.remove(current) + current = aligned.pop(0) # Filter out all contigs that are contained within the current one. # TODO: actually filter out if covered by multiple contigs + # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. aligned = [x for x in aligned if not \ interval_contains((current.alignment.r_st, current.alignment.r_ei), (x.alignment.r_st, x.alignment.r_ei))] @@ -216,7 +217,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): yield current continue - # Get overlaping regions + # Replace two contigs by their stitched version, then loop with it. new_contig = stitch_2_contigs(current, overlapping_contig) aligned.remove(overlapping_contig) - aligned.append(new_contig) + aligned.insert(0, new_contig) From c2cba4a062374726cab319e7a5c031b4bc3cdbaf Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 09:21:13 -0800 Subject: [PATCH 026/485] Remove unused imports in cigar_tools and contig_sticher --- micall/core/contig_stitcher.py | 14 +------------- micall/utils/cigar_tools.py | 5 +---- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 8f33a7d2d..3d8c6aa08 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,18 +1,6 @@ -import argparse -import logging -import os from typing import Iterable, Optional, Tuple, List -from collections import Counter, deque -from csv import DictWriter, DictReader +from collections import deque from dataclasses import dataclass -from datetime import datetime -from glob import glob -from io import StringIO -from itertools import chain -from operator import itemgetter -from shutil import rmtree -from subprocess import run, PIPE, CalledProcessError, STDOUT -from tempfile import mkdtemp from mappy import Aligner from functools import cached_property from gotoh import align_it diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index cffab829d..f39653fc9 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,11 +4,8 @@ from math import ceil, floor import re -from typing import List, Tuple, Iterable, Optional -from collections import OrderedDict +from typing import Tuple, Iterable, Optional from dataclasses import dataclass -import itertools -import copy from functools import cached_property from micall.utils.consensus_aligner import CigarActions From 54266cb7a0db487bd179a69a448510b764c74b9f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 12:44:28 -0800 Subject: [PATCH 027/485] CigarHit: add translate method --- micall/utils/cigar_tools.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index f39653fc9..6085fb4c4 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -99,13 +99,13 @@ def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): return r if q is None else q - def translate_coordinates(self, reference_offset: int, query_offset: int) -> 'CoordinateMapping': + def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': ret = CoordinateMapping() - ret.ref_to_query_d = {k + reference_offset: v + query_offset for (k, v) in self.ref_to_query_d.items()} - ret.query_to_ref_d = {k + query_offset: v + reference_offset for (k, v) in self.query_to_ref_d.items()} - ret.ref_to_op_d = {k + reference_offset: v for (k, v) in self.ref_to_op_d.items()} - ret.query_to_op_d = {k + query_offset: v for (k, v) in self.query_to_op_d.items()} + ret.ref_to_query_d = {k + reference_delta: v + query_delta for (k, v) in self.ref_to_query_d.items()} + ret.query_to_ref_d = {k + query_delta: v + reference_delta for (k, v) in self.query_to_ref_d.items()} + ret.ref_to_op_d = {k + reference_delta: v for (k, v) in self.ref_to_op_d.items()} + ret.query_to_op_d = {k + query_delta: v for (k, v) in self.query_to_op_d.items()} return ret @@ -430,13 +430,21 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': @cached_property def coordinate_mapping(self) -> CoordinateMapping: - return self.cigar.coordinate_mapping.translate_coordinates(self.r_st, self.q_st) + return self.cigar.coordinate_mapping.translate(self.r_st, self.q_st) def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: return self.cigar.to_msa(reference_seq[self.r_st:], query_seq[self.q_st:]) + def translate(self, reference_delta: int, query_delta: int) -> 'CigarHit': + return CigarHit(cigar=self.cigar, + r_st=self.r_st + reference_delta, + r_ei=self.r_ei + reference_delta, + q_st=self.q_st + query_delta, + q_ei=self.q_ei + query_delta) + + def __repr__(self): return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' From ea48997ae5c1188ad750953c9b0e28d05d71ff87 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 15:16:12 -0800 Subject: [PATCH 028/485] Cigar: add type checking for negative op nums --- micall/utils/cigar_tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 6085fb4c4..d05767b88 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -176,6 +176,8 @@ def append(self, item: Tuple[int, CigarActions]): operation = CigarActions(operation) if not isinstance(num, int) or not isinstance(operation, CigarActions): raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") + if num < 0: + raise ValueError(f"Invalid CIGAR list: number of operations is negative.") # Normalization if num == 0: From a612a4c1ecfdd4765a8e1aa33f829b758ee0eb7c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 15:27:15 -0800 Subject: [PATCH 029/485] Fix overlap check in contig stitcher --- micall/core/contig_stitcher.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 3d8c6aa08..d0828a521 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -106,12 +106,18 @@ def interval_contains(x, y): return x[0] <= y[0] and x[1] >= y[1] +def intervals_overlap(x, y): + """ Check if two intervals [x0, x1] and [y0, y1] overlap. """ + return x[0] <= y[1] and x[1] >= y[0] + + def find_all_overlapping_contigs(self, aligned_contigs): for other in aligned_contigs: if self.contig.ref_name != other.contig.ref_name: continue - if self.alignment.overlaps(other.alignment): + if intervals_overlap((self.alignment.r_st, self.alignment.r_ei), + (other.alignment.r_st, other.alignment.r_ei)): yield other From bf2e50e81aa3cb60067b215844cbe008c885e408 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 08:44:26 -0800 Subject: [PATCH 030/485] Add MockAligner class to test utils --- micall/tests/test_tests_utils.py | 75 ++++++++++++++++++++++++++++++++ micall/tests/utils.py | 56 ++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 micall/tests/test_tests_utils.py create mode 100644 micall/tests/utils.py diff --git a/micall/tests/test_tests_utils.py b/micall/tests/test_tests_utils.py new file mode 100644 index 000000000..047868475 --- /dev/null +++ b/micall/tests/test_tests_utils.py @@ -0,0 +1,75 @@ + +import pytest +from micall.tests.utils import MockAligner, MockAlignment + +def test_basic_mapping(): + + aligner = MockAligner('acgt' + 'a' * 20 + 'acgt') + + alignment = list(aligner.map('a' * 10)) + + assert len(alignment) == 5 + + alignment = alignment[0] + + assert isinstance(alignment, MockAlignment) + assert alignment.mapq == 60 + assert alignment.is_rev == False + assert alignment.r_st == 4 + assert alignment.r_en == 14 + assert alignment.q_st == 0 + assert alignment.q_en == 10 + + +def test_exact_match(): + aligner = MockAligner("abcdefg") + alignments = list(aligner.map("abc")) + assert len(alignments) == 1 + assert alignments[0].r_st == 0 + assert alignments[0].r_en == 3 + + +def test_no_match(): + aligner = MockAligner("abcdefg") + alignments = list(aligner.map("xyz")) + assert len(alignments) == 0 + + +def test_partial_match(): + aligner = MockAligner("abcdefg") + alignments = list(aligner.map("abxyabc")) + assert len(alignments) == 1 + assert alignments[0].r_st == 0 + assert alignments[0].r_en == 3 + + +def test_multiple_matches(): + aligner = MockAligner("A" * 40) + alignments = list(aligner.map("A" * 20)) + assert len(alignments) == 5 + assert alignments[0].r_st == 0 + assert alignments[0].r_en == 20 + assert alignments[1].r_st == 0 + assert alignments[1].r_en == 19 + + +def test_multiple_matches_bigger_query(): + aligner = MockAligner("A" * 40) + alignments = list(aligner.map("A" * 50)) + assert len(alignments) == 5 + assert alignments[0].r_st == 0 + assert alignments[0].r_en == 40 + assert alignments[1].r_st == 0 + assert alignments[1].r_en == 40 + + +def test_empty_reference(): + aligner = MockAligner("A" * 0) + alignments = list(aligner.map("A" * 20)) + assert len(alignments) == 0 + + +def test_empty_query(): + aligner = MockAligner("A" * 40) + alignments = list(aligner.map("A" * 0)) + assert len(alignments) == 0 diff --git a/micall/tests/utils.py b/micall/tests/utils.py new file mode 100644 index 000000000..1a569e1d5 --- /dev/null +++ b/micall/tests/utils.py @@ -0,0 +1,56 @@ +from dataclasses import dataclass +from math import floor, ceil + +from micall.utils.consensus_aligner import CigarActions + + +@dataclass +class MockAlignment: + is_rev: bool + mapq: int + cigar: list + cigar_str: str + q_st: int + q_en: int + r_st: int + r_en: int + + +class MockAligner: + """ + Mock for the mappy's aligner class. + Only reports exact matches. + """ + + def __init__(self, seq, *args, **kwargs): + self.seq = seq + self.max_matches = 5 + self.min_length = 3 + + + def map(self, seq): + max_matches = self.max_matches + returned = set() + for length in range(len(seq), self.min_length - 2, -1): + for start in range(len(seq) - length): + end = start + length + substring = seq[start:end+1] + if substring not in self.seq: + continue + + mapq = 60 + is_rev = False # Doesn't handle reverse complements in this mock. + r_st = self.seq.index(substring) + r_en = r_st + len(substring) + q_st = start + q_en = end + 1 + cigar = [[q_en - q_st, CigarActions.MATCH]] + cigar_str = f'{(q_en - q_st)}M' + al = MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + if (q_st, q_en, r_st, r_en) not in returned: + returned.add((q_st, q_en, r_st, r_en)) + yield MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + + max_matches -= 1 + if max_matches < 1: + return From 7f7ad3c11960290f440ee59cc7c0e668bbe0e294 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 08:46:37 -0800 Subject: [PATCH 031/485] Use mocked Aligner in contig stitcher --- micall/tests/test_contig_stitcher.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 7501dd6cd..2ba9c00b5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,5 +1,11 @@ import pytest from micall.core.contig_stitcher import stitch_contigs, GenotypedContig +from micall.tests.utils import MockAligner + + +@pytest.fixture(autouse=True) +def mock_mappy_aligner(monkeypatch): + monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) def test_1(): From 2a18db4d85b4475d71c284a285270427897bd8f6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 09:27:58 -0800 Subject: [PATCH 032/485] Contig stitcher: fix Frankenstein cut_reference implementation --- micall/core/contig_stitcher.py | 84 +++++++++++++++++++++------- micall/tests/test_contig_stitcher.py | 41 +++++++++++++- 2 files changed, 103 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index d0828a521..982df0b6d 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -3,9 +3,11 @@ from dataclasses import dataclass from mappy import Aligner from functools import cached_property +from itertools import accumulate from gotoh import align_it -from micall.utils.cigar_tools import connect_cigar_hits, CigarHit +from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit +from micall.utils.consensus_aligner import CigarActions @dataclass @@ -13,6 +15,10 @@ class Contig: name: str seq: str + @property + def contig(self): + return self + @dataclass class GenotypedContig(Contig): @@ -20,10 +26,24 @@ class GenotypedContig(Contig): ref_seq: str matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. - @property - def contig(self): + + def __add__(self, other): + if self.ref_name != other.ref_name: + raise ValueError("Cannot concatenate contigs that do not belong the they same reference") + + assert self.ref_seq == other.ref_seq, "References that are named the same must be the same sequence" + + return GenotypedContig(name=f'{self.name}+{other.name}', + seq=self.seq + other.seq, + ref_name=self.ref_name, + ref_seq=self.ref_seq, + matched_fraction=None) + + + def narrow_query_to_alignment(self) -> 'GenotypedContig': return self + @dataclass class AlignedContig: contig: GenotypedContig @@ -44,36 +64,58 @@ def msa(self): @cached_property def seq(self): - seq_left, ref_seq_left = self.msa - return ''.join((c for c in ref_seq_left if c != '-')) + ref_msa, query_msa = self.msa + return ''.join((c for c in query_msa if c != '-')) + + + def narrow_query_to_alignment(self) -> 'AlignedContig': + seq = self.contig.seq[self.alignment.q_st:self.alignment.q_ei + 1] + contig = GenotypedContig(name=self.contig.name, + seq=seq, + ref_name=self.contig.ref_name, + ref_seq=self.contig.ref_seq, + matched_fraction=None) + + alignment = self.alignment.translate(0, -1 * self.alignment.q_st) + return AlignedContig(contig, alignment) class FrankensteinContig(AlignedContig): - """ Assembled of parts that were not even aligned together, + """ + Assembled of parts that were not even aligned together, and of some parts that were not aligned at all. - Yet its .seq string looks like a real contig. """ + Yet its self.seq string looks like a real contig. + """ def __init__(self, parts: List[GenotypedContig]): + assert len(parts) > 0, "Empty Frankenstei do not exist" + + # Flatten any possible Frankenstein parts self.parts = [subpart for part in parts for subpart in (part.parts if isinstance(part, FrankensteinContig) else [part])] - name = '+'.join(map(lambda acontig: acontig.contig.name, self.parts)) - ref = self.parts[0].contig - contig = GenotypedContig(name=name, seq=self.seq, - ref_name=ref.ref_name, - ref_seq=ref.ref_seq, - matched_fraction=ref.matched_fraction) + # In the remainder of this function we will try to construct alignment + # that spans over all parts, and its MSA is the sum of all parts MSAs. + narrowed_parts = [part.narrow_query_to_alignment() for part in self.parts] - alignment = connect_cigar_hits([part.alignment for part in self.parts - if isinstance(part, AlignedContig)]) + # Overall contig is just sum of parts contigs. + contigs = [part.contig for part in narrowed_parts] + contig = sum(contigs[1:], start=contigs[0]) - super().__init__(contig, alignment) + # Adjust alignment offsets + offsets = [0] + list(accumulate(len(contig.seq) for contig in contigs[:-1])) + def adjust(offset, alignment): + return alignment.translate(reference_delta=0, query_delta=offset) + aligned_parts = [adjust(offset, part.alignment) for (offset, part) + in zip(offsets, narrowed_parts) + if isinstance(part, AlignedContig)] - @cached_property - def seq(self): - return ''.join(map(lambda part: part.seq, self.parts)) + # Combine all aligned parts to produce overall alignment. + # It will only be reasonable if the ends are aligned. + alignment = connect_cigar_hits(aligned_parts) + super().__init__(contig, alignment) def align_to_reference(contig: GenotypedContig): @@ -87,7 +129,7 @@ def align_to_reference(contig: GenotypedContig): return AlignedContig(contig=contig, alignment=single_cigar_hit) -def align_equal(seq1, seq2) -> Tuple[str, str]: +def align_equal(seq1: str, seq2: str) -> Tuple[str, str]: gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 @@ -190,7 +232,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): aligned = list(map(align_to_reference, contigs)) # Contigs that did not align do not need any more processing - stitched = yield from (x for x in aligned if not isinstance(x, AlignedContig)) + yield from (x for x in aligned if not isinstance(x, AlignedContig)) aligned = [x for x in aligned if isinstance(x, AlignedContig)] # Going left-to-right through aligned contigs. diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 2ba9c00b5..4b09d9930 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -65,5 +65,44 @@ def test_3(): ] result = list(stitch_contigs(contigs)) - assert 100 == sum(len(x.seq) for x in result) + assert len(result) == 1 + + result = result[0] + + assert 100 == len(result.seq) + assert result.seq == 'A' * 50 + 'C' * 50 + assert result.contig.name == 'a+overlap(a,b)+b' + + +def test_4(): + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='c', + seq='C' * 20 + 'T' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert len(result) == 2 + + assert 100 == len(result[0].seq) + assert result[0].seq == 'A' * 50 + 'C' * 50 assert result[0].contig.name == 'a+overlap(a,b)+b' + + assert result[1].contig == contigs[2] From 4c2979810271054e1305d51dee52c789236e7e16 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 15:25:34 -0800 Subject: [PATCH 033/485] Mention that CigarHit.overlaps only applicable in same refs --- micall/utils/cigar_tools.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index d05767b88..67624b739 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -327,6 +327,12 @@ def q_len(self): def overlaps(self, other) -> bool: + """ + Checks if this CIGAR hit overlaps with the other CIGAR hit, + in either reference or query space. + NOTE: only applicable if these hits come from the same reference and query. + """ + def intervals_overlap(x, y): """ Check if two intervals [x0, x1] and [y0, y1] overlap. """ return x[0] <= y[1] and x[1] >= y[0] From 00ee9fec8a75eb080f698f81a09b993449d7c4d6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 15:35:31 -0800 Subject: [PATCH 034/485] Improve basic config stitcher tests --- micall/tests/test_contig_stitcher.py | 123 ++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 4 deletions(-) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 4b09d9930..7946926ef 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -8,7 +8,9 @@ def mock_mappy_aligner(monkeypatch): monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) -def test_1(): +def test_identical_stitching_of_one_contig(): + # Scenario: When stitching one contig, it remains the same. + contigs = [ GenotypedContig(name='a', seq='ACTGACTG' * 100, @@ -23,7 +25,9 @@ def test_1(): == sorted(map(lambda x: x.seq, result)) -def test_2(): +def test_separate_stitching_of_non_overlapping_contigs(): + # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. + ref_seq = 'A' * 100 contigs = [ @@ -45,8 +49,29 @@ def test_2(): assert sorted(map(lambda x: x.seq, contigs)) \ == sorted(map(lambda x: x.seq, result)) + contigs = [ + GenotypedContig(name='b', + seq='C' * 100, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='a', + seq=ref_seq, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + +def test_correct_stitching_of_two_partially_overlapping_contigs(): + # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. -def test_3(): ref_seq = 'A' * 100 + 'C' * 100 contigs = [ @@ -74,7 +99,9 @@ def test_3(): assert result.contig.name == 'a+overlap(a,b)+b' -def test_4(): +def test_correct_processing_of_two_overlapping_and_one_separate_contig(): + # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 contigs = [ @@ -106,3 +133,91 @@ def test_4(): assert result[0].contig.name == 'a+overlap(a,b)+b' assert result[1].contig == contigs[2] + + +def test_stitching_of_all_overlapping_contigs_into_one_sequence(): + # Scenario: All contigs have some overlapping parts, resulting in one continuous sequence after stitching. + + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 100 + 'T' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='c', + seq='C' * 20 + 'T' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert len(result) == 1 + + result = result[0] + + assert 200 == len(result.seq) + assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 + assert result.contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + + +def test_stitching_with_empty_contigs(): + # Scenario: The function is able to handle and ignore empty contigs. + + ref_seq = 'A' * 100 + + contigs = [ + GenotypedContig(name='a', + seq=ref_seq, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='', + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + +def test_stitching_of_identical_contigs(): + # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. + + ref_seq = 'A' * 100 + contig = \ + GenotypedContig(name='a', + seq='ACTGACTG' * 100, + ref_name='testref', + ref_seq='ACTGACTG' * 100, + matched_fraction=1.0, + ) + + contigs = [contig, contig, contig] + + result = list(stitch_contigs(contigs)) + assert len(result) == 1 + assert result[0].contig == contig + + +def test_stitching_of_zero_contigs(): + # Scenario: The function does not crash if no contigs given. + + contigs = [] + result = list(stitch_contigs(contigs)) + assert result == contigs From 4cc5b9bdf6b49355199167ce583cfabd80cf9d7f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 15:55:53 -0800 Subject: [PATCH 035/485] Fix typo in GenotypeContig error message --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 982df0b6d..73e0f6617 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -29,7 +29,7 @@ class GenotypedContig(Contig): def __add__(self, other): if self.ref_name != other.ref_name: - raise ValueError("Cannot concatenate contigs that do not belong the they same reference") + raise ValueError("Cannot concatenate contigs that do not belong the same reference") assert self.ref_seq == other.ref_seq, "References that are named the same must be the same sequence" From 72148b568995f091bb68eb5258ec2211346b2d94 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 6 Nov 2023 16:08:52 -0800 Subject: [PATCH 036/485] Improve contig stitcher class hierarchy --- micall/core/contig_stitcher.py | 45 ++++++++++++++++------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 73e0f6617..5d8091df4 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -15,10 +15,6 @@ class Contig: name: str seq: str - @property - def contig(self): - return self - @dataclass class GenotypedContig(Contig): @@ -44,10 +40,22 @@ def narrow_query_to_alignment(self) -> 'GenotypedContig': return self -@dataclass -class AlignedContig: - contig: GenotypedContig - alignment: CigarHit +class AlignedContig(GenotypedContig): + + def __init__(self, contig: GenotypedContig, alignment: CigarHit): + self.alignment = alignment + self.contig = contig + + ref_msa, query_msa = self.alignment.to_msa(self.contig.ref_seq, self.contig.seq) + seq = ''.join((c for c in query_msa if c != '-')) + + super().__init__( + seq = seq, + name = contig.name, + ref_name = contig.ref_name, + ref_seq = contig.ref_seq, + matched_fraction = contig.matched_fraction) + def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ @@ -57,17 +65,6 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont AlignedContig(self.contig, alignment_right)) - @cached_property - def msa(self): - return self.alignment.to_msa(self.contig.ref_seq, self.contig.seq) - - - @cached_property - def seq(self): - ref_msa, query_msa = self.msa - return ''.join((c for c in query_msa if c != '-')) - - def narrow_query_to_alignment(self) -> 'AlignedContig': seq = self.contig.seq[self.alignment.q_st:self.alignment.q_ei + 1] contig = GenotypedContig(name=self.contig.name, @@ -99,7 +96,7 @@ def __init__(self, parts: List[GenotypedContig]): narrowed_parts = [part.narrow_query_to_alignment() for part in self.parts] # Overall contig is just sum of parts contigs. - contigs = [part.contig for part in narrowed_parts] + contigs = [part.contig if isinstance(part, AlignedContig) else part for part in narrowed_parts] contig = sum(contigs[1:], start=contigs[0]) # Adjust alignment offsets @@ -155,7 +152,7 @@ def intervals_overlap(x, y): def find_all_overlapping_contigs(self, aligned_contigs): for other in aligned_contigs: - if self.contig.ref_name != other.contig.ref_name: + if self.ref_name != other.ref_name: continue if intervals_overlap((self.alignment.r_st, self.alignment.r_ei), @@ -222,9 +219,9 @@ def stitch_2_contigs(left, right): overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') # Return something that can be fed back into the loop. - overlap_contig = GenotypedContig(name=f'overlap({left.contig.name},{right.contig.name})', - seq=overlap_seq, ref_name=left.contig.ref_name, - ref_seq=left.contig.ref_seq, matched_fraction=None) + overlap_contig = GenotypedContig(name=f'overlap({left.name},{right.name})', + seq=overlap_seq, ref_name=left.ref_name, + ref_seq=left.ref_seq, matched_fraction=None) return FrankensteinContig([left_remainder, overlap_contig, right_remainder]) From 0619581f6dce69ac40a955bfe6b0c45e953104fe Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 13:22:26 -0800 Subject: [PATCH 037/485] Cigar tools: do not enforce commutativity on CigarHits If we assert that addition of cigar strings is commutative, then it is not associative. But for predictability of addition it is more important to have associativity. --- micall/tests/test_cigar_tools.py | 2 +- micall/utils/cigar_tools.py | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 05eff8db8..b003278cf 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -250,7 +250,7 @@ def test_cigar_hit_ref_cut(hit, cut_point, expected_result): if not isinstance(x[2], Exception)]) def test_cigar_hit_ref_cut_add_prop(hit, cut_point): left, right = hit.cut_reference(cut_point) - assert left + right == hit == right + left + assert left + right == hit @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 67624b739..e3f3f1fdf 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -350,24 +350,16 @@ def __add__(self, other): if self.overlaps(other): raise ValueError("Cannot combine overlapping CIGAR hits") - if (self.r_st, self.r_ei) < (other.r_st, other.r_ei): - # Note: in cases where one CigarHit is empty, comparing only by a single coordiate is not sufficient. - left = self - right = other - else: - left = other - right = self - - cigar = left.cigar \ - + Cigar.coerce([(right.r_st - left.r_ei - 1, CigarActions.DELETE)]) \ - + Cigar.coerce([(right.q_st - left.q_ei - 1, CigarActions.INSERT)]) \ - + right.cigar + cigar = self.cigar \ + + Cigar.coerce([(other.r_st - self.r_ei - 1, CigarActions.DELETE)]) \ + + Cigar.coerce([(other.q_st - self.q_ei - 1, CigarActions.INSERT)]) \ + + other.cigar return CigarHit(cigar=cigar, - r_st=left.r_st, - r_ei=right.r_ei, - q_st=left.q_st, - q_ei=right.q_ei, + r_st=self.r_st, + r_ei=other.r_ei, + q_st=self.q_st, + q_ei=other.q_ei, ) From eba431987b15a12c16513616e993740bd139a726 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 14:42:50 -0800 Subject: [PATCH 038/485] Cigar tools: test associativity of CigarHit addition --- micall/tests/test_cigar_tools.py | 33 +++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index b003278cf..1c31a7e8e 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -1,5 +1,6 @@ import pytest from typing import List, Tuple +from math import floor from micall.utils.consensus_aligner import CigarActions from micall.utils.cigar_tools import Cigar, CigarHit @@ -139,7 +140,11 @@ def test_invalid_cigar_string(): cigar_hit_ref_cut_cases = [ - # Trivial cases + # # Trivial cases + (CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), 2.5, + [CigarHit('2M', r_st=1, r_ei=2, q_st=1, q_ei=2), + CigarHit('2M', r_st=3, r_ei=4, q_st=3, q_ei=4)]), + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 3.5, [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), CigarHit('6M', r_st=4, r_ei=9, q_st=4, q_ei=9)]), @@ -253,6 +258,32 @@ def test_cigar_hit_ref_cut_add_prop(hit, cut_point): assert left + right == hit +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): + percentage = cut_point - floor(cut_point) + + for cut_point in range(hit.r_st, hit.r_ei + 2): + left, right = hit.cut_reference(cut_point - percentage) + assert left + right == hit + + +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): + percentage = cut_point - floor(cut_point) + + for ax_cut in range(hit.r_st, hit.r_ei + 2): + a, x = hit.cut_reference(ax_cut - percentage) + + for bc_cut in range(a.r_ei + 1, hit.r_ei + 2): + if x.r_len == 0: continue + + b, c = x.cut_reference(bc_cut - percentage) + + assert (a + b) + c == a + (b + c) + + @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), ('ACTG', '', '4D', 'ACTG', '----'), From 24a0cbb6cdac2e894318e91f679620ee800d06fc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 14:46:46 -0800 Subject: [PATCH 039/485] Cigar tools: add lstrip and rstrip functions --- micall/tests/test_cigar_tools.py | 39 +++++++++++++++++++++++++- micall/tests/test_contig_stitcher.py | 28 +++++++++++++++++++ micall/utils/cigar_tools.py | 41 ++++++++++++++++++++++++++-- 3 files changed, 105 insertions(+), 3 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 1c31a7e8e..f23b02cbc 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -1,9 +1,10 @@ import pytest from typing import List, Tuple from math import floor +import itertools from micall.utils.consensus_aligner import CigarActions -from micall.utils.cigar_tools import Cigar, CigarHit +from micall.utils.cigar_tools import Cigar, CigarHit, parse_cigar_operation, CIGAR_OP_MAPPING cigar_mapping_cases: List[Tuple[Cigar, 'mapping', 'closest_mapping']] = [ @@ -284,6 +285,42 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): assert (a + b) + c == a + (b + c) +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +def test_cigar_hit_lstrip_is_stringlike(hit): + all_chars = CIGAR_OP_MAPPING.keys() + + actions_of = lambda s: (x for x in s if x in all_chars) + + for r in range(len(all_chars) + 1): + for char_set in itertools.combinations(all_chars, r): + actions = set(map(parse_cigar_operation, char_set)) + chars = ''.join(char_set) + + p = lambda x: ''.join(actions_of(str(x.cigar))) + g = lambda x: x.lstrip(actions) + h = lambda x: x.lstrip(chars) + + assert p(g(hit)) == h(p(hit)) + + +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +def test_cigar_hit_rstrip_is_stringlike(hit): + all_chars = CIGAR_OP_MAPPING.keys() + + actions_of = lambda s: (x for x in s if x in all_chars) + + for r in range(len(all_chars) + 1): + for char_set in itertools.combinations(all_chars, r): + actions = set(map(parse_cigar_operation, char_set)) + chars = ''.join(char_set) + + p = lambda x: ''.join(actions_of(str(x.cigar))) + g = lambda x: x.rstrip(actions) + h = lambda x: x.rstrip(chars) + + assert p(g(hit)) == h(p(hit)) + + @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), ('ACTG', '', '4D', 'ACTG', '----'), diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 7946926ef..eed5c540f 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -221,3 +221,31 @@ def test_stitching_of_zero_contigs(): contigs = [] result = list(stitch_contigs(contigs)) assert result == contigs + + +def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(): + # Scenario: Two partially overlapping contigs, but which come from different organism, + # are not stitched into a single sequence. + + ref_seq = 'A' * 100 + 'C' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name='testref-1', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name='testref-2', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert len(result) == 2 + + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index e3f3f1fdf..370949c35 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,9 +4,10 @@ from math import ceil, floor import re -from typing import Tuple, Iterable, Optional +from typing import Container, Tuple, Iterable, Optional from dataclasses import dataclass from functools import cached_property +from itertools import chain, dropwhile from micall.utils.consensus_aligner import CigarActions @@ -130,7 +131,7 @@ class Cigar(list): """ - def __init__(self, cigar_lst): + def __init__(self, cigar_lst: Iterable[Tuple[int, CigarActions]]): super().__init__([]) for x in cigar_lst: self.append(x) @@ -239,6 +240,16 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': [start_inclusive:end_noninclusive]) + def lstrip(self, actions: Container[CigarActions]) -> 'Cigar': + """ Return a copy of the Cigar with leading actions removed. """ + return Cigar(dropwhile(lambda tupl: tupl[1] in actions, self)) + + + def rstrip(self, actions: Container[CigarActions]) -> 'Cigar': + """ Return a copy of the Cigar with trailing actions removed. """ + return Cigar(reversed(list(dropwhile(lambda tupl: tupl[1] in actions, reversed(self))))) + + @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ @@ -428,6 +439,32 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': return left, right + def lstrip(self, actions: Container[CigarActions]) -> 'CigarHit': + """ Return a copy of the CigarHit with leading actions removed. """ + + cigar = self.cigar.lstrip(actions) + reference_delta = cigar.ref_length - self.cigar.ref_length + query_delta = cigar.query_length - self.cigar.query_length + return CigarHit(cigar, + r_st=self.r_st, + r_ei=self.r_ei + reference_delta, + q_st=self.q_st, + q_ei=self.q_ei + query_delta) + + + def rstrip(self, actions: Container[CigarActions]) -> 'CigarHit': + """ Return a copy of the CigarHit with trailing actions removed. """ + + cigar = self.cigar.rstrip(actions) + reference_delta = cigar.ref_length - self.cigar.ref_length + query_delta = cigar.query_length - self.cigar.query_length + return CigarHit(cigar, + r_st=self.r_st, + r_ei=self.r_ei + reference_delta, + q_st=self.q_st, + q_ei=self.q_ei + query_delta) + + @cached_property def coordinate_mapping(self) -> CoordinateMapping: return self.cigar.coordinate_mapping.translate(self.r_st, self.q_st) From 5789ee632f15b4df5d2d3453ae611b7a5a9d4e73 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 16:30:40 -0800 Subject: [PATCH 040/485] Cigar tools: rename "r_len" and "q_len" --- micall/tests/test_cigar_tools.py | 2 +- micall/utils/cigar_tools.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index f23b02cbc..23bf97a18 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -278,7 +278,7 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): a, x = hit.cut_reference(ax_cut - percentage) for bc_cut in range(a.r_ei + 1, hit.r_ei + 2): - if x.r_len == 0: continue + if x.ref_length == 0: continue b, c = x.cut_reference(bc_cut - percentage) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 370949c35..4622778eb 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -318,22 +318,22 @@ class CigarHit: def __post_init__(self): self.cigar = Cigar.coerce(self.cigar) - if self.r_len != self.cigar.ref_length: + if self.ref_length != self.cigar.ref_length: raise ValueError(f"CIGAR string maps {self.cigar.ref_length}" - f" reference positions, but CIGAR hit range is {self.r_len}") + f" reference positions, but CIGAR hit range is {self.ref_length}") - if self.q_len != self.cigar.query_length: + if self.query_length != self.cigar.query_length: raise ValueError(f"CIGAR string maps {self.cigar.query_length}" - f" query positions, but CIGAR hit range is {self.q_len}") + f" query positions, but CIGAR hit range is {self.query_length}") @property - def r_len(self): + def ref_length(self): return self.r_ei + 1 - self.r_st @property - def q_len(self): + def query_length(self): return self.q_ei + 1 - self.q_st @@ -424,7 +424,7 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': if float(cut_point).is_integer(): raise ValueError("Cut accepts fractions, not integers") - if self.r_len == 0 or \ + if self.ref_length == 0 or \ not (self.r_st - 1 < cut_point < self.r_ei + 1): raise IndexError("Cut point out of reference bounds") From 9469286c7b368d2f722b5e292d6315f395624c4e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 16:45:29 -0800 Subject: [PATCH 041/485] Cigar tools: add default alignment concept --- micall/utils/cigar_tools.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 4622778eb..3c8beb3a8 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -337,6 +337,16 @@ def query_length(self): return self.q_ei + 1 - self.q_st + @staticmethod + def from_default_alignment(r_st, r_ei, q_st, q_ei): + ref_length = r_ei - r_st + 1 + query_length = q_ei - q_st + 1 + cigar = Cigar.coerce([[ref_length, CigarActions.DELETE], + [query_length, CigarActions.INSERT]]) + + return CigarHit(cigar, r_st=r_st, r_ei=r_ei, q_st=q_st, q_ei=q_ei) + + def overlaps(self, other) -> bool: """ Checks if this CIGAR hit overlaps with the other CIGAR hit, @@ -362,8 +372,7 @@ def __add__(self, other): raise ValueError("Cannot combine overlapping CIGAR hits") cigar = self.cigar \ - + Cigar.coerce([(other.r_st - self.r_ei - 1, CigarActions.DELETE)]) \ - + Cigar.coerce([(other.q_st - self.q_ei - 1, CigarActions.INSERT)]) \ + + CigarHit.from_default_alignment(self.r_ei + 1, other.r_st - 1, self.q_ei + 1, other.q_st - 1).cigar \ + other.cigar return CigarHit(cigar=cigar, From 615e8770d5679c58a37355f93d9e1d570cb60683 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 16:50:55 -0800 Subject: [PATCH 042/485] Contig stitcher: fix containment check Account for the ref_name when checking if one contig is contained in another. --- micall/core/contig_stitcher.py | 27 ++++++++++---- micall/tests/test_contig_stitcher.py | 54 ++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 5d8091df4..58a30dcf3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -150,13 +150,27 @@ def intervals_overlap(x, y): return x[0] <= y[1] and x[1] >= y[0] +def contig_overlaps(self, other): + if self.ref_name != other.ref_name: + return False + + if intervals_overlap((self.alignment.r_st, self.alignment.r_ei), + (other.alignment.r_st, other.alignment.r_ei)): + return True + + +def contig_contains(self, other): + if self.ref_name != other.ref_name: + return False + + if interval_contains((self.alignment.r_st, self.alignment.r_ei), + (other.alignment.r_st, other.alignment.r_ei)): + return True + + def find_all_overlapping_contigs(self, aligned_contigs): for other in aligned_contigs: - if self.ref_name != other.ref_name: - continue - - if intervals_overlap((self.alignment.r_st, self.alignment.r_ei), - (other.alignment.r_st, other.alignment.r_ei)): + if contig_overlaps(self, other): yield other @@ -241,8 +255,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): # TODO: actually filter out if covered by multiple contigs # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. aligned = [x for x in aligned if not \ - interval_contains((current.alignment.r_st, current.alignment.r_ei), - (x.alignment.r_st, x.alignment.r_ei))] + contig_contains(current, x)] # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, aligned) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index eed5c540f..55b23043c 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -249,3 +249,57 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti assert sorted(map(lambda x: x.seq, contigs)) \ == sorted(map(lambda x: x.seq, result)) + + +def test_correct_processing_complex_nogaps(): + # Scenario: There are two reference organisms. + # Each with 4 contigs. + # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. + # This seems like the most general scenario if no gaps or complete goverage is involved. + + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + + contigs = [[ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name=ref_name, + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name=ref_name, + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='c', + seq='C' * 70 + 'T' * 20, + ref_name=ref_name, + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='d', + seq='T' * 20 + 'G' * 50, + ref_name=ref_name, + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] for ref_name in ['testref-1', 'testref-2']] + + contigs = sum(contigs, start=[]) + + result = list(stitch_contigs(contigs)) + assert len(result) == 4 + + assert 170 == len(result[0].seq) + assert result[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 + assert result[0].contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[0].contig.ref_name == 'testref-1' + + assert 170 == len(result[1].seq) + assert result[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 + assert result[1].contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[1].contig.ref_name == 'testref-2' + + assert result[2].contig == contigs[3] + assert result[3].contig == contigs[7] From 3c3a95da513d710a3b9d27eecce523b995f8db29 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 7 Nov 2023 20:44:56 -0800 Subject: [PATCH 043/485] Fix off-by-1 errors in MockAligner --- micall/tests/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 1a569e1d5..6a7de309a 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -31,10 +31,10 @@ def __init__(self, seq, *args, **kwargs): def map(self, seq): max_matches = self.max_matches returned = set() - for length in range(len(seq), self.min_length - 2, -1): - for start in range(len(seq) - length): + for length in range(len(seq), self.min_length - 1, -1): + for start in range(len(seq) - length + 1): end = start + length - substring = seq[start:end+1] + substring = seq[start:end] if substring not in self.seq: continue @@ -43,7 +43,7 @@ def map(self, seq): r_st = self.seq.index(substring) r_en = r_st + len(substring) q_st = start - q_en = end + 1 + q_en = end cigar = [[q_en - q_st, CigarActions.MATCH]] cigar_str = f'{(q_en - q_st)}M' al = MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) From 3b29507868dc67dfb7f694b9562ae371076a2abc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 09:20:31 -0800 Subject: [PATCH 044/485] Contig stitcher: keep nonconflicting parts of contigs intact --- micall/core/contig_stitcher.py | 58 +++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 58a30dcf3..9170d63dc 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -2,7 +2,7 @@ from collections import deque from dataclasses import dataclass from mappy import Aligner -from functools import cached_property +from functools import cached_property, reduce from itertools import accumulate from gotoh import align_it @@ -77,6 +77,13 @@ def narrow_query_to_alignment(self) -> 'AlignedContig': return AlignedContig(contig, alignment) +class SyntheticContig(AlignedContig): + def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): + alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, + q_st=0, q_ei=len(query.seq)-1) + super().__init__(query, alignment) + + class FrankensteinContig(AlignedContig): """ Assembled of parts that were not even aligned together, @@ -85,34 +92,37 @@ class FrankensteinContig(AlignedContig): """ def __init__(self, parts: List[GenotypedContig]): - assert len(parts) > 0, "Empty Frankenstei do not exist" + if len(parts) == 0: + raise ValueError("Empty Frankenstei do not exist") # Flatten any possible Frankenstein parts self.parts = [subpart for part in parts for subpart in (part.parts if isinstance(part, FrankensteinContig) else [part])] - # In the remainder of this function we will try to construct alignment - # that spans over all parts, and its MSA is the sum of all parts MSAs. - narrowed_parts = [part.narrow_query_to_alignment() for part in self.parts] + aligned = reduce(FrankensteinContig.munge, self.parts) - # Overall contig is just sum of parts contigs. - contigs = [part.contig if isinstance(part, AlignedContig) else part for part in narrowed_parts] - contig = sum(contigs[1:], start=contigs[0]) + super().__init__(aligned.contig, aligned.alignment) - # Adjust alignment offsets - offsets = [0] + list(accumulate(len(contig.seq) for contig in contigs[:-1])) - def adjust(offset, alignment): - return alignment.translate(reference_delta=0, query_delta=offset) - aligned_parts = [adjust(offset, part.alignment) for (offset, part) - in zip(offsets, narrowed_parts) - if isinstance(part, AlignedContig)] + @staticmethod + def munge(left: 'AlignedContig', right: 'AlignedContig') -> 'AlignedContig': + left_query_seq = left.contig.seq[0:left.alignment.q_ei + 1] + right_query_seq = right.contig.seq[right.alignment.q_st:] + query_seq = left_query_seq + right_query_seq - # Combine all aligned parts to produce overall alignment. - # It will only be reasonable if the ends are aligned. - alignment = connect_cigar_hits(aligned_parts) + left_alignment = left.alignment + right_alignment = \ + right.alignment.translate( + query_delta=(-1 * right.alignment.q_st + len(left_query_seq)), + reference_delta=0) + alignment = left_alignment + right_alignment - super().__init__(contig, alignment) + query = GenotypedContig(seq=query_seq, + name=f'{left.name}+{right.name}', + ref_name=left.ref_name, + ref_seq=left.ref_seq, + matched_fraction=None) + return AlignedContig(query, alignment) def align_to_reference(contig: GenotypedContig): @@ -233,9 +243,13 @@ def stitch_2_contigs(left, right): overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') # Return something that can be fed back into the loop. - overlap_contig = GenotypedContig(name=f'overlap({left.name},{right.name})', - seq=overlap_seq, ref_name=left.ref_name, - ref_seq=left.ref_seq, matched_fraction=None) + overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', + seq=overlap_seq, ref_name=left.ref_name, + ref_seq=left.ref_seq, matched_fraction=None) + overlap_contig = SyntheticContig(overlap_query, + r_st=left_overlap.alignment.r_st, + r_ei=right_overlap.alignment.r_ei) + return FrankensteinContig([left_remainder, overlap_contig, right_remainder]) From ba788b77773a2fe3cd2282a8f7cc416a8e1960aa Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 09:28:29 -0800 Subject: [PATCH 045/485] Small improvements to contig stitcher code * rename AlignedContig.contig to AlignedContig.query * remove redundant asserts * remove narrow_query_to_alignment method * remove default addition of GenotypedContig's --- micall/core/contig_stitcher.py | 107 +++++++++------------------ micall/tests/test_contig_stitcher.py | 23 +++--- 2 files changed, 48 insertions(+), 82 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9170d63dc..fc5bdedaa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -23,58 +23,51 @@ class GenotypedContig(Contig): matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. - def __add__(self, other): - if self.ref_name != other.ref_name: - raise ValueError("Cannot concatenate contigs that do not belong the same reference") - - assert self.ref_seq == other.ref_seq, "References that are named the same must be the same sequence" - - return GenotypedContig(name=f'{self.name}+{other.name}', - seq=self.seq + other.seq, - ref_name=self.ref_name, - ref_seq=self.ref_seq, - matched_fraction=None) - - - def narrow_query_to_alignment(self) -> 'GenotypedContig': - return self - - class AlignedContig(GenotypedContig): - def __init__(self, contig: GenotypedContig, alignment: CigarHit): + def __init__(self, query: GenotypedContig, alignment: CigarHit): self.alignment = alignment - self.contig = contig + self.query = query - ref_msa, query_msa = self.alignment.to_msa(self.contig.ref_seq, self.contig.seq) + ref_msa, query_msa = self.alignment.to_msa(self.query.ref_seq, self.query.seq) seq = ''.join((c for c in query_msa if c != '-')) super().__init__( seq = seq, - name = contig.name, - ref_name = contig.ref_name, - ref_seq = contig.ref_seq, - matched_fraction = contig.matched_fraction) + name = query.name, + ref_name = query.ref_name, + ref_seq = query.ref_seq, + matched_fraction = query.matched_fraction) def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - return (AlignedContig(self.contig, alignment_left), - AlignedContig(self.contig, alignment_right)) + return (AlignedContig(self.query, alignment_left), + AlignedContig(self.query, alignment_right)) + + + def overlaps(self, other) -> bool: + def intervals_overlap(x, y): + return x[0] <= y[1] and x[1] >= y[0] + + if self.ref_name != other.ref_name: + return False + + return intervals_overlap((self.alignment.r_st, self.alignment.r_ei), + (other.alignment.r_st, other.alignment.r_ei)) - def narrow_query_to_alignment(self) -> 'AlignedContig': - seq = self.contig.seq[self.alignment.q_st:self.alignment.q_ei + 1] - contig = GenotypedContig(name=self.contig.name, - seq=seq, - ref_name=self.contig.ref_name, - ref_seq=self.contig.ref_seq, - matched_fraction=None) + def contains(self, other) -> bool: + def interval_contains(x, y): + return x[0] <= y[0] and x[1] >= y[1] - alignment = self.alignment.translate(0, -1 * self.alignment.q_st) - return AlignedContig(contig, alignment) + if self.ref_name != other.ref_name: + return False + + return interval_contains((self.alignment.r_st, self.alignment.r_ei), + (other.alignment.r_st, other.alignment.r_ei)) class SyntheticContig(AlignedContig): @@ -96,18 +89,19 @@ def __init__(self, parts: List[GenotypedContig]): raise ValueError("Empty Frankenstei do not exist") # Flatten any possible Frankenstein parts + # This is entirely optional, but may be useful for debugging self.parts = [subpart for part in parts for subpart in (part.parts if isinstance(part, FrankensteinContig) else [part])] aligned = reduce(FrankensteinContig.munge, self.parts) - super().__init__(aligned.contig, aligned.alignment) + super().__init__(aligned.query, aligned.alignment) @staticmethod - def munge(left: 'AlignedContig', right: 'AlignedContig') -> 'AlignedContig': - left_query_seq = left.contig.seq[0:left.alignment.q_ei + 1] - right_query_seq = right.contig.seq[right.alignment.q_st:] + def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: + left_query_seq = left.query.seq[0:left.alignment.q_ei + 1] + right_query_seq = right.query.seq[right.alignment.q_st:] query_seq = left_query_seq + right_query_seq left_alignment = left.alignment @@ -133,7 +127,7 @@ def align_to_reference(contig: GenotypedContig): hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] single_cigar_hit = connect_cigar_hits(hits_array) - return AlignedContig(contig=contig, alignment=single_cigar_hit) + return AlignedContig(query=contig, alignment=single_cigar_hit) def align_equal(seq1: str, seq2: str) -> Tuple[str, str]: @@ -150,37 +144,9 @@ def align_equal(seq1: str, seq2: str) -> Tuple[str, str]: return aseq1, aseq2 -def interval_contains(x, y): - """ Check if interval (x0, x1) contains interval (y0, y1). """ - return x[0] <= y[0] and x[1] >= y[1] - - -def intervals_overlap(x, y): - """ Check if two intervals [x0, x1] and [y0, y1] overlap. """ - return x[0] <= y[1] and x[1] >= y[0] - - -def contig_overlaps(self, other): - if self.ref_name != other.ref_name: - return False - - if intervals_overlap((self.alignment.r_st, self.alignment.r_ei), - (other.alignment.r_st, other.alignment.r_ei)): - return True - - -def contig_contains(self, other): - if self.ref_name != other.ref_name: - return False - - if interval_contains((self.alignment.r_st, self.alignment.r_ei), - (other.alignment.r_st, other.alignment.r_ei)): - return True - - def find_all_overlapping_contigs(self, aligned_contigs): for other in aligned_contigs: - if contig_overlaps(self, other): + if self.overlaps(other): yield other @@ -268,8 +234,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): # Filter out all contigs that are contained within the current one. # TODO: actually filter out if covered by multiple contigs # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. - aligned = [x for x in aligned if not \ - contig_contains(current, x)] + aligned = [x for x in aligned if not current.contains(x)] # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, aligned) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 55b23043c..a00a1cfb5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,3 +1,4 @@ + import pytest from micall.core.contig_stitcher import stitch_contigs, GenotypedContig from micall.tests.utils import MockAligner @@ -96,7 +97,7 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(): assert 100 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 50 - assert result.contig.name == 'a+overlap(a,b)+b' + assert result.query.name == 'a+overlap(a,b)+b' def test_correct_processing_of_two_overlapping_and_one_separate_contig(): @@ -130,9 +131,9 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(): assert 100 == len(result[0].seq) assert result[0].seq == 'A' * 50 + 'C' * 50 - assert result[0].contig.name == 'a+overlap(a,b)+b' + assert result[0].query.name == 'a+overlap(a,b)+b' - assert result[1].contig == contigs[2] + assert result[1].query == contigs[2] def test_stitching_of_all_overlapping_contigs_into_one_sequence(): @@ -168,7 +169,7 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(): assert 200 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 - assert result.contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result.query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' def test_stitching_with_empty_contigs(): @@ -212,7 +213,7 @@ def test_stitching_of_identical_contigs(): result = list(stitch_contigs(contigs)) assert len(result) == 1 - assert result[0].contig == contig + assert result[0].query == contig def test_stitching_of_zero_contigs(): @@ -293,13 +294,13 @@ def test_correct_processing_complex_nogaps(): assert 170 == len(result[0].seq) assert result[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[0].contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' - assert result[0].contig.ref_name == 'testref-1' + assert result[0].query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[0].query.ref_name == 'testref-1' assert 170 == len(result[1].seq) assert result[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[1].contig.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' - assert result[1].contig.ref_name == 'testref-2' + assert result[1].query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[1].query.ref_name == 'testref-2' - assert result[2].contig == contigs[3] - assert result[3].contig == contigs[7] + assert result[2].query == contigs[3] + assert result[3].query == contigs[7] From 3772afe3294c220ce556203c6874d097f316a1d1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 10:34:47 -0800 Subject: [PATCH 046/485] Contig stitcher: improve cutting of Frankenstein contigs This solution overcomes the information loss problem of the previous approach and validates if the cut is permissible - confirming it does not slice through a SyntheticContig. This function identifies the segment to be cut, performs the cut, and yields the updated left and right Frankenstein contigs. Also, a `cut_reference` method was added to SyntheticContig to indicate that these contigs cannot be cut due to alignment issues. --- micall/core/contig_stitcher.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index fc5bdedaa..79343cae6 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from mappy import Aligner from functools import cached_property, reduce -from itertools import accumulate +from itertools import accumulate, takewhile from gotoh import align_it from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @@ -77,6 +77,10 @@ def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): super().__init__(query, alignment) + def cut_reference(self, cut_point: float): + raise NotImplementedError("SyntheticContigs cannot be cut because they are not properly aligned") + + class FrankensteinContig(AlignedContig): """ Assembled of parts that were not even aligned together, @@ -89,7 +93,6 @@ def __init__(self, parts: List[GenotypedContig]): raise ValueError("Empty Frankenstei do not exist") # Flatten any possible Frankenstein parts - # This is entirely optional, but may be useful for debugging self.parts = [subpart for part in parts for subpart in (part.parts if isinstance(part, FrankensteinContig) else [part])] @@ -98,6 +101,24 @@ def __init__(self, parts: List[GenotypedContig]): super().__init__(aligned.query, aligned.alignment) + def cut_reference(self, cut_point: float) -> 'FrankensteinContig': + # The cut_reference version of super() works here. + # But it loses information about parts, + # and does not check if the cut is legal + # i.e. whether it slices a SyntheticContig. + + # Search for the part that needs to be cut: + left_parts = list(takewhile(lambda part: cut_point >= part.alignment.r_ei + 1, self.parts)) + target_part = self.parts[len(left_parts)] + right_parts = self.parts[len(left_parts) + 1:] + + target_part_left, target_part_right = target_part.cut_reference(cut_point) + left = FrankensteinContig(left_parts + [target_part_left]) + right = FrankensteinContig([target_part_right] + right_parts) + + return (left, right) + + @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: left_query_seq = left.query.seq[0:left.alignment.q_ei + 1] From e23f7758baf86555a02412edfc2c0d03919f12a3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 10:45:11 -0800 Subject: [PATCH 047/485] Contig stitcher: normalize concordance score between 0 and 1 --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 79343cae6..1f05daa72 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -206,7 +206,7 @@ def slide(left, right): scores_sum -= scores.popleft() scores_sum += current scores.append(current) - result[i] += scores_sum / window_size + result[i] += (scores_sum / window_size) / 2 # Slide forward, then in reverse, adding the scores at each position. slide(left, right) From 90735734b6d7346d5616daa0cd2f1866b6c30a3d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 11:07:32 -0800 Subject: [PATCH 048/485] Contig stitcher: split overlap handling and coverage handling --- micall/core/contig_stitcher.py | 58 +++++++++++++++++++--------- micall/tests/test_contig_stitcher.py | 9 ++--- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1f05daa72..76d95afea 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -240,30 +240,52 @@ def stitch_2_contigs(left, right): return FrankensteinContig([left_remainder, overlap_contig, right_remainder]) -def stitch_contigs(contigs: Iterable[GenotypedContig]): - aligned = list(map(align_to_reference, contigs)) - - # Contigs that did not align do not need any more processing - yield from (x for x in aligned if not isinstance(x, AlignedContig)) - aligned = [x for x in aligned if isinstance(x, AlignedContig)] - +def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Going left-to-right through aligned contigs. - aligned = list(sorted(aligned, key=lambda x: x.alignment.r_st)) - while aligned: - current = aligned.pop(0) - - # Filter out all contigs that are contained within the current one. - # TODO: actually filter out if covered by multiple contigs - # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. - aligned = [x for x in aligned if not current.contains(x)] + contigs = list(sorted(contigs, key=lambda x: x.alignment.r_st)) + while contigs: + current = contigs.pop(0) # Find overlap. If there isn't one - we are done with the current contig. - overlapping_contig = find_overlapping_contig(current, aligned) + overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: yield current continue # Replace two contigs by their stitched version, then loop with it. new_contig = stitch_2_contigs(current, overlapping_contig) - aligned.remove(overlapping_contig) - aligned.insert(0, new_contig) + contigs.remove(overlapping_contig) + contigs.insert(0, new_contig) + + +def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: + """ Filter out all contigs that are contained within other contigs. """ + + # TODO: filter out if covered by multiple contigs + # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. + + def find_most_covered(contigs) -> Optional[AlignedContig]: + for current in contigs: + if any(x for x in contigs if x != current and x.contains(current)): + return current + + while contigs: + most_covered = find_most_covered(contigs) + if most_covered: + contigs.remove(most_covered) + else: + break + + return contigs + + +def stitch_contigs(contigs: Iterable[GenotypedContig]): + maybe_aligned = list(map(align_to_reference, contigs)) + + # Contigs that did not align do not need any more processing + yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) + aligned = [x for x in maybe_aligned if isinstance(x, AlignedContig)] + + aligned = drop_completely_covered(aligned) + + yield from combine_overlaps(aligned) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index a00a1cfb5..234a6e753 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -201,19 +201,18 @@ def test_stitching_of_identical_contigs(): # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. ref_seq = 'A' * 100 - contig = \ - GenotypedContig(name='a', + contigs = [ + GenotypedContig(name=name, seq='ACTGACTG' * 100, ref_name='testref', ref_seq='ACTGACTG' * 100, matched_fraction=1.0, ) - - contigs = [contig, contig, contig] + for name in ["a", "b", "c"]] result = list(stitch_contigs(contigs)) assert len(result) == 1 - assert result[0].query == contig + assert result[0].query == contigs[2] def test_stitching_of_zero_contigs(): From eeec6a53ac617ced1ed5b26a85e4d26fa52465a3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 13:36:18 -0800 Subject: [PATCH 049/485] Cigar tools: add getters for coordinate positions --- micall/tests/test_cigar_tools.py | 10 ++++++---- micall/utils/cigar_tools.py | 8 ++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 23bf97a18..9c9db98d1 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -62,7 +62,7 @@ def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping assert expected_mapping == mapping.ref_to_query_d - assert expected_mapping == {i: mapping.ref_to_query(i) for i in mapping.ref_to_query_d} + assert expected_mapping == {i: mapping.ref_to_query(i) for i in mapping.reference_coordinates()} @pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) @@ -87,7 +87,8 @@ def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping else: fullrange = {i: mapping.ref_to_closest_query(i) \ - for i in range(min(mapping.ref_to_query_d), 1 + max(mapping.ref_to_query_d))} + for i in range(min(mapping.reference_coordinates()), + 1 + max(mapping.reference_coordinates()))} assert expected_closest_mapping == fullrange @@ -102,7 +103,7 @@ def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): assert mapping.ref_to_query(0) == None assert mapping.query_to_ref(0) == None assert expected_mapping \ - == {i: mapping.ref_to_query(i) for i in mapping.ref_to_query_d} + == {i: mapping.ref_to_query(i) for i in mapping.reference_coordinates()} @pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) @@ -119,7 +120,8 @@ def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_map # Coordinates are translated by q_st and r_st. expected_closest_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()} fullrange = {i: mapping.ref_to_closest_query(i) \ - for i in range(min(mapping.ref_to_query_d), 1 + max(mapping.ref_to_query_d))} + for i in range(min(mapping.reference_coordinates()), + 1 + max(mapping.reference_coordinates()))} assert expected_closest_mapping == fullrange diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 3c8beb3a8..213ed5ba7 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -60,6 +60,14 @@ def extend(self, self.query_to_op_d[query_index] = op_index + def reference_coordinates(self): + return self.ref_to_query_d.keys() + + + def query_coordinates(self): + return self.query_to_ref_d.keys() + + def ref_to_query(self, index) -> Optional[int]: return self.ref_to_query_d.get(index, None) From be32fd0a135a755b07d6699d2576d172c45ef18f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 8 Nov 2023 16:14:24 -0800 Subject: [PATCH 050/485] Cigar tools: add epsilon quantity for precise cuts --- micall/utils/cigar_tools.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 213ed5ba7..2708a415b 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -391,6 +391,11 @@ def __add__(self, other): ) + @property + def epsilon(self): + return 1 / (self.query_length + self.ref_length + 100) + + def _slice(self, r_st, r_ei, q_st, q_ei) -> 'CigarHit': mapping = self.coordinate_mapping @@ -426,7 +431,7 @@ def _ref_cut_to_query_cut(self, cut_point: float): if float(query_cut_point).is_integer(): # Disambiguate to the right. - query_cut_point += 0.1 / (self.r_st + self.r_ei + self.q_st + self.q_ei) + query_cut_point += self.epsilon return query_cut_point From b03cbd5957cd94a84d971cba8b91f39fd65eaa6f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 9 Nov 2023 12:56:11 -0800 Subject: [PATCH 051/485] Cigar tools: return coordinates as sets --- micall/utils/cigar_tools.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 2708a415b..89ad2d708 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,7 +4,7 @@ from math import ceil, floor import re -from typing import Container, Tuple, Iterable, Optional +from typing import Container, Tuple, Iterable, Optional, Set from dataclasses import dataclass from functools import cached_property from itertools import chain, dropwhile @@ -60,12 +60,12 @@ def extend(self, self.query_to_op_d[query_index] = op_index - def reference_coordinates(self): - return self.ref_to_query_d.keys() + def reference_coordinates(self) -> Set[int]: + return set(self.ref_to_query_d.keys()) - def query_coordinates(self): - return self.query_to_ref_d.keys() + def query_coordinates(self) -> Set[int]: + return set(self.query_to_ref_d.keys()) def ref_to_query(self, index) -> Optional[int]: From 8b8443dd01938bf25a08d4906698d1580a8f8088 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 9 Nov 2023 22:17:16 -0800 Subject: [PATCH 052/485] Contig stitcher: make align_to_reference a class method --- micall/core/contig_stitcher.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 76d95afea..a9cba9d69 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -22,6 +22,16 @@ class GenotypedContig(Contig): ref_seq: str matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. + def align_to_reference(self): + aligner = Aligner(seq=self.ref_seq, preset='map-ont') + alignments = list(aligner.map(self.seq)) + if not alignments: + return self + + hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] + single_cigar_hit = connect_cigar_hits(hits_array) + return AlignedContig(query=self, alignment=single_cigar_hit) + class AlignedContig(GenotypedContig): @@ -140,17 +150,6 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment) -def align_to_reference(contig: GenotypedContig): - aligner = Aligner(seq=contig.ref_seq, preset='map-ont') - alignments = list(aligner.map(contig.seq)) - if not alignments: - return contig - - hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] - single_cigar_hit = connect_cigar_hits(hits_array) - return AlignedContig(query=contig, alignment=single_cigar_hit) - - def align_equal(seq1: str, seq2: str) -> Tuple[str, str]: gap_open_penalty = 15 gap_extend_penalty = 3 @@ -280,7 +279,7 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: def stitch_contigs(contigs: Iterable[GenotypedContig]): - maybe_aligned = list(map(align_to_reference, contigs)) + maybe_aligned = list(map(GenotypedContig.align_to_reference, contigs)) # Contigs that did not align do not need any more processing yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) From 4bee555ec631f0c5801e4837ddc8662da0fb0d26 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 9 Nov 2023 13:10:25 -0800 Subject: [PATCH 053/485] Add gaps() method to CigarHit --- micall/tests/test_cigar_tools.py | 13 +++++++++++++ micall/utils/cigar_tools.py | 28 +++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 9c9db98d1..b61912207 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -323,6 +323,19 @@ def test_cigar_hit_rstrip_is_stringlike(hit): assert p(g(hit)) == h(p(hit)) +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_gaps_no_m_or_i(hit): + gaps = list(hit.gaps()) + + if 'D' in str(hit.cigar): + assert len(gaps) > 0 + + for gap in gaps: + assert 'M' not in str(gap.cigar) + assert 'I' not in str(gap.cigar) + + @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), ('ACTG', '', '4D', 'ACTG', '----'), diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 89ad2d708..0de61b76f 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -262,7 +262,7 @@ def rstrip(self, actions: Container[CigarActions]) -> 'Cigar': def coordinate_mapping(self) -> CoordinateMapping: """ Convert a CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. - TODO: describe the domains and holes. + TODO(vitalik): describe the domains and holes. :param cigar: a CIGAR string. @@ -370,6 +370,32 @@ def intervals_overlap(x, y): or intervals_overlap((self.q_st, self.q_ei), (other.q_st, other.q_ei)) + def gaps(self) -> Iterable['CigarHit']: + # TODO(vitalik): memoize whatever possible. + + covered_coordinates = self.coordinate_mapping.reference_coordinates() + all_coordinates = range(self.r_st, self.r_ei + 1) + + def make_gap(r_st, r_en): + r_ei = r_en - 1 + left, midright = self.cut_reference(r_st - 0.5) + middle, right = midright.cut_reference(r_ei + 0.5) + return middle + + gap_start = None + for coord in all_coordinates: + if coord in covered_coordinates: + if gap_start is not None: + yield make_gap(gap_start, coord) + gap_start = None + else: + if gap_start is None: + gap_start = coord + + if gap_start is not None: + yield make_gap(gap_start, coord) + + def __add__(self, other): """ Inserts deletions/insertions between self and other, From d1f3a9d9a683b7d39625d00380afb0dfb1dab019 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 9 Nov 2023 22:22:56 -0800 Subject: [PATCH 054/485] Contig stitcher: implement basic gap slicing algorithm --- micall/core/contig_stitcher.py | 52 ++++++++++++- micall/tests/test_contig_stitcher.py | 107 ++++++++++++++++++++++++++- 2 files changed, 157 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index a9cba9d69..cf4c382fa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -5,6 +5,8 @@ from functools import cached_property, reduce from itertools import accumulate, takewhile from gotoh import align_it +from queue import LifoQueue +from math import floor from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions @@ -80,6 +82,10 @@ def interval_contains(x, y): (other.alignment.r_st, other.alignment.r_ei)) + def gaps(self) -> Iterable[CigarHit]: + return self.alignment.gaps() + + class SyntheticContig(AlignedContig): def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, @@ -98,7 +104,7 @@ class FrankensteinContig(AlignedContig): Yet its self.seq string looks like a real contig. """ - def __init__(self, parts: List[GenotypedContig]): + def __init__(self, parts: List[AlignedContig]): if len(parts) == 0: raise ValueError("Empty Frankenstei do not exist") @@ -278,6 +284,49 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: return contigs +def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: + def covered_by(gap, contig): + # TODO(vitalik): implement the more precise check + possible_reference_coordinates = set(range(gap.r_st, gap.r_ei + 1)) + return possible_reference_coordinates \ + .issubset(contig.alignment.coordinate_mapping.reference_coordinates()) + + def covered(contig, gap): + return any(covered_by(gap, other) for other in contigs + if other != contig) + + def gap_boundaries(gap): + midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + left_slice, right_slice = contig.cut_reference(floor(midpoint) + 0.5) + left_closest_query = left_slice.alignment.coordinate_mapping.ref_to_closest_query(midpoint) + right_closest_query = right_slice.alignment.coordinate_mapping.ref_to_closest_query(midpoint) + left_closest_ref = left_slice.alignment.coordinate_mapping.query_to_ref(left_closest_query) + right_closest_ref = right_slice.alignment.coordinate_mapping.query_to_ref(right_closest_query) + return (left_closest_ref, right_closest_ref) + + def try_split(contig): + for gap in contig.gaps(): + if covered(contig, gap): + left_closest_ref, right_closest_ref = gap_boundaries(gap) + left_part, left_gap = contig.cut_reference(left_closest_ref + contig.alignment.epsilon) + right_gap, right_part = contig.cut_reference(right_closest_ref - contig.alignment.epsilon) + + contigs.remove(contig) + contigs.append(left_part) + contigs.append(right_part) + process_queue.put(right_part) + return + + process_queue = LifoQueue() + for contig in contigs: process_queue.put(contig) + + while not process_queue.empty(): + contig = process_queue.get() + try_split(contig) + + return contigs + + def stitch_contigs(contigs: Iterable[GenotypedContig]): maybe_aligned = list(map(GenotypedContig.align_to_reference, contigs)) @@ -285,6 +334,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) aligned = [x for x in maybe_aligned if isinstance(x, AlignedContig)] + aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) yield from combine_overlaps(aligned) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 234a6e753..b3a4c572d 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,6 +1,6 @@ import pytest -from micall.core.contig_stitcher import stitch_contigs, GenotypedContig +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig from micall.tests.utils import MockAligner @@ -303,3 +303,108 @@ def test_correct_processing_complex_nogaps(): assert result[2].query == contigs[3] assert result[3].query == contigs[7] + + +def test_stitching_when_one_contig_completely_covered_by_another(): + # Scenario: If one contig is completely covered by another contig, + # the completely covered contig must be dropped. + + ref_seq = 'A' * 100 + 'C' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 20 + 'C' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 50 + 'C' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + assert len(result) == 1 + + # Test to ensure that the final result contains the contig 'b' and + # does not contain the completely covered contig 'a'. + assert result[0].query.name == 'b' + assert result[0].query == contigs[1] + + +def test_stitching_contig_with_big_noncovered_gap(): + # Scenario: One contig has a big gap, which is however not covered by anything else. + + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + + contigs = [ + GenotypedContig(name='a', + seq= 'A' * 50 + 'T' * 50, # mind the C gap + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + +def test_stitching_contig_with_big_noncovered_gap_2(): + # Scenario: One contig has a big gap, which is however not covered by anything else. + + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'T' * 50, # mind the C gap + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='B', + seq='G' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + result = list(stitch_contigs(contigs)) + + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) + + +def test_stitching_contig_with_big_covered_gap(): + # Scenario: If one contig has a big gap covered by another contig. + + ref_seq = 'G' * 100 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='G' * 50 + 'A' * 50 + 'T' * 100, # mind the gap + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 50, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + contigs = [x.align_to_reference() for x in contigs] + assert len(list(contigs[0].gaps())) == 1 + assert len(list(contigs[1].gaps())) == 0 + + result = list(split_contigs_with_gaps(contigs)) + assert len(result) == 3 + assert all(list(contig.gaps()) == [] for contig in result) From c389ee4d6e35317d59af625442a08a416940dcea Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 16:43:29 -0800 Subject: [PATCH 055/485] Cigar tools: distinguish mapped coordinates and all coordinates --- micall/core/contig_stitcher.py | 5 ++--- micall/tests/test_cigar_tools.py | 20 ++++++++++---------- micall/utils/cigar_tools.py | 16 ++++++++++++---- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index cf4c382fa..150022506 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -287,9 +287,8 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: def covered_by(gap, contig): # TODO(vitalik): implement the more precise check - possible_reference_coordinates = set(range(gap.r_st, gap.r_ei + 1)) - return possible_reference_coordinates \ - .issubset(contig.alignment.coordinate_mapping.reference_coordinates()) + return gap.coordinate_mapping.all_reference_coordinates() \ + .issubset(contig.alignment.coordinate_mapping.mapped_reference_coordinates()) def covered(contig, gap): return any(covered_by(gap, other) for other in contigs diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index b61912207..65591f539 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -30,13 +30,13 @@ ('5M3I4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}), ('1M1D', {0: 0}, - {0: 0}), + {0: 0, 1: 0}), ('1M1I', {0: 0}, {0: 0}), ('1I1M', {0: 1}, {0: 1}), ('1D1M', {1: 0}, - {1: 0}), + {1: 0, 0: 0}), # Multiple deletions and insertions ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, @@ -62,7 +62,8 @@ def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping assert expected_mapping == mapping.ref_to_query_d - assert expected_mapping == {i: mapping.ref_to_query(i) for i in mapping.reference_coordinates()} + assert expected_mapping == {i: mapping.ref_to_query(i) + for i in mapping.mapped_reference_coordinates()} @pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) @@ -86,9 +87,8 @@ def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping mapping.ref_to_closest_query(0) else: - fullrange = {i: mapping.ref_to_closest_query(i) \ - for i in range(min(mapping.reference_coordinates()), - 1 + max(mapping.reference_coordinates()))} + fullrange = {i: mapping.ref_to_closest_query(i) + for i in mapping.all_reference_coordinates()} assert expected_closest_mapping == fullrange @@ -103,7 +103,8 @@ def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): assert mapping.ref_to_query(0) == None assert mapping.query_to_ref(0) == None assert expected_mapping \ - == {i: mapping.ref_to_query(i) for i in mapping.reference_coordinates()} + == {i: mapping.ref_to_query(i) + for i in mapping.mapped_reference_coordinates()} @pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) @@ -119,9 +120,8 @@ def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_map else: # Coordinates are translated by q_st and r_st. expected_closest_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()} - fullrange = {i: mapping.ref_to_closest_query(i) \ - for i in range(min(mapping.reference_coordinates()), - 1 + max(mapping.reference_coordinates()))} + fullrange = {i: mapping.ref_to_closest_query(i) + for i in mapping.all_reference_coordinates()} assert expected_closest_mapping == fullrange diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 0de61b76f..48b428e03 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -60,14 +60,22 @@ def extend(self, self.query_to_op_d[query_index] = op_index - def reference_coordinates(self) -> Set[int]: + def mapped_reference_coordinates(self) -> Set[int]: return set(self.ref_to_query_d.keys()) - def query_coordinates(self) -> Set[int]: + def all_reference_coordinates(self) -> Set[int]: + return set(self.ref_to_op_d.keys()) + + + def mapped_query_coordinates(self) -> Set[int]: return set(self.query_to_ref_d.keys()) + def all_query_coordinates(self) -> Set[int]: + return set(self.query_to_op_d.keys()) + + def ref_to_query(self, index) -> Optional[int]: return self.ref_to_query_d.get(index, None) @@ -373,8 +381,8 @@ def intervals_overlap(x, y): def gaps(self) -> Iterable['CigarHit']: # TODO(vitalik): memoize whatever possible. - covered_coordinates = self.coordinate_mapping.reference_coordinates() - all_coordinates = range(self.r_st, self.r_ei + 1) + covered_coordinates = self.coordinate_mapping.mapped_reference_coordinates() + all_coordinates = self.coordinate_mapping.all_reference_coordinates() def make_gap(r_st, r_en): r_ei = r_en - 1 From 3cba9d4e173d7619c91c627c1dfb3ab6ef5c0544 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 16:57:07 -0800 Subject: [PATCH 056/485] Contig stitcher: skip insignificant gaps --- micall/core/contig_stitcher.py | 6 ++++++ micall/tests/test_contig_stitcher.py | 30 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 150022506..0719c51ac 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -303,8 +303,14 @@ def gap_boundaries(gap): right_closest_ref = right_slice.alignment.coordinate_mapping.query_to_ref(right_closest_query) return (left_closest_ref, right_closest_ref) + def significant(gap): + return gap.ref_length > 100 + def try_split(contig): for gap in contig.gaps(): + if not significant(gap): + continue + if covered(contig, gap): left_closest_ref, right_closest_ref = gap_boundaries(gap) left_part, left_gap = contig.cut_reference(left_closest_ref + contig.alignment.epsilon) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index b3a4c572d..a92c257c5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -408,3 +408,33 @@ def test_stitching_contig_with_big_covered_gap(): result = list(split_contigs_with_gaps(contigs)) assert len(result) == 3 assert all(list(contig.gaps()) == [] for contig in result) + + +def test_stitching_contig_with_small_covered_gap(): + # Scenario: If one contig has a small gap covered by another contig. + + ref_seq = 'G' * 100 + 'A' * 9 + 'C' * 100 + 'T' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='G' * 100 + 'A' * 0 + 'C' * 100, # mind the gap + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 9 + 'C' * 100 + 'T' * 100, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.5, + ), + ] + + contigs = [x.align_to_reference() for x in contigs] + assert len(list(contigs[0].gaps())) == 1 + assert len(list(contigs[1].gaps())) == 0 + + result = list(split_contigs_with_gaps(contigs)) + + assert sorted(map(lambda x: x.seq, contigs)) \ + == sorted(map(lambda x: x.seq, result)) From ba17fdc6eab7b1aacd8e07dc14be78da23db2b57 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 17:02:27 -0800 Subject: [PATCH 057/485] Tests: do not auto-use mock aligner --- micall/tests/test_contig_stitcher.py | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index a92c257c5..0494bc541 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -4,12 +4,12 @@ from micall.tests.utils import MockAligner -@pytest.fixture(autouse=True) -def mock_mappy_aligner(monkeypatch): +@pytest.fixture() +def exact_aligner(monkeypatch): monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) -def test_identical_stitching_of_one_contig(): +def test_identical_stitching_of_one_contig(exact_aligner): # Scenario: When stitching one contig, it remains the same. contigs = [ @@ -26,7 +26,7 @@ def test_identical_stitching_of_one_contig(): == sorted(map(lambda x: x.seq, result)) -def test_separate_stitching_of_non_overlapping_contigs(): +def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. ref_seq = 'A' * 100 @@ -70,7 +70,7 @@ def test_separate_stitching_of_non_overlapping_contigs(): == sorted(map(lambda x: x.seq, result)) -def test_correct_stitching_of_two_partially_overlapping_contigs(): +def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. ref_seq = 'A' * 100 + 'C' * 100 @@ -100,7 +100,7 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(): assert result.query.name == 'a+overlap(a,b)+b' -def test_correct_processing_of_two_overlapping_and_one_separate_contig(): +def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -136,7 +136,7 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(): assert result[1].query == contigs[2] -def test_stitching_of_all_overlapping_contigs_into_one_sequence(): +def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): # Scenario: All contigs have some overlapping parts, resulting in one continuous sequence after stitching. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -172,7 +172,7 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(): assert result.query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' -def test_stitching_with_empty_contigs(): +def test_stitching_with_empty_contigs(exact_aligner): # Scenario: The function is able to handle and ignore empty contigs. ref_seq = 'A' * 100 @@ -197,7 +197,7 @@ def test_stitching_with_empty_contigs(): == sorted(map(lambda x: x.seq, result)) -def test_stitching_of_identical_contigs(): +def test_stitching_of_identical_contigs(exact_aligner): # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. ref_seq = 'A' * 100 @@ -215,7 +215,7 @@ def test_stitching_of_identical_contigs(): assert result[0].query == contigs[2] -def test_stitching_of_zero_contigs(): +def test_stitching_of_zero_contigs(exact_aligner): # Scenario: The function does not crash if no contigs given. contigs = [] @@ -223,7 +223,7 @@ def test_stitching_of_zero_contigs(): assert result == contigs -def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(): +def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(exact_aligner): # Scenario: Two partially overlapping contigs, but which come from different organism, # are not stitched into a single sequence. @@ -251,7 +251,7 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti == sorted(map(lambda x: x.seq, result)) -def test_correct_processing_complex_nogaps(): +def test_correct_processing_complex_nogaps(exact_aligner): # Scenario: There are two reference organisms. # Each with 4 contigs. # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. @@ -305,7 +305,7 @@ def test_correct_processing_complex_nogaps(): assert result[3].query == contigs[7] -def test_stitching_when_one_contig_completely_covered_by_another(): +def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): # Scenario: If one contig is completely covered by another contig, # the completely covered contig must be dropped. @@ -335,7 +335,7 @@ def test_stitching_when_one_contig_completely_covered_by_another(): assert result[0].query == contigs[1] -def test_stitching_contig_with_big_noncovered_gap(): +def test_stitching_contig_with_big_noncovered_gap(exact_aligner): # Scenario: One contig has a big gap, which is however not covered by anything else. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -355,7 +355,7 @@ def test_stitching_contig_with_big_noncovered_gap(): == sorted(map(lambda x: x.seq, result)) -def test_stitching_contig_with_big_noncovered_gap_2(): +def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): # Scenario: One contig has a big gap, which is however not covered by anything else. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 @@ -381,7 +381,7 @@ def test_stitching_contig_with_big_noncovered_gap_2(): == sorted(map(lambda x: x.seq, result)) -def test_stitching_contig_with_big_covered_gap(): +def test_stitching_contig_with_big_covered_gap(exact_aligner): # Scenario: If one contig has a big gap covered by another contig. ref_seq = 'G' * 100 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 @@ -410,7 +410,7 @@ def test_stitching_contig_with_big_covered_gap(): assert all(list(contig.gaps()) == [] for contig in result) -def test_stitching_contig_with_small_covered_gap(): +def test_stitching_contig_with_small_covered_gap(exact_aligner): # Scenario: If one contig has a small gap covered by another contig. ref_seq = 'G' * 100 + 'A' * 9 + 'C' * 100 + 'T' * 100 From 017a93aac9aa9c6daf4a056cbcfef8a18730867b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 17:15:19 -0800 Subject: [PATCH 058/485] Tests: move fixed_random_seed to shared utils.py file --- micall/tests/test_consensus_aligner.py | 12 +----------- micall/tests/utils.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 2483563a8..0d6b36e63 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -3,7 +3,6 @@ import random from io import StringIO from pytest import approx -from contextlib import contextmanager from micall.core.aln2counts import SeedAmino, ReportAmino from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, CigarActions, AminoAlignment @@ -11,19 +10,10 @@ # noinspection PyUnresolvedReferences from micall.tests.test_remap import load_projects +from micall.tests.utils import fixed_random_seed from micall.utils.report_amino import ReportNucleotide -@contextmanager -def fixed_random_seed(seed): - original_state = random.getstate() - random.seed(seed) - try: - yield - finally: - random.setstate(original_state) - - def mutate_sequence(rate, seq): def mutate(x): if random.random() >= rate: diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 6a7de309a..8d823f20c 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -1,5 +1,7 @@ from dataclasses import dataclass from math import floor, ceil +from contextlib import contextmanager +import random from micall.utils.consensus_aligner import CigarActions @@ -54,3 +56,13 @@ def map(self, seq): max_matches -= 1 if max_matches < 1: return + + +@contextmanager +def fixed_random_seed(seed): + original_state = random.getstate() + random.seed(seed) + try: + yield + finally: + random.setstate(original_state) From 8f41cf54600bc69206d72643a3a5b3c1b294991b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 18:00:46 -0800 Subject: [PATCH 059/485] Cigar tools: improve semantics of "closest" It is now based on op index. --- micall/core/contig_stitcher.py | 6 +++-- micall/tests/test_cigar_tools.py | 31 ++++++++++++---------- micall/utils/cigar_tools.py | 44 ++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0719c51ac..fe77cd85f 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -297,8 +297,10 @@ def covered(contig, gap): def gap_boundaries(gap): midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 left_slice, right_slice = contig.cut_reference(floor(midpoint) + 0.5) - left_closest_query = left_slice.alignment.coordinate_mapping.ref_to_closest_query(midpoint) - right_closest_query = right_slice.alignment.coordinate_mapping.ref_to_closest_query(midpoint) + left_midpoint_ref = left_slice.alignment.coordinate_mapping.find_closest_ref(midpoint) + left_closest_query = left_slice.alignment.coordinate_mapping.ref_to_closest_query(left_midpoint_ref) + right_midpoint_ref = right_slice.alignment.coordinate_mapping.find_closest_ref(midpoint) + right_closest_query = right_slice.alignment.coordinate_mapping.ref_to_closest_query(right_midpoint_ref) left_closest_ref = left_slice.alignment.coordinate_mapping.query_to_ref(left_closest_query) right_closest_ref = right_slice.alignment.coordinate_mapping.query_to_ref(right_closest_query) return (left_closest_ref, right_closest_ref) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 65591f539..43ef620b7 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -51,8 +51,8 @@ {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 4, 7: 4, 8: 5}), # Edge cases - ('', {}, ValueError()), - ('12I', {}, ValueError()), + ('', {}, {}), + ('12I', {}, {}), ('12D', {}, ValueError()), ] @@ -82,15 +82,17 @@ def test_cigar_to_coordinate_bijection_property(cigar_str): def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping - if isinstance(expected_closest_mapping, Exception): - with pytest.raises(type(expected_closest_mapping)): - mapping.ref_to_closest_query(0) - - else: + def test(): fullrange = {i: mapping.ref_to_closest_query(i) for i in mapping.all_reference_coordinates()} assert expected_closest_mapping == fullrange + if isinstance(expected_closest_mapping, Exception): + with pytest.raises(type(expected_closest_mapping)): + test() + else: + test() + @pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): @@ -113,16 +115,17 @@ def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_map hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) mapping = hit.coordinate_mapping - if isinstance(expected_closest_mapping, Exception): - with pytest.raises(type(expected_closest_mapping)): - mapping.ref_to_closest_query(0) - - else: + def test(expected): # Coordinates are translated by q_st and r_st. - expected_closest_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()} fullrange = {i: mapping.ref_to_closest_query(i) for i in mapping.all_reference_coordinates()} - assert expected_closest_mapping == fullrange + assert expected == fullrange + + if isinstance(expected_closest_mapping, Exception): + with pytest.raises(type(expected_closest_mapping)): + test(expected_closest_mapping) + else: + test({k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()}) def test_invalid_operation_in_cigar_string(): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 48b428e03..3f6d40eb3 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -47,17 +47,16 @@ def __init__(self): def extend(self, ref_index: Optional[int], query_index: Optional[int], - op_index: Optional[int]): + op_index: int): if ref_index is not None and query_index is not None: self.ref_to_query_d[ref_index] = query_index self.query_to_ref_d[query_index] = ref_index - if op_index is not None: - if ref_index is not None: - self.ref_to_op_d[ref_index] = op_index - if query_index is not None: - self.query_to_op_d[query_index] = op_index + if ref_index is not None: + self.ref_to_op_d[ref_index] = op_index + if query_index is not None: + self.query_to_op_d[query_index] = op_index def mapped_reference_coordinates(self) -> Set[int]: @@ -84,27 +83,40 @@ def query_to_ref(self, index) -> Optional[int]: return self.query_to_ref_d.get(index, None) + def ref_to_leftsup_query(self, index) -> Optional[int]: + left_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if v <= index) + return max(left_neihbourhood, default=None) + + + def ref_to_rightinf_query(self, index) -> Optional[int]: + right_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if index <= v) + return min(right_neihbourhood, default=None) + + + @staticmethod + def _find_closest(collection, value) -> int: + return min(collection, key=lambda x: abs(x - value)) + + @staticmethod def _find_closest_key(mapping: dict, index: int) -> int: return min(mapping, key=lambda k: abs(mapping[k] - index)) - def ref_to_closest_query(self, index) -> int: - return CoordinateMapping._find_closest_key(self.query_to_ref_d, index) + def find_closest_ref(self, index) -> int: + return CoordinateMapping._find_closest(self.all_reference_coordinates(), index) - def query_to_closest_ref(self, index) -> int: - return CoordinateMapping._find_closest_key(self.ref_to_query_d, index) + def find_closest_query(self, index) -> int: + return CoordinateMapping._find_closest(self.all_query_coordinates(), index) - def ref_to_leftsup_query(self, index) -> Optional[int]: - left_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if v <= index) - return max(left_neihbourhood, default=None) + def ref_to_closest_query(self, index) -> int: + return CoordinateMapping._find_closest_key(self.query_to_op_d, self.ref_to_op_d[index]) - def ref_to_rightinf_query(self, index) -> Optional[int]: - right_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if index <= v) - return min(right_neihbourhood, default=None) + def query_to_closest_ref(self, index) -> int: + return CoordinateMapping._find_closest_key(self.ref_to_op_d, self.query_to_op_d[index]) def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): From 6b2bbd6997cfc10dc944db29d1028c657e911e06 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 Nov 2023 18:35:26 -0800 Subject: [PATCH 060/485] Tests: add two more edge cases for Cigar tools --- micall/tests/test_cigar_tools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 43ef620b7..9be5374fa 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -52,6 +52,10 @@ # Edge cases ('', {}, {}), + ('3I3D', {}, + {0: 2, 1: 2, 2: 2}), + ('3D3I', {}, + {0: 0, 1: 0, 2: 0}), ('12I', {}, {}), ('12D', {}, ValueError()), ] From f00d2c7be8f6d016dd29789c7f7c5740d840417c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Sat, 11 Nov 2023 11:47:54 -0800 Subject: [PATCH 061/485] Cigar tools: reimplement strip operations --- micall/tests/test_cigar_tools.py | 78 +++++++++++++++++--------------- micall/utils/cigar_tools.py | 52 +++++++++------------ 2 files changed, 63 insertions(+), 67 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 9be5374fa..a42c7326a 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -150,7 +150,7 @@ def test_invalid_cigar_string(): cigar_hit_ref_cut_cases = [ - # # Trivial cases + # Trivial cases (CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), 2.5, [CigarHit('2M', r_st=1, r_ei=2, q_st=1, q_ei=2), CigarHit('2M', r_st=3, r_ei=4, q_st=3, q_ei=4)]), @@ -184,6 +184,34 @@ def test_invalid_cigar_string(): [CigarHit('9M9I4M', r_st=1, r_ei=13, q_st=1, q_ei=22), CigarHit('5M', r_st=14, r_ei=18, q_st=23, q_ei=27)]), + (CigarHit('5M6I', r_st=1, r_ei=5, q_st=1, q_ei=11), 3.5, + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('2M6I', r_st=4, r_ei=5, q_st=4, q_ei=11)]), + + (CigarHit('6I5M', r_st=1, r_ei=5, q_st=1, q_ei=11), 3.5, + [CigarHit('6I3M', r_st=1, r_ei=3, q_st=1, q_ei=9), + CigarHit('2M', r_st=4, r_ei=5, q_st=10, q_ei=11)]), + + (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), 3.5, + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('2M6D', r_st=4, r_ei=11, q_st=4, q_ei=5)]), + + (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 3.5, + [CigarHit('3D', r_st=1, r_ei=3, q_st=1, q_ei=0), + CigarHit('3D5M', r_st=4, r_ei=11, q_st=1, q_ei=5)]), + + (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), 7.5, + [CigarHit('5M2D', r_st=1, r_ei=7, q_st=1, q_ei=5), + CigarHit('4D', r_st=8, r_ei=11, q_st=6, q_ei=5)]), + + (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 7.5, + [CigarHit('6D1M', r_st=1, r_ei=7, q_st=1, q_ei=1), + CigarHit('4M', r_st=8, r_ei=11, q_st=2, q_ei=5)]), + + (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 6.5, + [CigarHit('6D', r_st=1, r_ei=6, q_st=1, q_ei=0), + CigarHit('5M', r_st=7, r_ei=11, q_st=1, q_ei=5)]), + # Ambigous cases (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 13.5 or 27/2, [CigarHit('9M4D', r_st=1, r_ei=13, q_st=1, q_ei=9), @@ -278,6 +306,18 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): assert left + right == hit +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception) + and not 'N' in str(x[0].cigar)]) +def test_cigar_hit_strip_combines_with_add(hit, cut_point): + left, right = hit.cut_reference(cut_point) + + left = left.rstrip_query() + right = right.lstrip_query() + + assert left + right == hit + + @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): @@ -294,42 +334,6 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): assert (a + b) + c == a + (b + c) -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) -def test_cigar_hit_lstrip_is_stringlike(hit): - all_chars = CIGAR_OP_MAPPING.keys() - - actions_of = lambda s: (x for x in s if x in all_chars) - - for r in range(len(all_chars) + 1): - for char_set in itertools.combinations(all_chars, r): - actions = set(map(parse_cigar_operation, char_set)) - chars = ''.join(char_set) - - p = lambda x: ''.join(actions_of(str(x.cigar))) - g = lambda x: x.lstrip(actions) - h = lambda x: x.lstrip(chars) - - assert p(g(hit)) == h(p(hit)) - - -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) -def test_cigar_hit_rstrip_is_stringlike(hit): - all_chars = CIGAR_OP_MAPPING.keys() - - actions_of = lambda s: (x for x in s if x in all_chars) - - for r in range(len(all_chars) + 1): - for char_set in itertools.combinations(all_chars, r): - actions = set(map(parse_cigar_operation, char_set)) - chars = ''.join(char_set) - - p = lambda x: ''.join(actions_of(str(x.cigar))) - g = lambda x: x.rstrip(actions) - h = lambda x: x.rstrip(chars) - - assert p(g(hit)) == h(p(hit)) - - @pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_gaps_no_m_or_i(hit): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 3f6d40eb3..36b829262 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -268,16 +268,6 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': [start_inclusive:end_noninclusive]) - def lstrip(self, actions: Container[CigarActions]) -> 'Cigar': - """ Return a copy of the Cigar with leading actions removed. """ - return Cigar(dropwhile(lambda tupl: tupl[1] in actions, self)) - - - def rstrip(self, actions: Container[CigarActions]) -> 'Cigar': - """ Return a copy of the Cigar with trailing actions removed. """ - return Cigar(reversed(list(dropwhile(lambda tupl: tupl[1] in actions, reversed(self))))) - - @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ @@ -507,30 +497,32 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': return left, right - def lstrip(self, actions: Container[CigarActions]) -> 'CigarHit': - """ Return a copy of the CigarHit with leading actions removed. """ + def lstrip_query(self) -> 'CigarHit': + """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ - cigar = self.cigar.lstrip(actions) - reference_delta = cigar.ref_length - self.cigar.ref_length - query_delta = cigar.query_length - self.cigar.query_length - return CigarHit(cigar, - r_st=self.r_st, - r_ei=self.r_ei + reference_delta, - q_st=self.q_st, - q_ei=self.q_ei + query_delta) + if self.query_length == 0: + return self + boundary_ref = self.coordinate_mapping.find_closest_ref(self.r_st - 1) + closest_query = self.coordinate_mapping.ref_to_closest_query(boundary_ref) + closest_ref = self.coordinate_mapping.query_to_ref(closest_query) - def rstrip(self, actions: Container[CigarActions]) -> 'CigarHit': - """ Return a copy of the CigarHit with trailing actions removed. """ + remainder, stripped = self.cut_reference(closest_ref - self.epsilon) + return stripped - cigar = self.cigar.rstrip(actions) - reference_delta = cigar.ref_length - self.cigar.ref_length - query_delta = cigar.query_length - self.cigar.query_length - return CigarHit(cigar, - r_st=self.r_st, - r_ei=self.r_ei + reference_delta, - q_st=self.q_st, - q_ei=self.q_ei + query_delta) + + def rstrip_query(self) -> 'CigarHit': + """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ + + if self.query_length == 0: + return self + + boundary_ref = self.coordinate_mapping.find_closest_ref(self.r_ei + 1) + closest_query = self.coordinate_mapping.ref_to_closest_query(boundary_ref) + closest_ref = self.coordinate_mapping.query_to_ref(closest_query) + + stripped, remainder = self.cut_reference(closest_ref + self.epsilon) + return stripped @cached_property From a2f2c2c7dd2e535cef1b8aec8c77095690316c9f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Sat, 11 Nov 2023 11:51:19 -0800 Subject: [PATCH 062/485] Contig stitcher: rebase gap splitting on the new strip operation --- micall/core/contig_stitcher.py | 39 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index fe77cd85f..86d77ccc7 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -60,6 +60,16 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont AlignedContig(self.query, alignment_right)) + def lstrip_query(self) -> 'AlignedContig': + alignment = self.alignment.lstrip_query() + return AlignedContig(self.query, alignment) + + + def rstrip_query(self) -> 'AlignedContig': + alignment = self.alignment.rstrip_query() + return AlignedContig(self.query, alignment) + + def overlaps(self, other) -> bool: def intervals_overlap(x, y): return x[0] <= y[1] and x[1] >= y[0] @@ -285,38 +295,33 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: - def covered_by(gap, contig): + def covered_by(gap, other): # TODO(vitalik): implement the more precise check return gap.coordinate_mapping.all_reference_coordinates() \ - .issubset(contig.alignment.coordinate_mapping.mapped_reference_coordinates()) + .issubset(other.alignment.coordinate_mapping.mapped_reference_coordinates()) def covered(contig, gap): return any(covered_by(gap, other) for other in contigs if other != contig) - def gap_boundaries(gap): - midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 - left_slice, right_slice = contig.cut_reference(floor(midpoint) + 0.5) - left_midpoint_ref = left_slice.alignment.coordinate_mapping.find_closest_ref(midpoint) - left_closest_query = left_slice.alignment.coordinate_mapping.ref_to_closest_query(left_midpoint_ref) - right_midpoint_ref = right_slice.alignment.coordinate_mapping.find_closest_ref(midpoint) - right_closest_query = right_slice.alignment.coordinate_mapping.ref_to_closest_query(right_midpoint_ref) - left_closest_ref = left_slice.alignment.coordinate_mapping.query_to_ref(left_closest_query) - right_closest_ref = right_slice.alignment.coordinate_mapping.query_to_ref(right_closest_query) - return (left_closest_ref, right_closest_ref) - def significant(gap): - return gap.ref_length > 100 + return gap.ref_length > 5 def try_split(contig): for gap in contig.gaps(): if not significant(gap): + # Really we do not want to split on every little deletion + # because that would mean that we would need to stitch + # overlaps around them. + # And we are likely to lose quality with every stitching operation. + # By skipping we assert that this gap is aligner's fault. continue if covered(contig, gap): - left_closest_ref, right_closest_ref = gap_boundaries(gap) - left_part, left_gap = contig.cut_reference(left_closest_ref + contig.alignment.epsilon) - right_gap, right_part = contig.cut_reference(right_closest_ref - contig.alignment.epsilon) + midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon + left_part, right_part = contig.cut_reference(midpoint) + left_part = left_part.rstrip_query() + right_part = right_part.lstrip_query() contigs.remove(contig) contigs.append(left_part) From 8888bb7a8ebb873b2afaebbe54e49e09913762fe Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Sat, 11 Nov 2023 11:56:33 -0800 Subject: [PATCH 063/485] Contig stitcher: implement the more precise gap coverage check --- micall/core/contig_stitcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 86d77ccc7..96b690454 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -296,9 +296,10 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: def covered_by(gap, other): - # TODO(vitalik): implement the more precise check - return gap.coordinate_mapping.all_reference_coordinates() \ - .issubset(other.alignment.coordinate_mapping.mapped_reference_coordinates()) + # Check if any 1 reference coordinate in gap is mapped in other. + gap_coords = gap.coordinate_mapping.all_reference_coordinates() + cover_coords = other.alignment.coordinate_mapping.mapped_reference_coordinates() + return not gap_coords.isdisjoint(cover_coords) def covered(contig, gap): return any(covered_by(gap, other) for other in contigs From 6c6786e71a9e3c4a3504fe89a6eca9591885594f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Sat, 11 Nov 2023 12:19:33 -0800 Subject: [PATCH 064/485] Cigar tools: prevent floating point errors in cut_reference --- micall/utils/cigar_tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 36b829262..f68765da0 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -8,6 +8,7 @@ from dataclasses import dataclass from functools import cached_property from itertools import chain, dropwhile +from fractions import Fraction from micall.utils.consensus_aligner import CigarActions @@ -429,7 +430,7 @@ def __add__(self, other): @property def epsilon(self): - return 1 / (self.query_length + self.ref_length + 100) + return Fraction(1, self.query_length + self.ref_length + 100) def _slice(self, r_st, r_ei, q_st, q_ei) -> 'CigarHit': @@ -457,9 +458,9 @@ def _ref_cut_to_query_cut(self, cut_point: float): right_query_cut_point = mapping.ref_to_rightinf_query(ceil(cut_point)) if left_query_cut_point is None: - return self.q_st - 0.1 + return self.q_st - 0.5 if right_query_cut_point is None: - return self.q_ei + 0.1 + return self.q_ei + 0.5 lerp = lambda start, end, t: (1 - t) * start + t * end query_cut_point = lerp(left_query_cut_point, right_query_cut_point, @@ -479,7 +480,8 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': and that no element is lost. """ - if float(cut_point).is_integer(): + cut_point = Fraction(cut_point) + if cut_point.denominator == 1: raise ValueError("Cut accepts fractions, not integers") if self.ref_length == 0 or \ From 3fb21ff8f98f7c1a2d8ebe7acdcf4b303242ca89 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Sun, 12 Nov 2023 11:47:32 -0800 Subject: [PATCH 065/485] Cigar tools: reimplement CoordinateMapping Base on a new structure called PartialDict --- micall/core/contig_stitcher.py | 4 +- micall/tests/test_cigar_tools.py | 26 +++---- micall/utils/cigar_tools.py | 125 ++++++++++++++----------------- 3 files changed, 73 insertions(+), 82 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 96b690454..6e3a5c7be 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -297,8 +297,8 @@ def find_most_covered(contigs) -> Optional[AlignedContig]: def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: def covered_by(gap, other): # Check if any 1 reference coordinate in gap is mapped in other. - gap_coords = gap.coordinate_mapping.all_reference_coordinates() - cover_coords = other.alignment.coordinate_mapping.mapped_reference_coordinates() + gap_coords = gap.coordinate_mapping.ref_to_query.domain + cover_coords = set(other.alignment.coordinate_mapping.ref_to_query.keys()) return not gap_coords.isdisjoint(cover_coords) def covered(contig, gap): diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index a42c7326a..98e339909 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -65,9 +65,9 @@ def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping - assert expected_mapping == mapping.ref_to_query_d - assert expected_mapping == {i: mapping.ref_to_query(i) - for i in mapping.mapped_reference_coordinates()} + assert expected_mapping == mapping.ref_to_query + assert expected_mapping == {i: mapping.ref_to_query[i] + for i in mapping.ref_to_query.keys()} @pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) @@ -76,10 +76,10 @@ def test_cigar_to_coordinate_bijection_property(cigar_str): mapping = Cigar.coerce(cigar_str).coordinate_mapping - assert mapping.query_to_ref_d == inverse(mapping.ref_to_query_d) - assert mapping.ref_to_query_d == inverse(mapping.query_to_ref_d) - assert mapping.ref_to_query_d == inverse(inverse(mapping.ref_to_query_d)) - assert mapping.query_to_ref_d == inverse(inverse(mapping.query_to_ref_d)) + assert mapping.query_to_ref == inverse(mapping.ref_to_query) + assert mapping.ref_to_query == inverse(mapping.query_to_ref) + assert mapping.ref_to_query == inverse(inverse(mapping.ref_to_query)) + assert mapping.query_to_ref == inverse(inverse(mapping.query_to_ref)) @pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) @@ -88,7 +88,7 @@ def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping def test(): fullrange = {i: mapping.ref_to_closest_query(i) - for i in mapping.all_reference_coordinates()} + for i in mapping.ref_to_query.domain} assert expected_closest_mapping == fullrange if isinstance(expected_closest_mapping, Exception): @@ -106,11 +106,11 @@ def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): # Coordinates are translated by q_st and r_st. expected_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_mapping.items()} - assert mapping.ref_to_query(0) == None - assert mapping.query_to_ref(0) == None + assert mapping.ref_to_query.get(0, None) == None + assert mapping.query_to_ref.get(0, None) == None assert expected_mapping \ - == {i: mapping.ref_to_query(i) - for i in mapping.mapped_reference_coordinates()} + == {i: mapping.ref_to_query[i] + for i in mapping.ref_to_query.keys()} @pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) @@ -122,7 +122,7 @@ def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_map def test(expected): # Coordinates are translated by q_st and r_st. fullrange = {i: mapping.ref_to_closest_query(i) - for i in mapping.all_reference_coordinates()} + for i in mapping.ref_to_query.domain} assert expected == fullrange if isinstance(expected_closest_mapping, Exception): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index f68765da0..4d23d7d3a 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,7 +4,7 @@ from math import ceil, floor import re -from typing import Container, Tuple, Iterable, Optional, Set +from typing import Container, Tuple, Iterable, Optional, Set, Dict from dataclasses import dataclass from functools import cached_property from itertools import chain, dropwhile @@ -37,66 +37,70 @@ def cigar_operation_to_str(op: CigarActions) -> str: return [k for (k, v) in CIGAR_OP_MAPPING.items() if v == op][0] -class CoordinateMapping: +class PartialDict(dict): def __init__(self): - self.query_to_ref_d = {} - self.ref_to_query_d = {} - self.ref_to_op_d = {} - self.query_to_op_d = {} - + super().__init__() + self.domain = set() # superset of self.keys() + self.codomain = set() # superset of self.values() - def extend(self, - ref_index: Optional[int], - query_index: Optional[int], - op_index: int): - if ref_index is not None and query_index is not None: - self.ref_to_query_d[ref_index] = query_index - self.query_to_ref_d[query_index] = ref_index + def extend(self, key: Optional[int], value: Optional[int]): + if key is not None and value is not None: + self[key] = value - if ref_index is not None: - self.ref_to_op_d[ref_index] = op_index - if query_index is not None: - self.query_to_op_d[query_index] = op_index + if key is not None: + self.domain.add(key) + if value is not None: + self.codomain.add(value) - def mapped_reference_coordinates(self) -> Set[int]: - return set(self.ref_to_query_d.keys()) + def closest_key(self, index) -> int: + return min(self.keys(), key=lambda x: abs(x - index)) - def all_reference_coordinates(self) -> Set[int]: - return set(self.ref_to_op_d.keys()) + def left_supremum(self, value) -> Optional[int]: + left_neihbourhood = (k for k in self.keys() if self[k] <= value) + return max(left_neihbourhood, default=None) - def mapped_query_coordinates(self) -> Set[int]: - return set(self.query_to_ref_d.keys()) + def right_infimum(self, value) -> Optional[int]: + right_neihbourhood = (k for k in self.keys() if value <= self[k]) + return min(right_neihbourhood, default=None) - def all_query_coordinates(self) -> Set[int]: - return set(self.query_to_op_d.keys()) + def translate(self, domain_delta: int, codomain_delta: int) -> 'PartialDict': + ret = PartialDict() - def ref_to_query(self, index) -> Optional[int]: - return self.ref_to_query_d.get(index, None) + for k, v in self.items(): + ret.extend(k + domain_delta, v + codomain_delta) + for k in self.domain: + ret.extend(k + domain_delta, None) - def query_to_ref(self, index) -> Optional[int]: - return self.query_to_ref_d.get(index, None) + for v in self.codomain: + ret.extend(None, v + codomain_delta) + return ret - def ref_to_leftsup_query(self, index) -> Optional[int]: - left_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if v <= index) - return max(left_neihbourhood, default=None) +class CoordinateMapping: + def __init__(self): + self.query_to_ref = PartialDict() + self.ref_to_query = PartialDict() + self.ref_to_op = PartialDict() + self.query_to_op = PartialDict() - def ref_to_rightinf_query(self, index) -> Optional[int]: - right_neihbourhood = (k for (k, v) in self.query_to_ref_d.items() if index <= v) - return min(right_neihbourhood, default=None) + def extend(self, + ref_index: Optional[int], + query_index: Optional[int], + op_index: int): - @staticmethod - def _find_closest(collection, value) -> int: - return min(collection, key=lambda x: abs(x - value)) + self.ref_to_query.extend(ref_index, query_index) + self.query_to_ref.extend(query_index, ref_index) + self.ref_to_op.extend(ref_index, op_index) + self.query_to_op.extend(query_index, op_index) @staticmethod @@ -104,25 +108,17 @@ def _find_closest_key(mapping: dict, index: int) -> int: return min(mapping, key=lambda k: abs(mapping[k] - index)) - def find_closest_ref(self, index) -> int: - return CoordinateMapping._find_closest(self.all_reference_coordinates(), index) - - - def find_closest_query(self, index) -> int: - return CoordinateMapping._find_closest(self.all_query_coordinates(), index) - - def ref_to_closest_query(self, index) -> int: - return CoordinateMapping._find_closest_key(self.query_to_op_d, self.ref_to_op_d[index]) + return CoordinateMapping._find_closest_key(self.query_to_op, self.ref_to_op[index]) def query_to_closest_ref(self, index) -> int: - return CoordinateMapping._find_closest_key(self.ref_to_op_d, self.query_to_op_d[index]) + return CoordinateMapping._find_closest_key(self.ref_to_op, self.query_to_op[index]) def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): - r = self.ref_to_op_d.get(ref_index, None) - q = self.query_to_op_d.get(query_index, None) + r = self.ref_to_op.get(ref_index, None) + q = self.query_to_op.get(query_index, None) if r is not None and q is not None: return conflict(r, q) @@ -132,10 +128,10 @@ def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': ret = CoordinateMapping() - ret.ref_to_query_d = {k + reference_delta: v + query_delta for (k, v) in self.ref_to_query_d.items()} - ret.query_to_ref_d = {k + query_delta: v + reference_delta for (k, v) in self.query_to_ref_d.items()} - ret.ref_to_op_d = {k + reference_delta: v for (k, v) in self.ref_to_op_d.items()} - ret.query_to_op_d = {k + query_delta: v for (k, v) in self.query_to_op_d.items()} + ret.ref_to_query = self.ref_to_query.translate(reference_delta, query_delta) + ret.query_to_ref = self.query_to_ref.translate(query_delta, reference_delta) + ret.ref_to_op = self.ref_to_op.translate(reference_delta, 0) + ret.query_to_op = self.query_to_op.translate(query_delta, 0) return ret @@ -384,8 +380,8 @@ def intervals_overlap(x, y): def gaps(self) -> Iterable['CigarHit']: # TODO(vitalik): memoize whatever possible. - covered_coordinates = self.coordinate_mapping.mapped_reference_coordinates() - all_coordinates = self.coordinate_mapping.all_reference_coordinates() + covered_coordinates = self.coordinate_mapping.ref_to_query.keys() + all_coordinates = self.coordinate_mapping.ref_to_query.domain def make_gap(r_st, r_en): r_ei = r_en - 1 @@ -454,8 +450,9 @@ def _slice(self, r_st, r_ei, q_st, q_ei) -> 'CigarHit': def _ref_cut_to_query_cut(self, cut_point: float): mapping = self.coordinate_mapping - left_query_cut_point = mapping.ref_to_leftsup_query(floor(cut_point)) - right_query_cut_point = mapping.ref_to_rightinf_query(ceil(cut_point)) + # TODO(vitalik): fix this to use ref_to_query + left_query_cut_point = mapping.query_to_ref.left_supremum(floor(cut_point)) + right_query_cut_point = mapping.query_to_ref.right_infimum(ceil(cut_point)) if left_query_cut_point is None: return self.q_st - 0.5 @@ -505,10 +502,7 @@ def lstrip_query(self) -> 'CigarHit': if self.query_length == 0: return self - boundary_ref = self.coordinate_mapping.find_closest_ref(self.r_st - 1) - closest_query = self.coordinate_mapping.ref_to_closest_query(boundary_ref) - closest_ref = self.coordinate_mapping.query_to_ref(closest_query) - + closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_st - 1) remainder, stripped = self.cut_reference(closest_ref - self.epsilon) return stripped @@ -519,10 +513,7 @@ def rstrip_query(self) -> 'CigarHit': if self.query_length == 0: return self - boundary_ref = self.coordinate_mapping.find_closest_ref(self.r_ei + 1) - closest_query = self.coordinate_mapping.ref_to_closest_query(boundary_ref) - closest_ref = self.coordinate_mapping.query_to_ref(closest_query) - + closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_ei + 1) stripped, remainder = self.cut_reference(closest_ref + self.epsilon) return stripped From 2ea41b873fca91efe0838865e714fb68b1e23096 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 11:14:09 -0800 Subject: [PATCH 066/485] Cigar tools: fix some more cut edge cases Make the cut based on ref+op indexes instead of ref+query indexes. --- micall/tests/test_cigar_tools.py | 16 ++++++ micall/utils/cigar_tools.py | 88 +++++++++++++------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 98e339909..c4a7b58c6 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -254,6 +254,22 @@ def test_invalid_cigar_string(): [CigarHit('', r_st=0, r_ei=-1, q_st=0, q_ei=-1), CigarHit('9D', r_st=0, r_ei=8, q_st=0, q_ei=-1)]), + (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.5, + [CigarHit('1M4I', r_st=1, r_ei=1, q_st=1, q_ei=5), + CigarHit('3I1M', r_st=2, r_ei=2, q_st=6, q_ei=9)]), + + (CigarHit('1M6I1M', r_st=1, r_ei=2, q_st=1, q_ei=8), 1.5, + [CigarHit('1M3I', r_st=1, r_ei=1, q_st=1, q_ei=4), + CigarHit('3I1M', r_st=2, r_ei=2, q_st=5, q_ei=8)]), + + (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.999, + [CigarHit('1M7I', r_st=1, r_ei=1, q_st=1, q_ei=8), + CigarHit('1M', r_st=2, r_ei=2, q_st=9, q_ei=9)]), + + (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.001, + [CigarHit('1M', r_st=1, r_ei=1, q_st=1, q_ei=1), + CigarHit('7I1M', r_st=2, r_ei=2, q_st=2, q_ei=9)]), + (CigarHit('2=1X2N1N2=1H2S', r_st=1, r_ei=8, q_st=1, q_ei=7), 3.5, [CigarHit('2=1X', r_st=1, r_ei=3, q_st=1, q_ei=3), CigarHit('3N2=1H2S', r_st=4, r_ei=8, q_st=4, q_ei=7)]), diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 4d23d7d3a..8d2eed731 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -59,14 +59,12 @@ def closest_key(self, index) -> int: return min(self.keys(), key=lambda x: abs(x - index)) - def left_supremum(self, value) -> Optional[int]: - left_neihbourhood = (k for k in self.keys() if self[k] <= value) - return max(left_neihbourhood, default=None) + def left_max(self, index) -> Optional[int]: + return max((v for (k, v) in self.items() if k <= index), default=None) - def right_infimum(self, value) -> Optional[int]: - right_neihbourhood = (k for k in self.keys() if value <= self[k]) - return min(right_neihbourhood, default=None) + def right_min(self, index) -> Optional[int]: + return min((v for (k, v) in self.items() if k >= index), default=None) def translate(self, domain_delta: int, codomain_delta: int) -> 'PartialDict': @@ -116,15 +114,6 @@ def query_to_closest_ref(self, index) -> int: return CoordinateMapping._find_closest_key(self.ref_to_op, self.query_to_op[index]) - def ref_or_query_to_op(self, ref_index: int, query_index: int, conflict): - r = self.ref_to_op.get(ref_index, None) - q = self.query_to_op.get(query_index, None) - if r is not None and q is not None: - return conflict(r, q) - - return r if q is None else q - - def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': ret = CoordinateMapping() @@ -246,6 +235,11 @@ def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optio yield (operation, None, None) + @cached_property + def op_length(self): + return sum(1 for x in self.iterate_operations()) + + @cached_property def query_length(self): return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) @@ -429,45 +423,39 @@ def epsilon(self): return Fraction(1, self.query_length + self.ref_length + 100) - def _slice(self, r_st, r_ei, q_st, q_ei) -> 'CigarHit': + def _ref_cut_to_op_cut(self, cut_point: float): mapping = self.coordinate_mapping - o_st = mapping.ref_or_query_to_op(r_st, q_st, min) - o_ei = mapping.ref_or_query_to_op(r_ei, q_ei, max) - if o_st is None or o_ei is None: - cigar = Cigar([]) - else: - cigar = self.cigar.slice_operations(o_st, o_ei + 1) - - return CigarHit(cigar=cigar, - r_st = r_st, - r_ei = r_ei, - q_st = q_st, - q_ei = q_ei, - ) + left_op_cut_point = mapping.ref_to_op.left_max(floor(cut_point)) + right_op_cut_point = mapping.ref_to_op.right_min(ceil(cut_point)) + if left_op_cut_point is None: + left_op_cut_point = -1 + if right_op_cut_point is None: + right_op_cut_point = self.cigar.op_length - def _ref_cut_to_query_cut(self, cut_point: float): - mapping = self.coordinate_mapping + lerp = lambda start, end, t: (1 - t) * start + t * end + op_cut_point = lerp(left_op_cut_point, right_op_cut_point, + cut_point - floor(cut_point)) - # TODO(vitalik): fix this to use ref_to_query - left_query_cut_point = mapping.query_to_ref.left_supremum(floor(cut_point)) - right_query_cut_point = mapping.query_to_ref.right_infimum(ceil(cut_point)) + if float(op_cut_point).is_integer(): + # Disambiguate to the right. + op_cut_point += self.epsilon - if left_query_cut_point is None: - return self.q_st - 0.5 - if right_query_cut_point is None: - return self.q_ei + 0.5 + return op_cut_point - lerp = lambda start, end, t: (1 - t) * start + t * end - query_cut_point = lerp(left_query_cut_point, right_query_cut_point, - cut_point - floor(cut_point)) - if float(query_cut_point).is_integer(): - # Disambiguate to the right. - query_cut_point += self.epsilon + def _slice(self, r_st, q_st, o_st, o_ei): + cigar = self.cigar.slice_operations(o_st, o_ei + 1) + r_ei = r_st + cigar.ref_length - 1 + q_ei = q_st + cigar.query_length - 1 - return query_cut_point + return CigarHit(cigar=cigar, + r_st = r_st, + r_ei = r_ei, + q_st = q_st, + q_ei = q_ei, + ) def cut_reference(self, cut_point: float) -> 'CigarHit': @@ -485,13 +473,9 @@ def cut_reference(self, cut_point: float) -> 'CigarHit': not (self.r_st - 1 < cut_point < self.r_ei + 1): raise IndexError("Cut point out of reference bounds") - query_cut_point = self._ref_cut_to_query_cut(cut_point) - assert (self.q_st - 1 <= query_cut_point <= self.q_ei + 1) - - left = self._slice(self.r_st, floor(cut_point), - self.q_st, floor(query_cut_point)) - right = self._slice(ceil(cut_point), self.r_ei, - ceil(query_cut_point), self.q_ei) + op_cut_point = self._ref_cut_to_op_cut(cut_point) + left = self._slice(self.r_st, self.q_st, 0, floor(op_cut_point)) + right = self._slice(left.r_ei + 1, left.q_ei + 1, ceil(op_cut_point), self.cigar.op_length) return left, right From 712ef3b4c2fc20fbcdf54f0663bdccad968a6e17 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 12:04:46 -0800 Subject: [PATCH 067/485] Cigar tools: fix gaps tests --- micall/tests/test_cigar_tools.py | 20 ++++++++++++++------ micall/utils/cigar_tools.py | 9 +++++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index c4a7b58c6..6ab40b859 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -7,12 +7,12 @@ from micall.utils.cigar_tools import Cigar, CigarHit, parse_cigar_operation, CIGAR_OP_MAPPING -cigar_mapping_cases: List[Tuple[Cigar, 'mapping', 'closest_mapping']] = [ +cigar_mapping_cases = [ # Simple cases - ('3M', {0: 0, 1: 1, 2: 2}, - {0: 0, 1: 1, 2: 2}), - ('1M1D1M', {0: 0, 2: 1}, - {0: 0, 1: 0, 2: 1}), + ('3M', {0: 0, 1: 1, 2: 2}, # exact mapping + {0: 0, 1: 1, 2: 2}), # closest mapping + ('1M1D1M', {0: 0, 2: 1}, # exact mapping + {0: 0, 1: 0, 2: 1}), # closest mapping ('1M1I1M', {0: 0, 1: 2}, {0: 0, 1: 2}), ('2M2D2M', {0: 0, 1: 1, 4: 2, 5: 3}, @@ -221,6 +221,14 @@ def test_invalid_cigar_string(): [CigarHit('9M1I', r_st=1, r_ei=9, q_st=1, q_ei=10), CigarHit('8I9M', r_st=10, r_ei=18, q_st=11, q_ei=27)]), + (CigarHit('9M9D9I9M', r_st=1, r_ei=27, q_st=1, q_ei=27), 13.5 or 27/2, + [CigarHit('9M4D', r_st=1, r_ei=13, q_st=1, q_ei=9), + CigarHit('5D9I9M', r_st=14, r_ei=27, q_st=10, q_ei=27)]), + + (CigarHit('9M9I9D9M', r_st=1, r_ei=27, q_st=1, q_ei=27), 13.5 or 27/2, + [CigarHit('9M9I4D', r_st=1, r_ei=13, q_st=1, q_ei=18), + CigarHit('5D9M', r_st=14, r_ei=27, q_st=19, q_ei=27)]), + # Edge cases (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.5, # no middlepoint [CigarHit('9M5I', r_st=1, r_ei=9, q_st=1, q_ei=14), @@ -331,7 +339,7 @@ def test_cigar_hit_strip_combines_with_add(hit, cut_point): left = left.rstrip_query() right = right.lstrip_query() - assert left + right == hit + assert (left + right).coordinate_mapping == hit.coordinate_mapping @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 8d2eed731..b0a082fe7 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -82,6 +82,7 @@ def translate(self, domain_delta: int, codomain_delta: int) -> 'PartialDict': return ret +@dataclass class CoordinateMapping: def __init__(self): self.query_to_ref = PartialDict() @@ -125,6 +126,10 @@ def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMappin return ret + def __repr__(self): + return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' + + class Cigar(list): """ A CIGAR string represents a read alignment against a reference sequence. @@ -379,8 +384,8 @@ def gaps(self) -> Iterable['CigarHit']: def make_gap(r_st, r_en): r_ei = r_en - 1 - left, midright = self.cut_reference(r_st - 0.5) - middle, right = midright.cut_reference(r_ei + 0.5) + left, midright = self.cut_reference(r_st - self.epsilon) + middle, right = midright.cut_reference(r_ei + self.epsilon) return middle gap_start = None From 66fcafd30b3f4c71cce5746673602b0e157db810 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 12:28:22 -0800 Subject: [PATCH 068/485] Cigar tools: divide __add__ operation into connect and basic __add__ --- micall/tests/test_cigar_tools.py | 16 ++++- micall/utils/cigar_tools.py | 109 ++++++++++++++++++------------- 2 files changed, 79 insertions(+), 46 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 6ab40b859..40a1fbc1a 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -4,7 +4,7 @@ import itertools from micall.utils.consensus_aligner import CigarActions -from micall.utils.cigar_tools import Cigar, CigarHit, parse_cigar_operation, CIGAR_OP_MAPPING +from micall.utils.cigar_tools import Cigar, CigarHit cigar_mapping_cases = [ @@ -330,6 +330,17 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): assert left + right == hit +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_strip_combines_with_connect(hit, cut_point): + left, right = hit.cut_reference(cut_point) + + left = left.rstrip_query() + right = right.lstrip_query() + + assert left.connect(right).coordinate_mapping == hit.coordinate_mapping + + @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception) and not 'N' in str(x[0].cigar)]) @@ -339,7 +350,8 @@ def test_cigar_hit_strip_combines_with_add(hit, cut_point): left = left.rstrip_query() right = right.lstrip_query() - assert (left + right).coordinate_mapping == hit.coordinate_mapping + if left.touches(right): + assert left + right == hit @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index b0a082fe7..78a462595 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -6,38 +6,14 @@ import re from typing import Container, Tuple, Iterable, Optional, Set, Dict from dataclasses import dataclass -from functools import cached_property +from functools import cached_property, reduce from itertools import chain, dropwhile from fractions import Fraction from micall.utils.consensus_aligner import CigarActions -CIGAR_OP_MAPPING = { - 'M': CigarActions.MATCH, - 'I': CigarActions.INSERT, - 'D': CigarActions.DELETE, - 'N': CigarActions.SKIPPED, - 'S': CigarActions.SOFT_CLIPPED, - 'H': CigarActions.HARD_CLIPPED, - 'P': CigarActions.PADDING, - '=': CigarActions.SEQ_MATCH, - 'X': CigarActions.MISMATCH, -} - - -def parse_cigar_operation(operation: str) -> CigarActions: - if operation in CIGAR_OP_MAPPING: - return CIGAR_OP_MAPPING[operation] - else: - raise ValueError(f"Unexpected CIGAR action: {operation}.") - - -def cigar_operation_to_str(op: CigarActions) -> str: - return [k for (k, v) in CIGAR_OP_MAPPING.items() if v == op][0] - - -class PartialDict(dict): +class IntDict(dict): def __init__(self): super().__init__() self.domain = set() # superset of self.keys() @@ -67,8 +43,8 @@ def right_min(self, index) -> Optional[int]: return min((v for (k, v) in self.items() if k >= index), default=None) - def translate(self, domain_delta: int, codomain_delta: int) -> 'PartialDict': - ret = PartialDict() + def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': + ret = IntDict() for k, v in self.items(): ret.extend(k + domain_delta, v + codomain_delta) @@ -85,10 +61,10 @@ def translate(self, domain_delta: int, codomain_delta: int) -> 'PartialDict': @dataclass class CoordinateMapping: def __init__(self): - self.query_to_ref = PartialDict() - self.ref_to_query = PartialDict() - self.ref_to_op = PartialDict() - self.query_to_op = PartialDict() + self.query_to_ref = IntDict() + self.ref_to_query = IntDict() + self.ref_to_op = IntDict() + self.query_to_op = IntDict() def extend(self, @@ -169,6 +145,32 @@ def coerce(obj): raise TypeError(f"Cannot coerce {obj!r} to CIGAR string.") + OP_MAPPING = { + 'M': CigarActions.MATCH, + 'I': CigarActions.INSERT, + 'D': CigarActions.DELETE, + 'N': CigarActions.SKIPPED, + 'S': CigarActions.SOFT_CLIPPED, + 'H': CigarActions.HARD_CLIPPED, + 'P': CigarActions.PADDING, + '=': CigarActions.SEQ_MATCH, + 'X': CigarActions.MISMATCH, + } + + + @staticmethod + def parse_operation(operation: str) -> CigarActions: + if operation in Cigar.OP_MAPPING: + return Cigar.OP_MAPPING[operation] + else: + raise ValueError(f"Unexpected CIGAR action: {operation}.") + + + @staticmethod + def operation_to_str(op: CigarActions) -> str: + return [k for (k, v) in Cigar.OP_MAPPING.items() if v == op][0] + + @staticmethod def parse(string): data = [] @@ -176,7 +178,7 @@ def parse(string): match = re.match(r'([0-9]+)([^0-9])', string) if match: num, operation = match.groups() - data.append([int(num), parse_cigar_operation(operation)]) + data.append([int(num), Cigar.parse_operation(operation)]) string = string[match.end():] else: raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") @@ -317,7 +319,7 @@ def __repr__(self): def __str__(self): """ Inverse of Cigar.parse """ - return ''.join('{}{}'.format(num, cigar_operation_to_str(op)) for num, op in self) + return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self) @dataclass @@ -376,6 +378,17 @@ def intervals_overlap(x, y): or intervals_overlap((self.q_st, self.q_ei), (other.q_st, other.q_ei)) + def touches(self, other) -> bool: + """ + Checks if this CIGAR hit touches the other CIGAR hit, + in both reference and query space. + NOTE: only applicable if these hits come from the same reference and query. + """ + + return self.r_ei + 1 == other.r_st \ + and self.q_ei + 1 == other.q_st + + def gaps(self) -> Iterable['CigarHit']: # TODO(vitalik): memoize whatever possible. @@ -404,24 +417,32 @@ def make_gap(r_st, r_en): def __add__(self, other): """ - Inserts deletions/insertions between self and other, - then ajusts boundaries appropriately. + Only adds CigarHits that are touching. + The addition is simply a concatenation of two Cigar strings, and adjustment of hit coordinates. """ - if self.overlaps(other): - raise ValueError("Cannot combine overlapping CIGAR hits") - - cigar = self.cigar \ - + CigarHit.from_default_alignment(self.r_ei + 1, other.r_st - 1, self.q_ei + 1, other.q_st - 1).cigar \ - + other.cigar + if not self.touches(other): + raise ValueError("Cannot combine CIGAR hits that do not touch in both reference and query coordinates") - return CigarHit(cigar=cigar, + return CigarHit(cigar=self.cigar + other.cigar, r_st=self.r_st, r_ei=other.r_ei, q_st=self.q_st, q_ei=other.q_ei, ) + def connect(self, other): + """ + Inserts deletions/insertions between self and other, + then ajusts boundaries appropriately. + """ + + if self.overlaps(other): + raise ValueError("Cannot combine overlapping CIGAR hits") + + filler = CigarHit.from_default_alignment(self.r_ei + 1, other.r_st - 1, self.q_ei + 1, other.q_st - 1) + return self + filler + other + @property def epsilon(self): @@ -556,4 +577,4 @@ def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> CigarHit: sorted_parts = sorted(accumulator, key=lambda p: p.r_st) # Collect all intervals back together, connecting them with CigarActions.DELETE. - return sum(sorted_parts[1:], start=sorted_parts[0]) + return reduce(CigarHit.connect, sorted_parts) From 224fbd0f4a9f2555e30a865c12c02987c727cc8a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 17:44:46 -0800 Subject: [PATCH 069/485] Contig stitcher: reimplement complete coverage check * add merge_intervals helper function * check if contig is covered by multiple other contigs --- micall/core/contig_stitcher.py | 69 +++++++++-- micall/tests/test_contig_stitcher.py | 170 ++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 13 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 6e3a5c7be..8450cc6c7 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -273,21 +273,68 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: contigs.insert(0, new_contig) -def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: - """ Filter out all contigs that are contained within other contigs. """ +def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: + """ + Merge overlapping and adjacent intervals. + Note that intervals are inclusive. + + :param intervals: A list of intervals [start, end] where 'start' and 'end' are integers. + :eturn: A list of merged intervals. + """ + + if not intervals: + return [] - # TODO: filter out if covered by multiple contigs - # TODO: split contigs that have big gaps in them first, otherwise they will cover too much. + # Sort intervals by their starting values + sorted_intervals = sorted(intervals, key=lambda x: x[0]) - def find_most_covered(contigs) -> Optional[AlignedContig]: - for current in contigs: - if any(x for x in contigs if x != current and x.contains(current)): - return current + merged_intervals = [sorted_intervals[0]] + for current in sorted_intervals[1:]: + current_start, current_end = current + last_start, last_end = merged_intervals[-1] + if current_start <= last_end + 1: + # Extend the last interval if there is an overlap or if they are adjacent + merged_intervals[-1] = (min(last_start, current_start), max(last_end, current_end)) + else: + # Add the current interval if there is no overlap + merged_intervals.append(current) + + return merged_intervals + + +def find_covered_contig(contigs: List[AlignedContig]) -> Optional[AlignedContig]: + """ + Find and return the first contig that is completely covered by other contigs. + + :param contigs: List of all aligned contigs to be considered. + :return: An AlignedContig if there is one completely covered by others, None otherwise. + """ + + def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: + intervals = [(contig.alignment.r_st, contig.alignment.r_ei) for contig in contigs] + merged_intervals = merge_intervals(intervals) + return merged_intervals + + for current in contigs: + current_interval = (current.alignment.r_st, current.alignment.r_ei) + + # Create a map of cumulative coverage for contigs + other_contigs = [x for x in contigs if x != current and x.ref_name == current.ref_name] + cumulative_coverage = calculate_cumulative_coverage(other_contigs) + + # Check if the current contig is covered by the cumulative coverage intervals + if any((cover_interval[0] <= current_interval[0] and cover_interval[1] >= current_interval[1]) + for cover_interval in cumulative_coverage): + return current + + +def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: + """ Filter out all contigs that are contained within other contigs. """ while contigs: - most_covered = find_most_covered(contigs) - if most_covered: - contigs.remove(most_covered) + covered = find_covered_contig(contigs) + if covered: + contigs.remove(covered) else: break diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 0494bc541..48836de36 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,6 +1,6 @@ import pytest -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig from micall.tests.utils import MockAligner @@ -255,7 +255,6 @@ def test_correct_processing_complex_nogaps(exact_aligner): # Scenario: There are two reference organisms. # Each with 4 contigs. # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. - # This seems like the most general scenario if no gaps or complete goverage is involved. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 @@ -438,3 +437,170 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): assert sorted(map(lambda x: x.seq, contigs)) \ == sorted(map(lambda x: x.seq, result)) + + + +# _ _ _ _ _ _ +# | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ +# | | | | '_ \| | __| | __/ _ \/ __| __/ __| +# | |_| | | | | | |_ | || __/\__ \ |_\__ \ +# \___/|_| |_|_|\__| \__\___||___/\__|___/ +# + +@pytest.mark.parametrize("intervals, expected", [ + ([], []), + ([(1, 3)], [(1, 3)]), + + # Non-overlapping intervals + ([(1, 3), (5, 6)], [(1, 3), (5, 6)]), + + # Directly overlapping intervals + ([(1, 3), (2, 5)], [(1, 5)]), + + # Adjacent intervals that exactly touch each other + ([(1, 2), (3, 4)], [(1, 4)]), + + # Nested intervals + ([(1, 10), (2, 5)], [(1, 10)]), + + # Multiple merged intervals + ([(1, 3), (2, 4), (6, 8), (10, 11), (11, 12)], + [(1, 4), (6, 8), (10, 12)]), + + # Intervals out of initial order + ([(4, 6), (1, 2)], + [(1, 2), (4, 6)]), + + # Overlapping intervals with out of order inputs + ([(1, 4), (3, 5), (2, 3), (7, 10), (9, 12)], + [(1, 5), (7, 12)]), + + # Large set of intervals with various overlaps + ([(1, 4), (2, 6), (5, 8), (7, 8), (10, 15), (11, 12), (13, 14), (17, 18)], + [(1, 8), (10, 15), (17, 18)]), + + # Intervals where end is less than start should return as is or be handled explicitly depending on implementation + ([(5, 3), (1, 2)], + [(1, 2), (5, 3)]), + + # Intervals that are exactly one after the other in sequence / Intervals that are completely disjoint + ([(1, 2), (4, 5), (7, 8)], + [(1, 2), (4, 5), (7, 8)]), + + # Overlapping intervals that merge into one large interval + ([(2, 6), (4, 10), (5, 15), (14, 20)], + [(2, 20)]), + + # Same interval repeated multiple times + ([(1, 5), (1, 5), (1, 5)], + [(1, 5)]), + + # Single point intervals + ([(1, 1), (5, 5), (3, 3)], + [(1, 1), (3, 3), (5, 5)]), + + ([(1, 1), (5, 5), (3, 3), (1, 1), (1, 1)], + [(1, 1), (3, 3), (5, 5)]), + + ([(1, 1), (2, 3)], + [(1, 3)]), + + # Intervals that start with negative numbers + ([(-5, 0), (-2, 3), (1, 7), (9, 12)], + [(-5, 7), (9, 12)]), +]) +def test_merge_intervals(intervals, expected): + assert merge_intervals(intervals) == expected + + +class MockAlignedContig: + def __init__(self, ref_name, r_st, r_ei, name="contig"): + self.ref_name = ref_name + self.alignment = MockAlignment(r_st, r_ei) + self.name = name + + +class MockAlignment: + def __init__(self, r_st, r_ei): + self.r_st = r_st + self.r_ei = r_ei + + +# Simple function to create mock AlignedContig objects for testing, including ref_name. +def create_mock_aligned_contig(ref_name, r_st, r_ei, name="contig"): + return MockAlignedContig(ref_name, r_st, r_ei, name) + + +@pytest.mark.parametrize("contigs, expected_covered_name", [ + # No contigs are completely covered. + ([('ref1', 0, 100), ('ref1', 101, 200)], None), + ([('ref1', 0, 50), ('ref1', 51, 100)], None), + + # A single contig is completely covered by one other contig. + ([('ref1', 0, 100), ('ref1', 0, 200)], 'contig1'), + ([('ref1', 50, 150), ('ref1', 0, 200)], 'contig1'), + + # A single contig completely covers another, but with different reference names. + ([('ref1', 0, 50), ('ref2', 0, 100)], None), + + # Single coverage with exact match. + ([('ref1', 0, 100), ('ref1', 0, 100)], 'contig1'), + + # A single contig is completely covered at the beginning by one and at the end by another contig. + ([('ref1', 0, 50), ('ref1', 50, 100), ('ref1', 25, 75)], 'contig3'), + + # Contigs overlap but none are completely covered. + ([('ref1', 0, 50), ('ref1', 40, 90), ('ref1', 80, 120)], None), + + # Multiple contigs with some covered completely by a single other contig. + ([('ref1', 0, 200), ('ref1', 10, 30), ('ref1', 170, 190)], 'contig2'), + + # Multiple contigs with complex overlaps and one completely covered. + ([('ref1', 30, 60), ('ref1', 0, 50), ('ref1', 20, 70), ('ref1', 60, 90)], 'contig1'), + + # Edge case where a contig starts where another ends. + ([('ref1', 0, 50), ('ref1', 50, 100)], None), + + # Contigs are completely covered in a nested fashion. + ([('ref1', 0, 200), ('ref1', 50, 150), ('ref1', 100, 125)], 'contig2'), + + # Contigs are adjacent and cover each other completely. + ([('ref1', 0, 100), ('ref1', 101, 200), ('ref1', 0, 200)], 'contig1'), + + # Single large contig covers several smaller non-adjacent contigs. + ([('ref1', 0, 500), ('ref1', 50, 100), ('ref1', 200, 250), ('ref1', 300, 350)], 'contig2'), + + # Single large contig covers several smaller adjacent contigs. + ([('ref1', 50, 100), ('ref1', 70, 300), ('ref1', 101, 199), ('ref1', 200, 350)], 'contig2'), + + # Single small contig is covered by several larger contigs. + ([('ref1', 0, 250), ('ref1', 200, 300), ('ref1', 600, 800), ('ref1', 250, 700)], 'contig2'), + + # Complex case with multiple contigs and complete coverage by combinations. + ([('ref1', 0, 100), ('ref1', 30, 130), ('ref1', 60, 160), ('ref1', 90, 190), ('ref1', 120, 220)], 'contig2'), + + # Contigs with same start but different end, where one is covered. + ([('ref1', 0, 100), ('ref1', 0, 50)], 'contig2'), + + # Contigs with same end but different start, where one is covered. + ([('ref1', 50, 100), ('ref1', 0, 100)], 'contig1'), + + # Contig covered by two overlapping contigs that don't individually cover the whole range. + ([('ref1', 0, 75), ('ref1', 25, 100), ('ref1', 0, 100)], 'contig1'), + + # Two contigs are covered completely by one large contig. + ([('ref1', 0, 300), ('ref1', 50, 100), ('ref1', 200, 250)], 'contig2'), + + # No contigs at all. + ([], None), +]) +def test_find_covered(contigs, expected_covered_name): + mock_contigs = [create_mock_aligned_contig(ref_name, r_st, r_ei, f'contig{i+1}') + for i, (ref_name, r_st, r_ei) in enumerate(contigs)] + covered = find_covered_contig(mock_contigs) + if expected_covered_name is None: + assert covered is None + else: + assert covered is not None + assert covered.name == expected_covered_name + From adb7b536ebe4f5785c2d9e3e4ab1645282dab52c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 18:18:54 -0800 Subject: [PATCH 070/485] Contig stitcher: throw away parts of query after cuts --- micall/core/contig_stitcher.py | 37 ++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 8450cc6c7..226378baa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,6 +1,7 @@ from typing import Iterable, Optional, Tuple, List from collections import deque from dataclasses import dataclass +from math import ceil from mappy import Aligner from functools import cached_property, reduce from itertools import accumulate, takewhile @@ -35,6 +36,28 @@ def align_to_reference(self): return AlignedContig(query=self, alignment=single_cigar_hit) + def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: + """ + Cuts this alignment in two parts with cut_point between them. + Reference sequence is kept untouched. + """ + + cut_point = max(0, cut_point) + left = GenotypedContig(name=self.name, # TODO(vitalik): make it f'left({self.name})' + seq=self.seq[:ceil(cut_point)], + ref_seq=self.ref_seq, + ref_name=self.ref_name, + matched_fraction=None) + right = GenotypedContig(name=self.name, # TODO(vitalik): make it f'right({self.name})' + seq=self.seq[ceil(cut_point):], + ref_seq=self.ref_seq, + ref_name=self.ref_name, + matched_fraction=None) + + return (left, right) + + +@dataclass class AlignedContig(GenotypedContig): def __init__(self, query: GenotypedContig, alignment: CigarHit): @@ -56,18 +79,24 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - return (AlignedContig(self.query, alignment_left), - AlignedContig(self.query, alignment_right)) + query_left, query_right = self.query.cut_query(alignment_left.q_ei + 0.5) + alignment_right = alignment_right.translate(0, -1 * alignment_right.q_st) + + return (AlignedContig(query_left, alignment_left), + AlignedContig(query_right, alignment_right)) def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() - return AlignedContig(self.query, alignment) + q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) + alignment = alignment.translate(0, -1 * alignment.q_st) + return AlignedContig(query, alignment) def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() - return AlignedContig(self.query, alignment) + query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) + return AlignedContig(query, alignment) def overlaps(self, other) -> bool: From bee4b8c70682123aedcd4050b831ff280973cdae Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 21:15:32 -0800 Subject: [PATCH 071/485] Improve documentation of CIGAR tools --- micall/utils/cigar_tools.py | 178 +++++++++++++++++++++++++++--------- 1 file changed, 136 insertions(+), 42 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 78a462595..b501aa9e1 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -14,10 +14,19 @@ class IntDict(dict): + """ + An extension of the basic Python dictionary designed for integer-to-integer mappings. + + The IntDict maintains not just key-value pairs (as in a normal dictionary) but also + tracks additional sets called `domain` and `codomain`. These sets act as supersets + to the keys and values respectively, even including integers that might not be used + directly in mappings but are within the range of interest for the domain and codomain. + """ + def __init__(self): super().__init__() - self.domain = set() # superset of self.keys() - self.codomain = set() # superset of self.values() + self.domain: Set[int] = set() # superset of self.keys() + self.codomain: Set[int] = set() # superset of self.values() def extend(self, key: Optional[int], value: Optional[int]): @@ -44,6 +53,13 @@ def right_min(self, index) -> Optional[int]: def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': + """ + Generates a new IntDict by shifting the entire mapping -- keys and values + are incremented by domain_delta and codomain_delta, respectively. + This shift operation preserves the inherent ordering and relative spacing within the mapping, + effectively repositioning the dataset within the integer space. + """ + ret = IntDict() for k, v in self.items(): @@ -60,6 +76,17 @@ def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': @dataclass class CoordinateMapping: + """ + Manages bidirectional mappings between reference and query coordinates, as well as operation indices. + + A CoordinateMapping object contains mappings which represent the relationships and positions between + elements of a reference sequence, a query sequence, and the corresponding operations as defined in a + CIGAR string. + + The mapping enables conversion from reference to query coordinates and vice versa. It also manages the + association of these coordinates with their respective operations in the alignment process. + """ + def __init__(self): self.query_to_ref = IntDict() self.ref_to_query = IntDict() @@ -92,6 +119,15 @@ def query_to_closest_ref(self, index) -> int: def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': + """ + Generate a new CoordinateMapping with shifted coordinate spaces. + + This method creates a new mapping where each original coordinate in + the reference and query sequences is shifted. This allows for adapting + the CoordinateMapping to account for changes or offsets in sequence positions, + such as when sequences are trimmed or extended. + """ + ret = CoordinateMapping() ret.ref_to_query = self.ref_to_query.translate(reference_delta, query_delta) @@ -108,23 +144,22 @@ def __repr__(self): class Cigar(list): """ - A CIGAR string represents a read alignment against a reference sequence. - It is a run-length encoded sequence of alignment operations listed below: - - M: Alignment match (can be a sequence match or mismatch) - D: Deletion from the reference - I: Insertion to the reference - S: Soft clip on the read (ignored region, not aligned but present in the read) - H: Hard clip on the read (ignored region, not present in the read) - N: Skipped region from the reference - P: Padding (silent deletion from padded reference, not applicable for our case) - =: Sequence match - X: Sequence mismatch - - CIGAR strings are defined in the SAM specification - (https://samtools.github.io/hts-specs/SAMv1.pdf). - """ + Represents an alignment between a query sequence and a reference sequence using the + Compact Idiosyncratic Gapped Alignment Report (CIGAR) string format. + + A CIGAR string is a sequence of operation codes ('M', 'I', 'D', etc.) each preceded by + the number of bases or residues to which the operation applies. The primary use of a + CIGAR string is to detail areas of alignment and gaps (insertions or deletions) between + the two sequences. + Instances of this class should be created by calling the Cigar.coerce method. + Examples: + Cigar.coerce("10M1I5M1D") + Cigar.coerce([(10, CigarActions.MATCH), (1, CigarActions.INSERT), ...]) + Cigar.coerce(existing_cigar_object) + + CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). + """ def __init__(self, cigar_lst: Iterable[Tuple[int, CigarActions]]): super().__init__([]) @@ -146,15 +181,15 @@ def coerce(obj): OP_MAPPING = { - 'M': CigarActions.MATCH, - 'I': CigarActions.INSERT, - 'D': CigarActions.DELETE, - 'N': CigarActions.SKIPPED, - 'S': CigarActions.SOFT_CLIPPED, - 'H': CigarActions.HARD_CLIPPED, - 'P': CigarActions.PADDING, - '=': CigarActions.SEQ_MATCH, - 'X': CigarActions.MISMATCH, + 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) + 'I': CigarActions.INSERT, # Insertion to the reference + 'D': CigarActions.DELETE, # Deletion from the reference + 'N': CigarActions.SKIPPED, # Skipped region from the reference + 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) + 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) + 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) + '=': CigarActions.SEQ_MATCH, # Sequence match + 'X': CigarActions.MISMATCH, # Sequence mismatch } @@ -172,7 +207,14 @@ def operation_to_str(op: CigarActions) -> str: @staticmethod - def parse(string): + def parse(string) -> 'Cigar': + """ + Parses a CIGAR string into a Cigar object. + + :param string: A CIGAR string with the format '(\\d+[MIDNSHPX=])+', where each operation code + is preceded by a number indicating how many times the operation should be applied. + """ + data = [] while string: match = re.match(r'([0-9]+)([^0-9])', string) @@ -187,6 +229,11 @@ def parse(string): def append(self, item: Tuple[int, CigarActions]): + """ + Appends an operation to the CIGAR sequence, checking for type correctness + and performing normalization by merging consecutive identical operations. + """ + # Type checking if not isinstance(item, list) and not isinstance(item, tuple): raise ValueError(f"Invalid CIGAR list: {item!r} is not a tuple.") @@ -215,12 +262,30 @@ def append(self, item: Tuple[int, CigarActions]): def iterate_operations(self) -> Iterable[CigarActions]: + """ + Yields each operation in the CIGAR sequence as a `CigarActions` enum. + The resulting sequence is a decoded version of the initial run-length encoded sequence. + """ + for num, operation in self: for _ in range(num): yield operation def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optional[int], Optional[int]]]: + """ + Iterates over the operations while tracking the reference and + query sequence positions affected by each operation. + + Example: + For a Cigar instance representing "1M1I1M", this method would yield: + (CigarActions.MATCH, 0, 0), (CigarActions.INSERT, None, 1), (CigarActions.MATCH, 1, 2) + + :return: Tuple of type (CigarActions, reference_pointer, query_pointer) for each operation in the + CIGAR sequence. Pointers can be None if the operation does not map to a sequence + position (e.g., insertions, deletions). + """ + ref_pointer = 0 query_pointer = 0 @@ -262,6 +327,16 @@ def ref_length(self): def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': + """ + Creates a new Cigar object by slicing the current one from start_inclusive to + end_noninclusive. Note that slicing is done at the level of individual operations, + not at the level of counts within operations. + + Example: + Given a Cigar instance representing "10M5D5M", slicing from 2 to 11 would result in a new + Cigar object representing "8M1D". + """ + return Cigar([(1, op) for op in self.iterate_operations()] [start_inclusive:end_noninclusive]) @@ -269,8 +344,7 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ - Convert a CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. - TODO(vitalik): describe the domains and holes. + Convert this CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. :param cigar: a CIGAR string. @@ -289,6 +363,12 @@ def coordinate_mapping(self) -> CoordinateMapping: def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: + """ + Constructs a multiple sequence alignment (MSA) representation for this Cigar, using the original reference + and query sequences. It aligns the sequences according to the CIGAR operations, introducing gaps ('-') + as necessary to reflect insertions or deletions. + """ + reference_msa = '' query_msa = '' @@ -355,6 +435,11 @@ def query_length(self): @staticmethod def from_default_alignment(r_st, r_ei, q_st, q_ei): + """ + A convenience method that creates a CigarHit instance representing a default alignment, + where there are only deletions in the reference sequence and only insertions in the query. + """ + ref_length = r_ei - r_st + 1 query_length = q_ei - q_st + 1 cigar = Cigar.coerce([[ref_length, CigarActions.DELETE], @@ -365,9 +450,9 @@ def from_default_alignment(r_st, r_ei, q_st, q_ei): def overlaps(self, other) -> bool: """ - Checks if this CIGAR hit overlaps with the other CIGAR hit, - in either reference or query space. - NOTE: only applicable if these hits come from the same reference and query. + Determines whether this CigarHit overlaps with another in terms of reference or query coordinates. + Two hits are considered overlapping if their alignment ranges on the reference or query sequence overlap. + Note: Assumes that both CigarHit instances pertain to the same pair of reference and query sequences. """ def intervals_overlap(x, y): @@ -380,9 +465,8 @@ def intervals_overlap(x, y): def touches(self, other) -> bool: """ - Checks if this CIGAR hit touches the other CIGAR hit, - in both reference and query space. - NOTE: only applicable if these hits come from the same reference and query. + Checks if the end of this CigarHit is immediately adjacent to the start of another one. + Note: Assumes that both CigarHit instances pertain to the same pair of reference and query sequences. """ return self.r_ei + 1 == other.r_st \ @@ -428,8 +512,8 @@ def __add__(self, other): r_st=self.r_st, r_ei=other.r_ei, q_st=self.q_st, - q_ei=other.q_ei, - ) + q_ei=other.q_ei) + def connect(self, other): """ @@ -484,11 +568,11 @@ def _slice(self, r_st, q_st, o_st, o_ei): ) - def cut_reference(self, cut_point: float) -> 'CigarHit': + def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: """ - Splits alignment in two parts such that cut_point is in between. - Guarantees that the two parts do not share any elements, - and that no element is lost. + Splits this CigarHit into two non-overlapping parts using a fractional cut point in the reference space. + Resulting parts of CigarHits are touching at cut point. + The two parts do not share any elements, and no element is "lost". """ cut_point = Fraction(cut_point) @@ -530,10 +614,20 @@ def rstrip_query(self) -> 'CigarHit': @cached_property def coordinate_mapping(self) -> CoordinateMapping: + """ + Convert this alignment to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. + """ + return self.cigar.coordinate_mapping.translate(self.r_st, self.q_st) def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: + """ + Constructs a multiple sequence alignment (MSA) representation for this CigarHit, using the original reference + and query sequences. It aligns the sequences according to the CIGAR operations, introducing gaps ('-') + as necessary to reflect insertions or deletions. + """ + return self.cigar.to_msa(reference_seq[self.r_st:], query_seq[self.q_st:]) From c460e9fd6b5ce92b9b60bcb8381586576de9831e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 21:16:31 -0800 Subject: [PATCH 072/485] Small improvements to and cleanup of contig stitcher --- micall/core/contig_stitcher.py | 34 ++++++++------------ micall/tests/test_contig_stitcher.py | 46 +++++++++++++++------------- 2 files changed, 37 insertions(+), 43 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 226378baa..985ef1dfa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -43,12 +43,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont """ cut_point = max(0, cut_point) - left = GenotypedContig(name=self.name, # TODO(vitalik): make it f'left({self.name})' + left = GenotypedContig(name=f'left({self.name})', seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, matched_fraction=None) - right = GenotypedContig(name=self.name, # TODO(vitalik): make it f'right({self.name})' + right = GenotypedContig(name=f'right({self.name})', seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, @@ -110,17 +110,6 @@ def intervals_overlap(x, y): (other.alignment.r_st, other.alignment.r_ei)) - def contains(self, other) -> bool: - def interval_contains(x, y): - return x[0] <= y[0] and x[1] >= y[1] - - if self.ref_name != other.ref_name: - return False - - return interval_contains((self.alignment.r_st, self.alignment.r_ei), - (other.alignment.r_st, other.alignment.r_ei)) - - def gaps(self) -> Iterable[CigarHit]: return self.alignment.gaps() @@ -195,7 +184,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment) -def align_equal(seq1: str, seq2: str) -> Tuple[str, str]: +def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 @@ -217,7 +206,7 @@ def find_all_overlapping_contigs(self, aligned_contigs): def find_overlapping_contig(self, aligned_contigs): every = find_all_overlapping_contigs(self, aligned_contigs) - return max(every, key=lambda other: other.alignment.r_ei - other.alignment.r_st if other else 0, + return max(every, key=lambda other: other.alignment.ref_length if other else 0, default=None) @@ -236,12 +225,13 @@ def calculate_concordance(left: str, right: str) -> List[float]: :return: list representing concordance ratio for each position """ - result = [0] * len(left) + if len(left) != len(right): + raise ValueError("Can only calculate concordance for same sized sequences") - assert len(left) == len(right), "Can only calculate concordance for same sized sequences" + result = [0] * len(left) def slide(left, right): - window_size = 10 + window_size = 30 scores = deque([0] * window_size, maxlen=window_size) scores_sum = 0 @@ -265,7 +255,7 @@ def stitch_2_contigs(left, right): right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) # Align overlapping parts, then recombine based on concordance. - aligned_left, aligned_right = align_equal(left_overlap.seq, right_overlap.seq) + aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) @@ -416,7 +406,7 @@ def try_split(contig): return contigs -def stitch_contigs(contigs: Iterable[GenotypedContig]): +def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]: maybe_aligned = list(map(GenotypedContig.align_to_reference, contigs)) # Contigs that did not align do not need any more processing @@ -425,5 +415,5 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]): aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) - - yield from combine_overlaps(aligned) + aligned = combine_overlaps(aligned) + yield from aligned diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 48836de36..ea178f604 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -22,8 +22,8 @@ def test_identical_stitching_of_one_contig(exact_aligner): ] result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): @@ -47,8 +47,10 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): ] result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + + # No claims about the output order, so wrap into set() + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) contigs = [ GenotypedContig(name='b', @@ -66,8 +68,10 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): ] result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + + # No claims about the output order, so wrap into set() + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): @@ -97,7 +101,7 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): assert 100 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 50 - assert result.query.name == 'a+overlap(a,b)+b' + assert result.query.name == 'left(a)+overlap(a,b)+right(b)' def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner): @@ -131,7 +135,7 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali assert 100 == len(result[0].seq) assert result[0].seq == 'A' * 50 + 'C' * 50 - assert result[0].query.name == 'a+overlap(a,b)+b' + assert result[0].query.name == 'left(a)+overlap(a,b)+right(b)' assert result[1].query == contigs[2] @@ -169,7 +173,7 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): assert 200 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 - assert result.query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result.query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' def test_stitching_with_empty_contigs(exact_aligner): @@ -193,8 +197,8 @@ def test_stitching_with_empty_contigs(exact_aligner): ] result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_stitching_of_identical_contigs(exact_aligner): @@ -247,8 +251,8 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti result = list(stitch_contigs(contigs)) assert len(result) == 2 - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_correct_processing_complex_nogaps(exact_aligner): @@ -292,12 +296,12 @@ def test_correct_processing_complex_nogaps(exact_aligner): assert 170 == len(result[0].seq) assert result[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[0].query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[0].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' assert result[0].query.ref_name == 'testref-1' assert 170 == len(result[1].seq) assert result[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[1].query.name == 'a+overlap(a,b)+b+overlap(a+overlap(a,b)+b,c)+c' + assert result[1].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' assert result[1].query.ref_name == 'testref-2' assert result[2].query == contigs[3] @@ -350,8 +354,8 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): @@ -376,8 +380,8 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): result = list(stitch_contigs(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) def test_stitching_contig_with_big_covered_gap(exact_aligner): @@ -435,8 +439,8 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): result = list(split_contigs_with_gaps(contigs)) - assert sorted(map(lambda x: x.seq, contigs)) \ - == sorted(map(lambda x: x.seq, result)) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, result)) From fb718bede397acd33208003d43d3ee4098785e5d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 21:32:03 -0800 Subject: [PATCH 073/485] Cigar tools: remove the unused "closest_*" methods --- micall/tests/test_cigar_tools.py | 48 ++++++++++++++++---------------- micall/utils/cigar_tools.py | 13 --------- 2 files changed, 24 insertions(+), 37 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 40a1fbc1a..c6657c266 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -16,7 +16,7 @@ ('1M1I1M', {0: 0, 1: 2}, {0: 0, 1: 2}), ('2M2D2M', {0: 0, 1: 1, 4: 2, 5: 3}, - {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3}), + {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3}), ('2M2I2M', {0: 0, 1: 1, 2: 4, 3: 5}, {0: 0, 1: 1, 2: 4, 3: 5}), ('3M1D3M', {0: 0, 1: 1, 2: 2, 4: 3, 5: 4, 6: 5}, @@ -26,7 +26,7 @@ ('7M1I3M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}), ('5M2D4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 7: 5, 8: 6, 9: 7, 10: 8}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 5, 8: 6, 9: 7, 10: 8}), + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8}), ('5M3I4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}), ('1M1D', {0: 0}, @@ -36,28 +36,28 @@ ('1I1M', {0: 1}, {0: 1}), ('1D1M', {1: 0}, - {1: 0, 0: 0}), + {1: 0, 0: None}), # Multiple deletions and insertions ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 6, 7: 7}), + {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}), ('2M2I2M2D2M', {0: 0, 1: 1, 2: 4, 3: 5, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 6, 6: 6, 7: 7}), + {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 5, 6: 6, 7: 7}), ('2=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, - {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4}), + {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 3, 7: 4}), ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 6, 7: 7}), + {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}), ('3=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 4, 7: 4, 8: 5}), + {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 3, 7: 4, 8: 5}), # Edge cases ('', {}, {}), ('3I3D', {}, - {0: 2, 1: 2, 2: 2}), + {0: None, 1: None, 2: None}), ('3D3I', {}, - {0: 0, 1: 0, 2: 0}), + {0: None, 1: None, 2: None}), ('12I', {}, {}), - ('12D', {}, ValueError()), + ('12D', {}, {k: None for k in range(12)}), ] @@ -82,17 +82,17 @@ def test_cigar_to_coordinate_bijection_property(cigar_str): assert mapping.query_to_ref == inverse(inverse(mapping.query_to_ref)) -@pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) -def test_cigar_to_closest_coordinate_mapping(cigar_str, expected_closest_mapping): +@pytest.mark.parametrize("cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) +def test_cigar_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping def test(): - fullrange = {i: mapping.ref_to_closest_query(i) + fullrange = {i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain} - assert expected_closest_mapping == fullrange + assert expected_leftmax_mapping == fullrange - if isinstance(expected_closest_mapping, Exception): - with pytest.raises(type(expected_closest_mapping)): + if isinstance(expected_leftmax_mapping, Exception): + with pytest.raises(type(expected_leftmax_mapping)): test() else: test() @@ -113,23 +113,23 @@ def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): for i in mapping.ref_to_query.keys()} -@pytest.mark.parametrize("cigar_str, expected_closest_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) -def test_cigar_hit_to_coordinate_closest_mapping(cigar_str, expected_closest_mapping): +@pytest.mark.parametrize("cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) +def test_cigar_hit_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): cigar = Cigar.coerce(cigar_str) hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) mapping = hit.coordinate_mapping def test(expected): # Coordinates are translated by q_st and r_st. - fullrange = {i: mapping.ref_to_closest_query(i) + fullrange = {i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain} assert expected == fullrange - if isinstance(expected_closest_mapping, Exception): - with pytest.raises(type(expected_closest_mapping)): - test(expected_closest_mapping) + if isinstance(expected_leftmax_mapping, Exception): + with pytest.raises(type(expected_leftmax_mapping)): + test(expected_leftmax_mapping) else: - test({k + hit.r_st: v + hit.q_st for (k, v) in expected_closest_mapping.items()}) + test({k + hit.r_st: v + hit.q_st if v is not None else v for (k, v) in expected_leftmax_mapping.items()}) def test_invalid_operation_in_cigar_string(): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index b501aa9e1..9b44058aa 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -105,19 +105,6 @@ def extend(self, self.query_to_op.extend(query_index, op_index) - @staticmethod - def _find_closest_key(mapping: dict, index: int) -> int: - return min(mapping, key=lambda k: abs(mapping[k] - index)) - - - def ref_to_closest_query(self, index) -> int: - return CoordinateMapping._find_closest_key(self.query_to_op, self.ref_to_op[index]) - - - def query_to_closest_ref(self, index) -> int: - return CoordinateMapping._find_closest_key(self.ref_to_op, self.query_to_op[index]) - - def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': """ Generate a new CoordinateMapping with shifted coordinate spaces. From 751d8f2040dc16832706c66d013c28276e8cb1ad Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 13 Nov 2023 22:04:06 -0800 Subject: [PATCH 074/485] Add tests for connect_cigar_hits --- micall/tests/test_cigar_tools.py | 81 +++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index c6657c266..62185d71d 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -4,7 +4,7 @@ import itertools from micall.utils.consensus_aligner import CigarActions -from micall.utils.cigar_tools import Cigar, CigarHit +from micall.utils.cigar_tools import Cigar, CigarHit, connect_cigar_hits cigar_mapping_cases = [ @@ -406,3 +406,82 @@ def test_cigar_to_msa(reference_seq, query_seq, cigar, expected_reference, expec def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): with pytest.raises(ValueError): Cigar.coerce(cigar).to_msa(reference_seq, query_seq) + + +connect_cigar_hits_cases = [ + # Non-overlapping hits should be connected with deletions/insertions + ( + [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), + CigarHit('4M', r_st=10, r_ei=13, q_st=8, q_ei=11)], + CigarHit('4M5D3I4M', r_st=1, r_ei=13, q_st=1, q_ei=11) + ), + # Overlapping hits should ignore later ones + ( + [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), + CigarHit('5M', r_st=3, r_ei=7, q_st=3, q_ei=7)], + CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4) + ), + # Touching hits should be simply concatenated + ( + [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), + CigarHit('4M', r_st=5, r_ei=8, q_st=5, q_ei=8)], + CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8) + ), + # Hits that touch at only one boundary should combine just fine + ( + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('6M', r_st=4, r_ei=9, q_st=6, q_ei=11)], + CigarHit('3M2I6M', r_st=1, r_ei=9, q_st=1, q_ei=11) + ), + # Hits that are subsets of earlier hits should be ignored + ( + [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8), + CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], + CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8) + ), + # Hits that are out of order should be connected if no overlap + ( + [CigarHit('3M', r_st=10, r_ei=12, q_st=6, q_ei=8), + CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3)], + CigarHit('3M6D2I3M', r_st=1, r_ei=12, q_st=1, q_ei=8) + ), + # Hits that overlap by a single base should prioritize the first hit and not combine + ( + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], + CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3) + ), + # Non-overlapping hits in the query space but overlapping in reference space + ( + [CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5), + CigarHit('1M', r_st=3, r_ei=3, q_st=10, q_ei=10)], + CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5) + ), + # Combining more than two hits + ( + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), + CigarHit('3M', r_st=7, r_ei=9, q_st=7, q_ei=9), + CigarHit('3M', r_st=12, r_ei=14, q_st=16, q_ei=18)], + CigarHit('3M3D3I3M2D6I3M', r_st=1, r_ei=14, q_st=1, q_ei=18) + ), + # Combining hits including hard-clipping, which should be ignored in alignments + ( + [CigarHit('2H5M1H', r_st=1, r_ei=5, q_st=3, q_ei=7), + CigarHit('2H5M1H', r_st=11, r_ei=15, q_st=13, q_ei=17)], + CigarHit('2H5M1H5D5I2H5M1H', r_st=1, r_ei=15, q_st=3, q_ei=17) + ), + # An empty list of hits should raise a ValueError + ( + [], + ValueError("Expected a non-empty list of cigar hits") + ), +] + +@pytest.mark.parametrize('hits, expected_result', connect_cigar_hits_cases) +def test_connect_cigar_hits(hits, expected_result): + if isinstance(expected_result, Exception): + with pytest.raises(type(expected_result)): + connect_cigar_hits(hits) + else: + result = connect_cigar_hits(hits) + assert expected_result == result From 57805fa2b66c73f6559044c21fcd009b72dd5062 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 08:12:58 -0800 Subject: [PATCH 075/485] Contig stitcher: add the stitch_consensus function --- micall/core/contig_stitcher.py | 37 +++-- micall/tests/test_contig_stitcher.py | 206 ++++++++++++++++++++------- 2 files changed, 180 insertions(+), 63 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 985ef1dfa..0da1eb032 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,5 +1,5 @@ from typing import Iterable, Optional, Tuple, List -from collections import deque +from collections import deque, defaultdict from dataclasses import dataclass from math import ceil from mappy import Aligner @@ -61,20 +61,22 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont class AlignedContig(GenotypedContig): def __init__(self, query: GenotypedContig, alignment: CigarHit): - self.alignment = alignment self.query = query - - ref_msa, query_msa = self.alignment.to_msa(self.query.ref_seq, self.query.seq) - seq = ''.join((c for c in query_msa if c != '-')) - + self.alignment = alignment super().__init__( - seq = seq, + seq = query.seq, name = query.name, ref_name = query.ref_name, ref_seq = query.ref_seq, matched_fraction = query.matched_fraction) + @cached_property + def aligned_seq(self): + ref_msa, query_msa = self.alignment.to_msa(self.query.ref_seq, self.query.seq) + return ''.join((c for c in query_msa if c != '-')) + + def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ @@ -165,8 +167,8 @@ def cut_reference(self, cut_point: float) -> 'FrankensteinContig': @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: - left_query_seq = left.query.seq[0:left.alignment.q_ei + 1] - right_query_seq = right.query.seq[right.alignment.q_st:] + left_query_seq = left.seq[0:left.alignment.q_ei + 1] + right_query_seq = right.seq[right.alignment.q_st:] query_seq = left_query_seq + right_query_seq left_alignment = left.alignment @@ -417,3 +419,20 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig aligned = drop_completely_covered(aligned) aligned = combine_overlaps(aligned) yield from aligned + + +def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: + contigs = list(stitch_contigs(contigs)) + consensus_parts = defaultdict(list) # ref_name -> List[AlignedContig] + + for contig in contigs: + if isinstance(contig, AlignedContig): + consensus_parts[contig.ref_name].append(contig) + else: + yield contig + + def combine(contigs): + contigs = sorted(contigs, key=lambda x: x.alignment.r_st) + return FrankensteinContig(contigs) + + yield from map(combine, consensus_parts.values()) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index ea178f604..d7feb2689 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,6 +1,6 @@ import pytest -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus from micall.tests.utils import MockAligner @@ -21,9 +21,9 @@ def test_identical_stitching_of_one_contig(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): @@ -46,11 +46,11 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) contigs = [ GenotypedContig(name='b', @@ -67,11 +67,11 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): @@ -94,10 +94,10 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) - assert len(result) == 1 + results = list(stitch_contigs(contigs)) + assert len(results) == 1 - result = result[0] + result = results[0] assert 100 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 50 @@ -130,14 +130,14 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali ), ] - result = list(stitch_contigs(contigs)) - assert len(result) == 2 + results = list(stitch_contigs(contigs)) + assert len(results) == 2 - assert 100 == len(result[0].seq) - assert result[0].seq == 'A' * 50 + 'C' * 50 - assert result[0].query.name == 'left(a)+overlap(a,b)+right(b)' + assert 100 == len(results[0].seq) + assert results[0].seq == 'A' * 50 + 'C' * 50 + assert results[0].query.name == 'left(a)+overlap(a,b)+right(b)' - assert result[1].query == contigs[2] + assert results[1].query == contigs[2] def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): @@ -166,10 +166,10 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) - assert len(result) == 1 + results = list(stitch_contigs(contigs)) + assert len(results) == 1 - result = result[0] + result = results[0] assert 200 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 @@ -196,9 +196,9 @@ def test_stitching_with_empty_contigs(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_stitching_of_identical_contigs(exact_aligner): @@ -214,17 +214,17 @@ def test_stitching_of_identical_contigs(exact_aligner): ) for name in ["a", "b", "c"]] - result = list(stitch_contigs(contigs)) - assert len(result) == 1 - assert result[0].query == contigs[2] + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + assert results[0].query == contigs[2] def test_stitching_of_zero_contigs(exact_aligner): # Scenario: The function does not crash if no contigs given. contigs = [] - result = list(stitch_contigs(contigs)) - assert result == contigs + results = list(stitch_contigs(contigs)) + assert results == contigs def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(exact_aligner): @@ -248,11 +248,11 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti ), ] - result = list(stitch_contigs(contigs)) - assert len(result) == 2 + results = list(stitch_contigs(contigs)) + assert len(results) == 2 assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_correct_processing_complex_nogaps(exact_aligner): @@ -291,21 +291,21 @@ def test_correct_processing_complex_nogaps(exact_aligner): contigs = sum(contigs, start=[]) - result = list(stitch_contigs(contigs)) - assert len(result) == 4 + results = list(stitch_contigs(contigs)) + assert len(results) == 4 - assert 170 == len(result[0].seq) - assert result[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[0].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' - assert result[0].query.ref_name == 'testref-1' + assert 170 == len(results[0].seq) + assert results[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 + assert results[0].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' + assert results[0].query.ref_name == 'testref-1' - assert 170 == len(result[1].seq) - assert result[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert result[1].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' - assert result[1].query.ref_name == 'testref-2' + assert 170 == len(results[1].seq) + assert results[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 + assert results[1].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' + assert results[1].query.ref_name == 'testref-2' - assert result[2].query == contigs[3] - assert result[3].query == contigs[7] + assert results[2].query == contigs[3] + assert results[3].query == contigs[7] def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): @@ -329,13 +329,13 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) - assert len(result) == 1 + results = list(stitch_contigs(contigs)) + assert len(results) == 1 # Test to ensure that the final result contains the contig 'b' and # does not contain the completely covered contig 'a'. - assert result[0].query.name == 'b' - assert result[0].query == contigs[1] + assert results[0].query.name == 'b' + assert results[0].query == contigs[1] def test_stitching_contig_with_big_noncovered_gap(exact_aligner): @@ -352,10 +352,10 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): @@ -378,10 +378,10 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): ), ] - result = list(stitch_contigs(contigs)) + results = list(stitch_contigs(contigs)) assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + == set(map(lambda x: x.seq, results)) def test_stitching_contig_with_big_covered_gap(exact_aligner): @@ -408,9 +408,9 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): assert len(list(contigs[0].gaps())) == 1 assert len(list(contigs[1].gaps())) == 0 - result = list(split_contigs_with_gaps(contigs)) - assert len(result) == 3 - assert all(list(contig.gaps()) == [] for contig in result) + results = list(split_contigs_with_gaps(contigs)) + assert len(results) == 3 + assert all(list(contig.gaps()) == [] for contig in results) def test_stitching_contig_with_small_covered_gap(exact_aligner): @@ -437,11 +437,109 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): assert len(list(contigs[0].gaps())) == 1 assert len(list(contigs[1].gaps())) == 0 - result = list(split_contigs_with_gaps(contigs)) + results = list(split_contigs_with_gaps(contigs)) + + assert all(x.seq == x.aligned_seq for x in results) + + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, results)) + + +def test_stitching_partial_align(exact_aligner): + # Scenario: A single contig has a sequence that partially aligns to the reference sequence. + + contigs = [ + GenotypedContig(name='a', + seq='T' * 10 + 'C' * 20 + 'A' * 10, + ref_name='testref', + ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, + matched_fraction=0.3, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == len(contigs) + for result in results: + assert any(result.seq in contig.seq for contig in contigs) + + assert all(x.seq != x.aligned_seq for x in results) + + assert set(map(lambda x: x.seq, contigs)) \ + != set(map(lambda x: x.aligned_seq, results)) + + +def test_partial_align_consensus(exact_aligner): + # Scenario: A single contig partially aligns to the reference sequence, and a consensus sequence is being stitched. + + contigs = [ + GenotypedContig(name='a', + seq='T' * 10 + 'C' * 20 + 'A' * 10, + ref_name='testref', + ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, + matched_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == len(contigs) + assert set(map(lambda x: x.seq, contigs)) \ + == set(map(lambda x: x.seq, results)) + + +def test_stitching_partial_align_multiple_sequences(exact_aligner): + # Scenario: Multiple contigs have sequences that partially align to the same reference sequence. + + ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + + contigs = [ + GenotypedContig(name='a', + seq='T' * 10 + 'C' * 20 + 'A' * 10, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + GenotypedContig(name='b', + seq='C' * 20 + 'A' * 10 + 'G' * 10, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + for result in results: + assert any(result.seq in contig.seq for contig in contigs) assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, result)) + != set(map(lambda x: x.aligned_seq, results)) + + +def test_partial_align_consensus_multiple_sequences(exact_aligner): + # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. + + ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + + contigs = [ + GenotypedContig(name='a', + seq='T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 10, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + GenotypedContig(name='b', + seq='C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + ] + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 9 + 'T' * 5 + 'A' * 10 + 'G' * 10 + assert results[0].seq == contigs[0].seq[:-1] + contigs[1].seq[20:] + assert results[0].name == 'left(a)+overlap(a,b)+right(b)' # _ _ _ _ _ _ From 7f3cdc0a24d8a486d1f803dd1c9db6e5847de338 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 10:26:10 -0800 Subject: [PATCH 076/485] Contig stitcher: fix munging of non-touching contigs --- micall/core/contig_stitcher.py | 2 +- micall/tests/test_contig_stitcher.py | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0da1eb032..12ede51f2 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -176,7 +176,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: right.alignment.translate( query_delta=(-1 * right.alignment.q_st + len(left_query_seq)), reference_delta=0) - alignment = left_alignment + right_alignment + alignment = left_alignment.connect(right_alignment) query = GenotypedContig(seq=query_seq, name=f'{left.name}+{right.name}', diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index d7feb2689..da783efd2 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -520,6 +520,32 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner): ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + contigs = [ + GenotypedContig(name='a', + seq='A' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + GenotypedContig(name='b', + seq='T' * 20, + ref_name='testref', + ref_seq=ref_seq, + matched_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq + contigs[1].seq + assert results[0].name == 'a+b' + + +def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): + # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. + + ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + contigs = [ GenotypedContig(name='a', seq='T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 10, From eb712a335f88fd55319c1dc8454fa8e8c3872950 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 10:29:47 -0800 Subject: [PATCH 077/485] Add test for checking CigarHit.gaps() lengths --- micall/tests/test_cigar_tools.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 62185d71d..9378cb775 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -4,7 +4,7 @@ import itertools from micall.utils.consensus_aligner import CigarActions -from micall.utils.cigar_tools import Cigar, CigarHit, connect_cigar_hits +from micall.utils.cigar_tools import Cigar, CigarHit, connect_cigar_hits, CoordinateMapping cigar_mapping_cases = [ @@ -383,6 +383,17 @@ def test_cigar_hit_gaps_no_m_or_i(hit): assert 'I' not in str(gap.cigar) +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_gaps_lengths(hit): + gaps = list(hit.gaps()) + + for gap in gaps: + assert gap.query_length == 0 + assert gap.ref_length > 0 + assert gap.coordinate_mapping == CoordinateMapping() + + @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), ('ACTG', '', '4D', 'ACTG', '----'), From aaf2a28c8049516a71f7f41856d12a64c3d5776a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 10:38:09 -0800 Subject: [PATCH 078/485] Contig stitcher: fix overlap overcounting --- micall/core/contig_stitcher.py | 2 +- micall/tests/test_contig_stitcher.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 12ede51f2..0ecf3b2d3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -257,7 +257,7 @@ def stitch_2_contigs(left, right): right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) # Align overlapping parts, then recombine based on concordance. - aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) + aligned_left, aligned_right = align_queries(left_overlap.aligned_seq, right_overlap.aligned_seq) concordance = calculate_concordance(aligned_left, aligned_right) max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index da783efd2..e1068ab5a 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -563,8 +563,8 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 9 + 'T' * 5 + 'A' * 10 + 'G' * 10 - assert results[0].seq == contigs[0].seq[:-1] + contigs[1].seq[20:] + assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10 + assert results[0].seq == contigs[0].seq[:-10] + contigs[1].seq[20:] assert results[0].name == 'left(a)+overlap(a,b)+right(b)' From 98e92403c3f03f8393a129f6b9c31963ab0c1eb2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 11:09:39 -0800 Subject: [PATCH 079/485] Add tests for calculate_concordance --- micall/core/contig_stitcher.py | 2 +- micall/tests/test_contig_stitcher.py | 74 +++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0ecf3b2d3..1f432a8e3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -218,7 +218,7 @@ def calculate_concordance(left: str, right: str) -> List[float]: The function compares the two strings from both left to right and then right to left, calculating for each position the ratio of matching characters in a window around the - current position (10 characters to the left and right). + current position. It's required that the input strings are of the same length. diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e1068ab5a..e2ce4cba2 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,7 +1,8 @@ import pytest -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus -from micall.tests.utils import MockAligner +import random +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance +from micall.tests.utils import MockAligner, fixed_random_seed @pytest.fixture() @@ -732,3 +733,72 @@ def test_find_covered(contigs, expected_covered_name): assert covered is not None assert covered.name == expected_covered_name + +def test_concordance_same_length_inputs(): + with pytest.raises(ValueError): + calculate_concordance('abc', 'ab') + +def test_concordance_completely_different_strings(): + result = calculate_concordance('a'*30, 'b'*30) + assert all(n == 0 for n in result) + +def generate_random_string_pair(length): + left = ''.join(random.choice('ACGT') for _ in range(length)) + right = ''.join(random.choice('ACGT') for _ in range(length)) + return left, right + +def generate_test_cases(num_cases): + with fixed_random_seed(42): + length = random.randint(1, 80) + return [generate_random_string_pair(length) for _ in range(num_cases)] + +concordance_cases = generate_test_cases(num_cases=100) + + +@pytest.mark.parametrize('left, right', concordance_cases) +def test_concordance_output_is_list_of_floats(left, right): + result = calculate_concordance(left, right) + assert isinstance(result, list), "Result should be a list" + assert all(isinstance(n, float) for n in result), "All items in result should be float" + + +@pytest.mark.parametrize('left, right', concordance_cases) +def test_concordance_output_range(left, right): + result = calculate_concordance(left, right) + assert all(0 <= n <= 1 for n in result), "All values in result should be between 0 and 1" + + +@pytest.mark.parametrize('left, right', concordance_cases) +def test_concordance_higher_if_more_matches_added(left, right): + # Insert exact matches in the middle + matching_sequence = 'A' * 30 + insert_position = len(left) // 2 + new_left = left[:insert_position] + matching_sequence + left[insert_position + len(matching_sequence):] + new_right = right[:insert_position] + matching_sequence + right[insert_position + len(matching_sequence):] + + old_conc = calculate_concordance(left, right) + new_conc = calculate_concordance(new_left, new_right) + old_average = sum(old_conc) / len(old_conc) + new_average = sum(new_conc) / len(new_conc) + assert old_average <= new_average + + +@pytest.mark.parametrize('left, right', concordance_cases) +def test_concordance_higher_in_matching_areas(left, right): + # Insert exact matches in the middle + matching_sequence = 'A' * 30 + insert_position = len(left) // 2 + new_left = left[:insert_position] + matching_sequence + left[insert_position + len(matching_sequence):] + new_right = right[:insert_position] + matching_sequence + right[insert_position + len(matching_sequence):] + + concordance_scores = calculate_concordance(new_left, new_right) + + # Check concordance in the matching area + matching_area_concordance = concordance_scores[insert_position:insert_position + len(matching_sequence)] + + # Calculate average concordance inside and outside the matching area + average_inside = sum(matching_area_concordance) / len(matching_sequence) + average_outside = (sum(concordance_scores) - sum(matching_area_concordance)) / (len(concordance_scores) - len(matching_sequence)) + + # Assert that the concordance is indeed higher in the matching area + assert average_inside > average_outside, "Concordance in matching areas should be higher than in non-matching areas" From 38f8833f049683177e895d884210c58722579270 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 11:33:19 -0800 Subject: [PATCH 080/485] Cigar tools: fix strips of empty queries --- micall/tests/test_cigar_tools.py | 29 +++++++++++++++++++++++++++++ micall/utils/cigar_tools.py | 4 ++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 9378cb775..fd817c41c 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -341,6 +341,35 @@ def test_cigar_hit_strip_combines_with_connect(hit, cut_point): assert left.connect(right).coordinate_mapping == hit.coordinate_mapping +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +def test_cigar_hit_strip_never_crashes(hit): + hit.rstrip_query().lstrip_query() + hit.lstrip_query().rstrip_query() + hit.lstrip_query().lstrip_query() + hit.rstrip_query().rstrip_query() + + +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +def test_cigar_hit_strip_is_idempotent(hit): + h1 = hit.rstrip_query() + assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() + + h1 = hit.lstrip_query() + assert h1 == h1.lstrip_query() == h1.lstrip_query().lstrip_query() + + h1 = hit.lstrip_query().rstrip_query() + assert h1 == h1.lstrip_query() == h1.rstrip_query() + + h1 = hit.rstrip_query().lstrip_query() + assert h1 == h1.rstrip_query() == h1.lstrip_query() + + +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +def test_cigar_hit_strips_are_commutative(hit): + assert hit.rstrip_query().lstrip_query() \ + == hit.lstrip_query().rstrip_query() + + @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception) and not 'N' in str(x[0].cigar)]) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 9b44058aa..10527a64e 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -580,7 +580,7 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: def lstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ - if self.query_length == 0: + if len(self.coordinate_mapping.ref_to_query) == 0: return self closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_st - 1) @@ -591,7 +591,7 @@ def lstrip_query(self) -> 'CigarHit': def rstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ - if self.query_length == 0: + if len(self.coordinate_mapping.ref_to_query) == 0: return self closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_ei + 1) From 5ab93bda3a8725617febc6c1caf7298aea4fe99a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 12:22:16 -0800 Subject: [PATCH 081/485] Add example tests for CigarHit.lstrip --- micall/tests/test_cigar_tools.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index fd817c41c..549a8c7f1 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -330,6 +330,29 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): assert left + right == hit +@pytest.mark.parametrize('hit, expected', [ + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), + CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)), + + (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), + CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5)), + + (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), + CigarHit('5M', r_st=7, r_ei=11, q_st=1, q_ei=5)), + + (CigarHit('4I6D5M', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M', r_st=7, r_ei=11, q_st=5, q_ei=9)), + + (CigarHit('6D4I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M', r_st=7, r_ei=11, q_st=5, q_ei=9)), + + (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), + CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), +]) +def test_cigar_hit_lstrip(hit, expected): + assert expected == hit.lstrip_query() + + @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_strip_combines_with_connect(hit, cut_point): From ea58060b47d625ea47fd571e0487c0d5bee6389b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 12:44:07 -0800 Subject: [PATCH 082/485] Cigar tools: fix strip() logic --- micall/tests/test_cigar_tools.py | 101 ++++++++++++++++++++++--------- micall/utils/cigar_tools.py | 49 +++++++++++---- 2 files changed, 109 insertions(+), 41 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 549a8c7f1..d4052c07a 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -330,7 +330,7 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): assert left + right == hit -@pytest.mark.parametrize('hit, expected', [ +lstrip_cases = [ (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)), @@ -340,31 +340,87 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), CigarHit('5M', r_st=7, r_ei=11, q_st=1, q_ei=5)), - (CigarHit('4I6D5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M', r_st=7, r_ei=11, q_st=5, q_ei=9)), - (CigarHit('6D4I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M', r_st=7, r_ei=11, q_st=5, q_ei=9)), + CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), + + (CigarHit('3D3D4I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), + + (CigarHit('3D2I3D2I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), + + (CigarHit('4I6D5M', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), -]) +] + +@pytest.mark.parametrize('hit, expected', lstrip_cases) def test_cigar_hit_lstrip(hit, expected): assert expected == hit.lstrip_query() -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) -def test_cigar_hit_strip_combines_with_connect(hit, cut_point): - left, right = hit.cut_reference(cut_point) +rstrip_cases = [ + (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), + CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)), + + (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), + CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5)), + + (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), + CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5)), + + (CigarHit('5M4I6D', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), + + (CigarHit('5M4I3D3D', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), + + (CigarHit('5M2I3D2I3D', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), - left = left.rstrip_query() - right = right.lstrip_query() + (CigarHit('5M6D4I', r_st=1, r_ei=11, q_st=1, q_ei=9), + CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), + + (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), + CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), +] + +@pytest.mark.parametrize('hit, expected', rstrip_cases) +def test_cigar_hit_rstrip(hit, expected): + assert expected == hit.rstrip_query() + + +strip_prop_cases_all = [x[0] for x in cigar_hit_ref_cut_cases] \ + + [x[0] for x in lstrip_cases] \ + + [x[0] for x in rstrip_cases] + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_strip_combines_with_connect(hit): + for cut_point in range(hit.r_st - 1, hit.r_ei): + left, right = hit.cut_reference(cut_point + hit.epsilon) + + left = left.rstrip_query() + right = right.lstrip_query() + + assert left.connect(right).coordinate_mapping == hit.coordinate_mapping - assert left.connect(right).coordinate_mapping == hit.coordinate_mapping +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_strip_combines_with_add(hit): + for cut_point in range(hit.r_st - 1, hit.r_ei): + left, right = hit.cut_reference(cut_point + hit.epsilon) -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) + left = left.rstrip_query() + right = right.lstrip_query() + + if left.touches(right): + assert left + right == hit + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_never_crashes(hit): hit.rstrip_query().lstrip_query() hit.lstrip_query().rstrip_query() @@ -372,7 +428,7 @@ def test_cigar_hit_strip_never_crashes(hit): hit.rstrip_query().rstrip_query() -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +@pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_is_idempotent(hit): h1 = hit.rstrip_query() assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() @@ -387,25 +443,12 @@ def test_cigar_hit_strip_is_idempotent(hit): assert h1 == h1.rstrip_query() == h1.lstrip_query() -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases]) +@pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strips_are_commutative(hit): assert hit.rstrip_query().lstrip_query() \ == hit.lstrip_query().rstrip_query() -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception) - and not 'N' in str(x[0].cigar)]) -def test_cigar_hit_strip_combines_with_add(hit, cut_point): - left, right = hit.cut_reference(cut_point) - - left = left.rstrip_query() - right = right.lstrip_query() - - if left.touches(right): - assert left + right == hit - - @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 10527a64e..130f0b3ee 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -328,6 +328,37 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': [start_inclusive:end_noninclusive]) + def lstrip_query(self) -> 'Cigar': + """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ + + if self.query_length == 0: + return self + + min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=0) + min_op = self.coordinate_mapping.query_to_op[min_q] + + ops = [(1, op) for i, (op, ref_pointer, query_pointer) + in enumerate(self.iterate_operations_with_pointers()) + if ref_pointer is None or i >= min_op] + return Cigar.coerce(ops) + + + def rstrip_query(self) -> 'Cigar': + """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ + + if self.query_length == 0: + return self + + max_q = max(self.coordinate_mapping.query_to_ref.keys(), + default=len(self.coordinate_mapping.query_to_op) - 1) + max_op = self.coordinate_mapping.query_to_op[max_q] + + ops = [(1, op) for i, (op, ref_pointer, query_pointer) + in enumerate(self.iterate_operations_with_pointers()) + if ref_pointer is None or i <= max_op] + return Cigar.coerce(ops) + + @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ @@ -580,23 +611,17 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: def lstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ - if len(self.coordinate_mapping.ref_to_query) == 0: - return self - - closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_st - 1) - remainder, stripped = self.cut_reference(closest_ref - self.epsilon) - return stripped + cigar = self.cigar.lstrip_query() + return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, + q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) def rstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ - if len(self.coordinate_mapping.ref_to_query) == 0: - return self - - closest_ref = self.coordinate_mapping.ref_to_query.closest_key(self.r_ei + 1) - stripped, remainder = self.cut_reference(closest_ref + self.epsilon) - return stripped + cigar = self.cigar.rstrip_query() + return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, + q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) @cached_property From 858c9e6270ef40cd0b844a8d35a55e3129c52af3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 15:58:00 -0800 Subject: [PATCH 083/485] Mention devcontainers in CONTRIBUTING.md --- CONTRIBUTING.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8551084f5..b96a245f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,6 +12,21 @@ you planning to submit your own fix in a pull request? [issues]: https://github.com/cfe-lab/MiCall/issues ## Development + +The easiest way to start developing MiCall is by using DevContainers. + +1. **Open Project**: Open the MiCall project folder in your editor, which should prompt to "Reopen in Container" or similar — click this to initialize the DevContainer environment. + +2. **Dependency Installation**: The DevContainer configuration will handle the installation of required dependencies within the Docker container, so you don't have to install them manually. + +3. **Interpreter Selection**: + - Watch out for dependency errors after setup. If they occur, manually switch to the correct Python interpreter version `3.8`, as the container includes two versions. + - This is an issue that we are tracking [here](https://github.com/cfe-lab/MiCall/issues/1033). + +4. **Verification**: Run `pytest` within the DevContainer to ensure all tests pass, confirming the environment is correctly configured. + +### Local install + To see how all the tools should be installed, follow the steps in `Dockerfile` and `dev.dockerfile`. If you prefer, you can run your development environment under docker, as described in `dev.dockerfile`. The same installation steps are @@ -286,4 +301,4 @@ similar steps to setting up a development workstation. Follow these steps: [parse_args]: https://github.com/cfe-lab/MiCall/blame/master/micall_watcher.py [Zenodo]: https://doi.org/10.5281/zenodo.2644171 [can't log in]: https://www.docker.com/blog/registry-v1-api-deprecation/ -[docker hub]: https://hub.docker.com/orgs/cfelab/members \ No newline at end of file +[docker hub]: https://hub.docker.com/orgs/cfelab/members From 4d61b41aad4f1f8bf8627f032171316f545441ff Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 16:09:11 -0800 Subject: [PATCH 084/485] Contig stitcher: simplify the munge operation Use strip_query operations to avoid coordinate arithmetic. --- micall/core/contig_stitcher.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1f432a8e3..cb8b60d36 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -167,22 +167,20 @@ def cut_reference(self, cut_point: float) -> 'FrankensteinContig': @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: - left_query_seq = left.seq[0:left.alignment.q_ei + 1] - right_query_seq = right.seq[right.alignment.q_st:] - query_seq = left_query_seq + right_query_seq + query_seq = left.rstrip_query().seq + right.lstrip_query().seq + query = GenotypedContig(seq=query_seq, + name=f'{left.name}+{right.name}', + ref_name=left.ref_name, + ref_seq=left.ref_seq, + matched_fraction=None) left_alignment = left.alignment right_alignment = \ right.alignment.translate( - query_delta=(-1 * right.alignment.q_st + len(left_query_seq)), + query_delta=(-1 * right.alignment.q_st + left.alignment.q_ei + 1), reference_delta=0) alignment = left_alignment.connect(right_alignment) - query = GenotypedContig(seq=query_seq, - name=f'{left.name}+{right.name}', - ref_name=left.ref_name, - ref_seq=left.ref_seq, - matched_fraction=None) return AlignedContig(query, alignment) From 87beae344d7fd9642b6911925e9ba0b1bb4be47c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 17:02:06 -0800 Subject: [PATCH 085/485] Remove gaps() method from AlignedContig --- micall/core/contig_stitcher.py | 7 +------ micall/tests/test_cigar_tools.py | 12 ++++++++++++ micall/tests/test_contig_stitcher.py | 10 +++++----- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index cb8b60d36..bda128ba1 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -112,17 +112,12 @@ def intervals_overlap(x, y): (other.alignment.r_st, other.alignment.r_ei)) - def gaps(self) -> Iterable[CigarHit]: - return self.alignment.gaps() - - class SyntheticContig(AlignedContig): def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, q_st=0, q_ei=len(query.seq)-1) super().__init__(query, alignment) - def cut_reference(self, cut_point: float): raise NotImplementedError("SyntheticContigs cannot be cut because they are not properly aligned") @@ -375,7 +370,7 @@ def significant(gap): return gap.ref_length > 5 def try_split(contig): - for gap in contig.gaps(): + for gap in contig.alignment.gaps(): if not significant(gap): # Really we do not want to split on every little deletion # because that would mean that we would need to stitch diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index d4052c07a..06337a90a 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -352,6 +352,12 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): (CigarHit('4I6D5M', r_st=1, r_ei=11, q_st=1, q_ei=9), CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), + (CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4), + CigarHit('4I', r_st=7, r_ei=6, q_st=1, q_ei=4)), + + (CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4), + CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4)), + (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), ] @@ -383,6 +389,12 @@ def test_cigar_hit_lstrip(hit, expected): (CigarHit('5M6D4I', r_st=1, r_ei=11, q_st=1, q_ei=9), CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), + (CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4), + CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4)), + + (CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4), + CigarHit('4I', r_st=1, r_ei=0, q_st=1, q_ei=4)), + (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), ] diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e2ce4cba2..21af2d84f 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -406,12 +406,12 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ] contigs = [x.align_to_reference() for x in contigs] - assert len(list(contigs[0].gaps())) == 1 - assert len(list(contigs[1].gaps())) == 0 + assert len(list(contigs[0].alignment.gaps())) == 1 + assert len(list(contigs[1].alignment.gaps())) == 0 results = list(split_contigs_with_gaps(contigs)) assert len(results) == 3 - assert all(list(contig.gaps()) == [] for contig in results) + assert all(list(contig.alignment.gaps()) == [] for contig in results) def test_stitching_contig_with_small_covered_gap(exact_aligner): @@ -435,8 +435,8 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ] contigs = [x.align_to_reference() for x in contigs] - assert len(list(contigs[0].gaps())) == 1 - assert len(list(contigs[1].gaps())) == 0 + assert len(list(contigs[0].alignment.gaps())) == 1 + assert len(list(contigs[1].alignment.gaps())) == 0 results = list(split_contigs_with_gaps(contigs)) From 6d44808fdac3245364fb464d14e56a15cbe4ab51 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 Nov 2023 17:13:02 -0800 Subject: [PATCH 086/485] Cigar tools: fix CoordinateMapping equality operator --- micall/tests/test_cigar_tools.py | 13 +++++++++++-- micall/utils/cigar_tools.py | 8 ++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 06337a90a..7b0e8f8cd 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -229,6 +229,14 @@ def test_invalid_cigar_string(): [CigarHit('9M9I4D', r_st=1, r_ei=13, q_st=1, q_ei=18), CigarHit('5D9M', r_st=14, r_ei=27, q_st=19, q_ei=27)]), + (CigarHit('1M1I1D1M', r_st=1, r_ei=3, q_st=1, q_ei=3), 1.5, # same as previous 2 cases but smaller + [CigarHit('1M1I', r_st=1, r_ei=1, q_st=1, q_ei=2), + CigarHit('1D1M', r_st=2, r_ei=3, q_st=3, q_ei=3)]), + + (CigarHit('1M1D1I1M', r_st=1, r_ei=3, q_st=1, q_ei=3), 1.5, # same as previous 2 cases but smaller + [CigarHit('1M', r_st=1, r_ei=1, q_st=1, q_ei=1), + CigarHit('1D1I1M', r_st=2, r_ei=3, q_st=2, q_ei=3)]), + # Edge cases (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.5, # no middlepoint [CigarHit('9M5I', r_st=1, r_ei=9, q_st=1, q_ei=14), @@ -417,7 +425,8 @@ def test_cigar_hit_strip_combines_with_connect(hit): left = left.rstrip_query() right = right.lstrip_query() - assert left.connect(right).coordinate_mapping == hit.coordinate_mapping + assert left.connect(right).coordinate_mapping.ref_to_query \ + == hit.coordinate_mapping.ref_to_query @pytest.mark.parametrize('hit', strip_prop_cases_all) @@ -498,7 +507,7 @@ def test_cigar_hit_gaps_lengths(hit): for gap in gaps: assert gap.query_length == 0 assert gap.ref_length > 0 - assert gap.coordinate_mapping == CoordinateMapping() + assert gap.coordinate_mapping.ref_to_query == {} @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 130f0b3ee..6acee2793 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -74,7 +74,6 @@ def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': return ret -@dataclass class CoordinateMapping: """ Manages bidirectional mappings between reference and query coordinates, as well as operation indices. @@ -88,8 +87,8 @@ class CoordinateMapping: """ def __init__(self): - self.query_to_ref = IntDict() self.ref_to_query = IntDict() + self.query_to_ref = IntDict() self.ref_to_op = IntDict() self.query_to_op = IntDict() @@ -125,6 +124,11 @@ def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMappin return ret + def __eq__(self, other): + return (self.ref_to_op, self.query_to_op) \ + == (other.ref_to_op, other.query_to_op) + + def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' From 713df5de1b6f65a6fafbb0b16b398b45c42a4503 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 Nov 2023 09:19:34 -0800 Subject: [PATCH 087/485] Contig stitcher: simplify overlap seq calculation --- micall/core/contig_stitcher.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index bda128ba1..7e1bd4ccb 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -248,9 +248,11 @@ def stitch_2_contigs(left, right): # Cut in 4 parts. left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) + left_overlap = left_overlap.rstrip_query() + right_overlap = right_overlap.lstrip_query() # Align overlapping parts, then recombine based on concordance. - aligned_left, aligned_right = align_queries(left_overlap.aligned_seq, right_overlap.aligned_seq) + aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) From 87d2d4d6c9da9a38248e86f20ccb5fac0ad6024e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 Nov 2023 09:21:32 -0800 Subject: [PATCH 088/485] Contig stitcher: remove unused aligned_seq field --- micall/core/contig_stitcher.py | 6 ------ micall/tests/test_contig_stitcher.py | 8 ++++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 7e1bd4ccb..6e75d3f79 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -71,12 +71,6 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit): matched_fraction = query.matched_fraction) - @cached_property - def aligned_seq(self): - ref_msa, query_msa = self.alignment.to_msa(self.query.ref_seq, self.query.seq) - return ''.join((c for c in query_msa if c != '-')) - - def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 21af2d84f..b5b03fd91 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -440,7 +440,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): results = list(split_contigs_with_gaps(contigs)) - assert all(x.seq == x.aligned_seq for x in results) + assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) assert set(map(lambda x: x.seq, contigs)) \ == set(map(lambda x: x.seq, results)) @@ -463,10 +463,10 @@ def test_stitching_partial_align(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) - assert all(x.seq != x.aligned_seq for x in results) + assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) assert set(map(lambda x: x.seq, contigs)) \ - != set(map(lambda x: x.aligned_seq, results)) + != set(map(lambda x: x.lstrip_query().rstrip_query().seq, results)) def test_partial_align_consensus(exact_aligner): @@ -513,7 +513,7 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner): assert any(result.seq in contig.seq for contig in contigs) assert set(map(lambda x: x.seq, contigs)) \ - != set(map(lambda x: x.aligned_seq, results)) + != set(map(lambda x: x.lstrip_query().rstrip_query().seq, results)) def test_partial_align_consensus_multiple_sequences(exact_aligner): From 2fd20f53c6c2cbad42a98493c7661f2ddc57ebed Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 Nov 2023 10:37:57 -0800 Subject: [PATCH 089/485] Small improvements to Cigar tools and Contig stitcher * Extracted the `align_to_reference` method from `GenotypedContig` class and converted it into a standalone function, streamlining how contig alignment is invoked across the application. * Transformed the `Cigar` data structure from a mutable list to an immutable tuple to enforce the immutability of CIGAR sequences after creation for safety and reliability. * Revised the `epsilon` property calculation in `CigarHit` for better accuracy and consistency. * Revised in-code documentation, removing redundant comments and placing technical details at the end of the file for cleaner, more focused code. * Removed the unused `closest_key` method from `IntDict`, simplifying the interface. * Enhanced comments in `CigarHit` and `Cigar`, making the code easier to understand and clarifying the purpose and functionality of core classes. --- micall/core/contig_stitcher.py | 28 +-- micall/tests/test_contig_stitcher.py | 6 +- micall/utils/cigar_tools.py | 256 ++++++++++++++------------- 3 files changed, 155 insertions(+), 135 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 6e75d3f79..b3daf1222 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -25,17 +25,6 @@ class GenotypedContig(Contig): ref_seq: str matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. - def align_to_reference(self): - aligner = Aligner(seq=self.ref_seq, preset='map-ont') - alignments = list(aligner.map(self.seq)) - if not alignments: - return self - - hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] - single_cigar_hit = connect_cigar_hits(hits_array) - return AlignedContig(query=self, alignment=single_cigar_hit) - - def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: """ Cuts this alignment in two parts with cut_point between them. @@ -59,6 +48,8 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont @dataclass class AlignedContig(GenotypedContig): + query: GenotypedContig + alignment: CigarHit def __init__(self, query: GenotypedContig, alignment: CigarHit): self.query = query @@ -173,6 +164,17 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment) +def align_to_reference(contig): + aligner = Aligner(seq=contig.ref_seq, preset='map-ont') + alignments = list(aligner.map(contig.seq)) + if not alignments: + return contig + + hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] + single_cigar_hit = connect_cigar_hits(hits_array) + return AlignedContig(query=contig, alignment=single_cigar_hit) + + def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: gap_open_penalty = 15 gap_extend_penalty = 3 @@ -376,7 +378,7 @@ def try_split(contig): continue if covered(contig, gap): - midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon + midpoint = gap.r_st + (gap.r_ei - gap.r_st) // 2 + contig.alignment.epsilon left_part, right_part = contig.cut_reference(midpoint) left_part = left_part.rstrip_query() right_part = right_part.lstrip_query() @@ -398,7 +400,7 @@ def try_split(contig): def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]: - maybe_aligned = list(map(GenotypedContig.align_to_reference, contigs)) + maybe_aligned = list(map(align_to_reference, contigs)) # Contigs that did not align do not need any more processing yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index b5b03fd91..27cffd248 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,7 +1,7 @@ import pytest import random -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_to_reference from micall.tests.utils import MockAligner, fixed_random_seed @@ -405,7 +405,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ), ] - contigs = [x.align_to_reference() for x in contigs] + contigs = list(map(align_to_reference, contigs)) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -434,7 +434,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ), ] - contigs = [x.align_to_reference() for x in contigs] + contigs = list(map(align_to_reference, contigs)) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 6acee2793..4197a12dc 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -18,8 +18,8 @@ class IntDict(dict): An extension of the basic Python dictionary designed for integer-to-integer mappings. The IntDict maintains not just key-value pairs (as in a normal dictionary) but also - tracks additional sets called `domain` and `codomain`. These sets act as supersets - to the keys and values respectively, even including integers that might not be used + tracks additional sets called `domain` and `codomain`. These sets are supersets + of the keys and values respectively, as they include integers that might not be used directly in mappings but are within the range of interest for the domain and codomain. """ @@ -40,10 +40,6 @@ def extend(self, key: Optional[int], value: Optional[int]): self.codomain.add(value) - def closest_key(self, index) -> int: - return min(self.keys(), key=lambda x: abs(x - index)) - - def left_max(self, index) -> Optional[int]: return max((v for (k, v) in self.items() if k <= index), default=None) @@ -78,10 +74,6 @@ class CoordinateMapping: """ Manages bidirectional mappings between reference and query coordinates, as well as operation indices. - A CoordinateMapping object contains mappings which represent the relationships and positions between - elements of a reference sequence, a query sequence, and the corresponding operations as defined in a - CIGAR string. - The mapping enables conversion from reference to query coordinates and vice versa. It also manages the association of these coordinates with their respective operations in the alignment process. """ @@ -133,15 +125,17 @@ def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' -class Cigar(list): +class Cigar(tuple): """ Represents an alignment between a query sequence and a reference sequence using the Compact Idiosyncratic Gapped Alignment Report (CIGAR) string format. A CIGAR string is a sequence of operation codes ('M', 'I', 'D', etc.) each preceded by - the number of bases or residues to which the operation applies. The primary use of a - CIGAR string is to detail areas of alignment and gaps (insertions or deletions) between - the two sequences. + the number of bases or residues to which the operation applies. + + The class abstracts a CIGAR string as a sequence of discrete operations for convenient + manipulation (as seen in self.iterate_operations()), while retaining the compact + form for storage and return purposes (seen in self.__str__()). Instances of this class should be created by calling the Cigar.coerce method. Examples: @@ -152,9 +146,8 @@ class Cigar(list): CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). """ - def __init__(self, cigar_lst: Iterable[Tuple[int, CigarActions]]): - super().__init__([]) - for x in cigar_lst: self.append(x) + def __new__(cls, cigar_lst: Iterable[Tuple[int, CigarActions]]): + return super(Cigar, cls).__new__(cls, Cigar.normalize(cigar_lst)) @staticmethod @@ -165,93 +158,12 @@ def coerce(obj): if isinstance(obj, str): return Cigar.parse(obj) - if isinstance(obj, list): + if isinstance(obj, list) or isinstance(obj, tuple): return Cigar(obj) raise TypeError(f"Cannot coerce {obj!r} to CIGAR string.") - OP_MAPPING = { - 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) - 'I': CigarActions.INSERT, # Insertion to the reference - 'D': CigarActions.DELETE, # Deletion from the reference - 'N': CigarActions.SKIPPED, # Skipped region from the reference - 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) - 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) - 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) - '=': CigarActions.SEQ_MATCH, # Sequence match - 'X': CigarActions.MISMATCH, # Sequence mismatch - } - - - @staticmethod - def parse_operation(operation: str) -> CigarActions: - if operation in Cigar.OP_MAPPING: - return Cigar.OP_MAPPING[operation] - else: - raise ValueError(f"Unexpected CIGAR action: {operation}.") - - - @staticmethod - def operation_to_str(op: CigarActions) -> str: - return [k for (k, v) in Cigar.OP_MAPPING.items() if v == op][0] - - - @staticmethod - def parse(string) -> 'Cigar': - """ - Parses a CIGAR string into a Cigar object. - - :param string: A CIGAR string with the format '(\\d+[MIDNSHPX=])+', where each operation code - is preceded by a number indicating how many times the operation should be applied. - """ - - data = [] - while string: - match = re.match(r'([0-9]+)([^0-9])', string) - if match: - num, operation = match.groups() - data.append([int(num), Cigar.parse_operation(operation)]) - string = string[match.end():] - else: - raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") - - return Cigar(data) - - - def append(self, item: Tuple[int, CigarActions]): - """ - Appends an operation to the CIGAR sequence, checking for type correctness - and performing normalization by merging consecutive identical operations. - """ - - # Type checking - if not isinstance(item, list) and not isinstance(item, tuple): - raise ValueError(f"Invalid CIGAR list: {item!r} is not a tuple.") - if len(item) != 2: - raise ValueError(f"Invalid CIGAR list: {item!r} is has a bad length.") - - num, operation = item - if isinstance(operation, int): - operation = CigarActions(operation) - if not isinstance(num, int) or not isinstance(operation, CigarActions): - raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") - if num < 0: - raise ValueError(f"Invalid CIGAR list: number of operations is negative.") - - # Normalization - if num == 0: - return - - if self: - last_num, last_operation = self[-1] - if operation == last_operation: - self[-1] = (last_num + num, operation) - return - - super().append((num, operation)) - - def iterate_operations(self) -> Iterable[CigarActions]: """ Yields each operation in the CIGAR sequence as a `CigarActions` enum. @@ -298,25 +210,6 @@ def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optio yield (operation, None, None) - @cached_property - def op_length(self): - return sum(1 for x in self.iterate_operations()) - - - @cached_property - def query_length(self): - return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) - in self.iterate_operations_with_pointers()), - default=0) - - - @cached_property - def ref_length(self): - return max((ref_pointer + 1 if ref_pointer is not None else 0 for (_, ref_pointer, _) - in self.iterate_operations_with_pointers()), - default=0) - - def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': """ Creates a new Cigar object by slicing the current one from start_inclusive to @@ -415,6 +308,117 @@ def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: return reference_msa, query_msa + @cached_property + def op_length(self): + return sum(1 for x in self.iterate_operations()) + + + @cached_property + def query_length(self): + return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) + in self.iterate_operations_with_pointers()), + default=0) + + + @cached_property + def ref_length(self): + return max((ref_pointer + 1 if ref_pointer is not None else 0 for (_, ref_pointer, _) + in self.iterate_operations_with_pointers()), + default=0) + + # # + # Boring boilerplate code below # + # # + + OP_MAPPING = { + 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) + 'I': CigarActions.INSERT, # Insertion to the reference + 'D': CigarActions.DELETE, # Deletion from the reference + 'N': CigarActions.SKIPPED, # Skipped region from the reference + 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) + 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) + 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) + '=': CigarActions.SEQ_MATCH, # Sequence match + 'X': CigarActions.MISMATCH, # Sequence mismatch + } + + + @staticmethod + def parse_operation(operation: str) -> CigarActions: + if operation in Cigar.OP_MAPPING: + return Cigar.OP_MAPPING[operation] + else: + raise ValueError(f"Unexpected CIGAR action: {operation}.") + + + @staticmethod + def operation_to_str(op: CigarActions) -> str: + return [k for (k, v) in Cigar.OP_MAPPING.items() if v == op][0] + + + @staticmethod + def parse(string) -> 'Cigar': + """ + Parses a CIGAR string into a Cigar object. + + :param string: A CIGAR string with the format '(\\d+[MIDNSHPX=])+', where each operation code + is preceded by a number indicating how many times the operation should be applied. + """ + + data = [] + while string: + match = re.match(r'([0-9]+)([^0-9])', string) + if match: + num, operation = match.groups() + data.append([int(num), Cigar.parse_operation(operation)]) + string = string[match.end():] + else: + raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") + + return Cigar(data) + + + @staticmethod + def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: + """ + Goes through the list appending operations to the CIGAR sequence, + checking for type correctness and performing normalization + by merging consecutive identical operations. + """ + + last_item = None + + for item in cigar_lst: + # Type checking + if not isinstance(item, list) and not isinstance(item, tuple): + raise ValueError(f"Invalid CIGAR list: {item!r} is not a tuple.") + if len(item) != 2: + raise ValueError(f"Invalid CIGAR list: {item!r} has a bad length.") + + num, operation = item + if isinstance(operation, int): + operation = CigarActions(operation) + if not isinstance(num, int) or not isinstance(operation, CigarActions): + raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") + if num < 0: + raise ValueError(f"Invalid CIGAR list: number of operations is negative.") + + # Normalization + if num == 0: + continue + + if last_item: + last_num, last_operation = last_item + if operation == last_operation: + last_item = (last_num + num, operation) + continue + + if last_item: yield (last_item[0], last_item[1]) + last_item = item + + if last_item: yield (last_item[0], last_item[1]) + + def __repr__(self): return f'Cigar({str(self)!r})' @@ -426,6 +430,20 @@ def __str__(self): @dataclass class CigarHit: + """ + This class provides an abstraction over the complex details involved in working with sequence alignments + expressed as CIGAR strings. It implements operations for alignment handling that are conceptually + straightforward but challenging to implement ad-hoc. + + The main tasks handled by this class are: + - Precisely dividing an alignment into two contiguous segments + at any given reference position (`cut_reference()`), + - Removing portions of the query sequence that do not align with + the reference sequence from either end + while preserving the alignment context (`lstrip_query()` and `rstrip_query()`), + - Enumerating gaps in the alignment (`gaps()`). + """ + cigar: Cigar r_st: int r_ei: int # inclusive @@ -552,7 +570,7 @@ def connect(self, other): @property def epsilon(self): - return Fraction(1, self.query_length + self.ref_length + 100) + return Fraction(1, self.cigar.op_length * 3 + 1) def _ref_cut_to_op_cut(self, cut_point: float): From c40ec0edec87a630883da38796b0229e0d555268 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 Nov 2023 14:44:29 -0800 Subject: [PATCH 090/485] Contig stitcher: fix return types of AlignedContig methods --- micall/core/contig_stitcher.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b3daf1222..25bc80dc5 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -98,15 +98,30 @@ def intervals_overlap(x, y): class SyntheticContig(AlignedContig): + """ + Contig that is not really aligned, but its boundaries are known. + It is created as a result of overlaps between the real contigs. + """ + # TODO(vitalik): maybe it is worth to realign overlaps to get rid of this special-case class. + def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, q_st=0, q_ei=len(query.seq)-1) super().__init__(query, alignment) + def cut_reference(self, cut_point: float): raise NotImplementedError("SyntheticContigs cannot be cut because they are not properly aligned") + def lstrip_query(self): + return self + + + def rstrip_query(self): + return self + + class FrankensteinContig(AlignedContig): """ Assembled of parts that were not even aligned together, @@ -128,16 +143,12 @@ def __init__(self, parts: List[AlignedContig]): def cut_reference(self, cut_point: float) -> 'FrankensteinContig': - # The cut_reference version of super() works here. - # But it loses information about parts, - # and does not check if the cut is legal - # i.e. whether it slices a SyntheticContig. - - # Search for the part that needs to be cut: + # Search for the part that needs to be cut left_parts = list(takewhile(lambda part: cut_point >= part.alignment.r_ei + 1, self.parts)) target_part = self.parts[len(left_parts)] right_parts = self.parts[len(left_parts) + 1:] + # Cut the target part and add its pieces to left and right. target_part_left, target_part_right = target_part.cut_reference(cut_point) left = FrankensteinContig(left_parts + [target_part_left]) right = FrankensteinContig([target_part_right] + right_parts) @@ -145,6 +156,14 @@ def cut_reference(self, cut_point: float) -> 'FrankensteinContig': return (left, right) + def lstrip_query(self): + return FrankensteinContig([self.parts[0].lstrip_query()] + self.parts[1:]) + + + def rstrip_query(self): + return FrankensteinContig(self.parts[:-1] + [self.parts[-1].rstrip_query()]) + + @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: query_seq = left.rstrip_query().seq + right.lstrip_query().seq From ff05ce34858f06c95ec223a505f7764f4bc8fb81 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 16 Nov 2023 09:04:04 -0800 Subject: [PATCH 091/485] Improve DevContainers section of CONTRIBUTING.md * Mention VS Code DevContainers extension * Mention the option to run MiCall in GitHub code spaces --- CONTRIBUTING.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b96a245f1..c3cf74388 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,15 +15,17 @@ you planning to submit your own fix in a pull request? The easiest way to start developing MiCall is by using DevContainers. -1. **Open Project**: Open the MiCall project folder in your editor, which should prompt to "Reopen in Container" or similar — click this to initialize the DevContainer environment. +1. **Open Project**: + - If you're using Visual Studio Code on your local machine, open the MiCall project folder and select the "Reopen in Container" prompt to initialize the DevContainer environment. Make sure you have the necessary DevContainer extension installed beforehand, available [here](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers). + - For a web-based development environment, you can develop directly on GitHub using GitHub Codespaces by navigating to the MiCall repository on GitHub and selecting "Code" > "Open with Codespaces" to launch a pre-configured environment. -2. **Dependency Installation**: The DevContainer configuration will handle the installation of required dependencies within the Docker container, so you don't have to install them manually. +2. **Dependency Installation**: All required dependencies will be automatically installed whether you are using a local DevContainer or GitHub Codespace. 3. **Interpreter Selection**: - - Watch out for dependency errors after setup. If they occur, manually switch to the correct Python interpreter version `3.8`, as the container includes two versions. - - This is an issue that we are tracking [here](https://github.com/cfe-lab/MiCall/issues/1033). + - Following the setup, if you encounter dependency errors, manually select the Python interpreter version `3.8`. This is necessary because the container includes multiple Python versions, and the default selection might not be appropriate. + - This issue is tracked and further details can be found [here](https://github.com/cfe-lab/MiCall/issues/1033). -4. **Verification**: Run `pytest` within the DevContainer to ensure all tests pass, confirming the environment is correctly configured. +4. **Verification**: To ensure that the environment is correctly configured, execute `pytest` within the DevContainer or Codespace. All tests should pass, indicating that the setup is successful. ### Local install From 1b0c0ca053f6c1e8682977e96f8ed6abd268e552 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 17 Nov 2023 08:34:16 -0800 Subject: [PATCH 092/485] Integrate contig stitcher structures into denovo pipeline --- micall/core/contig_stitcher.py | 17 +++--- micall/core/denovo.py | 50 +++++++++++++----- micall/tests/test_contig_stitcher.py | 78 ++++++++++++++-------------- 3 files changed, 86 insertions(+), 59 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 25bc80dc5..51d64f575 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -22,8 +22,8 @@ class Contig: @dataclass class GenotypedContig(Contig): ref_name: str - ref_seq: str - matched_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. + ref_seq: Optional[str] # None in cases where the reference organism is unknown. + match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: """ @@ -36,12 +36,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, - matched_fraction=None) + match_fraction=None) right = GenotypedContig(name=f'right({self.name})', seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, - matched_fraction=None) + match_fraction=None) return (left, right) @@ -59,7 +59,7 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit): name = query.name, ref_name = query.ref_name, ref_seq = query.ref_seq, - matched_fraction = query.matched_fraction) + match_fraction = query.match_fraction) def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: @@ -171,7 +171,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: name=f'{left.name}+{right.name}', ref_name=left.ref_name, ref_seq=left.ref_seq, - matched_fraction=None) + match_fraction=None) left_alignment = left.alignment right_alignment = \ @@ -184,6 +184,9 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: def align_to_reference(contig): + if contig.ref_seq is None: + return contig + aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) if not alignments: @@ -278,7 +281,7 @@ def stitch_2_contigs(left, right): # Return something that can be fed back into the loop. overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', seq=overlap_seq, ref_name=left.ref_name, - ref_seq=left.ref_seq, matched_fraction=None) + ref_seq=left.ref_seq, match_fraction=None) overlap_contig = SyntheticContig(overlap_query, r_st=left_overlap.alignment.r_st, r_ei=right_overlap.alignment.r_ei) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index c3ce3c9c7..bed1f62a5 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,6 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig +from micall.core.contig_stitcher import GenotypedContig IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), @@ -28,6 +29,32 @@ logger = logging.getLogger(__name__) +def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> typing.Iterable[GenotypedContig]: + projects = ProjectConfig.loadDefault() + + for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): + (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) + seq = record.seq + if match_fraction < 0: + seq = seq.reverse_complement() + match_fraction *= -1 + + group_ref = group_refs.get(ref_name) + try: + ref_seq = projects.getGenotypeReference(ref_name) + except KeyError: + try: + ref_seq = projects.getReference(ref_name) + except: + ref_seq = None + + yield GenotypedContig(name=record.name, + seq=seq, + ref_name=ref_name, + ref_seq=ref_seq, + match_fraction=match_fraction) + + def write_contig_refs(contigs_fasta_path, contigs_csv, merged_contigs_csv=None, @@ -55,19 +82,16 @@ def write_contig_refs(contigs_fasta_path, genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) - genotype_count = 0 - for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): - (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) - seq = record.seq - if match_fraction < 0: - seq = seq.reverse_complement() - match_fraction *= -1 - writer.writerow(dict(ref=ref_name, - match=match_fraction, - group_ref=group_refs.get(ref_name), - contig=seq)) - genotype_count += 1 - return genotype_count + + contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) + + for contig in contigs: + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=group_refs.get(contig.ref_name), + contig=contig.seq)) + + return len(contigs) def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 27cffd248..5801039c6 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -18,7 +18,7 @@ def test_identical_stitching_of_one_contig(exact_aligner): seq='ACTGACTG' * 100, ref_name='testref', ref_seq='ACTGACTG' * 100, - matched_fraction=1.0, + match_fraction=1.0, ), ] @@ -37,13 +37,13 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): seq=ref_seq, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='C' * 100, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -58,13 +58,13 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): seq='C' * 100, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='a', seq=ref_seq, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -85,13 +85,13 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): seq='A' * 50 + 'C' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -115,19 +115,19 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali seq='A' * 50 + 'C' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 20 + 'T' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -151,19 +151,19 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): seq='A' * 50 + 'C' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 100 + 'T' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 20 + 'T' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -187,13 +187,13 @@ def test_stitching_with_empty_contigs(exact_aligner): seq=ref_seq, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='', ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -211,7 +211,7 @@ def test_stitching_of_identical_contigs(exact_aligner): seq='ACTGACTG' * 100, ref_name='testref', ref_seq='ACTGACTG' * 100, - matched_fraction=1.0, + match_fraction=1.0, ) for name in ["a", "b", "c"]] @@ -239,13 +239,13 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti seq='A' * 50 + 'C' * 20, ref_name='testref-1', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref-2', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -268,25 +268,25 @@ def test_correct_processing_complex_nogaps(exact_aligner): seq='A' * 50 + 'C' * 20, ref_name=ref_name, ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name=ref_name, ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 70 + 'T' * 20, ref_name=ref_name, ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='d', seq='T' * 20 + 'G' * 50, ref_name=ref_name, ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] for ref_name in ['testref-1', 'testref-2']] @@ -320,13 +320,13 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): seq='A' * 20 + 'C' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 50 + 'C' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -349,7 +349,7 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): seq= 'A' * 50 + 'T' * 50, # mind the C gap ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -369,13 +369,13 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): seq='A' * 50 + 'T' * 50, # mind the C gap ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='B', seq='G' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -395,13 +395,13 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): seq='G' * 50 + 'A' * 50 + 'T' * 100, # mind the gap ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 50, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -424,13 +424,13 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): seq='G' * 100 + 'A' * 0 + 'C' * 100, # mind the gap ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 9 + 'C' * 100 + 'T' * 100, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.5, + match_fraction=0.5, ), ] @@ -454,7 +454,7 @@ def test_stitching_partial_align(exact_aligner): seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, - matched_fraction=0.3, + match_fraction=0.3, ), ] @@ -477,7 +477,7 @@ def test_partial_align_consensus(exact_aligner): seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, - matched_fraction=0.3, + match_fraction=0.3, ), ] @@ -497,13 +497,13 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner): seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), GenotypedContig(name='b', seq='C' * 20 + 'A' * 10 + 'G' * 10, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), ] @@ -526,13 +526,13 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner): seq='A' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), GenotypedContig(name='b', seq='T' * 20, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), ] @@ -552,13 +552,13 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): seq='T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 10, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), GenotypedContig(name='b', seq='C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10, ref_name='testref', ref_seq=ref_seq, - matched_fraction=0.3, + match_fraction=0.3, ), ] From 7ab3666904f9d40fc9e5d6417f3a9a0ece073e93 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 17 Nov 2023 08:33:07 -0800 Subject: [PATCH 093/485] Contig stitcher: ensure match_fraction value for every contig It is required by remap, so we have to provide something to it. However, it is not clear how to recalculate the value for subparts of initial contigs since it is initially set by BLAST. The easiest would be to rerun BLAST on final contigs, but that also seems wasteful. Current solution simply copies the match_fraction from parent of a subpart, or takes min of two "parents" for contigs that are munged together. --- micall/core/contig_stitcher.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 51d64f575..0ec8a8bfd 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -23,7 +23,7 @@ class Contig: class GenotypedContig(Contig): ref_name: str ref_seq: Optional[str] # None in cases where the reference organism is unknown. - match_fraction: Optional[float] # Approximated overall concordance between `seq` and `ref_seq`. + match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: """ @@ -32,16 +32,17 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont """ cut_point = max(0, cut_point) + match_fraction = self.match_fraction left = GenotypedContig(name=f'left({self.name})', seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, - match_fraction=None) + match_fraction=match_fraction) right = GenotypedContig(name=f'right({self.name})', seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, - match_fraction=None) + match_fraction=match_fraction) return (left, right) @@ -167,11 +168,12 @@ def rstrip_query(self): @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: query_seq = left.rstrip_query().seq + right.lstrip_query().seq + match_fraction = min(left.match_fraction, right.match_fraction) query = GenotypedContig(seq=query_seq, name=f'{left.name}+{right.name}', ref_name=left.ref_name, ref_seq=left.ref_seq, - match_fraction=None) + match_fraction=match_fraction) left_alignment = left.alignment right_alignment = \ @@ -279,9 +281,10 @@ def stitch_2_contigs(left, right): overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') # Return something that can be fed back into the loop. + match_fraction = min(left.match_fraction, right.match_fraction) overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', seq=overlap_seq, ref_name=left.ref_name, - ref_seq=left.ref_seq, match_fraction=None) + ref_seq=left.ref_seq, match_fraction=match_fraction) overlap_contig = SyntheticContig(overlap_query, r_st=left_overlap.alignment.r_st, r_ei=right_overlap.alignment.r_ei) From ce29f040aa23e0d12582a538dbf189e643e0bfd5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 17 Nov 2023 09:03:04 -0800 Subject: [PATCH 094/485] Cigar tools: fix handling of cross-alignments in connect_cigar_hits --- micall/core/contig_stitcher.py | 28 ++++++++++++++++++++++------ micall/tests/test_cigar_tools.py | 27 +++++++++++++++++---------- micall/tests/test_contig_stitcher.py | 6 +++--- micall/utils/cigar_tools.py | 20 +++++++++++++++++--- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0ec8a8bfd..8060c2206 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -185,18 +185,34 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment) -def align_to_reference(contig): +def align_to_reference(contig) -> Iterable[GenotypedContig]: if contig.ref_seq is None: - return contig + yield contig + return aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) if not alignments: - return contig + yield contig + return hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] - single_cigar_hit = connect_cigar_hits(hits_array) - return AlignedContig(query=contig, alignment=single_cigar_hit) + connected = connect_cigar_hits(hits_array) + if len(connected) == 1: + yield AlignedContig(query=contig, alignment=connected[0]) + return + + for single_hit in connected: + query = GenotypedContig(name=f'part({contig.name})', + seq=contig.seq, + ref_name=contig.ref_name, + ref_seq=contig.ref_seq, + match_fraction=contig.match_fraction) + yield AlignedContig(query=query, alignment=single_hit) + + +def align_all_to_reference(contigs): + return [contig for parts in map(align_to_reference, contigs) for contig in parts] def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: @@ -425,7 +441,7 @@ def try_split(contig): def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]: - maybe_aligned = list(map(align_to_reference, contigs)) + maybe_aligned = align_all_to_reference(contigs) # Contigs that did not align do not need any more processing yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 7b0e8f8cd..b49296233 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -540,68 +540,75 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): ( [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), CigarHit('4M', r_st=10, r_ei=13, q_st=8, q_ei=11)], - CigarHit('4M5D3I4M', r_st=1, r_ei=13, q_st=1, q_ei=11) + [CigarHit('4M5D3I4M', r_st=1, r_ei=13, q_st=1, q_ei=11)] ), # Overlapping hits should ignore later ones ( [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), CigarHit('5M', r_st=3, r_ei=7, q_st=3, q_ei=7)], - CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4) + [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4)] ), # Touching hits should be simply concatenated ( [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), CigarHit('4M', r_st=5, r_ei=8, q_st=5, q_ei=8)], - CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8) + [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8)] ), # Hits that touch at only one boundary should combine just fine ( [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), CigarHit('6M', r_st=4, r_ei=9, q_st=6, q_ei=11)], - CigarHit('3M2I6M', r_st=1, r_ei=9, q_st=1, q_ei=11) + [CigarHit('3M2I6M', r_st=1, r_ei=9, q_st=1, q_ei=11)] ), # Hits that are subsets of earlier hits should be ignored ( [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8), CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], - CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8) + [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8)] ), # Hits that are out of order should be connected if no overlap ( [CigarHit('3M', r_st=10, r_ei=12, q_st=6, q_ei=8), CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3)], - CigarHit('3M6D2I3M', r_st=1, r_ei=12, q_st=1, q_ei=8) + [CigarHit('3M6D2I3M', r_st=1, r_ei=12, q_st=1, q_ei=8)] ), # Hits that overlap by a single base should prioritize the first hit and not combine ( [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], - CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3) + [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3)] ), # Non-overlapping hits in the query space but overlapping in reference space ( [CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5), CigarHit('1M', r_st=3, r_ei=3, q_st=10, q_ei=10)], - CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5) + [CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5)] ), # Combining more than two hits ( [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), CigarHit('3M', r_st=7, r_ei=9, q_st=7, q_ei=9), CigarHit('3M', r_st=12, r_ei=14, q_st=16, q_ei=18)], - CigarHit('3M3D3I3M2D6I3M', r_st=1, r_ei=14, q_st=1, q_ei=18) + [CigarHit('3M3D3I3M2D6I3M', r_st=1, r_ei=14, q_st=1, q_ei=18)] ), # Combining hits including hard-clipping, which should be ignored in alignments ( [CigarHit('2H5M1H', r_st=1, r_ei=5, q_st=3, q_ei=7), CigarHit('2H5M1H', r_st=11, r_ei=15, q_st=13, q_ei=17)], - CigarHit('2H5M1H5D5I2H5M1H', r_st=1, r_ei=15, q_st=3, q_ei=17) + [CigarHit('2H5M1H5D5I2H5M1H', r_st=1, r_ei=15, q_st=3, q_ei=17)] ), # An empty list of hits should raise a ValueError ( [], ValueError("Expected a non-empty list of cigar hits") ), + # Before by reference, after by query + ( + [CigarHit('4M', r_st=1, r_ei=4, q_st=8, q_ei=11), + CigarHit('4M', r_st=10, r_ei=13, q_st=1, q_ei=4)], + [CigarHit('4M', r_st=1, r_ei=4, q_st=8, q_ei=11), + CigarHit('4M', r_st=10, r_ei=13, q_st=1, q_ei=4)] + ), ] @pytest.mark.parametrize('hits, expected_result', connect_cigar_hits_cases) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 5801039c6..0b4540097 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,7 +1,7 @@ import pytest import random -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_to_reference +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference from micall.tests.utils import MockAligner, fixed_random_seed @@ -405,7 +405,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ), ] - contigs = list(map(align_to_reference, contigs)) + contigs = align_all_to_reference(contigs) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -434,7 +434,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ), ] - contigs = list(map(align_to_reference, contigs)) + contigs = align_all_to_reference(contigs) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 4197a12dc..862c3e0b5 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,7 +4,7 @@ from math import ceil, floor import re -from typing import Container, Tuple, Iterable, Optional, Set, Dict +from typing import Container, Tuple, Iterable, Optional, Set, Dict, List from dataclasses import dataclass from functools import cached_property, reduce from itertools import chain, dropwhile @@ -677,7 +677,7 @@ def __repr__(self): return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' -def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> CigarHit: +def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> List[CigarHit]: """ This function exists to deal with the fact that mappy does not always connect big gaps, and returns surrounding parts as two separate alignment hits. @@ -704,5 +704,19 @@ def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> CigarHit: # Sort by interval start positions. sorted_parts = sorted(accumulator, key=lambda p: p.r_st) + # Segregate independent matches. + sorted_groups = [] + + def find_group(hit): + for group in sorted_groups: + if hit.q_st > group[-1].q_st: + group.append(hit) + return + + sorted_groups.append([hit]) + + for hit in sorted_parts: + find_group(hit) + # Collect all intervals back together, connecting them with CigarActions.DELETE. - return reduce(CigarHit.connect, sorted_parts) + return [reduce(CigarHit.connect, group) for group in sorted_groups] From 877857a84f9f280b92ebfc2feb31356e0a5329b8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 17 Nov 2023 17:50:06 -0800 Subject: [PATCH 095/485] Perform the new stitching in the denovo pipeline One thing to make sure is that we are passing `str` instead of `Bio.Seq` to the stitcher. --- micall/core/contig_stitcher.py | 23 +++++++++---- micall/core/denovo.py | 14 ++++---- micall/tests/test_contig_stitcher.py | 48 +++++++++++++++++++++++++--- release_test_microtest.py | 7 ++-- 4 files changed, 71 insertions(+), 21 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 8060c2206..dc194a466 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -22,7 +22,8 @@ class Contig: @dataclass class GenotypedContig(Contig): ref_name: str - ref_seq: Optional[str] # None in cases where the reference organism is unknown. + group_ref: str + ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: @@ -37,11 +38,13 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, + group_ref=self.group_ref, match_fraction=match_fraction) right = GenotypedContig(name=f'right({self.name})', seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, + group_ref=self.group_ref, match_fraction=match_fraction) return (left, right) @@ -59,6 +62,7 @@ def __init__(self, query: GenotypedContig, alignment: CigarHit): seq = query.seq, name = query.name, ref_name = query.ref_name, + group_ref = query.group_ref, ref_seq = query.ref_seq, match_fraction = query.match_fraction) @@ -91,7 +95,7 @@ def overlaps(self, other) -> bool: def intervals_overlap(x, y): return x[0] <= y[1] and x[1] >= y[0] - if self.ref_name != other.ref_name: + if self.group_ref != other.group_ref: return False return intervals_overlap((self.alignment.r_st, self.alignment.r_ei), @@ -169,9 +173,11 @@ def rstrip_query(self): def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: query_seq = left.rstrip_query().seq + right.lstrip_query().seq match_fraction = min(left.match_fraction, right.match_fraction) + ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=query_seq, name=f'{left.name}+{right.name}', - ref_name=left.ref_name, + ref_name=ref_name, + group_ref=left.group_ref, ref_seq=left.ref_seq, match_fraction=match_fraction) @@ -206,6 +212,7 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: query = GenotypedContig(name=f'part({contig.name})', seq=contig.seq, ref_name=contig.ref_name, + group_ref=contig.group_ref, ref_seq=contig.ref_seq, match_fraction=contig.match_fraction) yield AlignedContig(query=query, alignment=single_hit) @@ -298,8 +305,10 @@ def stitch_2_contigs(left, right): # Return something that can be fed back into the loop. match_fraction = min(left.match_fraction, right.match_fraction) + ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', - seq=overlap_seq, ref_name=left.ref_name, + ref_name=ref_name, + seq=overlap_seq, group_ref=left.group_ref, ref_seq=left.ref_seq, match_fraction=match_fraction) overlap_contig = SyntheticContig(overlap_query, r_st=left_overlap.alignment.r_st, @@ -372,7 +381,7 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: current_interval = (current.alignment.r_st, current.alignment.r_ei) # Create a map of cumulative coverage for contigs - other_contigs = [x for x in contigs if x != current and x.ref_name == current.ref_name] + other_contigs = [x for x in contigs if x != current and x.group_ref == current.group_ref] cumulative_coverage = calculate_cumulative_coverage(other_contigs) # Check if the current contig is covered by the cumulative coverage intervals @@ -455,11 +464,11 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(stitch_contigs(contigs)) - consensus_parts = defaultdict(list) # ref_name -> List[AlignedContig] + consensus_parts = defaultdict(list) # group_ref -> List[AlignedContig] for contig in contigs: if isinstance(contig, AlignedContig): - consensus_parts[contig.ref_name].append(contig) + consensus_parts[contig.group_ref].append(contig) else: yield contig diff --git a/micall/core/denovo.py b/micall/core/denovo.py index bed1f62a5..e1d48c4b6 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,7 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), @@ -41,17 +41,18 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> ty group_ref = group_refs.get(ref_name) try: - ref_seq = projects.getGenotypeReference(ref_name) + ref_seq = projects.getGenotypeReference(group_ref) except KeyError: try: - ref_seq = projects.getReference(ref_name) + ref_seq = projects.getReference(group_ref) except: ref_seq = None yield GenotypedContig(name=record.name, - seq=seq, + seq=str(seq), ref_name=ref_name, - ref_seq=ref_seq, + group_ref=group_ref, + ref_seq=str(ref_seq), match_fraction=match_fraction) @@ -84,11 +85,12 @@ def write_contig_refs(contigs_fasta_path, group_refs=group_refs) contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) + contigs = list(stitch_consensus(contigs)) for contig in contigs: writer.writerow(dict(ref=contig.ref_name, match=contig.match_fraction, - group_ref=group_refs.get(contig.ref_name), + group_ref=contig.group_ref, contig=contig.seq)) return len(contigs) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 0b4540097..f463b47b4 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -17,6 +17,7 @@ def test_identical_stitching_of_one_contig(exact_aligner): GenotypedContig(name='a', seq='ACTGACTG' * 100, ref_name='testref', + group_ref='testref', ref_seq='ACTGACTG' * 100, match_fraction=1.0, ), @@ -36,12 +37,14 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): GenotypedContig(name='a', seq=ref_seq, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='C' * 100, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -57,12 +60,14 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): GenotypedContig(name='b', seq='C' * 100, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='a', seq=ref_seq, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -84,12 +89,14 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): GenotypedContig(name='a', seq='A' * 50 + 'C' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -114,18 +121,21 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali GenotypedContig(name='a', seq='A' * 50 + 'C' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 20 + 'T' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -150,18 +160,21 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): GenotypedContig(name='a', seq='A' * 50 + 'C' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 100 + 'T' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 20 + 'T' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -186,12 +199,14 @@ def test_stitching_with_empty_contigs(exact_aligner): GenotypedContig(name='a', seq=ref_seq, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='', ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -210,6 +225,7 @@ def test_stitching_of_identical_contigs(exact_aligner): GenotypedContig(name=name, seq='ACTGACTG' * 100, ref_name='testref', + group_ref='testref', ref_seq='ACTGACTG' * 100, match_fraction=1.0, ) @@ -238,12 +254,14 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti GenotypedContig(name='a', seq='A' * 50 + 'C' * 20, ref_name='testref-1', + group_ref='testref-1', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name='testref-2', + group_ref='testref-2', ref_seq=ref_seq, match_fraction=0.5, ), @@ -267,24 +285,28 @@ def test_correct_processing_complex_nogaps(exact_aligner): GenotypedContig(name='a', seq='A' * 50 + 'C' * 20, ref_name=ref_name, + group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 20 + 'C' * 50, ref_name=ref_name, + group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='c', seq='C' * 70 + 'T' * 20, ref_name=ref_name, + group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='d', seq='T' * 20 + 'G' * 50, ref_name=ref_name, + group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), @@ -298,12 +320,12 @@ def test_correct_processing_complex_nogaps(exact_aligner): assert 170 == len(results[0].seq) assert results[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 assert results[0].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' - assert results[0].query.ref_name == 'testref-1' + assert results[0].query.group_ref == 'testref-1' assert 170 == len(results[1].seq) assert results[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 assert results[1].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' - assert results[1].query.ref_name == 'testref-2' + assert results[1].query.group_ref == 'testref-2' assert results[2].query == contigs[3] assert results[3].query == contigs[7] @@ -319,12 +341,14 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): GenotypedContig(name='a', seq='A' * 20 + 'C' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 50 + 'C' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -348,6 +372,7 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): GenotypedContig(name='a', seq= 'A' * 50 + 'T' * 50, # mind the C gap ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -368,12 +393,14 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): GenotypedContig(name='a', seq='A' * 50 + 'T' * 50, # mind the C gap ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='B', seq='G' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -394,12 +421,14 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): GenotypedContig(name='a', seq='G' * 50 + 'A' * 50 + 'T' * 100, # mind the gap ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 50, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -423,12 +452,14 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): GenotypedContig(name='a', seq='G' * 100 + 'A' * 0 + 'C' * 100, # mind the gap ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', seq='A' * 9 + 'C' * 100 + 'T' * 100, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), @@ -453,6 +484,7 @@ def test_stitching_partial_align(exact_aligner): GenotypedContig(name='a', seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', + group_ref='testref', ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, match_fraction=0.3, ), @@ -476,6 +508,7 @@ def test_partial_align_consensus(exact_aligner): GenotypedContig(name='a', seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', + group_ref='testref', ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, match_fraction=0.3, ), @@ -496,12 +529,14 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner): GenotypedContig(name='a', seq='T' * 10 + 'C' * 20 + 'A' * 10, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), GenotypedContig(name='b', seq='C' * 20 + 'A' * 10 + 'G' * 10, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), @@ -525,12 +560,14 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner): GenotypedContig(name='a', seq='A' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), GenotypedContig(name='b', seq='T' * 20, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), @@ -551,12 +588,14 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): GenotypedContig(name='a', seq='T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 10, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), GenotypedContig(name='b', seq='C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10, ref_name='testref', + group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), @@ -643,8 +682,9 @@ def test_merge_intervals(intervals, expected): class MockAlignedContig: - def __init__(self, ref_name, r_st, r_ei, name="contig"): + def __init__(self, ref_name, group_ref, r_st, r_ei, name="contig"): self.ref_name = ref_name + self.group_ref = group_ref self.alignment = MockAlignment(r_st, r_ei) self.name = name @@ -657,7 +697,7 @@ def __init__(self, r_st, r_ei): # Simple function to create mock AlignedContig objects for testing, including ref_name. def create_mock_aligned_contig(ref_name, r_st, r_ei, name="contig"): - return MockAlignedContig(ref_name, r_st, r_ei, name) + return MockAlignedContig(ref_name, ref_name, r_st, r_ei, name) @pytest.mark.parametrize("contigs, expected_covered_name", [ diff --git a/release_test_microtest.py b/release_test_microtest.py index 407a9ba0b..79d597688 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -179,8 +179,7 @@ def check_2100(self): if self.is_denovo: expected_regions = {'HIV1-CON-XX-Consensus-seed', '1-HIV1-B-FR-K03455-seed', - '2-HCV-1a', - '3-HIV1-B-FR-K03455-seed'} + '2-HCV-1a'} else: expected_regions = {'HIV1-CON-XX-Consensus-seed', 'HCV-1a', @@ -247,7 +246,7 @@ def check_2120(self): def check_2130(self): conseq_rows = list(self.read_file('2130A-HCV_S15', 'conseq.csv')) regions = set(map(itemgetter('region'), conseq_rows)) - expected_regions = ({'1-HCV-2a', '2-HCV-2a'} + expected_regions = ({'1-HCV-2a'} if self.is_denovo else {'HCV-2a'}) assert regions == expected_regions, regions @@ -255,7 +254,7 @@ def check_2130(self): def check_2130midi(self): conseq_rows = list(self.read_file('2130AMIDI-MidHCV_S16', 'conseq.csv')) regions = set(map(itemgetter('region'), conseq_rows)) - expected_regions = ({'1-HCV-2a', '2-HCV-2a'} + expected_regions = ({'1-HCV-2a'} if self.is_denovo else {'HCV-2a'}) assert regions == expected_regions, regions From e0670324a8413d9c650c0a00dd0ecd8e26de999e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 20 Nov 2023 10:02:18 -0800 Subject: [PATCH 096/485] Improvements to contig stitcher tests code --- micall/core/contig_stitcher.py | 3 +- micall/tests/test_contig_stitcher.py | 47 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index dc194a466..9a0de1ca7 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -297,8 +297,7 @@ def stitch_2_contigs(left, right): # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) - max_concordance_index = max(range(len(concordance)), - key=lambda i: concordance[i]) + max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) aligned_left_part = aligned_left[:max_concordance_index] aligned_right_part = aligned_right[max_concordance_index:] overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index f463b47b4..efcec7344 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,6 +1,6 @@ -import pytest import random +import pytest from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference from micall.tests.utils import MockAligner, fixed_random_seed @@ -24,8 +24,8 @@ def test_identical_stitching_of_one_contig(exact_aligner): ] results = list(stitch_contigs(contigs)) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): @@ -53,8 +53,8 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } contigs = [ GenotypedContig(name='b', @@ -76,8 +76,8 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): @@ -213,14 +213,13 @@ def test_stitching_with_empty_contigs(exact_aligner): ] results = list(stitch_contigs(contigs)) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_stitching_of_identical_contigs(exact_aligner): # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. - ref_seq = 'A' * 100 contigs = [ GenotypedContig(name=name, seq='ACTGACTG' * 100, @@ -270,8 +269,8 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti results = list(stitch_contigs(contigs)) assert len(results) == 2 - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_correct_processing_complex_nogaps(exact_aligner): @@ -380,8 +379,8 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): results = list(stitch_contigs(contigs)) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): @@ -408,8 +407,8 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): results = list(stitch_contigs(contigs)) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_stitching_contig_with_big_covered_gap(exact_aligner): @@ -473,8 +472,8 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_stitching_partial_align(exact_aligner): @@ -497,8 +496,8 @@ def test_stitching_partial_align(exact_aligner): assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) - assert set(map(lambda x: x.seq, contigs)) \ - != set(map(lambda x: x.lstrip_query().rstrip_query().seq, results)) + assert { contig.seq for contig in contigs } \ + != { contig.lstrip_query().rstrip_query().seq for contig in results } def test_partial_align_consensus(exact_aligner): @@ -516,8 +515,8 @@ def test_partial_align_consensus(exact_aligner): results = list(stitch_consensus(contigs)) assert len(results) == len(contigs) - assert set(map(lambda x: x.seq, contigs)) \ - == set(map(lambda x: x.seq, results)) + assert { contig.seq for contig in contigs } \ + == { contig.seq for contig in results } def test_stitching_partial_align_multiple_sequences(exact_aligner): @@ -547,8 +546,8 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) - assert set(map(lambda x: x.seq, contigs)) \ - != set(map(lambda x: x.lstrip_query().rstrip_query().seq, results)) + assert { contig.seq for contig in contigs } \ + != { contig.lstrip_query().rstrip_query().seq for contig in results } def test_partial_align_consensus_multiple_sequences(exact_aligner): From 7a153c05a4d1448798039f291b8065c9145a90e7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 20 Nov 2023 10:45:09 -0800 Subject: [PATCH 097/485] Contig stitcher: make mypy-compliant To verify that the modules are correctly typed, run: ```shell mypy micall/core/contig_stitcher.py --check-untyped-defs \ | grep -e tools.py: -e stitcher.py: ``` --- micall/core/contig_stitcher.py | 21 ++++++++++++--------- micall/utils/cigar_tools.py | 26 +++++++++++++------------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9a0de1ca7..7bf3eb527 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -139,15 +139,16 @@ def __init__(self, parts: List[AlignedContig]): raise ValueError("Empty Frankenstei do not exist") # Flatten any possible Frankenstein parts - self.parts = [subpart for part in parts for subpart in - (part.parts if isinstance(part, FrankensteinContig) else [part])] + self.parts: List[AlignedContig] = \ + [subpart for part in parts for subpart in + (part.parts if isinstance(part, FrankensteinContig) else [part])] aligned = reduce(FrankensteinContig.munge, self.parts) super().__init__(aligned.query, aligned.alignment) - def cut_reference(self, cut_point: float) -> 'FrankensteinContig': + def cut_reference(self, cut_point: float) -> Tuple['FrankensteinContig', 'FrankensteinContig']: # Search for the part that needs to be cut left_parts = list(takewhile(lambda part: cut_point >= part.alignment.r_ei + 1, self.parts)) target_part = self.parts[len(left_parts)] @@ -266,7 +267,7 @@ def calculate_concordance(left: str, right: str) -> List[float]: if len(left) != len(right): raise ValueError("Can only calculate concordance for same sized sequences") - result = [0] * len(left) + result: List[float] = [0] * len(left) def slide(left, right): window_size = 30 @@ -388,6 +389,8 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: for cover_interval in cumulative_coverage): return current + return None + def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: """ Filter out all contigs that are contained within other contigs. """ @@ -402,7 +405,7 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] return contigs -def split_contigs_with_gaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: +def split_contigs_with_gaps(contigs: List[AlignedContig]) -> List[AlignedContig]: def covered_by(gap, other): # Check if any 1 reference coordinate in gap is mapped in other. gap_coords = gap.coordinate_mapping.ref_to_query.domain @@ -438,7 +441,7 @@ def try_split(contig): process_queue.put(right_part) return - process_queue = LifoQueue() + process_queue: LifoQueue = LifoQueue() for contig in contigs: process_queue.put(contig) while not process_queue.empty(): @@ -453,12 +456,12 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig # Contigs that did not align do not need any more processing yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) - aligned = [x for x in maybe_aligned if isinstance(x, AlignedContig)] + aligned: List[AlignedContig] = \ + [x for x in maybe_aligned if isinstance(x, AlignedContig)] aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) - aligned = combine_overlaps(aligned) - yield from aligned + yield from combine_overlaps(aligned) def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 862c3e0b5..f0473a0d9 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -147,7 +147,7 @@ class Cigar(tuple): """ def __new__(cls, cigar_lst: Iterable[Tuple[int, CigarActions]]): - return super(Cigar, cls).__new__(cls, Cigar.normalize(cigar_lst)) + return super(Cigar, cls).__new__(cls, Cigar.normalize(cigar_lst)) # type: ignore @staticmethod @@ -370,7 +370,7 @@ def parse(string) -> 'Cigar': match = re.match(r'([0-9]+)([^0-9])', string) if match: num, operation = match.groups() - data.append([int(num), Cigar.parse_operation(operation)]) + data.append((int(num), Cigar.parse_operation(operation))) string = string[match.end():] else: raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") @@ -573,7 +573,7 @@ def epsilon(self): return Fraction(1, self.cigar.op_length * 3 + 1) - def _ref_cut_to_op_cut(self, cut_point: float): + def _ref_cut_to_op_cut(self, cut_point): mapping = self.coordinate_mapping left_op_cut_point = mapping.ref_to_op.left_max(floor(cut_point)) @@ -615,17 +615,17 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: The two parts do not share any elements, and no element is "lost". """ - cut_point = Fraction(cut_point) - if cut_point.denominator == 1: + fcut_point: Fraction = Fraction(cut_point) + if fcut_point.denominator == 1: raise ValueError("Cut accepts fractions, not integers") if self.ref_length == 0 or \ - not (self.r_st - 1 < cut_point < self.r_ei + 1): + not (self.r_st - 1 < fcut_point < self.r_ei + 1): raise IndexError("Cut point out of reference bounds") - op_cut_point = self._ref_cut_to_op_cut(cut_point) - left = self._slice(self.r_st, self.q_st, 0, floor(op_cut_point)) - right = self._slice(left.r_ei + 1, left.q_ei + 1, ceil(op_cut_point), self.cigar.op_length) + op_fcut_point = self._ref_cut_to_op_cut(fcut_point) + left = self._slice(self.r_st, self.q_st, 0, floor(op_fcut_point)) + right = self._slice(left.r_ei + 1, left.q_ei + 1, ceil(op_fcut_point), self.cigar.op_length) return left, right @@ -677,7 +677,7 @@ def __repr__(self): return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' -def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> List[CigarHit]: +def connect_cigar_hits(cigar_hits: List[CigarHit]) -> List[CigarHit]: """ This function exists to deal with the fact that mappy does not always connect big gaps, and returns surrounding parts as two separate alignment hits. @@ -688,10 +688,10 @@ def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> List[CigarHit]: that overlap with previously found alignments. """ - if not len(cigar_hits) > 0: + if len(cigar_hits) == 0: raise ValueError("Expected a non-empty list of cigar hits") - accumulator = [] + accumulator: List[CigarHit] = [] # Collect non-overlaping parts. # Earlier matches have priority over ones that come after. @@ -705,7 +705,7 @@ def connect_cigar_hits(cigar_hits: Iterable[CigarHit]) -> List[CigarHit]: sorted_parts = sorted(accumulator, key=lambda p: p.r_st) # Segregate independent matches. - sorted_groups = [] + sorted_groups: List[List[CigarHit]] = [] def find_group(hit): for group in sorted_groups: From 535d03f3fc3e2f547be31081c1c7be07698bdfa9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 Nov 2023 17:00:40 -0800 Subject: [PATCH 098/485] Add a simple fuzz-test for contig stitcher --- micall/core/contig_stitcher.py | 1 + .../data/contig_stitcher_fuzz_nogaps.json | 264 ++++++++++++++++++ micall/tests/test_contig_stitcher_fuzz.py | 112 ++++++++ 3 files changed, 377 insertions(+) create mode 100644 micall/tests/data/contig_stitcher_fuzz_nogaps.json create mode 100644 micall/tests/test_contig_stitcher_fuzz.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 7bf3eb527..10472715d 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -395,6 +395,7 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: """ Filter out all contigs that are contained within other contigs. """ + contigs = contigs[:] while contigs: covered = find_covered_contig(contigs) if covered: diff --git a/micall/tests/data/contig_stitcher_fuzz_nogaps.json b/micall/tests/data/contig_stitcher_fuzz_nogaps.json new file mode 100644 index 000000000..754796324 --- /dev/null +++ b/micall/tests/data/contig_stitcher_fuzz_nogaps.json @@ -0,0 +1,264 @@ +[{ "type": "contig", "fields": { "start": 502, "end": 540, "name": "a" } }, { "type": "contig", "fields": { "start": 196, "end": 890, "name": "b" } }, { "type": "contig", "fields": { "start": 202, "end": 664, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 196, "end": 890, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 502, "end": 540, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 227, "end": 338, "name": "a" } }, { "type": "contig", "fields": { "start": 109, "end": 110, "name": "b" } }, { "type": "contig", "fields": { "start": 912, "end": 972, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 109, "end": 110, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 227, "end": 338, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 550, "end": 637, "name": "a" } }, { "type": "contig", "fields": { "start": 541, "end": 548, "name": "b" } }, { "type": "contig", "fields": { "start": 415, "end": 897, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 541, "end": 548, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 550, "end": 637, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 75, "end": 570, "name": "a" } }, { "type": "contig", "fields": { "start": 65, "end": 74, "name": "b" } }, { "type": "contig", "fields": { "start": 394, "end": 692, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 65, "end": 74, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 75, "end": 570, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 892, "end": 988, "name": "a" } }, { "type": "contig", "fields": { "start": 420, "end": 554, "name": "b" } }, { "type": "contig", "fields": { "start": 77, "end": 900, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 900, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 892, "end": 988, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 565, "end": 767, "name": "a" } }, { "type": "contig", "fields": { "start": 765, "end": 825, "name": "b" } }, { "type": "contig", "fields": { "start": 136, "end": 295, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 136, "end": 295, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 565, "end": 767, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 960, "end": 997, "name": "a" } }, { "type": "contig", "fields": { "start": 886, "end": 903, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 347, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 347, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 960, "end": 997, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 857, "end": 882, "name": "a" } }, { "type": "contig", "fields": { "start": 687, "end": 795, "name": "b" } }, { "type": "contig", "fields": { "start": 298, "end": 856, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 298, "end": 856, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 857, "end": 882, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 374, "end": 644, "name": "a" } }, { "type": "contig", "fields": { "start": 557, "end": 761, "name": "b" } }, { "type": "contig", "fields": { "start": 106, "end": 861, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 861, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 557, "end": 761, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 513, "end": 923, "name": "a" } }, { "type": "contig", "fields": { "start": 314, "end": 937, "name": "b" } }, { "type": "contig", "fields": { "start": 227, "end": 260, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 227, "end": 260, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 314, "end": 937, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 647, "end": 657, "name": "a" } }, { "type": "contig", "fields": { "start": 694, "end": 937, "name": "b" } }, { "type": "contig", "fields": { "start": 522, "end": 530, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 522, "end": 530, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 694, "end": 937, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 255, "end": 770, "name": "a" } }, { "type": "contig", "fields": { "start": 245, "end": 360, "name": "b" } }, { "type": "contig", "fields": { "start": 136, "end": 244, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 136, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 245, "end": 360, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 840, "end": 936, "name": "a" } }, { "type": "contig", "fields": { "start": 154, "end": 969, "name": "b" } }, { "type": "contig", "fields": { "start": 812, "end": 839, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 812, "end": 839, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 840, "end": 936, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 154, "end": 969, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 840, "end": 936, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 615, "end": 772, "name": "a" } }, { "type": "contig", "fields": { "start": 289, "end": 740, "name": "b" } }, { "type": "contig", "fields": { "start": 94, "end": 288, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 94, "end": 288, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 289, "end": 740, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 289, "end": 740, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 615, "end": 772, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 147, "end": 735, "name": "a" } }, { "type": "contig", "fields": { "start": 97, "end": 100, "name": "b" } }, { "type": "contig", "fields": { "start": 25, "end": 225, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 225, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 147, "end": 735, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 97, "end": 100, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 147, "end": 735, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 624, "end": 903, "name": "a" } }, { "type": "contig", "fields": { "start": 203, "end": 571, "name": "b" } }, { "type": "contig", "fields": { "start": 420, "end": 623, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 420, "end": 623, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 624, "end": 903, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 203, "end": 571, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 624, "end": 903, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 725, "end": 901, "name": "a" } }, { "type": "contig", "fields": { "start": 566, "end": 703, "name": "b" } }, { "type": "contig", "fields": { "start": 215, "end": 950, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 215, "end": 950, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 566, "end": 703, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 566, "end": 703, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 725, "end": 901, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 610, "end": 809, "name": "a" } }, { "type": "contig", "fields": { "start": 564, "end": 596, "name": "b" } }, { "type": "contig", "fields": { "start": 363, "end": 563, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 363, "end": 563, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 564, "end": 596, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 564, "end": 596, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 610, "end": 809, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 158, "end": 710, "name": "a" } }, { "type": "contig", "fields": { "start": 115, "end": 153, "name": "b" } }, { "type": "contig", "fields": { "start": 993, "end": 998, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 115, "end": 153, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 158, "end": 710, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 115, "end": 153, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 158, "end": 710, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 873, "end": 945, "name": "a" } }, { "type": "contig", "fields": { "start": 493, "end": 872, "name": "b" } }, { "type": "contig", "fields": { "start": 323, "end": 383, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 493, "end": 872, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 873, "end": 945, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 493, "end": 872, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 873, "end": 945, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 92, "end": 880, "name": "a" } }, { "type": "contig", "fields": { "start": 14, "end": 18, "name": "b" } }, { "type": "contig", "fields": { "start": 54, "end": 752, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 54, "end": 752, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 92, "end": 880, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 18, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 92, "end": 880, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 894, "end": 993, "name": "a" } }, { "type": "contig", "fields": { "start": 418, "end": 430, "name": "b" } }, { "type": "contig", "fields": { "start": 759, "end": 860, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 759, "end": 860, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 894, "end": 993, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 418, "end": 430, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 894, "end": 993, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 264, "end": 597, "name": "a" } }, { "type": "contig", "fields": { "start": 20, "end": 207, "name": "b" } }, { "type": "contig", "fields": { "start": 21, "end": 263, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 263, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 597, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 207, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 597, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 812, "end": 898, "name": "a" } }, { "type": "contig", "fields": { "start": 427, "end": 568, "name": "b" } }, { "type": "contig", "fields": { "start": 281, "end": 761, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 281, "end": 761, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 427, "end": 568, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 427, "end": 568, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 812, "end": 898, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 474, "end": 511, "name": "a" } }, { "type": "contig", "fields": { "start": 275, "end": 436, "name": "b" } }, { "type": "contig", "fields": { "start": 273, "end": 273, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 273, "end": 273, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 275, "end": 436, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 275, "end": 436, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 474, "end": 511, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 510, "end": 962, "name": "a" } }, { "type": "contig", "fields": { "start": 260, "end": 396, "name": "b" } }, { "type": "contig", "fields": { "start": 168, "end": 259, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 168, "end": 259, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 260, "end": 396, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 260, "end": 396, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 510, "end": 962, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 912, "end": 962, "name": "a" } }, { "type": "contig", "fields": { "start": 162, "end": 989, "name": "b" } }, { "type": "contig", "fields": { "start": 281, "end": 974, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 162, "end": 989, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 912, "end": 962, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 281, "end": 974, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 912, "end": 962, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 302, "end": 374, "name": "a" } }, { "type": "contig", "fields": { "start": 61, "end": 301, "name": "b" } }, { "type": "contig", "fields": { "start": 185, "end": 563, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 61, "end": 301, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 302, "end": 374, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 185, "end": 563, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 302, "end": 374, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 438, "end": 590, "name": "a" } }, { "type": "contig", "fields": { "start": 840, "end": 899, "name": "b" } }, { "type": "contig", "fields": { "start": 435, "end": 839, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 435, "end": 839, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 840, "end": 899, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 435, "end": 839, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 438, "end": 590, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 398, "end": 723, "name": "a" } }, { "type": "contig", "fields": { "start": 161, "end": 947, "name": "b" } }, { "type": "contig", "fields": { "start": 179, "end": 191, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 161, "end": 947, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 398, "end": 723, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 179, "end": 191, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 398, "end": 723, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 114, "end": 842, "name": "a" } }, { "type": "contig", "fields": { "start": 50, "end": 112, "name": "b" } }, { "type": "contig", "fields": { "start": 79, "end": 82, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 50, "end": 112, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 114, "end": 842, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 79, "end": 82, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 114, "end": 842, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 961, "end": 963, "name": "a" } }, { "type": "contig", "fields": { "start": 827, "end": 960, "name": "b" } }, { "type": "contig", "fields": { "start": 364, "end": 767, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 827, "end": 960, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 961, "end": 963, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 364, "end": 767, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 961, "end": 963, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 627, "end": 654, "name": "a" } }, { "type": "contig", "fields": { "start": 412, "end": 733, "name": "b" } }, { "type": "contig", "fields": { "start": 53, "end": 428, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 53, "end": 428, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 412, "end": 733, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 53, "end": 428, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 627, "end": 654, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 92, "end": 428, "name": "a" } }, { "type": "contig", "fields": { "start": 74, "end": 74, "name": "b" } }, { "type": "contig", "fields": { "start": 21, "end": 73, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 73, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 74, "end": 74, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 73, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 92, "end": 428, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 429, "end": 706, "name": "a" } }, { "type": "contig", "fields": { "start": 369, "end": 594, "name": "b" } }, { "type": "contig", "fields": { "start": 170, "end": 380, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 369, "end": 594, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 429, "end": 706, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 170, "end": 380, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 429, "end": 706, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 947, "end": 975, "name": "a" } }, { "type": "contig", "fields": { "start": 689, "end": 732, "name": "b" } }, { "type": "contig", "fields": { "start": 219, "end": 535, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 689, "end": 732, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 947, "end": 975, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 219, "end": 535, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 947, "end": 975, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 9, "end": 498, "name": "a" } }, { "type": "contig", "fields": { "start": 7, "end": 9, "name": "b" } }, { "type": "contig", "fields": { "start": 9, "end": 9, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 7, "end": 9, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 9, "end": 498, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 9, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 9, "end": 498, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 751, "end": 836, "name": "a" } }, { "type": "contig", "fields": { "start": 451, "end": 750, "name": "b" } }, { "type": "contig", "fields": { "start": 436, "end": 663, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 451, "end": 750, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 751, "end": 836, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 436, "end": 663, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 751, "end": 836, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 696, "end": 917, "name": "a" } }, { "type": "contig", "fields": { "start": 187, "end": 619, "name": "b" } }, { "type": "contig", "fields": { "start": 209, "end": 350, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 209, "end": 350, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 696, "end": 917, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 209, "end": 350, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 696, "end": 917, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 566, "end": 784, "name": "a" } }, { "type": "contig", "fields": { "start": 915, "end": 945, "name": "b" } }, { "type": "contig", "fields": { "start": 258, "end": 565, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 258, "end": 565, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 566, "end": 784, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 258, "end": 565, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 566, "end": 784, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 68, "end": 251, "name": "a" } }, { "type": "contig", "fields": { "start": 7, "end": 860, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 42, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 42, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 7, "end": 860, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 42, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 68, "end": 251, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 965, "end": 967, "name": "a" } }, { "type": "contig", "fields": { "start": 988, "end": 988, "name": "b" } }, { "type": "contig", "fields": { "start": 312, "end": 763, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 312, "end": 763, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 988, "end": 988, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 312, "end": 763, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 965, "end": 967, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 512, "end": 738, "name": "a" } }, { "type": "contig", "fields": { "start": 120, "end": 387, "name": "b" } }, { "type": "contig", "fields": { "start": 62, "end": 119, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 62, "end": 119, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 120, "end": 387, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 62, "end": 119, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 512, "end": 738, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 214, "end": 530, "name": "a" } }, { "type": "contig", "fields": { "start": 107, "end": 213, "name": "b" } }, { "type": "contig", "fields": { "start": 9, "end": 213, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 107, "end": 213, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 214, "end": 530, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 213, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 214, "end": 530, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 38, "end": 360, "name": "a" } }, { "type": "contig", "fields": { "start": 32, "end": 652, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 84, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 32, "end": 652, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 38, "end": 360, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 84, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 32, "end": 652, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 485, "end": 835, "name": "a" } }, { "type": "contig", "fields": { "start": 409, "end": 484, "name": "b" } }, { "type": "contig", "fields": { "start": 352, "end": 541, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 409, "end": 484, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 485, "end": 835, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 352, "end": 541, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 409, "end": 484, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 668, "end": 974, "name": "a" } }, { "type": "contig", "fields": { "start": 945, "end": 963, "name": "b" } }, { "type": "contig", "fields": { "start": 116, "end": 964, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 116, "end": 964, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 668, "end": 974, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 116, "end": 964, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 945, "end": 963, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 63, "end": 974, "name": "a" } }, { "type": "contig", "fields": { "start": 5, "end": 294, "name": "b" } }, { "type": "contig", "fields": { "start": 2, "end": 62, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 62, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 63, "end": 974, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 62, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 5, "end": 294, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 250, "end": 287, "name": "a" } }, { "type": "contig", "fields": { "start": 14, "end": 779, "name": "b" } }, { "type": "contig", "fields": { "start": 11, "end": 11, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 779, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 250, "end": 287, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 14, "end": 779, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 600, "end": 802, "name": "a" } }, { "type": "contig", "fields": { "start": 535, "end": 584, "name": "b" } }, { "type": "contig", "fields": { "start": 222, "end": 443, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 535, "end": 584, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 600, "end": 802, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 443, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 535, "end": 584, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 114, "end": 826, "name": "a" } }, { "type": "contig", "fields": { "start": 46, "end": 113, "name": "b" } }, { "type": "contig", "fields": { "start": 41, "end": 43, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 46, "end": 113, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 114, "end": 826, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 41, "end": 43, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 113, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 750, "end": 755, "name": "a" } }, { "type": "contig", "fields": { "start": 773, "end": 773, "name": "b" } }, { "type": "contig", "fields": { "start": 69, "end": 770, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 69, "end": 770, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 750, "end": 755, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 69, "end": 770, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 773, "end": 773, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 765, "end": 899, "name": "a" } }, { "type": "contig", "fields": { "start": 970, "end": 989, "name": "b" } }, { "type": "contig", "fields": { "start": 337, "end": 622, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 337, "end": 622, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 765, "end": 899, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 337, "end": 622, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 970, "end": 989, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 521, "end": 673, "name": "a" } }, { "type": "contig", "fields": { "start": 826, "end": 974, "name": "b" } }, { "type": "contig", "fields": { "start": 353, "end": 520, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 353, "end": 520, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 521, "end": 673, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 353, "end": 520, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 826, "end": 974, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 663, "end": 756, "name": "a" } }, { "type": "contig", "fields": { "start": 24, "end": 727, "name": "b" } }, { "type": "contig", "fields": { "start": 20, "end": 24, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 727, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 663, "end": 756, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 24, "end": 727, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 44, "end": 758, "name": "a" } }, { "type": "contig", "fields": { "start": 5, "end": 39, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 2, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 39, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 44, "end": 758, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 2, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 5, "end": 39, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 223, "end": 843, "name": "a" } }, { "type": "contig", "fields": { "start": 146, "end": 221, "name": "b" } }, { "type": "contig", "fields": { "start": 122, "end": 141, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 146, "end": 221, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 223, "end": 843, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 122, "end": 141, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 146, "end": 221, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 116, "end": 524, "name": "a" } }, { "type": "contig", "fields": { "start": 76, "end": 115, "name": "b" } }, { "type": "contig", "fields": { "start": 27, "end": 31, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 76, "end": 115, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 116, "end": 524, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 31, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 76, "end": 115, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 499, "end": 728, "name": "a" } }, { "type": "contig", "fields": { "start": 757, "end": 927, "name": "b" } }, { "type": "contig", "fields": { "start": 196, "end": 663, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 196, "end": 663, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 499, "end": 728, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 196, "end": 663, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 757, "end": 927, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 892, "end": 963, "name": "a" } }, { "type": "contig", "fields": { "start": 162, "end": 293, "name": "b" } }, { "type": "contig", "fields": { "start": 67, "end": 76, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 76, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 892, "end": 963, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 76, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 162, "end": 293, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 33, "end": 113, "name": "a" } }, { "type": "contig", "fields": { "start": 508, "end": 950, "name": "b" } }, { "type": "contig", "fields": { "start": 3, "end": 32, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 32, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 33, "end": 113, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 32, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 508, "end": 950, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 301, "end": 649, "name": "a" } }, { "type": "contig", "fields": { "start": 446, "end": 545, "name": "b" } }, { "type": "contig", "fields": { "start": 108, "end": 300, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 108, "end": 300, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 301, "end": 649, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 108, "end": 300, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 446, "end": 545, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 642, "end": 724, "name": "a" } }, { "type": "contig", "fields": { "start": 367, "end": 631, "name": "b" } }, { "type": "contig", "fields": { "start": 98, "end": 131, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 98, "end": 131, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 367, "end": 631, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 98, "end": 131, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 367, "end": 631, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 389, "end": 566, "name": "a" } }, { "type": "contig", "fields": { "start": 148, "end": 730, "name": "b" } }, { "type": "contig", "fields": { "start": 9, "end": 147, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 147, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 148, "end": 730, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 147, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 148, "end": 730, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 931, "end": 944, "name": "a" } }, { "type": "contig", "fields": { "start": 141, "end": 930, "name": "b" } }, { "type": "contig", "fields": { "start": 140, "end": 140, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 141, "end": 930, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 931, "end": 944, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 140, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 141, "end": 930, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 476, "end": 573, "name": "a" } }, { "type": "contig", "fields": { "start": 476, "end": 652, "name": "b" } }, { "type": "contig", "fields": { "start": 464, "end": 475, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 464, "end": 475, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 476, "end": 573, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 464, "end": 475, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 476, "end": 652, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 183, "end": 500, "name": "a" } }, { "type": "contig", "fields": { "start": 23, "end": 25, "name": "b" } }, { "type": "contig", "fields": { "start": 17, "end": 404, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 404, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 183, "end": 500, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 404, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 23, "end": 25, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 25, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 183, "end": 500, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 444, "end": 949, "name": "a" } }, { "type": "contig", "fields": { "start": 170, "end": 219, "name": "b" } }, { "type": "contig", "fields": { "start": 167, "end": 443, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 167, "end": 443, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 444, "end": 949, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 167, "end": 443, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 170, "end": 219, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 170, "end": 219, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 444, "end": 949, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 276, "end": 591, "name": "a" } }, { "type": "contig", "fields": { "start": 85, "end": 185, "name": "b" } }, { "type": "contig", "fields": { "start": 2, "end": 385, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 385, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 276, "end": 591, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 85, "end": 185, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 276, "end": 591, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 85, "end": 185, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 276, "end": 591, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 145, "end": 146, "name": "a" } }, { "type": "contig", "fields": { "start": 75, "end": 107, "name": "b" } }, { "type": "contig", "fields": { "start": 20, "end": 144, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 144, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 145, "end": 146, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 75, "end": 107, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 145, "end": 146, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 75, "end": 107, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 145, "end": 146, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 49, "end": 385, "name": "a" } }, { "type": "contig", "fields": { "start": 41, "end": 43, "name": "b" } }, { "type": "contig", "fields": { "start": 8, "end": 216, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 216, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 41, "end": 43, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 41, "end": 43, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 49, "end": 385, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 41, "end": 43, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 49, "end": 385, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 8, "end": 660, "name": "a" } }, { "type": "contig", "fields": { "start": 4, "end": 4, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 3, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 3, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 4, "end": 4, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 4, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 8, "end": 660, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 4, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 8, "end": 660, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 642, "end": 949, "name": "a" } }, { "type": "contig", "fields": { "start": 272, "end": 641, "name": "b" } }, { "type": "contig", "fields": { "start": 302, "end": 845, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 272, "end": 641, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 642, "end": 949, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 302, "end": 845, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 642, "end": 949, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 272, "end": 641, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 642, "end": 949, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 366, "end": 398, "name": "a" } }, { "type": "contig", "fields": { "start": 323, "end": 342, "name": "b" } }, { "type": "contig", "fields": { "start": 272, "end": 351, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 323, "end": 342, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 366, "end": 398, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 272, "end": 351, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 366, "end": 398, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 323, "end": 342, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 366, "end": 398, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 806, "end": 894, "name": "a" } }, { "type": "contig", "fields": { "start": 391, "end": 805, "name": "b" } }, { "type": "contig", "fields": { "start": 508, "end": 763, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 391, "end": 805, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 894, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 508, "end": 763, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 894, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 391, "end": 805, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 894, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 141, "end": 479, "name": "a" } }, { "type": "contig", "fields": { "start": 36, "end": 111, "name": "b" } }, { "type": "contig", "fields": { "start": 18, "end": 115, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 18, "end": 115, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 36, "end": 111, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 18, "end": 115, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 141, "end": 479, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 111, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 141, "end": 479, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 571, "end": 901, "name": "a" } }, { "type": "contig", "fields": { "start": 402, "end": 448, "name": "b" } }, { "type": "contig", "fields": { "start": 47, "end": 401, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 401, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 402, "end": 448, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 401, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 571, "end": 901, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 402, "end": 448, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 571, "end": 901, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 441, "end": 666, "name": "a" } }, { "type": "contig", "fields": { "start": 395, "end": 440, "name": "b" } }, { "type": "contig", "fields": { "start": 287, "end": 440, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 395, "end": 440, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 441, "end": 666, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 287, "end": 440, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 441, "end": 666, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 395, "end": 440, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 441, "end": 666, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 142, "end": 212, "name": "a" } }, { "type": "contig", "fields": { "start": 51, "end": 141, "name": "b" } }, { "type": "contig", "fields": { "start": 20, "end": 756, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 51, "end": 141, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 142, "end": 212, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 756, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 51, "end": 141, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 51, "end": 141, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 142, "end": 212, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 738, "end": 896, "name": "a" } }, { "type": "contig", "fields": { "start": 642, "end": 681, "name": "b" } }, { "type": "contig", "fields": { "start": 572, "end": 903, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 572, "end": 903, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 738, "end": 896, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 572, "end": 903, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 642, "end": 681, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 642, "end": 681, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 738, "end": 896, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 385, "end": 920, "name": "a" } }, { "type": "contig", "fields": { "start": 193, "end": 247, "name": "b" } }, { "type": "contig", "fields": { "start": 55, "end": 384, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 55, "end": 384, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 385, "end": 920, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 55, "end": 384, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 193, "end": 247, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 193, "end": 247, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 385, "end": 920, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 602, "end": 905, "name": "a" } }, { "type": "contig", "fields": { "start": 310, "end": 315, "name": "b" } }, { "type": "contig", "fields": { "start": 141, "end": 277, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 310, "end": 315, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 602, "end": 905, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 141, "end": 277, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 310, "end": 315, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 310, "end": 315, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 602, "end": 905, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 400, "end": 825, "name": "a" } }, { "type": "contig", "fields": { "start": 337, "end": 399, "name": "b" } }, { "type": "contig", "fields": { "start": 319, "end": 327, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 337, "end": 399, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 400, "end": 825, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 319, "end": 327, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 337, "end": 399, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 337, "end": 399, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 400, "end": 825, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 217, "end": 462, "name": "a" } }, { "type": "contig", "fields": { "start": 201, "end": 205, "name": "b" } }, { "type": "contig", "fields": { "start": 112, "end": 147, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 112, "end": 147, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 217, "end": 462, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 112, "end": 147, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 201, "end": 205, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 201, "end": 205, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 217, "end": 462, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 513, "end": 701, "name": "a" } }, { "type": "contig", "fields": { "start": 355, "end": 512, "name": "b" } }, { "type": "contig", "fields": { "start": 64, "end": 354, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 355, "end": 512, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 513, "end": 701, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 64, "end": 354, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 355, "end": 512, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 355, "end": 512, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 513, "end": 701, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 950, "end": 955, "name": "a" } }, { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } }, { "type": "contig", "fields": { "start": 615, "end": 949, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 615, "end": 949, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 955, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 615, "end": 949, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 955, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 167, "end": 317, "name": "a" } }, { "type": "contig", "fields": { "start": 12, "end": 779, "name": "b" } }, { "type": "contig", "fields": { "start": 7, "end": 11, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 7, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 12, "end": 779, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 779, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 167, "end": 317, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 7, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 167, "end": 317, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 479, "end": 987, "name": "a" } }, { "type": "contig", "fields": { "start": 185, "end": 299, "name": "b" } }, { "type": "contig", "fields": { "start": 136, "end": 244, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 136, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 185, "end": 299, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 185, "end": 299, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 479, "end": 987, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 136, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 479, "end": 987, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 130, "end": 305, "name": "a" } }, { "type": "contig", "fields": { "start": 125, "end": 126, "name": "b" } }, { "type": "contig", "fields": { "start": 106, "end": 124, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 124, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 125, "end": 126, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 125, "end": 126, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 130, "end": 305, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 124, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 130, "end": 305, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 505, "end": 869, "name": "a" } }, { "type": "contig", "fields": { "start": 102, "end": 575, "name": "b" } }, { "type": "contig", "fields": { "start": 94, "end": 384, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 102, "end": 575, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 505, "end": 869, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 94, "end": 384, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 102, "end": 575, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 94, "end": 384, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 505, "end": 869, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 785, "end": 840, "name": "a" } }, { "type": "contig", "fields": { "start": 719, "end": 784, "name": "b" } }, { "type": "contig", "fields": { "start": 659, "end": 777, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 719, "end": 784, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 785, "end": 840, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 659, "end": 777, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 719, "end": 784, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 659, "end": 777, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 785, "end": 840, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 410, "end": 583, "name": "a" } }, { "type": "contig", "fields": { "start": 287, "end": 409, "name": "b" } }, { "type": "contig", "fields": { "start": 158, "end": 286, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 287, "end": 409, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 410, "end": 583, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 158, "end": 286, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 287, "end": 409, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 158, "end": 286, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 410, "end": 583, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 802, "end": 896, "name": "a" } }, { "type": "contig", "fields": { "start": 29, "end": 981, "name": "b" } }, { "type": "contig", "fields": { "start": 318, "end": 801, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 318, "end": 801, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 896, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 29, "end": 981, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 896, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 318, "end": 801, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 896, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 584, "end": 641, "name": "a" } }, { "type": "contig", "fields": { "start": 258, "end": 771, "name": "b" } }, { "type": "contig", "fields": { "start": 110, "end": 257, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 110, "end": 257, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 258, "end": 771, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 258, "end": 771, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 584, "end": 641, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 110, "end": 257, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 584, "end": 641, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 763, "end": 775, "name": "a" } }, { "type": "contig", "fields": { "start": 407, "end": 752, "name": "b" } }, { "type": "contig", "fields": { "start": 688, "end": 762, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 688, "end": 762, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 763, "end": 775, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 407, "end": 752, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 763, "end": 775, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 688, "end": 762, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 763, "end": 775, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 44, "end": 168, "name": "a" } }, { "type": "contig", "fields": { "start": 20, "end": 33, "name": "b" } }, { "type": "contig", "fields": { "start": 3, "end": 24, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 20, "end": 33, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 33, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 44, "end": 168, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 44, "end": 168, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 230, "end": 327, "name": "a" } }, { "type": "contig", "fields": { "start": 208, "end": 215, "name": "b" } }, { "type": "contig", "fields": { "start": 114, "end": 207, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 114, "end": 207, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 208, "end": 215, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 208, "end": 215, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 230, "end": 327, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 114, "end": 207, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 230, "end": 327, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 775, "end": 869, "name": "a" } }, { "type": "contig", "fields": { "start": 106, "end": 454, "name": "b" } }, { "type": "contig", "fields": { "start": 289, "end": 556, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 454, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 775, "end": 869, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 454, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 775, "end": 869, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 289, "end": 556, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 775, "end": 869, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 736, "end": 935, "name": "a" } }, { "type": "contig", "fields": { "start": 234, "end": 735, "name": "b" } }, { "type": "contig", "fields": { "start": 261, "end": 668, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 735, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 736, "end": 935, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 735, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 736, "end": 935, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 261, "end": 668, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 736, "end": 935, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 227, "end": 344, "name": "a" } }, { "type": "contig", "fields": { "start": 159, "end": 159, "name": "b" } }, { "type": "contig", "fields": { "start": 80, "end": 102, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 80, "end": 102, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 227, "end": 344, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 159, "end": 159, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 227, "end": 344, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 80, "end": 102, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 227, "end": 344, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 523, "end": 734, "name": "a" } }, { "type": "contig", "fields": { "start": 271, "end": 466, "name": "b" } }, { "type": "contig", "fields": { "start": 185, "end": 522, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 185, "end": 522, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 523, "end": 734, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 271, "end": 466, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 523, "end": 734, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 185, "end": 522, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 523, "end": 734, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 158, "end": 772, "name": "a" } }, { "type": "contig", "fields": { "start": 107, "end": 149, "name": "b" } }, { "type": "contig", "fields": { "start": 30, "end": 121, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 30, "end": 121, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 107, "end": 149, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 107, "end": 149, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 158, "end": 772, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 30, "end": 121, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 158, "end": 772, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 54, "end": 175, "name": "a" } }, { "type": "contig", "fields": { "start": 17, "end": 19, "name": "b" } }, { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 17, "end": 19, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 19, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 175, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 175, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 72, "end": 953, "name": "a" } }, { "type": "contig", "fields": { "start": 39, "end": 72, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 38, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 38, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 39, "end": 72, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 72, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 72, "end": 953, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 38, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 72, "end": 953, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 110, "end": 505, "name": "a" } }, { "type": "contig", "fields": { "start": 27, "end": 396, "name": "b" } }, { "type": "contig", "fields": { "start": 13, "end": 103, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 396, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 110, "end": 505, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 13, "end": 103, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 110, "end": 505, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 13, "end": 103, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 110, "end": 505, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 930, "end": 943, "name": "a" } }, { "type": "contig", "fields": { "start": 280, "end": 818, "name": "b" } }, { "type": "contig", "fields": { "start": 351, "end": 645, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 280, "end": 818, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 930, "end": 943, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 351, "end": 645, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 930, "end": 943, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 351, "end": 645, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 930, "end": 943, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 745, "end": 786, "name": "a" } }, { "type": "contig", "fields": { "start": 530, "end": 744, "name": "b" } }, { "type": "contig", "fields": { "start": 568, "end": 703, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 530, "end": 744, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 745, "end": 786, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 568, "end": 703, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 745, "end": 786, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 568, "end": 703, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 745, "end": 786, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 283, "end": 654, "name": "a" } }, { "type": "contig", "fields": { "start": 236, "end": 814, "name": "b" } }, { "type": "contig", "fields": { "start": 126, "end": 274, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 126, "end": 274, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 236, "end": 814, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 126, "end": 274, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 283, "end": 654, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 126, "end": 274, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 283, "end": 654, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 28, "end": 86, "name": "a" } }, { "type": "contig", "fields": { "start": 21, "end": 587, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 20, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 20, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 21, "end": 587, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 20, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 28, "end": 86, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 20, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 28, "end": 86, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 750, "end": 946, "name": "a" } }, { "type": "contig", "fields": { "start": 297, "end": 749, "name": "b" } }, { "type": "contig", "fields": { "start": 184, "end": 749, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 297, "end": 749, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 750, "end": 946, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 184, "end": 749, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 750, "end": 946, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 184, "end": 749, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 750, "end": 946, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 332, "end": 620, "name": "a" } }, { "type": "contig", "fields": { "start": 80, "end": 443, "name": "b" } }, { "type": "contig", "fields": { "start": 76, "end": 158, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 80, "end": 443, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 332, "end": 620, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 76, "end": 158, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 80, "end": 443, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 76, "end": 158, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 332, "end": 620, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 769, "end": 793, "name": "a" } }, { "type": "contig", "fields": { "start": 717, "end": 768, "name": "b" } }, { "type": "contig", "fields": { "start": 392, "end": 766, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 717, "end": 768, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 769, "end": 793, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 392, "end": 766, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 717, "end": 768, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 392, "end": 766, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 769, "end": 793, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 365, "end": 618, "name": "a" } }, { "type": "contig", "fields": { "start": 59, "end": 871, "name": "b" } }, { "type": "contig", "fields": { "start": 47, "end": 364, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 364, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 365, "end": 618, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 364, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 59, "end": 871, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 364, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 365, "end": 618, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 75, "end": 769, "name": "a" } }, { "type": "contig", "fields": { "start": 62, "end": 931, "name": "b" } }, { "type": "contig", "fields": { "start": 14, "end": 59, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 62, "end": 931, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 75, "end": 769, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 62, "end": 931, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 75, "end": 769, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 816, "end": 844, "name": "a" } }, { "type": "contig", "fields": { "start": 698, "end": 748, "name": "b" } }, { "type": "contig", "fields": { "start": 264, "end": 644, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 698, "end": 748, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 816, "end": 844, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 264, "end": 644, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 698, "end": 748, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 264, "end": 644, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 816, "end": 844, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 855, "end": 922, "name": "a" } }, { "type": "contig", "fields": { "start": 156, "end": 854, "name": "b" } }, { "type": "contig", "fields": { "start": 84, "end": 86, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 156, "end": 854, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 855, "end": 922, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 84, "end": 86, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 156, "end": 854, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 84, "end": 86, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 855, "end": 922, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 22, "end": 771, "name": "a" } }, { "type": "contig", "fields": { "start": 827, "end": 843, "name": "b" } }, { "type": "contig", "fields": { "start": 17, "end": 18, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 22, "end": 771, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 827, "end": 843, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 22, "end": 771, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 869, "end": 983, "name": "a" } }, { "type": "contig", "fields": { "start": 998, "end": 999, "name": "b" } }, { "type": "contig", "fields": { "start": 793, "end": 868, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 793, "end": 868, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 869, "end": 983, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 793, "end": 868, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 998, "end": 999, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 793, "end": 868, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 869, "end": 983, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 935, "end": 990, "name": "a" } }, { "type": "contig", "fields": { "start": 572, "end": 934, "name": "b" } }, { "type": "contig", "fields": { "start": 381, "end": 571, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 572, "end": 934, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 935, "end": 990, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 381, "end": 571, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 572, "end": 934, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 381, "end": 571, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 935, "end": 990, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 685, "end": 956, "name": "a" } }, { "type": "contig", "fields": { "start": 685, "end": 922, "name": "b" } }, { "type": "contig", "fields": { "start": 524, "end": 684, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 524, "end": 684, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 685, "end": 956, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 524, "end": 684, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 685, "end": 922, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 524, "end": 684, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 685, "end": 956, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 418, "end": 658, "name": "a" } }, { "type": "contig", "fields": { "start": 112, "end": 622, "name": "b" } }, { "type": "contig", "fields": { "start": 33, "end": 417, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 33, "end": 417, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 418, "end": 658, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 112, "end": 622, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 418, "end": 658, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 33, "end": 417, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 112, "end": 622, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 844, "end": 869, "name": "a" } }, { "type": "contig", "fields": { "start": 543, "end": 933, "name": "b" } }, { "type": "contig", "fields": { "start": 289, "end": 942, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 543, "end": 933, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 844, "end": 869, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 289, "end": 942, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 844, "end": 869, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 289, "end": 942, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 543, "end": 933, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 735, "end": 936, "name": "a" } }, { "type": "contig", "fields": { "start": 404, "end": 734, "name": "b" } }, { "type": "contig", "fields": { "start": 36, "end": 915, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 404, "end": 734, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 735, "end": 936, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 915, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 735, "end": 936, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 915, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 404, "end": 734, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 283, "end": 721, "name": "a" } }, { "type": "contig", "fields": { "start": 266, "end": 282, "name": "b" } }, { "type": "contig", "fields": { "start": 129, "end": 282, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 266, "end": 282, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 283, "end": 721, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 129, "end": 282, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 283, "end": 721, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 129, "end": 282, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 266, "end": 282, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 649, "end": 727, "name": "a" } }, { "type": "contig", "fields": { "start": 284, "end": 994, "name": "b" } }, { "type": "contig", "fields": { "start": 250, "end": 274, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 284, "end": 994, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 649, "end": 727, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 250, "end": 274, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 649, "end": 727, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 250, "end": 274, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 284, "end": 994, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 399, "end": 905, "name": "a" } }, { "type": "contig", "fields": { "start": 159, "end": 229, "name": "b" } }, { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 159, "end": 229, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 399, "end": 905, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 399, "end": 905, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 159, "end": 229, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 965, "end": 989, "name": "a" } }, { "type": "contig", "fields": { "start": 257, "end": 964, "name": "b" } }, { "type": "contig", "fields": { "start": 225, "end": 233, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 257, "end": 964, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 965, "end": 989, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 225, "end": 233, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 965, "end": 989, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 225, "end": 233, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 257, "end": 964, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 821, "end": 861, "name": "a" } }, { "type": "contig", "fields": { "start": 820, "end": 931, "name": "b" } }, { "type": "contig", "fields": { "start": 566, "end": 820, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 566, "end": 820, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 861, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 820, "end": 931, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 861, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 566, "end": 820, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 820, "end": 931, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 894, "end": 922, "name": "a" } }, { "type": "contig", "fields": { "start": 886, "end": 996, "name": "b" } }, { "type": "contig", "fields": { "start": 66, "end": 885, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 885, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 886, "end": 996, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 886, "end": 996, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 894, "end": 922, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 885, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 886, "end": 996, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 952, "end": 996, "name": "a" } }, { "type": "contig", "fields": { "start": 167, "end": 571, "name": "b" } }, { "type": "contig", "fields": { "start": 43, "end": 166, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 43, "end": 166, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 167, "end": 571, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 167, "end": 571, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 952, "end": 996, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 43, "end": 166, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 167, "end": 571, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 48, "end": 958, "name": "a" } }, { "type": "contig", "fields": { "start": 29, "end": 29, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 14, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 29, "end": 29, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 48, "end": 958, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 29, "end": 29, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 48, "end": 958, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 14, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 29, "end": 29, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 451, "end": 615, "name": "a" } }, { "type": "contig", "fields": { "start": 284, "end": 450, "name": "b" } }, { "type": "contig", "fields": { "start": 209, "end": 210, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 284, "end": 450, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 451, "end": 615, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 284, "end": 450, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 451, "end": 615, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 209, "end": 210, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 284, "end": 450, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 42, "end": 687, "name": "a" } }, { "type": "contig", "fields": { "start": 10, "end": 38, "name": "b" } }, { "type": "contig", "fields": { "start": 9, "end": 10, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 10, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 42, "end": 687, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 10, "end": 38, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 42, "end": 687, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 10, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 10, "end": 38, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 92, "end": 840, "name": "a" } }, { "type": "contig", "fields": { "start": 91, "end": 91, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 91, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 91, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 92, "end": 840, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 91, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 92, "end": 840, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 91, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 91, "end": 91, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 613, "end": 679, "name": "a" } }, { "type": "contig", "fields": { "start": 271, "end": 432, "name": "b" } }, { "type": "contig", "fields": { "start": 142, "end": 192, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 142, "end": 192, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 271, "end": 432, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 271, "end": 432, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 613, "end": 679, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 142, "end": 192, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 271, "end": 432, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 202, "end": 432, "name": "a" } }, { "type": "contig", "fields": { "start": 197, "end": 199, "name": "b" } }, { "type": "contig", "fields": { "start": 84, "end": 196, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 84, "end": 196, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 197, "end": 199, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 197, "end": 199, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 202, "end": 432, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 84, "end": 196, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 197, "end": 199, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 272, "end": 569, "name": "a" } }, { "type": "contig", "fields": { "start": 514, "end": 531, "name": "b" } }, { "type": "contig", "fields": { "start": 175, "end": 513, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 175, "end": 513, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 514, "end": 531, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 175, "end": 513, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 272, "end": 569, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 175, "end": 513, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 514, "end": 531, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 466, "end": 763, "name": "a" } }, { "type": "contig", "fields": { "start": 444, "end": 621, "name": "b" } }, { "type": "contig", "fields": { "start": 101, "end": 172, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 444, "end": 621, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 466, "end": 763, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 101, "end": 172, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 466, "end": 763, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 101, "end": 172, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 444, "end": 621, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 918, "end": 934, "name": "a" } }, { "type": "contig", "fields": { "start": 352, "end": 569, "name": "b" } }, { "type": "contig", "fields": { "start": 181, "end": 306, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 352, "end": 569, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 918, "end": 934, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 181, "end": 306, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 918, "end": 934, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 181, "end": 306, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 352, "end": 569, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 633, "end": 980, "name": "a" } }, { "type": "contig", "fields": { "start": 370, "end": 632, "name": "b" } }, { "type": "contig", "fields": { "start": 44, "end": 116, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 370, "end": 632, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 633, "end": 980, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 44, "end": 116, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 633, "end": 980, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 44, "end": 116, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 370, "end": 632, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 119, "end": 708, "name": "a" } }, { "type": "contig", "fields": { "start": 78, "end": 433, "name": "b" } }, { "type": "contig", "fields": { "start": 53, "end": 77, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 53, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 78, "end": 433, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 53, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 119, "end": 708, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 53, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 78, "end": 433, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 524, "end": 671, "name": "a" } }, { "type": "contig", "fields": { "start": 351, "end": 859, "name": "b" } }, { "type": "contig", "fields": { "start": 347, "end": 350, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 351, "end": 859, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 524, "end": 671, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 347, "end": 350, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 524, "end": 671, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 347, "end": 350, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 351, "end": 859, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 510, "end": 804, "name": "a" } }, { "type": "contig", "fields": { "start": 17, "end": 356, "name": "b" } }, { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 17, "end": 356, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 510, "end": 804, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 510, "end": 804, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 14, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 17, "end": 356, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 738, "end": 757, "name": "a" } }, { "type": "contig", "fields": { "start": 63, "end": 294, "name": "b" } }, { "type": "contig", "fields": { "start": 11, "end": 12, "name": "c" } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 63, "end": 294, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 738, "end": 757, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 12, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 738, "end": 757, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 12, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 63, "end": 294, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 606, "end": 664, "name": "a" } }, { "type": "contig", "fields": { "start": 26, "end": 605, "name": "b" } }, { "type": "contig", "fields": { "start": 20, "end": 20, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 605, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 606, "end": 664, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 20, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 606, "end": 664, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 20, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 26, "end": 605, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 873, "end": 950, "name": "a" } }, { "type": "contig", "fields": { "start": 640, "end": 870, "name": "b" } }, { "type": "contig", "fields": { "start": 140, "end": 554, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 554, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 873, "end": 950, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 554, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 873, "end": 950, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 554, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 640, "end": 870, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 649, "end": 893, "name": "a" } }, { "type": "contig", "fields": { "start": 741, "end": 963, "name": "b" } }, { "type": "contig", "fields": { "start": 623, "end": 648, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 623, "end": 648, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 649, "end": 893, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 623, "end": 648, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 649, "end": 893, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 623, "end": 648, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 741, "end": 963, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 126, "end": 131, "name": "a" } }, { "type": "contig", "fields": { "start": 178, "end": 763, "name": "b" } }, { "type": "contig", "fields": { "start": 22, "end": 77, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 178, "end": 763, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 126, "end": 131, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 178, "end": 763, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 265, "end": 905, "name": "a" } }, { "type": "contig", "fields": { "start": 14, "end": 926, "name": "b" } }, { "type": "contig", "fields": { "start": 12, "end": 13, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 13, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 14, "end": 926, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 13, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 265, "end": 905, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 13, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 14, "end": 926, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 951, "end": 982, "name": "a" } }, { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } }, { "type": "contig", "fields": { "start": 91, "end": 950, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 951, "end": 982, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 950, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 951, "end": 982, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 950, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 950, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 455, "end": 772, "name": "a" } }, { "type": "contig", "fields": { "start": 347, "end": 900, "name": "b" } }, { "type": "contig", "fields": { "start": 335, "end": 341, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 347, "end": 900, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 455, "end": 772, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 335, "end": 341, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 347, "end": 900, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 335, "end": 341, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 347, "end": 900, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 235, "end": 631, "name": "a" } }, { "type": "contig", "fields": { "start": 52, "end": 156, "name": "b" } }, { "type": "contig", "fields": { "start": 8, "end": 40, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 52, "end": 156, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 235, "end": 631, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 40, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 52, "end": 156, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 40, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 52, "end": 156, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 771, "end": 932, "name": "a" } }, { "type": "contig", "fields": { "start": 380, "end": 770, "name": "b" } }, { "type": "contig", "fields": { "start": 333, "end": 333, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 380, "end": 770, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 771, "end": 932, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 333, "end": 333, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 380, "end": 770, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 333, "end": 333, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 380, "end": 770, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 703, "end": 936, "name": "a" } }, { "type": "contig", "fields": { "start": 966, "end": 966, "name": "b" } }, { "type": "contig", "fields": { "start": 681, "end": 854, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 681, "end": 854, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 703, "end": 936, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 681, "end": 854, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 966, "end": 966, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 681, "end": 854, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 966, "end": 966, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 512, "end": 939, "name": "a" } }, { "type": "contig", "fields": { "start": 472, "end": 496, "name": "b" } }, { "type": "contig", "fields": { "start": 388, "end": 413, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 388, "end": 413, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 512, "end": 939, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 388, "end": 413, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 472, "end": 496, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 388, "end": 413, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 472, "end": 496, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 534, "end": 837, "name": "a" } }, { "type": "contig", "fields": { "start": 616, "end": 949, "name": "b" } }, { "type": "contig", "fields": { "start": 357, "end": 533, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 357, "end": 533, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 534, "end": 837, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 357, "end": 533, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 616, "end": 949, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 357, "end": 533, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 616, "end": 949, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 725, "end": 903, "name": "a" } }, { "type": "contig", "fields": { "start": 670, "end": 724, "name": "b" } }, { "type": "contig", "fields": { "start": 517, "end": 669, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 670, "end": 724, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 725, "end": 903, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 517, "end": 669, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 670, "end": 724, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 517, "end": 669, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 670, "end": 724, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 591, "end": 896, "name": "a" } }, { "type": "contig", "fields": { "start": 591, "end": 701, "name": "b" } }, { "type": "contig", "fields": { "start": 213, "end": 590, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 213, "end": 590, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 591, "end": 896, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 213, "end": 590, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 591, "end": 701, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 213, "end": 590, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 591, "end": 701, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 806, "end": 994, "name": "a" } }, { "type": "contig", "fields": { "start": 108, "end": 208, "name": "b" } }, { "type": "contig", "fields": { "start": 40, "end": 912, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 40, "end": 912, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 994, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 40, "end": 912, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 108, "end": 208, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 108, "end": 208, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 994, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 108, "end": 208, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 806, "end": 994, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 785, "end": 990, "name": "a" } }, { "type": "contig", "fields": { "start": 654, "end": 686, "name": "b" } }, { "type": "contig", "fields": { "start": 351, "end": 784, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 351, "end": 784, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 785, "end": 990, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 351, "end": 784, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 654, "end": 686, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 654, "end": 686, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 785, "end": 990, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 654, "end": 686, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 785, "end": 990, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 904, "end": 983, "name": "a" } }, { "type": "contig", "fields": { "start": 397, "end": 713, "name": "b" } }, { "type": "contig", "fields": { "start": 222, "end": 813, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 813, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 397, "end": 713, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 397, "end": 713, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 904, "end": 983, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 813, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 904, "end": 983, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 397, "end": 713, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 904, "end": 983, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 827, "end": 865, "name": "a" } }, { "type": "contig", "fields": { "start": 178, "end": 719, "name": "b" } }, { "type": "contig", "fields": { "start": 39, "end": 177, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 177, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 178, "end": 719, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 178, "end": 719, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 827, "end": 865, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 177, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 827, "end": 865, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 178, "end": 719, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 827, "end": 865, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 937, "end": 986, "name": "a" } }, { "type": "contig", "fields": { "start": 878, "end": 936, "name": "b" } }, { "type": "contig", "fields": { "start": 235, "end": 881, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 878, "end": 936, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 937, "end": 986, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 235, "end": 881, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 878, "end": 936, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 235, "end": 881, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 937, "end": 986, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 878, "end": 936, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 937, "end": 986, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 309, "end": 686, "name": "a" } }, { "type": "contig", "fields": { "start": 82, "end": 308, "name": "b" } }, { "type": "contig", "fields": { "start": 24, "end": 81, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 82, "end": 308, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 686, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 81, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 82, "end": 308, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 81, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 686, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 82, "end": 308, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 686, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 821, "end": 923, "name": "a" } }, { "type": "contig", "fields": { "start": 37, "end": 820, "name": "b" } }, { "type": "contig", "fields": { "start": 24, "end": 921, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 37, "end": 820, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 923, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 921, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 923, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 921, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 37, "end": 820, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 37, "end": 820, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 923, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 219, "end": 718, "name": "a" } }, { "type": "contig", "fields": { "start": 186, "end": 218, "name": "b" } }, { "type": "contig", "fields": { "start": 89, "end": 218, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 186, "end": 218, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 219, "end": 718, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 89, "end": 218, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 219, "end": 718, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 89, "end": 218, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 186, "end": 218, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 186, "end": 218, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 219, "end": 718, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 684, "end": 943, "name": "a" } }, { "type": "contig", "fields": { "start": 575, "end": 670, "name": "b" } }, { "type": "contig", "fields": { "start": 344, "end": 382, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 575, "end": 670, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 684, "end": 943, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 344, "end": 382, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 684, "end": 943, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 344, "end": 382, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 575, "end": 670, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 575, "end": 670, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 684, "end": 943, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 802, "end": 971, "name": "a" } }, { "type": "contig", "fields": { "start": 447, "end": 801, "name": "b" } }, { "type": "contig", "fields": { "start": 364, "end": 390, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 447, "end": 801, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 971, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 364, "end": 390, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 971, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 364, "end": 390, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 447, "end": 801, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 447, "end": 801, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 802, "end": 971, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 618, "end": 832, "name": "a" } }, { "type": "contig", "fields": { "start": 241, "end": 348, "name": "b" } }, { "type": "contig", "fields": { "start": 66, "end": 617, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 617, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 618, "end": 832, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 617, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 241, "end": 348, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 241, "end": 348, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 618, "end": 832, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 617, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 618, "end": 832, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 268, "end": 690, "name": "a" } }, { "type": "contig", "fields": { "start": 265, "end": 266, "name": "b" } }, { "type": "contig", "fields": { "start": 179, "end": 267, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 179, "end": 267, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 268, "end": 690, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 265, "end": 266, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 268, "end": 690, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 265, "end": 266, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 268, "end": 690, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 179, "end": 267, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 268, "end": 690, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 28, "end": 978, "name": "a" } }, { "type": "contig", "fields": { "start": 3, "end": 5, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 8, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 8, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 3, "end": 5, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 5, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 28, "end": 978, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 5, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 28, "end": 978, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 8, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 28, "end": 978, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 503, "end": 659, "name": "a" } }, { "type": "contig", "fields": { "start": 417, "end": 491, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 416, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 416, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 417, "end": 491, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 417, "end": 491, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 503, "end": 659, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 417, "end": 491, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 503, "end": 659, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 416, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 503, "end": 659, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 932, "end": 967, "name": "a" } }, { "type": "contig", "fields": { "start": 366, "end": 547, "name": "b" } }, { "type": "contig", "fields": { "start": 46, "end": 748, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 366, "end": 547, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 932, "end": 967, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 46, "end": 748, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 932, "end": 967, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 366, "end": 547, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 932, "end": 967, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 46, "end": 748, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 932, "end": 967, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 296, "end": 809, "name": "a" } }, { "type": "contig", "fields": { "start": 223, "end": 295, "name": "b" } }, { "type": "contig", "fields": { "start": 61, "end": 277, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 223, "end": 295, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 296, "end": 809, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 61, "end": 277, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 296, "end": 809, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 223, "end": 295, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 296, "end": 809, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 61, "end": 277, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 296, "end": 809, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 584, "end": 776, "name": "a" } }, { "type": "contig", "fields": { "start": 32, "end": 371, "name": "b" } }, { "type": "contig", "fields": { "start": 8, "end": 414, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 32, "end": 371, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 584, "end": 776, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 32, "end": 371, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 584, "end": 776, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 584, "end": 776, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 793, "end": 988, "name": "a" } }, { "type": "contig", "fields": { "start": 751, "end": 766, "name": "b" } }, { "type": "contig", "fields": { "start": 640, "end": 750, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 640, "end": 750, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 751, "end": 766, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 640, "end": 750, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 793, "end": 988, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 751, "end": 766, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 793, "end": 988, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 640, "end": 750, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 793, "end": 988, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 264, "end": 348, "name": "a" } }, { "type": "contig", "fields": { "start": 139, "end": 263, "name": "b" } }, { "type": "contig", "fields": { "start": 24, "end": 263, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 139, "end": 263, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 348, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 263, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 348, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 139, "end": 263, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 348, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 263, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 264, "end": 348, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 46, "end": 161, "name": "a" } }, { "type": "contig", "fields": { "start": 45, "end": 45, "name": "b" } }, { "type": "contig", "fields": { "start": 23, "end": 46, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 45, "end": 45, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 161, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 46, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 45, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 45, "end": 45, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 161, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 46, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 161, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 68, "end": 511, "name": "a" } }, { "type": "contig", "fields": { "start": 43, "end": 58, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 67, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 67, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 68, "end": 511, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 67, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 43, "end": 58, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 43, "end": 58, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 68, "end": 511, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 67, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 68, "end": 511, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 312, "end": 971, "name": "a" } }, { "type": "contig", "fields": { "start": 52, "end": 88, "name": "b" } }, { "type": "contig", "fields": { "start": 9, "end": 40, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 52, "end": 88, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 312, "end": 971, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 40, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 52, "end": 88, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 52, "end": 88, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 312, "end": 971, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 40, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 312, "end": 971, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 575, "end": 761, "name": "a" } }, { "type": "contig", "fields": { "start": 236, "end": 574, "name": "b" } }, { "type": "contig", "fields": { "start": 165, "end": 203, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 236, "end": 574, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 575, "end": 761, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 165, "end": 203, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 236, "end": 574, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 236, "end": 574, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 575, "end": 761, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 165, "end": 203, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 575, "end": 761, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 406, "end": 942, "name": "a" } }, { "type": "contig", "fields": { "start": 234, "end": 306, "name": "b" } }, { "type": "contig", "fields": { "start": 214, "end": 215, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 214, "end": 215, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 406, "end": 942, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 214, "end": 215, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 234, "end": 306, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 306, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 406, "end": 942, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 214, "end": 215, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 406, "end": 942, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 161, "end": 596, "name": "a" } }, { "type": "contig", "fields": { "start": 96, "end": 160, "name": "b" } }, { "type": "contig", "fields": { "start": 77, "end": 95, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 96, "end": 160, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 161, "end": 596, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 95, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 96, "end": 160, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 96, "end": 160, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 161, "end": 596, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 95, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 161, "end": 596, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 927, "end": 966, "name": "a" } }, { "type": "contig", "fields": { "start": 927, "end": 927, "name": "b" } }, { "type": "contig", "fields": { "start": 814, "end": 926, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 814, "end": 926, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 927, "end": 966, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 814, "end": 926, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 927, "end": 927, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 927, "end": 927, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 927, "end": 966, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 814, "end": 926, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 927, "end": 966, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 909, "end": 909, "name": "a" } }, { "type": "contig", "fields": { "start": 78, "end": 945, "name": "b" } }, { "type": "contig", "fields": { "start": 54, "end": 77, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 54, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 78, "end": 945, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 945, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 909, "end": 909, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 54, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 909, "end": 909, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 54, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 909, "end": 909, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 734, "end": 777, "name": "a" } }, { "type": "contig", "fields": { "start": 287, "end": 377, "name": "b" } }, { "type": "contig", "fields": { "start": 23, "end": 422, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 422, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 287, "end": 377, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 287, "end": 377, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 734, "end": 777, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 422, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 734, "end": 777, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 23, "end": 422, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 734, "end": 777, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 54, "end": 688, "name": "a" } }, { "type": "contig", "fields": { "start": 32, "end": 33, "name": "b" } }, { "type": "contig", "fields": { "start": 14, "end": 31, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 31, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 32, "end": 33, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 32, "end": 33, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 688, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 31, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 688, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 14, "end": 31, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 688, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 301, "end": 551, "name": "a" } }, { "type": "contig", "fields": { "start": 106, "end": 632, "name": "b" } }, { "type": "contig", "fields": { "start": 26, "end": 258, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 106, "end": 632, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 301, "end": 551, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 258, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 106, "end": 632, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 258, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 301, "end": 551, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 258, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 301, "end": 551, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 739, "end": 753, "name": "a" } }, { "type": "contig", "fields": { "start": 294, "end": 738, "name": "b" } }, { "type": "contig", "fields": { "start": 15, "end": 677, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 294, "end": 738, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 739, "end": 753, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 15, "end": 677, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 294, "end": 738, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 15, "end": 677, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 739, "end": 753, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 15, "end": 677, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 739, "end": 753, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 25, "end": 788, "name": "a" } }, { "type": "contig", "fields": { "start": 12, "end": 24, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 11, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 24, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 788, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 12, "end": 24, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 788, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 11, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 788, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 583, "end": 649, "name": "a" } }, { "type": "contig", "fields": { "start": 406, "end": 975, "name": "b" } }, { "type": "contig", "fields": { "start": 230, "end": 582, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 230, "end": 582, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 583, "end": 649, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 406, "end": 975, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 583, "end": 649, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 230, "end": 582, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 406, "end": 975, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 230, "end": 582, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 583, "end": 649, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 849, "end": 883, "name": "a" } }, { "type": "contig", "fields": { "start": 612, "end": 848, "name": "b" } }, { "type": "contig", "fields": { "start": 11, "end": 848, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 612, "end": 848, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 849, "end": 883, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 848, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 849, "end": 883, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 848, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 612, "end": 848, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 11, "end": 848, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 849, "end": 883, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 905, "end": 960, "name": "a" } }, { "type": "contig", "fields": { "start": 240, "end": 961, "name": "b" } }, { "type": "contig", "fields": { "start": 96, "end": 168, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 240, "end": 961, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 905, "end": 960, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 96, "end": 168, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 905, "end": 960, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 96, "end": 168, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 240, "end": 961, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 96, "end": 168, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 905, "end": 960, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 950, "end": 997, "name": "a" } }, { "type": "contig", "fields": { "start": 183, "end": 630, "name": "b" } }, { "type": "contig", "fields": { "start": 31, "end": 87, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 183, "end": 630, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 997, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 31, "end": 87, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 997, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 31, "end": 87, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 183, "end": 630, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 31, "end": 87, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 997, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 916, "end": 936, "name": "a" } }, { "type": "contig", "fields": { "start": 74, "end": 915, "name": "b" } }, { "type": "contig", "fields": { "start": 57, "end": 68, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 74, "end": 915, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 916, "end": 936, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 57, "end": 68, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 916, "end": 936, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 57, "end": 68, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 74, "end": 915, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 57, "end": 68, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 916, "end": 936, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 508, "end": 897, "name": "a" } }, { "type": "contig", "fields": { "start": 242, "end": 502, "name": "b" } }, { "type": "contig", "fields": { "start": 91, "end": 241, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 241, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 242, "end": 502, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 242, "end": 502, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 508, "end": 897, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 242, "end": 502, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 508, "end": 897, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 241, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 242, "end": 502, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 404, "end": 565, "name": "a" } }, { "type": "contig", "fields": { "start": 9, "end": 164, "name": "b" } }, { "type": "contig", "fields": { "start": 7, "end": 9, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 164, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 404, "end": 565, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 7, "end": 9, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 404, "end": 565, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 164, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 404, "end": 565, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 7, "end": 9, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 9, "end": 164, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 704, "end": 975, "name": "a" } }, { "type": "contig", "fields": { "start": 78, "end": 703, "name": "b" } }, { "type": "contig", "fields": { "start": 77, "end": 77, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 703, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 704, "end": 975, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 704, "end": 975, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 703, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 704, "end": 975, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 77, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 78, "end": 703, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 838, "end": 910, "name": "a" } }, { "type": "contig", "fields": { "start": 666, "end": 767, "name": "b" } }, { "type": "contig", "fields": { "start": 660, "end": 665, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 660, "end": 665, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 666, "end": 767, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 660, "end": 665, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 838, "end": 910, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 666, "end": 767, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 838, "end": 910, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 660, "end": 665, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 666, "end": 767, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 130, "end": 888, "name": "a" } }, { "type": "contig", "fields": { "start": 129, "end": 129, "name": "b" } }, { "type": "contig", "fields": { "start": 70, "end": 129, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 129, "end": 129, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 130, "end": 888, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 70, "end": 129, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 130, "end": 888, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 129, "end": 129, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 130, "end": 888, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 70, "end": 129, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 129, "end": 129, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 276, "end": 436, "name": "a" } }, { "type": "contig", "fields": { "start": 159, "end": 226, "name": "b" } }, { "type": "contig", "fields": { "start": 137, "end": 154, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 159, "end": 226, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 276, "end": 436, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 137, "end": 154, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 159, "end": 226, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 159, "end": 226, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 276, "end": 436, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 137, "end": 154, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 159, "end": 226, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 912, "end": 951, "name": "a" } }, { "type": "contig", "fields": { "start": 731, "end": 911, "name": "b" } }, { "type": "contig", "fields": { "start": 569, "end": 658, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 731, "end": 911, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 912, "end": 951, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 569, "end": 658, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 731, "end": 911, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 731, "end": 911, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 912, "end": 951, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 569, "end": 658, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 731, "end": 911, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 522, "end": 593, "name": "a" } }, { "type": "contig", "fields": { "start": 378, "end": 414, "name": "b" } }, { "type": "contig", "fields": { "start": 315, "end": 325, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 315, "end": 325, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 522, "end": 593, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 315, "end": 325, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 378, "end": 414, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 378, "end": 414, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 522, "end": 593, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 315, "end": 325, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 378, "end": 414, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 420, "end": 450, "name": "a" } }, { "type": "contig", "fields": { "start": 3, "end": 419, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 2, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 419, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 420, "end": 450, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 2, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 3, "end": 419, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 419, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 420, "end": 450, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 2, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 3, "end": 419, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 308, "end": 679, "name": "a" } }, { "type": "contig", "fields": { "start": 308, "end": 308, "name": "b" } }, { "type": "contig", "fields": { "start": 111, "end": 307, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 111, "end": 307, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 308, "end": 679, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 111, "end": 307, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 308, "end": 308, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 308, "end": 308, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 308, "end": 679, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 111, "end": 307, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 308, "end": 308, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 239, "end": 779, "name": "a" } }, { "type": "contig", "fields": { "start": 45, "end": 386, "name": "b" } }, { "type": "contig", "fields": { "start": 44, "end": 44, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 44, "end": 44, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 386, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 45, "end": 386, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 239, "end": 779, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 44, "end": 44, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 239, "end": 779, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 44, "end": 44, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 386, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 25, "end": 407, "name": "a" } }, { "type": "contig", "fields": { "start": 9, "end": 13, "name": "b" } }, { "type": "contig", "fields": { "start": 5, "end": 8, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 8, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 9, "end": 13, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 9, "end": 13, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 407, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 8, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 407, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 8, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 9, "end": 13, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 174, "end": 175, "name": "a" } }, { "type": "contig", "fields": { "start": 103, "end": 173, "name": "b" } }, { "type": "contig", "fields": { "start": 34, "end": 102, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 103, "end": 173, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 174, "end": 175, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 102, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 103, "end": 173, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 102, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 174, "end": 175, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 102, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 103, "end": 173, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 463, "end": 474, "name": "a" } }, { "type": "contig", "fields": { "start": 462, "end": 921, "name": "b" } }, { "type": "contig", "fields": { "start": 78, "end": 462, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 462, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 463, "end": 474, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 462, "end": 921, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 463, "end": 474, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 462, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 463, "end": 474, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 78, "end": 462, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 462, "end": 921, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 110, "end": 850, "name": "a" } }, { "type": "contig", "fields": { "start": 19, "end": 684, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 19, "end": 684, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 19, "end": 684, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 110, "end": 850, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 110, "end": 850, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 19, "end": 684, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 385, "end": 576, "name": "a" } }, { "type": "contig", "fields": { "start": 307, "end": 346, "name": "b" } }, { "type": "contig", "fields": { "start": 216, "end": 306, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 216, "end": 306, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 307, "end": 346, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 307, "end": 346, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 385, "end": 576, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 216, "end": 306, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 385, "end": 576, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 216, "end": 306, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 307, "end": 346, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 593, "end": 980, "name": "a" } }, { "type": "contig", "fields": { "start": 26, "end": 272, "name": "b" } }, { "type": "contig", "fields": { "start": 1, "end": 24, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 272, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 593, "end": 980, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 26, "end": 272, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 593, "end": 980, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 593, "end": 980, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 1, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 26, "end": 272, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 781, "end": 819, "name": "a" } }, { "type": "contig", "fields": { "start": 293, "end": 780, "name": "b" } }, { "type": "contig", "fields": { "start": 76, "end": 109, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 293, "end": 780, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 781, "end": 819, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 293, "end": 780, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 781, "end": 819, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 76, "end": 109, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 781, "end": 819, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 76, "end": 109, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 293, "end": 780, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 882, "end": 927, "name": "a" } }, { "type": "contig", "fields": { "start": 519, "end": 621, "name": "b" } }, { "type": "contig", "fields": { "start": 292, "end": 359, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 292, "end": 359, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 882, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 519, "end": 621, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 882, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 292, "end": 359, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 882, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 292, "end": 359, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 519, "end": 621, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 128, "end": 665, "name": "a" } }, { "type": "contig", "fields": { "start": 127, "end": 127, "name": "b" } }, { "type": "contig", "fields": { "start": 86, "end": 127, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 127, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 128, "end": 665, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 127, "end": 127, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 128, "end": 665, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 127, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 128, "end": 665, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 127, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 127, "end": 127, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 361, "end": 945, "name": "a" } }, { "type": "contig", "fields": { "start": 117, "end": 352, "name": "b" } }, { "type": "contig", "fields": { "start": 91, "end": 110, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 110, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 117, "end": 352, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 117, "end": 352, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 361, "end": 945, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 110, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 361, "end": 945, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 91, "end": 110, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 117, "end": 352, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 76, "end": 739, "name": "a" } }, { "type": "contig", "fields": { "start": 47, "end": 63, "name": "b" } }, { "type": "contig", "fields": { "start": 21, "end": 46, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 46, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 47, "end": 63, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 47, "end": 63, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 76, "end": 739, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 46, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 76, "end": 739, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 21, "end": 46, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 47, "end": 63, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 783, "end": 801, "name": "a" } }, { "type": "contig", "fields": { "start": 304, "end": 922, "name": "b" } }, { "type": "contig", "fields": { "start": 222, "end": 276, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 304, "end": 922, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 783, "end": 801, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 276, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 783, "end": 801, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 276, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 783, "end": 801, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 276, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 304, "end": 922, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 727, "end": 973, "name": "a" } }, { "type": "contig", "fields": { "start": 77, "end": 668, "name": "b" } }, { "type": "contig", "fields": { "start": 2, "end": 73, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 77, "end": 668, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 727, "end": 973, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 73, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 727, "end": 973, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 73, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 727, "end": 973, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 73, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 77, "end": 668, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 328, "end": 740, "name": "a" } }, { "type": "contig", "fields": { "start": 50, "end": 327, "name": "b" } }, { "type": "contig", "fields": { "start": 3, "end": 41, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 50, "end": 327, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 328, "end": 740, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 41, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 328, "end": 740, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 41, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 328, "end": 740, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 3, "end": 41, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 50, "end": 327, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 54, "end": 916, "name": "a" } }, { "type": "contig", "fields": { "start": 46, "end": 151, "name": "b" } }, { "type": "contig", "fields": { "start": 12, "end": 45, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 45, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 151, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 45, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 916, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 45, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 54, "end": 916, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 12, "end": 45, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 46, "end": 151, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 748, "end": 939, "name": "a" } }, { "type": "contig", "fields": { "start": 747, "end": 747, "name": "b" } }, { "type": "contig", "fields": { "start": 297, "end": 747, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 747, "end": 747, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 748, "end": 939, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 297, "end": 747, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 748, "end": 939, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 297, "end": 747, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 748, "end": 939, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 297, "end": 747, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 747, "end": 747, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 458, "end": 945, "name": "a" } }, { "type": "contig", "fields": { "start": 262, "end": 466, "name": "b" } }, { "type": "contig", "fields": { "start": 34, "end": 186, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 262, "end": 466, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 458, "end": 945, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 186, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 262, "end": 466, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 186, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 458, "end": 945, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 34, "end": 186, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 262, "end": 466, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 731, "end": 757, "name": "a" } }, { "type": "contig", "fields": { "start": 654, "end": 675, "name": "b" } }, { "type": "contig", "fields": { "start": 355, "end": 472, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 654, "end": 675, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 731, "end": 757, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 355, "end": 472, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 654, "end": 675, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 355, "end": 472, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 731, "end": 757, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 355, "end": 472, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 654, "end": 675, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 970, "end": 970, "name": "a" } }, { "type": "contig", "fields": { "start": 10, "end": 969, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 7, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 10, "end": 969, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 970, "end": 970, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 7, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 10, "end": 969, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 7, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 970, "end": 970, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 7, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 10, "end": 969, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 164, "end": 171, "name": "a" } }, { "type": "contig", "fields": { "start": 103, "end": 128, "name": "b" } }, { "type": "contig", "fields": { "start": 38, "end": 54, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 38, "end": 54, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 164, "end": 171, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 38, "end": 54, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 103, "end": 128, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 38, "end": 54, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 164, "end": 171, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 38, "end": 54, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 103, "end": 128, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 844, "end": 927, "name": "a" } }, { "type": "contig", "fields": { "start": 862, "end": 881, "name": "b" } }, { "type": "contig", "fields": { "start": 280, "end": 843, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 280, "end": 843, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 844, "end": 927, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 280, "end": 843, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 862, "end": 881, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 280, "end": 843, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 844, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 280, "end": 843, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 862, "end": 881, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 372, "end": 845, "name": "a" } }, { "type": "contig", "fields": { "start": 211, "end": 371, "name": "b" } }, { "type": "contig", "fields": { "start": 139, "end": 210, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 211, "end": 371, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 372, "end": 845, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 139, "end": 210, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 211, "end": 371, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 139, "end": 210, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 372, "end": 845, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 139, "end": 210, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 211, "end": 371, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 144, "end": 728, "name": "a" } }, { "type": "contig", "fields": { "start": 144, "end": 877, "name": "b" } }, { "type": "contig", "fields": { "start": 132, "end": 143, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 132, "end": 143, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 144, "end": 728, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 132, "end": 143, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 144, "end": 877, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 132, "end": 143, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 144, "end": 728, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 132, "end": 143, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 144, "end": 877, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 709, "end": 751, "name": "a" } }, { "type": "contig", "fields": { "start": 496, "end": 809, "name": "b" } }, { "type": "contig", "fields": { "start": 291, "end": 356, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 496, "end": 809, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 709, "end": 751, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 291, "end": 356, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 709, "end": 751, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 291, "end": 356, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 496, "end": 809, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 291, "end": 356, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 496, "end": 809, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 647, "end": 745, "name": "a" } }, { "type": "contig", "fields": { "start": 561, "end": 561, "name": "b" } }, { "type": "contig", "fields": { "start": 151, "end": 167, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 561, "end": 561, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 647, "end": 745, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 151, "end": 167, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 647, "end": 745, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 151, "end": 167, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 561, "end": 561, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 151, "end": 167, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 561, "end": 561, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 165, "end": 417, "name": "a" } }, { "type": "contig", "fields": { "start": 115, "end": 164, "name": "b" } }, { "type": "contig", "fields": { "start": 93, "end": 96, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 115, "end": 164, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 165, "end": 417, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 93, "end": 96, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 165, "end": 417, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 93, "end": 96, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 115, "end": 164, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 93, "end": 96, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 115, "end": 164, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 957, "end": 981, "name": "a" } }, { "type": "contig", "fields": { "start": 519, "end": 596, "name": "b" } }, { "type": "contig", "fields": { "start": 436, "end": 956, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 436, "end": 956, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 957, "end": 981, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 436, "end": 956, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 519, "end": 596, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 519, "end": 596, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 957, "end": 981, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 519, "end": 596, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 957, "end": 981, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 436, "end": 956, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 957, "end": 981, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 526, "end": 895, "name": "a" } }, { "type": "contig", "fields": { "start": 66, "end": 408, "name": "b" } }, { "type": "contig", "fields": { "start": 37, "end": 278, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 37, "end": 278, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 66, "end": 408, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 408, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 895, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 37, "end": 278, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 895, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 66, "end": 408, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 895, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 37, "end": 278, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 895, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 859, "end": 932, "name": "a" } }, { "type": "contig", "fields": { "start": 85, "end": 609, "name": "b" } }, { "type": "contig", "fields": { "start": 2, "end": 84, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 84, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 85, "end": 609, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 85, "end": 609, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 859, "end": 932, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 84, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 859, "end": 932, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 85, "end": 609, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 859, "end": 932, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 84, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 859, "end": 932, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 374, "end": 842, "name": "a" } }, { "type": "contig", "fields": { "start": 82, "end": 373, "name": "b" } }, { "type": "contig", "fields": { "start": 25, "end": 244, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 82, "end": 373, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 374, "end": 842, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 82, "end": 373, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 374, "end": 842, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 82, "end": 373, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 374, "end": 842, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 244, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 374, "end": 842, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 695, "end": 930, "name": "a" } }, { "type": "contig", "fields": { "start": 174, "end": 694, "name": "b" } }, { "type": "contig", "fields": { "start": 162, "end": 173, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 174, "end": 694, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 695, "end": 930, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 162, "end": 173, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 174, "end": 694, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 162, "end": 173, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 695, "end": 930, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 174, "end": 694, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 695, "end": 930, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 162, "end": 173, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 695, "end": 930, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 526, "end": 605, "name": "a" } }, { "type": "contig", "fields": { "start": 392, "end": 525, "name": "b" } }, { "type": "contig", "fields": { "start": 105, "end": 525, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 392, "end": 525, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 605, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 105, "end": 525, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 605, "name": "a" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 105, "end": 525, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 392, "end": 525, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 392, "end": 525, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 605, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 105, "end": 525, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 526, "end": 605, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 201, "end": 724, "name": "a" } }, { "type": "contig", "fields": { "start": 20, "end": 193, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 193, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 201, "end": 724, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 201, "end": 724, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 20, "end": 193, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 193, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 201, "end": 724, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 18, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 201, "end": 724, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 352, "end": 433, "name": "a" } }, { "type": "contig", "fields": { "start": 215, "end": 351, "name": "b" } }, { "type": "contig", "fields": { "start": 170, "end": 208, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 215, "end": 351, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 352, "end": 433, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 170, "end": 208, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 352, "end": 433, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 170, "end": 208, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 215, "end": 351, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 215, "end": 351, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 352, "end": 433, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 170, "end": 208, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 352, "end": 433, "name": "a" } } } }] +[{ "type": "contig", "fields": { "start": 315, "end": 808, "name": "a" } }, { "type": "contig", "fields": { "start": 25, "end": 211, "name": "b" } }, { "type": "contig", "fields": { "start": 5, "end": 24, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 211, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 211, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 315, "end": 808, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 315, "end": 808, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 25, "end": 211, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 315, "end": 808, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 24, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 25, "end": 211, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 329, "end": 667, "name": "a" } }, { "type": "contig", "fields": { "start": 20, "end": 328, "name": "b" } }, { "type": "contig", "fields": { "start": 5, "end": 19, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 328, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 329, "end": 667, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 19, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 20, "end": 328, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 19, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 329, "end": 667, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 20, "end": 328, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 329, "end": 667, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 5, "end": 19, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 20, "end": 328, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 821, "end": 986, "name": "a" } }, { "type": "contig", "fields": { "start": 514, "end": 661, "name": "b" } }, { "type": "contig", "fields": { "start": 86, "end": 425, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 514, "end": 661, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 986, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 425, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 986, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 425, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 514, "end": 661, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 514, "end": 661, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 821, "end": 986, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 86, "end": 425, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 514, "end": 661, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 604, "end": 714, "name": "a" } }, { "type": "contig", "fields": { "start": 2, "end": 603, "name": "b" } }, { "type": "contig", "fields": { "start": 0, "end": 0, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 603, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 604, "end": 714, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 0, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 604, "end": 714, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 0, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 2, "end": 603, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 2, "end": 603, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 604, "end": 714, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 0, "end": 0, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 2, "end": 603, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 309, "end": 781, "name": "a" } }, { "type": "contig", "fields": { "start": 204, "end": 245, "name": "b" } }, { "type": "contig", "fields": { "start": 127, "end": 203, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 127, "end": 203, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 204, "end": 245, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 204, "end": 245, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 781, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 204, "end": 245, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 781, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 127, "end": 203, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 309, "end": 781, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 127, "end": 203, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 204, "end": 245, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 77, "end": 383, "name": "a" } }, { "type": "contig", "fields": { "start": 55, "end": 57, "name": "b" } }, { "type": "contig", "fields": { "start": 39, "end": 49, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 55, "end": 57, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 77, "end": 383, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 49, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 77, "end": 383, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 55, "end": 57, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 77, "end": 383, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 49, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 77, "end": 383, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 39, "end": 49, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 55, "end": 57, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 45, "end": 474, "name": "a" } }, { "type": "contig", "fields": { "start": 27, "end": 44, "name": "b" } }, { "type": "contig", "fields": { "start": 6, "end": 21, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 44, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 474, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 21, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 474, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 44, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 474, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 21, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 45, "end": 474, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 6, "end": 21, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 27, "end": 44, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 34, "end": 319, "name": "a" } }, { "type": "contig", "fields": { "start": 24, "end": 34, "name": "b" } }, { "type": "contig", "fields": { "start": 19, "end": 23, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 19, "end": 23, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 24, "end": 34, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 19, "end": 23, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 34, "end": 319, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 34, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 34, "end": 319, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 19, "end": 23, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 34, "end": 319, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 19, "end": 23, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 24, "end": 34, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 953, "end": 995, "name": "a" } }, { "type": "contig", "fields": { "start": 952, "end": 952, "name": "b" } }, { "type": "contig", "fields": { "start": 386, "end": 952, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 952, "end": 952, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 953, "end": 995, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 386, "end": 952, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 953, "end": 995, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 952, "end": 952, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 953, "end": 995, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 386, "end": 952, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 953, "end": 995, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 386, "end": 952, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 952, "end": 952, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 864, "end": 940, "name": "a" } }, { "type": "contig", "fields": { "start": 554, "end": 770, "name": "b" } }, { "type": "contig", "fields": { "start": 273, "end": 313, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 554, "end": 770, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 864, "end": 940, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 273, "end": 313, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 554, "end": 770, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 554, "end": 770, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 864, "end": 940, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 273, "end": 313, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 864, "end": 940, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 273, "end": 313, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 554, "end": 770, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 692, "end": 807, "name": "a" } }, { "type": "contig", "fields": { "start": 656, "end": 691, "name": "b" } }, { "type": "contig", "fields": { "start": 234, "end": 460, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 656, "end": 691, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 692, "end": 807, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 460, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 656, "end": 691, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 656, "end": 691, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 692, "end": 807, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 460, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 692, "end": 807, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 234, "end": 460, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 656, "end": 691, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 236, "end": 384, "name": "a" } }, { "type": "contig", "fields": { "start": 213, "end": 215, "name": "b" } }, { "type": "contig", "fields": { "start": 4, "end": 59, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 236, "end": 384, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 213, "end": 215, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 213, "end": 215, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 236, "end": 384, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 236, "end": 384, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 4, "end": 59, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 213, "end": 215, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 188, "end": 885, "name": "a" } }, { "type": "contig", "fields": { "start": 67, "end": 187, "name": "b" } }, { "type": "contig", "fields": { "start": 24, "end": 66, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 187, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 188, "end": 885, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 66, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 67, "end": 187, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 187, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 188, "end": 885, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 66, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 188, "end": 885, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 24, "end": 66, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 67, "end": 187, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 419, "end": 653, "name": "a" } }, { "type": "contig", "fields": { "start": 419, "end": 419, "name": "b" } }, { "type": "contig", "fields": { "start": 405, "end": 418, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 405, "end": 418, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 419, "end": 653, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 405, "end": 418, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 419, "end": 419, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 419, "end": 419, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 419, "end": 653, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 405, "end": 418, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 419, "end": 653, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 405, "end": 418, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 419, "end": 419, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 345, "end": 700, "name": "a" } }, { "type": "contig", "fields": { "start": 256, "end": 823, "name": "b" } }, { "type": "contig", "fields": { "start": 188, "end": 255, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 188, "end": 255, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 256, "end": 823, "name": "b" } } } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 256, "end": 823, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 345, "end": 700, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 188, "end": 255, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 345, "end": 700, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 188, "end": 255, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 345, "end": 700, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 188, "end": 255, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 256, "end": 823, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 901, "end": 923, "name": "a" } }, { "type": "contig", "fields": { "start": 222, "end": 368, "name": "b" } }, { "type": "contig", "fields": { "start": 171, "end": 221, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 171, "end": 221, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 222, "end": 368, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 222, "end": 368, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 901, "end": 923, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 171, "end": 221, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 901, "end": 923, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 171, "end": 221, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 901, "end": 923, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 171, "end": 221, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 222, "end": 368, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 36, "end": 927, "name": "a" } }, { "type": "contig", "fields": { "start": 27, "end": 35, "name": "b" } }, { "type": "contig", "fields": { "start": 8, "end": 26, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 35, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 36, "end": 927, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 26, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 27, "end": 35, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 26, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 36, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 26, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 36, "end": 927, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 8, "end": 26, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 27, "end": 35, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 169, "end": 916, "name": "a" } }, { "type": "contig", "fields": { "start": 108, "end": 316, "name": "b" } }, { "type": "contig", "fields": { "start": 100, "end": 105, "name": "c" } }, { "type": "overlap", "fields": { "a": { "type": "contig", "fields": { "start": 108, "end": 316, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 169, "end": 916, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 100, "end": 105, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 169, "end": 916, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 100, "end": 105, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 108, "end": 316, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 100, "end": 105, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 169, "end": 916, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 100, "end": 105, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 108, "end": 316, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 291, "end": 774, "name": "a" } }, { "type": "contig", "fields": { "start": 27, "end": 150, "name": "b" } }, { "type": "contig", "fields": { "start": 22, "end": 22, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 27, "end": 150, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 291, "end": 774, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 22, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 291, "end": 774, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 22, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 27, "end": 150, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 22, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 291, "end": 774, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 22, "end": 22, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 27, "end": 150, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 455, "end": 789, "name": "a" } }, { "type": "contig", "fields": { "start": 440, "end": 454, "name": "b" } }, { "type": "contig", "fields": { "start": 408, "end": 414, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 440, "end": 454, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 455, "end": 789, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 408, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 455, "end": 789, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 408, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 440, "end": 454, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 408, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 455, "end": 789, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 408, "end": 414, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 440, "end": 454, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 555, "end": 727, "name": "a" } }, { "type": "contig", "fields": { "start": 121, "end": 199, "name": "b" } }, { "type": "contig", "fields": { "start": 81, "end": 120, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 81, "end": 120, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 121, "end": 199, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 121, "end": 199, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 555, "end": 727, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 81, "end": 120, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 555, "end": 727, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 121, "end": 199, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 555, "end": 727, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 81, "end": 120, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 555, "end": 727, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 81, "end": 120, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 121, "end": 199, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 700, "end": 754, "name": "a" } }, { "type": "contig", "fields": { "start": 440, "end": 699, "name": "b" } }, { "type": "contig", "fields": { "start": 260, "end": 439, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 440, "end": 699, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 700, "end": 754, "name": "a" } } } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 260, "end": 439, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 440, "end": 699, "name": "b" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 260, "end": 439, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 700, "end": 754, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 440, "end": 699, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 700, "end": 754, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 260, "end": 439, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 700, "end": 754, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 260, "end": 439, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 440, "end": 699, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 950, "end": 990, "name": "a" } }, { "type": "contig", "fields": { "start": 140, "end": 750, "name": "b" } }, { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 750, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 990, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 990, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 140, "end": 750, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 140, "end": 750, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 990, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 950, "end": 990, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 49, "end": 92, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 140, "end": 750, "name": "b" } } } }] +[{ "type": "contig", "fields": { "start": 676, "end": 749, "name": "a" } }, { "type": "contig", "fields": { "start": 67, "end": 675, "name": "b" } }, { "type": "contig", "fields": { "start": 36, "end": 53, "name": "c" } }, { "type": "tangent", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 675, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 676, "end": 749, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 53, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 676, "end": 749, "name": "a" } } } }, { "type": "nooverlap", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 53, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 67, "end": 675, "name": "b" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 67, "end": 675, "name": "b" } }, "b": { "type": "contig", "fields": { "start": 676, "end": 749, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 53, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 676, "end": 749, "name": "a" } } } }, { "type": "kiss", "fields": { "a": { "type": "contig", "fields": { "start": 36, "end": 53, "name": "c" } }, "b": { "type": "contig", "fields": { "start": 67, "end": 675, "name": "b" } } } }] diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py new file mode 100644 index 000000000..c36881763 --- /dev/null +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -0,0 +1,112 @@ +import pytest +import json +import os +from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps +from micall.utils.cigar_tools import CigarHit +from micall.utils.consensus_aligner import CigarActions +from typing import Dict, List +from collections import defaultdict + + +@pytest.fixture +def no_aligner(monkeypatch): + monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [x]) + + +def read_contigs(line): + array = json.loads(line) + contig_descriptions = [obj['fields'] for obj in array if obj['type'] == 'contig'] + for description in contig_descriptions: + start = description['start'] + end = description['end'] + name = description['name'] + length = end - start + 1 + assert length > 0 + + ref_seq = 'A' * 1000 # it does not matter + seq='C' * 10 + 'A' * length + 'T' * 10 + query = GenotypedContig( + name=name, + seq=seq, + ref_name='commonref', + group_ref='commongroup', + ref_seq=ref_seq, + match_fraction=2/3, + ) + alignment = CigarHit([(length, CigarActions.MATCH)], q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) + contig = AlignedContig(query=query, alignment=alignment) + aidee = f'{start:03d}-{end:03d}' + yield {'contig': contig, 'id': aidee} + + +def get_case_descriptions(): + pwd = os.path.dirname(__file__) + jsonfile = os.path.join(pwd, "data", "contig_stitcher_fuzz_nogaps.json") + with open(jsonfile, "r", encoding="utf8") as reader: + for line in reader: + read = list(read_contigs(line)) + contigs = [x['contig'] for x in read] + ids = [x['id'] for x in read] + aidee = ','.join(ids) + yield {'contigs': contigs, 'id': aidee} + + +all_case_descriptions = list(get_case_descriptions()) +all_case_ids = [x['id'] for x in all_case_descriptions] + + +@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +def test_contig_number_prop(no_aligner, description): + contigs = description['contigs'] + stitched = list(stitch_consensus(contigs)) + assert len(stitched) <= len(contigs) + + +@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +def test_contig_number_prop2(no_aligner, description): + contigs = description['contigs'] + consensus = list(stitch_consensus(contigs)) + stitched = list(stitch_contigs(contigs)) + uncovered = list(drop_completely_covered(contigs)) + assert len(consensus) <= len(stitched) <= len(uncovered) <= len(contigs) + + +def test_contig_number_prop2_existential(): + # This test is just to confirm that our cases cover all sub-actions. + + contig_sets = [x['contigs'] for x in all_case_descriptions] + + assert any(len(list(stitch_contigs(contigs))) > + len(list(stitch_consensus(contigs))) + for contigs in contig_sets) + + assert any(len(list(drop_completely_covered(contigs))) > + len(list(stitch_contigs(contigs))) + for contigs in contig_sets) + + assert any(len(list(contigs)) > + len(list(drop_completely_covered(contigs))) + for contigs in contig_sets) + + +def get_all_reference_positions(contigs: List[AlignedContig]): + ret: Dict[int, int] = defaultdict(lambda: 0) + for contig in contigs: + for i in contig.alignment.coordinate_mapping.ref_to_query.domain: + ret[i] += 1 + + return ret + + +@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +def test_stitching_intervals_prop(no_aligner, description): + contigs = description['contigs'] + stitched = list(stitch_contigs(contigs)) + initial_positions = get_all_reference_positions(contigs) + stitched_positions = get_all_reference_positions(stitched) + + # Checks that no reference position has been lost, and no new positions "created" + assert set(initial_positions.keys()) == set(stitched_positions.keys()) + + # Checks that there are no overlaps between contigs + assert all(v == 1 for (k, v) in stitched_positions.items()) From a30ffe8a71474ff5f548799ae9fcf7935fa87e2c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 27 Nov 2023 14:00:54 -0800 Subject: [PATCH 099/485] Fix typo in merge_intervals docstring --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 10472715d..b69bc444c 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -341,7 +341,7 @@ def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: Note that intervals are inclusive. :param intervals: A list of intervals [start, end] where 'start' and 'end' are integers. - :eturn: A list of merged intervals. + :return: A list of merged intervals. """ if not intervals: From 912eff8070a0eb5f56a9359976cee8699f6530d1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 15:28:21 -0800 Subject: [PATCH 100/485] Add structured_logger utility module --- micall/utils/structured_logger.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 micall/utils/structured_logger.py diff --git a/micall/utils/structured_logger.py b/micall/utils/structured_logger.py new file mode 100644 index 000000000..098f4d866 --- /dev/null +++ b/micall/utils/structured_logger.py @@ -0,0 +1,21 @@ + +import logging +from typing import List, Tuple + + +LoggerName = str +structured_logs: List[Tuple[LoggerName, logging.LogRecord]] = [] + + +class InMemoryLogHandler(logging.Handler): + def __init__(self, name: str, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.name: str = name + + def emit(self, record: logging.LogRecord): + structured_logs.append((self.name, record)) + + +def register_structured_logger(logger: logging.Logger): + memory_handler = InMemoryLogHandler(logger.name) + logger.addHandler(memory_handler) From 039dfd3f524f9d9a0431307e0124436d082cb2e5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 15:27:41 -0800 Subject: [PATCH 101/485] Add logging to contig stitcher --- micall/core/contig_stitcher.py | 53 +++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b69bc444c..d93afc775 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -8,9 +8,15 @@ from gotoh import align_it from queue import LifoQueue from math import floor +import logging from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions +from micall.utils.structured_logger import register_structured_logger + + +logger = logging.getLogger(__name__) +register_structured_logger(logger) @dataclass @@ -194,22 +200,39 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: def align_to_reference(contig) -> Iterable[GenotypedContig]: if contig.ref_seq is None: + logger.info("Contig %r not aligned - no reference.", contig.name, + extra={"action": "alignment", "type": "noref", "contig": contig}) yield contig return aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) if not alignments: + logger.info("Contig %r not aligned - backend choice.", contig.name, + extra={"action": "alignment", "type": "zerohits", "contig": contig}) yield contig return hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] connected = connect_cigar_hits(hits_array) + + logger.info("Contig %r aligned in %s parts.", contig.name, len(connected), + extra={"action": "alignment", "type": "hitnumber", + "contig": contig, "n": len(connected)}) + + def logpart(i, part): + logger.info("Part %r of contig %s aligned as [%s, %s]->[%s, %s].", + i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei, + extra={"action": "alignment", "type": "hit", + "contig": contig, "part": part, "i": i}) + if len(connected) == 1: + logpart(0, connected[0]) yield AlignedContig(query=contig, alignment=connected[0]) return - for single_hit in connected: + for i, single_hit in enumerate(connected): + logpart(i, single_hit) query = GenotypedContig(name=f'part({contig.name})', seq=contig.seq, ref_name=contig.ref_name, @@ -326,6 +349,8 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: + logger.info("Nothing overlaps with %r.", + current.name, extra={"action": "nooverlap", "contig": current}) yield current continue @@ -334,6 +359,13 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: contigs.remove(overlapping_contig) contigs.insert(0, new_contig) + logger.info("Stitching %r with %r results in %r at [%s,%s]->[%s,%s].", + current.name, overlapping_contig.name, + new_contig.name, new_contig.alignment.q_st, new_contig.alignment.q_ei, + new_contig.alignment.r_st, new_contig.alignment.r_ei, + extra={"action": "stitch", "result": new_contig, + "left": current, "right": overlapping_contig}) + def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: """ @@ -400,6 +432,8 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] covered = find_covered_contig(contigs) if covered: contigs.remove(covered) + logger.info("Droped contig %r as it is completely covered by other contigs.", + covered.name, extra={"action": "drop", "contig": covered}) else: break @@ -440,6 +474,17 @@ def try_split(contig): contigs.append(left_part) contigs.append(right_part) process_queue.put(right_part) + + logger.info("Split contig %r around its gap at [%s, %s]->[%s, %s]. " + "Left part: %r at [%s, %s]->[%s, %s], " + "right part: %r at [%s, %s]->[%s, %s].", + contig.name, gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, + left_part.name, left_part.alignment.q_st, left_part.alignment.q_ei, + left_part.alignment.r_st, left_part.alignment.r_ei, + right_part.name, right_part.alignment.q_st, right_part.alignment.q_ei, + right_part.alignment.r_st, right_part.alignment.r_ei, + extra={"action": "splitgap", "contig": contig, + "gap": gap, "left": left_part, "right": right_part}) return process_queue: LifoQueue = LifoQueue() @@ -453,6 +498,12 @@ def try_split(contig): def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]: + contigs = list(contigs) + for contig in contigs: + logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.", + contig.name, contig.ref_name, contig.group_ref, len(contig.seq), + extra={"action": "intro", "contig": contig}) + maybe_aligned = align_all_to_reference(contigs) # Contigs that did not align do not need any more processing From cb09d25b831ac34f8bea3c0dd3d4a65494706264 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 15:37:05 -0800 Subject: [PATCH 102/485] Number separately aligned parts in contig stitcher --- micall/core/contig_stitcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index d93afc775..cfffffd36 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -221,7 +221,7 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: "contig": contig, "n": len(connected)}) def logpart(i, part): - logger.info("Part %r of contig %s aligned as [%s, %s]->[%s, %s].", + logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s].", i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei, extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) @@ -233,7 +233,7 @@ def logpart(i, part): for i, single_hit in enumerate(connected): logpart(i, single_hit) - query = GenotypedContig(name=f'part({contig.name})', + query = GenotypedContig(name=f'part({i}, {contig.name})', seq=contig.seq, ref_name=contig.ref_name, group_ref=contig.group_ref, From f1d88dbb71e1a371fffef8e1a5541aae7e835194 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 15:37:49 -0800 Subject: [PATCH 103/485] Add more detailed logging for contig stitcher --- micall/core/contig_stitcher.py | 12 ++++++++++++ micall/utils/cigar_tools.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index cfffffd36..bd6b32eb4 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -225,6 +225,7 @@ def logpart(i, part): i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei, extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) + logger.debug("Part %r of contig %r aligned as %s.", i, contig.name, part) if len(connected) == 1: logpart(0, connected[0]) @@ -326,6 +327,12 @@ def stitch_2_contigs(left, right): aligned_right_part = aligned_right[max_concordance_index:] overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') + average_concordance = sum(concordance) / (len(concordance) or 1) + logger.debug("Average concordance between overlapping parts of %r and %r is %s (full is %s).", + left.name, right.name, average_concordance, concordance, + extra={"action": "concordance", "left": left, "right": right, + "value": concordance, "avg": average_concordance}) + # Return something that can be fed back into the loop. match_fraction = min(left.match_fraction, right.match_fraction) ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name @@ -365,6 +372,9 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: new_contig.alignment.r_st, new_contig.alignment.r_ei, extra={"action": "stitch", "result": new_contig, "left": current, "right": overlapping_contig}) + logger.debug("Stitching %r with %r results in %r at %s.", + current.name, overlapping_contig.name, + new_contig.name, new_contig.alignment) def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: @@ -462,6 +472,8 @@ def try_split(contig): # overlaps around them. # And we are likely to lose quality with every stitching operation. # By skipping we assert that this gap is aligner's fault. + logger.debug("Ignored insignificant gap of %r, %s.", contig.name, gap, + extra={"action": "ignoregap", "contig": contig, "gap": gap}) continue if covered(contig, gap): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index f0473a0d9..8e98acac5 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -677,6 +677,10 @@ def __repr__(self): return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' + def __str__(self): + return f'{str(self.cigar)}@[{self.q_st},{self.q_ei}]->[{self.r_st},{self.r_ei}]' + + def connect_cigar_hits(cigar_hits: List[CigarHit]) -> List[CigarHit]: """ This function exists to deal with the fact that mappy does not always From 060e13d78296200c790f99a63b50a0411b539f8a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 16:04:43 -0800 Subject: [PATCH 104/485] Small code style improvements for contig stitcher --- micall/core/contig_stitcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index bd6b32eb4..95c7a55a4 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List +from typing import Iterable, Optional, Tuple, List, Dict from collections import deque, defaultdict from dataclasses import dataclass from math import ceil @@ -528,9 +528,11 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig yield from combine_overlaps(aligned) +GroupRef = str + def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(stitch_contigs(contigs)) - consensus_parts = defaultdict(list) # group_ref -> List[AlignedContig] + consensus_parts: Dict[GroupRef, List[AlignedContig]] = defaultdict(list) for contig in contigs: if isinstance(contig, AlignedContig): From 3720af14b0aafdfcda3447e5c624990dda3c2c41 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 16:42:50 -0800 Subject: [PATCH 105/485] Test logging of contig_stitcher --- micall/core/contig_stitcher.py | 1 - micall/tests/test_contig_stitcher.py | 68 ++++++++++++++++++++++++++++ micall/utils/structured_logger.py | 10 ++-- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 95c7a55a4..bcbfa81ea 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -18,7 +18,6 @@ logger = logging.getLogger(__name__) register_structured_logger(logger) - @dataclass class Contig: name: str diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index efcec7344..8a94b882f 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,9 +1,12 @@ import random import pytest +import logging from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference from micall.tests.utils import MockAligner, fixed_random_seed +from micall.utils.structured_logger import iterate_messages +logging.basicConfig(level=logging.DEBUG) @pytest.fixture() def exact_aligner(monkeypatch): @@ -607,6 +610,71 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): assert results[0].name == 'left(a)+overlap(a,b)+right(b)' +def test_correct_processing_complex_logs(exact_aligner): + # Scenario: There are two reference organisms. + # Each with 4 contigs. + # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. + # Tested before, but this time we check the logs + + ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + + contigs = [[ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='c', + seq='C' * 70 + 'T' * 20, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='d', + seq='T' * 20 + 'G' * 50, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] for ref_name in ['testref-1', 'testref-2']] + + contigs = sum(contigs, start=[]) + + logger = logging.getLogger("micall.core.contig_stitcher") + logger.setLevel(logging.DEBUG) + + messages = list(iterate_messages()) + assert len(messages) == 0 + + list(stitch_contigs(contigs)) + + messages = list(iterate_messages()) + assert len(messages) == 48 + assert all(name == "micall.core.contig_stitcher" for name, m in messages) + + info_messages = [m for name, m in messages if m.levelname == 'INFO'] + debug_messages = [m for name, m in messages if m.levelname == 'DEBUG'] + assert len(info_messages) == 32 + assert len(debug_messages) == len(messages) - len(info_messages) + + info_actions = [(m.action + ':' + (m.type if hasattr(m, 'type') else '')) for m in info_messages] + assert info_actions == \ + ['intro:'] * 8 + \ + ['alignment:hitnumber', 'alignment:hit'] * 8 + \ + ['stitch:'] * 2 + ['nooverlap:'] + ['stitch:'] * 2 + ['nooverlap:'] * 3 + + # _ _ _ _ _ _ # | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ # | | | | '_ \| | __| | __/ _ \/ __| __/ __| diff --git a/micall/utils/structured_logger.py b/micall/utils/structured_logger.py index 098f4d866..249ce232f 100644 --- a/micall/utils/structured_logger.py +++ b/micall/utils/structured_logger.py @@ -1,10 +1,10 @@ import logging -from typing import List, Tuple +from typing import List, Tuple, Iterable LoggerName = str -structured_logs: List[Tuple[LoggerName, logging.LogRecord]] = [] +_structured_logs: List[Tuple[LoggerName, logging.LogRecord]] = [] class InMemoryLogHandler(logging.Handler): @@ -13,9 +13,13 @@ def __init__(self, name: str, *args, **kwargs) -> None: self.name: str = name def emit(self, record: logging.LogRecord): - structured_logs.append((self.name, record)) + _structured_logs.append((self.name, record)) def register_structured_logger(logger: logging.Logger): memory_handler = InMemoryLogHandler(logger.name) logger.addHandler(memory_handler) + + +def iterate_messages() -> Iterable[Tuple[LoggerName, logging.LogRecord]]: + yield from _structured_logs From a3d878e850e0e2cdefd0c453081074e55ada2570 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 30 Nov 2023 17:34:26 -0800 Subject: [PATCH 106/485] Cigar tools: make CigarHit a dataclass --- micall/utils/cigar_tools.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 8e98acac5..c155f5508 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -125,7 +125,8 @@ def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' -class Cigar(tuple): +@dataclass(frozen=True) +class Cigar: """ Represents an alignment between a query sequence and a reference sequence using the Compact Idiosyncratic Gapped Alignment Report (CIGAR) string format. @@ -146,8 +147,10 @@ class Cigar(tuple): CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). """ - def __new__(cls, cigar_lst: Iterable[Tuple[int, CigarActions]]): - return super(Cigar, cls).__new__(cls, Cigar.normalize(cigar_lst)) # type: ignore + data: List[Tuple[int, CigarActions]] + + def __post_init__(self): + self.__dict__['data'] = list(Cigar.normalize(self.data)) @staticmethod @@ -170,7 +173,7 @@ def iterate_operations(self) -> Iterable[CigarActions]: The resulting sequence is a decoded version of the initial run-length encoded sequence. """ - for num, operation in self: + for num, operation in self.data: for _ in range(num): yield operation @@ -419,13 +422,17 @@ def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: if last_item: yield (last_item[0], last_item[1]) + def __add__(self, other: 'Cigar'): + return Cigar(self.data + other.data) + + def __repr__(self): return f'Cigar({str(self)!r})' def __str__(self): """ Inverse of Cigar.parse """ - return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self) + return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self.data) @dataclass From e4f221b572b9fdd539dc42ea197e852c3b098a9a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Dec 2023 13:12:24 -0800 Subject: [PATCH 107/485] Make Cigar class not a dataclass --- micall/utils/cigar_tools.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index c155f5508..8c532258a 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -125,7 +125,6 @@ def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' -@dataclass(frozen=True) class Cigar: """ Represents an alignment between a query sequence and a reference sequence using the @@ -147,10 +146,9 @@ class Cigar: CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). """ - data: List[Tuple[int, CigarActions]] - def __post_init__(self): - self.__dict__['data'] = list(Cigar.normalize(self.data)) + def __init__(self, data) -> None: + self._data: List[Tuple[int, CigarActions]] = list(Cigar.normalize(data)) @staticmethod @@ -173,7 +171,7 @@ def iterate_operations(self) -> Iterable[CigarActions]: The resulting sequence is a decoded version of the initial run-length encoded sequence. """ - for num, operation in self.data: + for num, operation in self._data: for _ in range(num): yield operation @@ -422,8 +420,12 @@ def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: if last_item: yield (last_item[0], last_item[1]) + def __eq__(self, other) -> bool: + return isinstance(other, Cigar) and self._data == other._data + + def __add__(self, other: 'Cigar'): - return Cigar(self.data + other.data) + return Cigar(self._data + other._data) def __repr__(self): @@ -432,7 +434,7 @@ def __repr__(self): def __str__(self): """ Inverse of Cigar.parse """ - return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self.data) + return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self._data) @dataclass @@ -489,8 +491,8 @@ def from_default_alignment(r_st, r_ei, q_st, q_ei): ref_length = r_ei - r_st + 1 query_length = q_ei - q_st + 1 - cigar = Cigar.coerce([[ref_length, CigarActions.DELETE], - [query_length, CigarActions.INSERT]]) + cigar = Cigar.coerce([(ref_length, CigarActions.DELETE), + (query_length, CigarActions.INSERT)]) return CigarHit(cigar, r_st=r_st, r_ei=r_ei, q_st=q_st, q_ei=q_ei) From d031a5e3f0191b467175a1fdb53c3d002ed9aee0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Dec 2023 13:40:39 -0800 Subject: [PATCH 108/485] Tests: pass CigarHits as unparsed strings --- micall/tests/test_cigar_tools.py | 360 ++++++++++++++----------------- 1 file changed, 156 insertions(+), 204 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index b49296233..d2c05206a 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -2,6 +2,7 @@ from typing import List, Tuple from math import floor import itertools +import re from micall.utils.consensus_aligner import CigarActions from micall.utils.cigar_tools import Cigar, CigarHit, connect_cigar_hits, CoordinateMapping @@ -149,172 +150,153 @@ def test_invalid_cigar_string(): Cigar.coerce('3') # Not enough Ms +CIGAR_REGEX = re.compile(r"(.*)@([0-9]+),([0-9]+)") +def parsed_hit(string): + match = CIGAR_REGEX.match(string) + assert match, f"Cannot parse {string}" + cigar_str, r_st, q_st = match.groups() + cigar = Cigar.coerce(cigar_str) + r_ei = int(r_st) + cigar.ref_length - 1 + q_ei = int(q_st) + cigar.query_length - 1 + return CigarHit(cigar, int(r_st), int(r_ei), int(q_st), int(q_ei)) + + cigar_hit_ref_cut_cases = [ # Trivial cases - (CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), 2.5, - [CigarHit('2M', r_st=1, r_ei=2, q_st=1, q_ei=2), - CigarHit('2M', r_st=3, r_ei=4, q_st=3, q_ei=4)]), + ('4M@1,1', 2.5, + ['2M@1,1', '2M@3,3']), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 3.5, - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('6M', r_st=4, r_ei=9, q_st=4, q_ei=9)]), + ('9M@1,1', 3.5, + ['3M@1,1', '6M@4,4']), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 4.5, - [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), - CigarHit('5M', r_st=5, r_ei=9, q_st=5, q_ei=9)]), + ('9M@1,1', 4.5, + ['4M@1,1', '5M@5,5']), - (CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8), 3.5, - [CigarHit('4M', r_st=0, r_ei=3, q_st=0, q_ei=3), - CigarHit('5M', r_st=4, r_ei=8, q_st=4, q_ei=8)]), + ('9M@0,0', 3.5, + ['4M@0,0', '5M@4,4']), # Simple cases - (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 3.5, - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('6M9D9M', r_st=4, r_ei=27, q_st=4, q_ei=18)]), + ('9M9D9M@1,1', 3.5, + ['3M@1,1', '6M9D9M@4,4']), - (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 20.5, - [CigarHit('9M9D2M', r_st=1, r_ei=20, q_st=1, q_ei=11), - CigarHit('7M', r_st=21, r_ei=27, q_st=12, q_ei=18)]), + ('9M9D9M@1,1', 20.5, + ['9M9D2M@1,1', '7M@21,12']), - (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 3.5, - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('6M9I9M', r_st=4, r_ei=18, q_st=4, q_ei=27)]), + ('9M9I9M@1,1', 3.5, + ['3M@1,1', '6M9I9M@4,4']), - (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 13.5 or 27/2, - [CigarHit('9M9I4M', r_st=1, r_ei=13, q_st=1, q_ei=22), - CigarHit('5M', r_st=14, r_ei=18, q_st=23, q_ei=27)]), + ('9M9I9M@1,1', 13.5 or 27/2, + ['9M9I4M@1,1', '5M@14,23']), - (CigarHit('5M6I', r_st=1, r_ei=5, q_st=1, q_ei=11), 3.5, - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('2M6I', r_st=4, r_ei=5, q_st=4, q_ei=11)]), + ('5M6I@1,1', 3.5, + ['3M@1,1', '2M6I@4,4']), - (CigarHit('6I5M', r_st=1, r_ei=5, q_st=1, q_ei=11), 3.5, - [CigarHit('6I3M', r_st=1, r_ei=3, q_st=1, q_ei=9), - CigarHit('2M', r_st=4, r_ei=5, q_st=10, q_ei=11)]), + ('6I5M@1,1', 3.5, + ['6I3M@1,1', '2M@4,10']), - (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), 3.5, - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('2M6D', r_st=4, r_ei=11, q_st=4, q_ei=5)]), + ('5M6D@1,1', 3.5, + ['3M@1,1', '2M6D@4,4']), - (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 3.5, - [CigarHit('3D', r_st=1, r_ei=3, q_st=1, q_ei=0), - CigarHit('3D5M', r_st=4, r_ei=11, q_st=1, q_ei=5)]), + ('6D5M@1,1', 3.5, + ['3D@1,1', '3D5M@4,1']), - (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), 7.5, - [CigarHit('5M2D', r_st=1, r_ei=7, q_st=1, q_ei=5), - CigarHit('4D', r_st=8, r_ei=11, q_st=6, q_ei=5)]), + ('5M6D@1,1', 7.5, + ['5M2D@1,1', '4D@8,6']), - (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 7.5, - [CigarHit('6D1M', r_st=1, r_ei=7, q_st=1, q_ei=1), - CigarHit('4M', r_st=8, r_ei=11, q_st=2, q_ei=5)]), + ('6D5M@1,1', 7.5, + ['6D1M@1,1', '4M@8,2']), - (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), 6.5, - [CigarHit('6D', r_st=1, r_ei=6, q_st=1, q_ei=0), - CigarHit('5M', r_st=7, r_ei=11, q_st=1, q_ei=5)]), + ('6D5M@1,1', 6.5, + ['6D@1,1', '5M@7,1']), # Ambigous cases - (CigarHit('9M9D9M', r_st=1, r_ei=27, q_st=1, q_ei=18), 13.5 or 27/2, - [CigarHit('9M4D', r_st=1, r_ei=13, q_st=1, q_ei=9), - CigarHit('5D9M', r_st=14, r_ei=27, q_st=10, q_ei=18)]), + ('9M9D9M@1,1', 13.5 or 27/2, + ['9M4D@1,1', '5D9M@14,10']), - (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.2, - [CigarHit('9M1I', r_st=1, r_ei=9, q_st=1, q_ei=10), - CigarHit('8I9M', r_st=10, r_ei=18, q_st=11, q_ei=27)]), + ('9M9I9M@1,1', 9.2, + ['9M1I@1,1', '8I9M@10,11']), - (CigarHit('9M9D9I9M', r_st=1, r_ei=27, q_st=1, q_ei=27), 13.5 or 27/2, - [CigarHit('9M4D', r_st=1, r_ei=13, q_st=1, q_ei=9), - CigarHit('5D9I9M', r_st=14, r_ei=27, q_st=10, q_ei=27)]), + ('9M9D9I9M@1,1', 13.5 or 27/2, + ['9M4D@1,1', '5D9I9M@14,10']), - (CigarHit('9M9I9D9M', r_st=1, r_ei=27, q_st=1, q_ei=27), 13.5 or 27/2, - [CigarHit('9M9I4D', r_st=1, r_ei=13, q_st=1, q_ei=18), - CigarHit('5D9M', r_st=14, r_ei=27, q_st=19, q_ei=27)]), + ('9M9I9D9M@1,1', 13.5 or 27/2, + ['9M9I4D@1,1', '5D9M@14,19']), - (CigarHit('1M1I1D1M', r_st=1, r_ei=3, q_st=1, q_ei=3), 1.5, # same as previous 2 cases but smaller - [CigarHit('1M1I', r_st=1, r_ei=1, q_st=1, q_ei=2), - CigarHit('1D1M', r_st=2, r_ei=3, q_st=3, q_ei=3)]), + ('1M1I1D1M@1,1', 1.5, # same as previous 2 cases but smaller + ['1M1I@1,1', '1D1M@2,3']), - (CigarHit('1M1D1I1M', r_st=1, r_ei=3, q_st=1, q_ei=3), 1.5, # same as previous 2 cases but smaller - [CigarHit('1M', r_st=1, r_ei=1, q_st=1, q_ei=1), - CigarHit('1D1I1M', r_st=2, r_ei=3, q_st=2, q_ei=3)]), + ('1M1D1I1M@1,1', 1.5, # same as previous 2 cases but smaller + ['1M@1,1', '1D1I1M@2,2']), # Edge cases - (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 9.5, # no middlepoint - [CigarHit('9M5I', r_st=1, r_ei=9, q_st=1, q_ei=14), - CigarHit('4I9M', r_st=10, r_ei=18, q_st=15, q_ei=27)]), + ('9M9I9M@1,1', 9.5, # no middlepoint + ['9M5I@1,1', '4I9M@10,15']), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 8.5, # one is singleton - [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8), - CigarHit('1M', r_st=9, r_ei=9, q_st=9, q_ei=9)]), + ('9M@1,1', 8.5, + ['8M@1,1', '1M@9,9']), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 9.5, # one is empty - [CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), - CigarHit('', r_st=10, r_ei=9, q_st=10, q_ei=9)]), + ('9M@1,1', 9.5, + ['9M@1,1', '@10,10']), - (CigarHit('7M', r_st=3, r_ei=9, q_st=3, q_ei=9), 2.5, # one is empty - [CigarHit('', r_st=3, r_ei=2, q_st=3, q_ei=2), - CigarHit('7M', r_st=3, r_ei=9, q_st=3, q_ei=9)]), + ('7M@3,3', 2.5, + ['@3,3', '7M@3,3']), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 0.5, # one is empty around 0 - [CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), - CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)]), + ('9M@1,1', 0.5, + ['@1,1', '9M@1,1']), - (CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8), -0.5, # another one is empty and negative - [CigarHit('', r_st=0, r_ei=-1, q_st=0, q_ei=-1), - CigarHit('9M', r_st=0, r_ei=8, q_st=0, q_ei=8)]), + ('9M@0,0', -0.5, + ['@0,0', '9M@0,0']), - (CigarHit('9D', r_st=1, r_ei=9, q_st=1, q_ei=0), 3.5, - [CigarHit('3D', r_st=1, r_ei=3, q_st=1, q_ei=0), - CigarHit('6D', r_st=4, r_ei=9, q_st=1, q_ei=0)]), + ('9D@1,1', 3.5, + ['3D@1,1', '6D@4,1']), - (CigarHit('9D', r_st=0, r_ei=8, q_st=0, q_ei=-1), -0.5, - [CigarHit('', r_st=0, r_ei=-1, q_st=0, q_ei=-1), - CigarHit('9D', r_st=0, r_ei=8, q_st=0, q_ei=-1)]), + ('9D@0,0', -0.5, + ['@0,0', '9D@0,0']), - (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.5, - [CigarHit('1M4I', r_st=1, r_ei=1, q_st=1, q_ei=5), - CigarHit('3I1M', r_st=2, r_ei=2, q_st=6, q_ei=9)]), + ('1M7I1M@1,1', 1.5, + ['1M4I@1,1', '3I1M@2,6']), - (CigarHit('1M6I1M', r_st=1, r_ei=2, q_st=1, q_ei=8), 1.5, - [CigarHit('1M3I', r_st=1, r_ei=1, q_st=1, q_ei=4), - CigarHit('3I1M', r_st=2, r_ei=2, q_st=5, q_ei=8)]), + ('1M6I1M@1,1', 1.5, + ['1M3I@1,1', '3I1M@2,5']), - (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.999, - [CigarHit('1M7I', r_st=1, r_ei=1, q_st=1, q_ei=8), - CigarHit('1M', r_st=2, r_ei=2, q_st=9, q_ei=9)]), + ('1M7I1M@1,1', 1.999, + ['1M7I@1,1', '1M@2,9']), - (CigarHit('1M7I1M', r_st=1, r_ei=2, q_st=1, q_ei=9), 1.001, - [CigarHit('1M', r_st=1, r_ei=1, q_st=1, q_ei=1), - CigarHit('7I1M', r_st=2, r_ei=2, q_st=2, q_ei=9)]), + ('1M7I1M@1,1', 1.001, + ['1M@1,1', '7I1M@2,2']), - (CigarHit('2=1X2N1N2=1H2S', r_st=1, r_ei=8, q_st=1, q_ei=7), 3.5, - [CigarHit('2=1X', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('3N2=1H2S', r_st=4, r_ei=8, q_st=4, q_ei=7)]), + ('2=1X2N1N2=1H2S@1,1', 3.5, + ['2=1X@1,1', '3N2=1H2S@4,4']), # Negative cases - (CigarHit('9M9I9M', r_st=1, r_ei=18, q_st=1, q_ei=27), 20.5, + ('9M9I9M@1,1', 20.5, IndexError("20.5 is bigger than reference (18)")), - (CigarHit('', r_st=2, r_ei=1, q_st=2, q_ei=1), 2.5, + ('@2,2', 2.5, IndexError("Empty string cannot be cut")), - (CigarHit('', r_st=2, r_ei=1, q_st=2, q_ei=1), 1.5, + ('@2,2', 1.5, IndexError("Empty string cannot be cut")), - (CigarHit('9I', r_st=1, r_ei=0, q_st=1, q_ei=9), 3.5, + ('9I@1,1', 3.5, IndexError("Out of reference bounds")), - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), 4, + ('9M@1,1', 4, ValueError("Cut point must not be an integer")), ] + @pytest.mark.parametrize('hit, cut_point, expected_result', cigar_hit_ref_cut_cases) def test_cigar_hit_ref_cut(hit, cut_point, expected_result): + hit = parsed_hit(hit) + if isinstance(expected_result, Exception): with pytest.raises(type(expected_result)): hit.cut_reference(cut_point) else: + expected_result = list(map(parsed_hit, expected_result)) expected_left, expected_right = expected_result left, right = hit.cut_reference(cut_point) assert expected_left == left @@ -324,91 +306,59 @@ def test_cigar_hit_ref_cut(hit, cut_point, expected_result): @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_ref_cut_add_prop(hit, cut_point): + hit = parsed_hit(hit) left, right = hit.cut_reference(cut_point) assert left + right == hit -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in + [x for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]]) def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): + hit = parsed_hit(hit) percentage = cut_point - floor(cut_point) for cut_point in range(hit.r_st, hit.r_ei + 2): left, right = hit.cut_reference(cut_point - percentage) assert left + right == hit - lstrip_cases = [ - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), - CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)), - - (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), - CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5)), - - (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), - CigarHit('5M', r_st=7, r_ei=11, q_st=1, q_ei=5)), - - (CigarHit('6D4I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), - - (CigarHit('3D3D4I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), - - (CigarHit('3D2I3D2I5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), - - (CigarHit('4I6D5M', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('4I5M', r_st=7, r_ei=11, q_st=1, q_ei=9)), - - (CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4), - CigarHit('4I', r_st=7, r_ei=6, q_st=1, q_ei=4)), - - (CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4), - CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4)), - - (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), - CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), + ('9M@1,1', '9M@1,1'), + ('5M6D@1,1', '5M6D@1,1'), + ('6D5M@1,1', '5M@7,1'), + ('6D4I5M@1,1', '4I5M@7,1'), + ('3D3D4I5M@1,1', '4I5M@7,1'), + ('3D2I3D2I5M@1,1', '4I5M@7,1'), + ('4I6D5M@1,1', '4I5M@7,1'), + ('6D4I@1,1', '4I@7,1'), + ('4I6D@1,1', '4I6D@1,1'), + ('@1,1', '@1,1'), ] @pytest.mark.parametrize('hit, expected', lstrip_cases) def test_cigar_hit_lstrip(hit, expected): + hit = parsed_hit(hit) + expected = parsed_hit(expected) assert expected == hit.lstrip_query() rstrip_cases = [ - (CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9), - CigarHit('9M', r_st=1, r_ei=9, q_st=1, q_ei=9)), - - (CigarHit('5M6D', r_st=1, r_ei=11, q_st=1, q_ei=5), - CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5)), - - (CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5), - CigarHit('6D5M', r_st=1, r_ei=11, q_st=1, q_ei=5)), - - (CigarHit('5M4I6D', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), - - (CigarHit('5M4I3D3D', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), - - (CigarHit('5M2I3D2I3D', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), - - (CigarHit('5M6D4I', r_st=1, r_ei=11, q_st=1, q_ei=9), - CigarHit('5M4I', r_st=1, r_ei=5, q_st=1, q_ei=9)), - - (CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4), - CigarHit('6D4I', r_st=1, r_ei=6, q_st=1, q_ei=4)), - - (CigarHit('4I6D', r_st=1, r_ei=6, q_st=1, q_ei=4), - CigarHit('4I', r_st=1, r_ei=0, q_st=1, q_ei=4)), - - (CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0), - CigarHit('', r_st=1, r_ei=0, q_st=1, q_ei=0)), + ('9M@1,1', '9M@1,1'), + ('5M6D@1,1', '5M@1,1'), + ('6D5M@1,1', '6D5M@1,1'), + ('5M4I6D@1,1', '5M4I@1,1'), + ('5M4I3D3D@1,1', '5M4I@1,1'), + ('5M2I3D2I3D@1,1', '5M4I@1,1'), + ('5M6D4I@1,1', '5M4I@1,1'), + ('6D4I@1,1', '6D4I@1,1'), + ('4I6D@1,1', '4I@1,1'), + ('@1,1', '@1,1'), ] @pytest.mark.parametrize('hit, expected', rstrip_cases) def test_cigar_hit_rstrip(hit, expected): + hit = parsed_hit(hit) + expected = parsed_hit(expected) assert expected == hit.rstrip_query() @@ -419,6 +369,8 @@ def test_cigar_hit_rstrip(hit, expected): @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_combines_with_connect(hit): + hit = parsed_hit(hit) + for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) @@ -431,6 +383,8 @@ def test_cigar_hit_strip_combines_with_connect(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_combines_with_add(hit): + hit = parsed_hit(hit) + for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) @@ -443,6 +397,8 @@ def test_cigar_hit_strip_combines_with_add(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_never_crashes(hit): + hit = parsed_hit(hit) + hit.rstrip_query().lstrip_query() hit.lstrip_query().rstrip_query() hit.lstrip_query().lstrip_query() @@ -451,6 +407,8 @@ def test_cigar_hit_strip_never_crashes(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strip_is_idempotent(hit): + hit = parsed_hit(hit) + h1 = hit.rstrip_query() assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() @@ -466,6 +424,8 @@ def test_cigar_hit_strip_is_idempotent(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_strips_are_commutative(hit): + hit = parsed_hit(hit) + assert hit.rstrip_query().lstrip_query() \ == hit.lstrip_query().rstrip_query() @@ -473,6 +433,7 @@ def test_cigar_hit_strips_are_commutative(hit): @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): + hit = parsed_hit(hit) percentage = cut_point - floor(cut_point) for ax_cut in range(hit.r_st, hit.r_ei + 2): @@ -489,6 +450,7 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): @pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_gaps_no_m_or_i(hit): + hit = parsed_hit(hit) gaps = list(hit.gaps()) if 'D' in str(hit.cigar): @@ -502,6 +464,7 @@ def test_cigar_hit_gaps_no_m_or_i(hit): @pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) def test_cigar_hit_gaps_lengths(hit): + hit = parsed_hit(hit) gaps = list(hit.gaps()) for gap in gaps: @@ -538,64 +501,53 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): connect_cigar_hits_cases = [ # Non-overlapping hits should be connected with deletions/insertions ( - [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), - CigarHit('4M', r_st=10, r_ei=13, q_st=8, q_ei=11)], - [CigarHit('4M5D3I4M', r_st=1, r_ei=13, q_st=1, q_ei=11)] + ['4M@1,1', '4M@10,8'], + ['4M5D3I4M@1,1'] ), # Overlapping hits should ignore later ones ( - [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), - CigarHit('5M', r_st=3, r_ei=7, q_st=3, q_ei=7)], - [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4)] + ['4M@1,1', '5M@3,3'], + ['4M@1,1'] ), # Touching hits should be simply concatenated ( - [CigarHit('4M', r_st=1, r_ei=4, q_st=1, q_ei=4), - CigarHit('4M', r_st=5, r_ei=8, q_st=5, q_ei=8)], - [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8)] + ['4M@1,1', '4M@5,5'], + ['8M@1,1'] ), # Hits that touch at only one boundary should combine just fine ( - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('6M', r_st=4, r_ei=9, q_st=6, q_ei=11)], - [CigarHit('3M2I6M', r_st=1, r_ei=9, q_st=1, q_ei=11)] + ['3M@1,1', '6M@4,6'], + ['3M2I6M@1,1'] ), # Hits that are subsets of earlier hits should be ignored ( - [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8), - CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], - [CigarHit('8M', r_st=1, r_ei=8, q_st=1, q_ei=8)] + ['8M@1,1', '3M@3,3'], + ['8M@1,1'] ), # Hits that are out of order should be connected if no overlap ( - [CigarHit('3M', r_st=10, r_ei=12, q_st=6, q_ei=8), - CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3)], - [CigarHit('3M6D2I3M', r_st=1, r_ei=12, q_st=1, q_ei=8)] + ['3M@10,6', '3M@1,1'], + ['3M6D2I3M@1,1'] ), # Hits that overlap by a single base should prioritize the first hit and not combine ( - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('3M', r_st=3, r_ei=5, q_st=3, q_ei=5)], - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3)] + ['3M@1,1', '3M@3,3'], + ['3M@1,1'] ), # Non-overlapping hits in the query space but overlapping in reference space ( - [CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5), - CigarHit('1M', r_st=3, r_ei=3, q_st=10, q_ei=10)], - [CigarHit('5M', r_st=1, r_ei=5, q_st=1, q_ei=5)] + ['5M@1,1', '1M@3,10'], + ['5M@1,1'] ), # Combining more than two hits ( - [CigarHit('3M', r_st=1, r_ei=3, q_st=1, q_ei=3), - CigarHit('3M', r_st=7, r_ei=9, q_st=7, q_ei=9), - CigarHit('3M', r_st=12, r_ei=14, q_st=16, q_ei=18)], - [CigarHit('3M3D3I3M2D6I3M', r_st=1, r_ei=14, q_st=1, q_ei=18)] + ['3M@1,1', '3M@7,7', '3M@12,16'], + ['3M3D3I3M2D6I3M@1,1'] ), # Combining hits including hard-clipping, which should be ignored in alignments ( - [CigarHit('2H5M1H', r_st=1, r_ei=5, q_st=3, q_ei=7), - CigarHit('2H5M1H', r_st=11, r_ei=15, q_st=13, q_ei=17)], - [CigarHit('2H5M1H5D5I2H5M1H', r_st=1, r_ei=15, q_st=3, q_ei=17)] + ['2H5M1H@1,3', '2H5M1H@11,13'], + ['2H5M1H5D5I2H5M1H@1,3'] ), # An empty list of hits should raise a ValueError ( @@ -604,18 +556,18 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): ), # Before by reference, after by query ( - [CigarHit('4M', r_st=1, r_ei=4, q_st=8, q_ei=11), - CigarHit('4M', r_st=10, r_ei=13, q_st=1, q_ei=4)], - [CigarHit('4M', r_st=1, r_ei=4, q_st=8, q_ei=11), - CigarHit('4M', r_st=10, r_ei=13, q_st=1, q_ei=4)] + ['4M@1,8', '4M@10,1'], + ['4M@1,8', '4M@10,1'] ), ] - @pytest.mark.parametrize('hits, expected_result', connect_cigar_hits_cases) def test_connect_cigar_hits(hits, expected_result): + hits = list(map(parsed_hit, hits)) + if isinstance(expected_result, Exception): with pytest.raises(type(expected_result)): connect_cigar_hits(hits) else: + expected_result = list(map(parsed_hit, expected_result)) result = connect_cigar_hits(hits) assert expected_result == result From 09171e2b02f5a23c436efe5f3e434ee878ee1532 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Dec 2023 13:44:04 -0800 Subject: [PATCH 109/485] Cigar tools: make CigarHit immutable --- micall/core/contig_stitcher.py | 2 +- micall/tests/test_contig_stitcher_fuzz.py | 4 ++-- micall/utils/cigar_tools.py | 8 +++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index bcbfa81ea..f7d87529a 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -212,7 +212,7 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: yield contig return - hits_array = [CigarHit(x.cigar, x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] + hits_array = [CigarHit(Cigar.coerce(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] connected = connect_cigar_hits(hits_array) logger.info("Contig %r aligned in %s parts.", contig.name, len(connected), diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index c36881763..ae761e220 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -2,7 +2,7 @@ import json import os from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps -from micall.utils.cigar_tools import CigarHit +from micall.utils.cigar_tools import CigarHit, Cigar from micall.utils.consensus_aligner import CigarActions from typing import Dict, List from collections import defaultdict @@ -33,7 +33,7 @@ def read_contigs(line): ref_seq=ref_seq, match_fraction=2/3, ) - alignment = CigarHit([(length, CigarActions.MATCH)], q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) + alignment = CigarHit(Cigar([(length, CigarActions.MATCH)]), q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) contig = AlignedContig(query=query, alignment=alignment) aidee = f'{start:03d}-{end:03d}' yield {'contig': contig, 'id': aidee} diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 8c532258a..9fd8d7514 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,7 +4,7 @@ from math import ceil, floor import re -from typing import Container, Tuple, Iterable, Optional, Set, Dict, List +from typing import Container, Tuple, Iterable, Optional, Set, Dict, List, Union from dataclasses import dataclass from functools import cached_property, reduce from itertools import chain, dropwhile @@ -152,7 +152,7 @@ def __init__(self, data) -> None: @staticmethod - def coerce(obj): + def coerce(obj: Union['Cigar', str, Iterable[Tuple[int, CigarActions]]]): if isinstance(obj, Cigar): return obj @@ -437,7 +437,7 @@ def __str__(self): return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self._data) -@dataclass +@dataclass(frozen=True) class CigarHit: """ This class provides an abstraction over the complex details involved in working with sequence alignments @@ -461,8 +461,6 @@ class CigarHit: def __post_init__(self): - self.cigar = Cigar.coerce(self.cigar) - if self.ref_length != self.cigar.ref_length: raise ValueError(f"CIGAR string maps {self.cigar.ref_length}" f" reference positions, but CIGAR hit range is {self.ref_length}") From a08b68f2bb4e443b46c29aac91e030bcb4dd3766 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Dec 2023 15:03:34 -0800 Subject: [PATCH 110/485] Contig stitcher: add main() entry point --- micall/core/contig_stitcher.py | 32 +++++++++++++++++++ micall/tests/data/exact_parts_contigs.csv | 4 +++ .../data/exact_parts_contigs_stitched.csv | 2 ++ micall/tests/test_contig_stitcher.py | 28 ++++++++++++++-- 4 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 micall/tests/data/exact_parts_contigs.csv create mode 100644 micall/tests/data/exact_parts_contigs_stitched.csv diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f7d87529a..4cbccc2a3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -544,3 +544,35 @@ def combine(contigs): return FrankensteinContig(contigs) yield from map(combine, consensus_parts.values()) + + +def main(args): + import argparse + from micall.core.denovo import write_contig_refs + + parser = argparse.ArgumentParser() + parser.add_argument('contigs', type=argparse.FileType('r')) + parser.add_argument('stitched_contigs', type=argparse.FileType('w')) + verbosity_group = parser.add_mutually_exclusive_group() + verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity') + verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity', default=True) + verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity') + verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity') + args = parser.parse_args(args) + + if args.quiet: + logging.basicConfig(level=logging.ERROR) + elif args.verbose: + logging.basicConfig(level=logging.INFO) + elif args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.WARN) + + write_contig_refs(args.contigs.name, args.stitched_contigs) + args.stitched_contigs.close() + + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) diff --git a/micall/tests/data/exact_parts_contigs.csv b/micall/tests/data/exact_parts_contigs.csv new file mode 100644 index 000000000..963f29c3f --- /dev/null +++ b/micall/tests/data/exact_parts_contigs.csv @@ -0,0 +1,4 @@ +>gag1 +GGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACAT +>gag2 +ACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAG diff --git a/micall/tests/data/exact_parts_contigs_stitched.csv b/micall/tests/data/exact_parts_contigs_stitched.csv new file mode 100644 index 000000000..c586ada9c --- /dev/null +++ b/micall/tests/data/exact_parts_contigs_stitched.csv @@ -0,0 +1,2 @@ +ref,match,group_ref,contig +HIV1-B-FR-K03455-seed,1.0,HIV1-B-FR-K03455-seed,GGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAG diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 8a94b882f..29bf16d38 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,12 +1,14 @@ import random -import pytest import logging -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference +import os +import pytest + +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main from micall.tests.utils import MockAligner, fixed_random_seed from micall.utils.structured_logger import iterate_messages +from micall.tests.test_denovo import check_hcv_db -logging.basicConfig(level=logging.DEBUG) @pytest.fixture() def exact_aligner(monkeypatch): @@ -675,6 +677,26 @@ def test_correct_processing_complex_logs(exact_aligner): ['stitch:'] * 2 + ['nooverlap:'] + ['stitch:'] * 2 + ['nooverlap:'] * 3 +def test_main_invocation(exact_aligner, tmp_path, hcv_db): + pwd = os.path.dirname(__file__) + contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") + stitched_contigs = os.path.join(tmp_path, "stitched.csv") + main([contigs, stitched_contigs]) + + assert os.path.exists(contigs) + assert os.path.exists(stitched_contigs) + + # Check the contents of stitched_contigs + with open(stitched_contigs, 'r') as stitched_file: + stitched_data = stitched_file.read() + + expected_file_path = os.path.join(pwd, "data", "exact_parts_contigs_stitched.csv") + with open(expected_file_path, 'r') as expected_file: + expected_data = expected_file.read() + + assert stitched_data == expected_data, "The contents of the stitched contigs file do not match the expected contents." + + # _ _ _ _ _ _ # | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ # | | | | '_ \| | __| | __/ _ \/ __| __/ __| From 93feed4e368b6da282772f118060f9f5e13072fb Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 13:50:42 -0800 Subject: [PATCH 111/485] Contig stitcher: ignore contigs that align in-reverse Also: * Small improvements to logging in contig stitcher * Fix MockAlignment is_rev field --- micall/core/contig_stitcher.py | 65 +++++++++++++++-------- micall/tests/test_contig_stitcher.py | 10 ++-- micall/tests/test_contig_stitcher_fuzz.py | 2 +- micall/tests/test_tests_utils.py | 2 +- micall/tests/utils.py | 8 +-- 5 files changed, 53 insertions(+), 34 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 4cbccc2a3..05c5b6b12 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -197,49 +197,61 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment) -def align_to_reference(contig) -> Iterable[GenotypedContig]: +def align_to_reference(contig) -> Iterable[Tuple[GenotypedContig, bool]]: if contig.ref_seq is None: logger.info("Contig %r not aligned - no reference.", contig.name, extra={"action": "alignment", "type": "noref", "contig": contig}) - yield contig + yield (contig, False) return aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - if not alignments: - logger.info("Contig %r not aligned - backend choice.", contig.name, - extra={"action": "alignment", "type": "zerohits", "contig": contig}) - yield contig - return + reversed_alignments = [alignment for alignment in alignments if alignment.strand == -1] + alignments = [alignment for alignment in alignments if alignment.strand == 1] + + logger.info("Contig %r produced %s reverse-complement alignments.", contig.name, len(reversed_alignments), + extra={"action": "alignment", "type": "reversenumber", + "contig": contig, "n": len(reversed_alignments)}) hits_array = [CigarHit(Cigar.coerce(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] - connected = connect_cigar_hits(hits_array) + connected = connect_cigar_hits(hits_array) if hits_array else [] - logger.info("Contig %r aligned in %s parts.", contig.name, len(connected), + logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected), extra={"action": "alignment", "type": "hitnumber", "contig": contig, "n": len(connected)}) - def logpart(i, part): - logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s].", + def logpart(i, part, is_rev): + logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s]%s.", i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei, + " (rev)" if is_rev else "", extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) - logger.debug("Part %r of contig %r aligned as %s.", i, contig.name, part) + logger.debug("Part %r of contig %r aligned as %s.%s", i, contig.name, part, + " (rev)" if is_rev else "") - if len(connected) == 1: - logpart(0, connected[0]) - yield AlignedContig(query=contig, alignment=connected[0]) + to_return = connected + reversed_alignments + if len(to_return) == 0: + logger.info("Contig %r not aligned - backend choice.", contig.name, + extra={"action": "alignment", "type": "zerohits", "contig": contig}) + yield (contig, False) + return + + if len(to_return) == 1: + is_rev = to_return[0] in reversed_alignments + logpart(0, to_return[0], is_rev) + yield (AlignedContig(query=contig, alignment=connected[0]), is_rev) return - for i, single_hit in enumerate(connected): - logpart(i, single_hit) + for i, single_hit in enumerate(connected + reversed_alignments): query = GenotypedContig(name=f'part({i}, {contig.name})', seq=contig.seq, ref_name=contig.ref_name, group_ref=contig.group_ref, ref_seq=contig.ref_seq, match_fraction=contig.match_fraction) - yield AlignedContig(query=query, alignment=single_hit) + is_rev = single_hit in reversed_alignments + logpart(i, single_hit, is_rev) + yield (AlignedContig(query=query, alignment=single_hit), is_rev) def align_all_to_reference(contigs): @@ -514,13 +526,20 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.", contig.name, contig.ref_name, contig.group_ref, len(contig.seq), extra={"action": "intro", "contig": contig}) + logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.", + contig.name, contig.seq, contig.ref_name, + contig.group_ref, contig.ref_seq, len(contig.seq), + extra={"action": "intro", "contig": contig}) + + aligned = align_all_to_reference(contigs) - maybe_aligned = align_all_to_reference(contigs) + # Contigs aligned in reverse do not need any more processing + yield from (x for (x, is_rev) in aligned if is_rev) + aligned = [x for (x, is_rev) in aligned if not is_rev] # Contigs that did not align do not need any more processing - yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) - aligned: List[AlignedContig] = \ - [x for x in maybe_aligned if isinstance(x, AlignedContig)] + yield from (x for x in aligned if not isinstance(x, AlignedContig)) + aligned = [x for x in aligned if isinstance(x, AlignedContig)] aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) @@ -548,7 +567,7 @@ def combine(contigs): def main(args): import argparse - from micall.core.denovo import write_contig_refs + from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() parser.add_argument('contigs', type=argparse.FileType('r')) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 29bf16d38..5636edbe8 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -438,7 +438,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ), ] - contigs = align_all_to_reference(contigs) + contigs = [c for c, is_rev in align_all_to_reference(contigs)] assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -469,7 +469,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ), ] - contigs = align_all_to_reference(contigs) + contigs = [c for c, is_rev in align_all_to_reference(contigs)] assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -662,18 +662,18 @@ def test_correct_processing_complex_logs(exact_aligner): list(stitch_contigs(contigs)) messages = list(iterate_messages()) - assert len(messages) == 48 + assert len(messages) == 64 assert all(name == "micall.core.contig_stitcher" for name, m in messages) info_messages = [m for name, m in messages if m.levelname == 'INFO'] debug_messages = [m for name, m in messages if m.levelname == 'DEBUG'] - assert len(info_messages) == 32 + assert len(info_messages) == 40 assert len(debug_messages) == len(messages) - len(info_messages) info_actions = [(m.action + ':' + (m.type if hasattr(m, 'type') else '')) for m in info_messages] assert info_actions == \ ['intro:'] * 8 + \ - ['alignment:hitnumber', 'alignment:hit'] * 8 + \ + ['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \ ['stitch:'] * 2 + ['nooverlap:'] + ['stitch:'] * 2 + ['nooverlap:'] * 3 diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index ae761e220..1f0039716 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -10,7 +10,7 @@ @pytest.fixture def no_aligner(monkeypatch): - monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [x]) + monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [(x, False)]) def read_contigs(line): diff --git a/micall/tests/test_tests_utils.py b/micall/tests/test_tests_utils.py index 047868475..f1e70c845 100644 --- a/micall/tests/test_tests_utils.py +++ b/micall/tests/test_tests_utils.py @@ -14,7 +14,7 @@ def test_basic_mapping(): assert isinstance(alignment, MockAlignment) assert alignment.mapq == 60 - assert alignment.is_rev == False + assert alignment.strand == 1 assert alignment.r_st == 4 assert alignment.r_en == 14 assert alignment.q_st == 0 diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 8d823f20c..0d8d980e4 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -8,7 +8,7 @@ @dataclass class MockAlignment: - is_rev: bool + strand: int # +1 if on the forward strand; -1 if on the reverse strand mapq: int cigar: list cigar_str: str @@ -41,17 +41,17 @@ def map(self, seq): continue mapq = 60 - is_rev = False # Doesn't handle reverse complements in this mock. + strand = 1 # Doesn't handle reverse complements in this mock. r_st = self.seq.index(substring) r_en = r_st + len(substring) q_st = start q_en = end cigar = [[q_en - q_st, CigarActions.MATCH]] cigar_str = f'{(q_en - q_st)}M' - al = MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + al = MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) if (q_st, q_en, r_st, r_en) not in returned: returned.add((q_st, q_en, r_st, r_en)) - yield MockAlignment(is_rev, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) max_matches -= 1 if max_matches < 1: From bb3fcaeb81e32726a3527d4512a146ad033a280d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 15:43:20 -0800 Subject: [PATCH 112/485] Contig stitcher: fix type error in alignments --- micall/core/contig_stitcher.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 05c5b6b12..3e9fb8295 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -206,15 +206,17 @@ def align_to_reference(contig) -> Iterable[Tuple[GenotypedContig, bool]]: aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - reversed_alignments = [alignment for alignment in alignments if alignment.strand == -1] - alignments = [alignment for alignment in alignments if alignment.strand == 1] + hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), x.strand == -1) + for x in alignments] + reversed_alignments = [alignment for alignment, is_rev in hits_array if is_rev] + alignments = [alignment for alignment, is_rev in hits_array if not is_rev] - logger.info("Contig %r produced %s reverse-complement alignments.", contig.name, len(reversed_alignments), + logger.info("Contig %r produced %s reverse-complement alignments.", + contig.name, len(reversed_alignments), extra={"action": "alignment", "type": "reversenumber", "contig": contig, "n": len(reversed_alignments)}) - hits_array = [CigarHit(Cigar.coerce(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1) for x in alignments] - connected = connect_cigar_hits(hits_array) if hits_array else [] + connected = connect_cigar_hits(alignments) if alignments else [] logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected), extra={"action": "alignment", "type": "hitnumber", From 0194cff9d13d9506254ad9e45b3d2c6304aa9af5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 16:37:41 -0800 Subject: [PATCH 113/485] More small improvements to Contig Stitcher logging --- micall/core/contig_stitcher.py | 39 +++++++++++++++++++--------- micall/tests/test_contig_stitcher.py | 19 +++++++++----- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 3e9fb8295..e011e6dbc 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -222,13 +222,13 @@ def align_to_reference(contig) -> Iterable[Tuple[GenotypedContig, bool]]: extra={"action": "alignment", "type": "hitnumber", "contig": contig, "n": len(connected)}) - def logpart(i, part, is_rev): - logger.info("Part %r of contig %r aligned as [%s, %s]->[%s, %s]%s.", - i, contig.name, part.q_st, part.q_ei, part.r_st, part.r_ei, + def logpart(i, part_name, part, is_rev): + logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", + i, contig.name, part_name, part.q_st, part.q_ei, part.r_st, part.r_ei, " (rev)" if is_rev else "", extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) - logger.debug("Part %r of contig %r aligned as %s.%s", i, contig.name, part, + logger.debug("Part %r of contig %r aligned as %s%s.", i, contig.name, part, " (rev)" if is_rev else "") to_return = connected + reversed_alignments @@ -240,7 +240,7 @@ def logpart(i, part, is_rev): if len(to_return) == 1: is_rev = to_return[0] in reversed_alignments - logpart(0, to_return[0], is_rev) + logpart(0, contig.name, to_return[0], is_rev) yield (AlignedContig(query=contig, alignment=connected[0]), is_rev) return @@ -252,7 +252,7 @@ def logpart(i, part, is_rev): ref_seq=contig.ref_seq, match_fraction=contig.match_fraction) is_rev = single_hit in reversed_alignments - logpart(i, single_hit, is_rev) + logpart(i, query.name, single_hit, is_rev) yield (AlignedContig(query=query, alignment=single_hit), is_rev) @@ -332,6 +332,16 @@ def stitch_2_contigs(left, right): left_overlap = left_overlap.rstrip_query() right_overlap = right_overlap.lstrip_query() + logger.debug("Stitching %r at %s (len %s) with %r at %s (len %s)." + " The left_overlap %r is at %s (len %s)" + " and the right_overlap %r is at %s (len %s).", + left.name, left.alignment, len(left.seq), + right.name, right.alignment, len(right.seq), + left_overlap.name, left_overlap.alignment, len(left_overlap.seq), + right_overlap.name, right_overlap.alignment, len(right_overlap.seq), + extra={"action": "stitchcut", "left": left, "right": right, + "left_overlap": left_overlap, "right_overlap": right_overlap}) + # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) @@ -385,9 +395,9 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: new_contig.alignment.r_st, new_contig.alignment.r_ei, extra={"action": "stitch", "result": new_contig, "left": current, "right": overlapping_contig}) - logger.debug("Stitching %r with %r results in %r at %s.", + logger.debug("Stitching %r with %r results in %r at %s (len %s).", current.name, overlapping_contig.name, - new_contig.name, new_contig.alignment) + new_contig.name, new_contig.alignment, len(new_contig.seq)) def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: @@ -560,11 +570,16 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo else: yield contig - def combine(contigs): - contigs = sorted(contigs, key=lambda x: x.alignment.r_st) - return FrankensteinContig(contigs) + def combine(group_ref): + contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) + logger.debug("Combining these contigs for final output for %r: %s.", + group_ref, + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs], + extra={"action": "finalcombine", "contigs": contigs}) + ret = FrankensteinContig(contigs) + return ret - yield from map(combine, consensus_parts.values()) + yield from map(combine, consensus_parts) def main(args): diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 5636edbe8..9dc8f7c02 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -659,10 +659,10 @@ def test_correct_processing_complex_logs(exact_aligner): messages = list(iterate_messages()) assert len(messages) == 0 - list(stitch_contigs(contigs)) + list(stitch_consensus(contigs)) messages = list(iterate_messages()) - assert len(messages) == 64 + assert len(messages) == 70 assert all(name == "micall.core.contig_stitcher" for name, m in messages) info_messages = [m for name, m in messages if m.levelname == 'INFO'] @@ -670,11 +670,18 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(info_messages) == 40 assert len(debug_messages) == len(messages) - len(info_messages) - info_actions = [(m.action + ':' + (m.type if hasattr(m, 'type') else '')) for m in info_messages] - assert info_actions == \ - ['intro:'] * 8 + \ + actions = [m.__dict__.get('action', '') + ':' + m.__dict__.get('type', '') + for name, m in messages] + actions = [action for action in actions if action != ':'] + + assert actions == \ + ['intro:'] * 16 + \ ['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \ - ['stitch:'] * 2 + ['nooverlap:'] + ['stitch:'] * 2 + ['nooverlap:'] * 3 + ['stitchcut:', 'concordance:', 'stitch:'] * 2 + \ + ['nooverlap:'] + \ + ['stitchcut:', 'concordance:', 'stitch:'] * 2 + \ + ['nooverlap:'] * 3 + \ + ['finalcombine:'] * 2 def test_main_invocation(exact_aligner, tmp_path, hcv_db): From eaf82f1089b7add495f98feb390bfcee14bbc3d4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 16:48:11 -0800 Subject: [PATCH 114/485] Move reverse flag to AlignedContig object in Contig Stitcher --- micall/core/contig_stitcher.py | 37 ++++++++++++----------- micall/tests/test_contig_stitcher.py | 4 +-- micall/tests/test_contig_stitcher_fuzz.py | 4 +-- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e011e6dbc..e685bfee0 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -59,10 +59,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont class AlignedContig(GenotypedContig): query: GenotypedContig alignment: CigarHit + reverse: bool - def __init__(self, query: GenotypedContig, alignment: CigarHit): + def __init__(self, query: GenotypedContig, alignment: CigarHit, reverse: bool): self.query = query self.alignment = alignment + self.reverse = reverse super().__init__( seq = query.seq, name = query.name, @@ -79,21 +81,21 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont query_left, query_right = self.query.cut_query(alignment_left.q_ei + 0.5) alignment_right = alignment_right.translate(0, -1 * alignment_right.q_st) - return (AlignedContig(query_left, alignment_left), - AlignedContig(query_right, alignment_right)) + return (AlignedContig(query_left, alignment_left, self.reverse), + AlignedContig(query_right, alignment_right, self.reverse)) def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) - return AlignedContig(query, alignment) + return AlignedContig(query, alignment, self.reverse) def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) - return AlignedContig(query, alignment) + return AlignedContig(query, alignment, self.reverse) def overlaps(self, other) -> bool: @@ -117,7 +119,7 @@ class SyntheticContig(AlignedContig): def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, q_st=0, q_ei=len(query.seq)-1) - super().__init__(query, alignment) + super().__init__(query, alignment, reverse=False) def cut_reference(self, cut_point: float): @@ -150,7 +152,7 @@ def __init__(self, parts: List[AlignedContig]): aligned = reduce(FrankensteinContig.munge, self.parts) - super().__init__(aligned.query, aligned.alignment) + super().__init__(aligned.query, aligned.alignment, reverse=aligned.reverse) def cut_reference(self, cut_point: float) -> Tuple['FrankensteinContig', 'FrankensteinContig']: @@ -194,14 +196,15 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: reference_delta=0) alignment = left_alignment.connect(right_alignment) - return AlignedContig(query, alignment) + assert left.reverse == right.reverse + return AlignedContig(query, alignment, left.reverse) -def align_to_reference(contig) -> Iterable[Tuple[GenotypedContig, bool]]: +def align_to_reference(contig) -> Iterable[GenotypedContig]: if contig.ref_seq is None: logger.info("Contig %r not aligned - no reference.", contig.name, extra={"action": "alignment", "type": "noref", "contig": contig}) - yield (contig, False) + yield contig return aligner = Aligner(seq=contig.ref_seq, preset='map-ont') @@ -235,13 +238,13 @@ def logpart(i, part_name, part, is_rev): if len(to_return) == 0: logger.info("Contig %r not aligned - backend choice.", contig.name, extra={"action": "alignment", "type": "zerohits", "contig": contig}) - yield (contig, False) + yield contig return if len(to_return) == 1: is_rev = to_return[0] in reversed_alignments logpart(0, contig.name, to_return[0], is_rev) - yield (AlignedContig(query=contig, alignment=connected[0]), is_rev) + yield AlignedContig(query=contig, alignment=connected[0], reverse=is_rev) return for i, single_hit in enumerate(connected + reversed_alignments): @@ -253,7 +256,7 @@ def logpart(i, part_name, part, is_rev): match_fraction=contig.match_fraction) is_rev = single_hit in reversed_alignments logpart(i, query.name, single_hit, is_rev) - yield (AlignedContig(query=query, alignment=single_hit), is_rev) + yield AlignedContig(query=query, alignment=single_hit, reverse=is_rev) def align_all_to_reference(contigs): @@ -545,14 +548,14 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig aligned = align_all_to_reference(contigs) - # Contigs aligned in reverse do not need any more processing - yield from (x for (x, is_rev) in aligned if is_rev) - aligned = [x for (x, is_rev) in aligned if not is_rev] - # Contigs that did not align do not need any more processing yield from (x for x in aligned if not isinstance(x, AlignedContig)) aligned = [x for x in aligned if isinstance(x, AlignedContig)] + # Contigs aligned in reverse do not need any more processing + yield from (x for x in aligned if x.reverse) + aligned = [x for x in aligned if not x.reverse] + aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) yield from combine_overlaps(aligned) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 9dc8f7c02..cd7b7b5b9 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -438,7 +438,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ), ] - contigs = [c for c, is_rev in align_all_to_reference(contigs)] + contigs = list(align_all_to_reference(contigs)) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -469,7 +469,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ), ] - contigs = [c for c, is_rev in align_all_to_reference(contigs)] + contigs = list(align_all_to_reference(contigs)) assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 1f0039716..9846826d6 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -10,7 +10,7 @@ @pytest.fixture def no_aligner(monkeypatch): - monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [(x, False)]) + monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [x]) def read_contigs(line): @@ -34,7 +34,7 @@ def read_contigs(line): match_fraction=2/3, ) alignment = CigarHit(Cigar([(length, CigarActions.MATCH)]), q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) - contig = AlignedContig(query=query, alignment=alignment) + contig = AlignedContig(query=query, alignment=alignment, reverse=False) aidee = f'{start:03d}-{end:03d}' yield {'contig': contig, 'id': aidee} From 520870bab86284c91b98a9ba4c5da77d9327974b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 16:48:39 -0800 Subject: [PATCH 115/485] Fix issue with reversed alignments in Contig Stitcher --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e685bfee0..2c4f55f0f 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -568,7 +568,7 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo consensus_parts: Dict[GroupRef, List[AlignedContig]] = defaultdict(list) for contig in contigs: - if isinstance(contig, AlignedContig): + if isinstance(contig, AlignedContig) and not contig.reverse: consensus_parts[contig.group_ref].append(contig) else: yield contig From 691a8f70324f6eee500d6ffcea30aa7476f059f5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 5 Dec 2023 17:18:16 -0800 Subject: [PATCH 116/485] Contig stitcher: log individual munge operations --- micall/core/contig_stitcher.py | 22 ++++++++++++++++------ micall/tests/test_contig_stitcher.py | 10 +++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 2c4f55f0f..d74c1253c 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -151,7 +151,6 @@ def __init__(self, parts: List[AlignedContig]): (part.parts if isinstance(part, FrankensteinContig) else [part])] aligned = reduce(FrankensteinContig.munge, self.parts) - super().__init__(aligned.query, aligned.alignment, reverse=aligned.reverse) @@ -200,6 +199,15 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: return AlignedContig(query, alignment, left.reverse) +def combine_contigs(parts: List[AlignedContig]) -> FrankensteinContig: + ret = FrankensteinContig(parts) + logger.debug("Munge of contigs %s results in %r at %s (len %s).", + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in ret.parts], + ret.name, ret.alignment, len(ret.seq), + extra={"action": "munge", "contigs": ret.parts, "result": ret}) + return ret + + def align_to_reference(contig) -> Iterable[GenotypedContig]: if contig.ref_seq is None: logger.info("Contig %r not aligned - no reference.", contig.name, @@ -370,7 +378,7 @@ def stitch_2_contigs(left, right): r_st=left_overlap.alignment.r_st, r_ei=right_overlap.alignment.r_ei) - return FrankensteinContig([left_remainder, overlap_contig, right_remainder]) + return combine_contigs([left_remainder, overlap_contig, right_remainder]) def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: @@ -575,11 +583,13 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) - logger.debug("Combining these contigs for final output for %r: %s.", - group_ref, - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs], + ret = combine_contigs(contigs) + logger.info("Combined these contigs for final output for %r: %s," + " resulting in %r at [%s, %s]->[%s, %s].", group_ref, + [repr(x.name) for x in contigs], + ret.name, ret.alignment.q_st, ret.alignment.q_ei, + ret.alignment.r_st, ret.alignment.r_ei, extra={"action": "finalcombine", "contigs": contigs}) - ret = FrankensteinContig(contigs) return ret yield from map(combine, consensus_parts) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index cd7b7b5b9..057c16ee7 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -662,12 +662,12 @@ def test_correct_processing_complex_logs(exact_aligner): list(stitch_consensus(contigs)) messages = list(iterate_messages()) - assert len(messages) == 70 + assert len(messages) == 76 assert all(name == "micall.core.contig_stitcher" for name, m in messages) info_messages = [m for name, m in messages if m.levelname == 'INFO'] debug_messages = [m for name, m in messages if m.levelname == 'DEBUG'] - assert len(info_messages) == 40 + assert len(info_messages) == 42 assert len(debug_messages) == len(messages) - len(info_messages) actions = [m.__dict__.get('action', '') + ':' + m.__dict__.get('type', '') @@ -677,11 +677,11 @@ def test_correct_processing_complex_logs(exact_aligner): assert actions == \ ['intro:'] * 16 + \ ['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \ - ['stitchcut:', 'concordance:', 'stitch:'] * 2 + \ + ['stitchcut:', 'concordance:', 'munge:', 'stitch:'] * 2 + \ ['nooverlap:'] + \ - ['stitchcut:', 'concordance:', 'stitch:'] * 2 + \ + ['stitchcut:', 'concordance:', 'munge:', 'stitch:'] * 2 + \ ['nooverlap:'] * 3 + \ - ['finalcombine:'] * 2 + ['munge:', 'finalcombine:'] * 2 def test_main_invocation(exact_aligner, tmp_path, hcv_db): From 9f05e31e92c278c25aceb0ed74fc16f5718842ac Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 7 Dec 2023 15:42:20 -0800 Subject: [PATCH 117/485] Simplify structured logger utility --- micall/core/contig_stitcher.py | 16 +++++++--------- micall/tests/test_contig_stitcher.py | 22 +++++++++------------- micall/utils/structured_logger.py | 24 +++++++++++++----------- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index d74c1253c..026305fba 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -12,11 +12,10 @@ from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions -from micall.utils.structured_logger import register_structured_logger logger = logging.getLogger(__name__) -register_structured_logger(logger) + @dataclass class Contig: @@ -239,8 +238,8 @@ def logpart(i, part_name, part, is_rev): " (rev)" if is_rev else "", extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) - logger.debug("Part %r of contig %r aligned as %s%s.", i, contig.name, part, - " (rev)" if is_rev else "") + logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, + part_name, part, " (rev)" if is_rev else "") to_return = connected + reversed_alignments if len(to_return) == 0: @@ -390,8 +389,8 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: - logger.info("Nothing overlaps with %r.", - current.name, extra={"action": "nooverlap", "contig": current}) + logger.info("Nothing overlaps with %r.", current.name, + extra={"action": "nooverlap", "contig": current}) yield current continue @@ -551,8 +550,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig extra={"action": "intro", "contig": contig}) logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.", contig.name, contig.seq, contig.ref_name, - contig.group_ref, contig.ref_seq, len(contig.seq), - extra={"action": "intro", "contig": contig}) + contig.group_ref, contig.ref_seq, len(contig.seq)) aligned = align_all_to_reference(contigs) @@ -589,7 +587,7 @@ def combine(group_ref): [repr(x.name) for x in contigs], ret.name, ret.alignment.q_st, ret.alignment.q_ei, ret.alignment.r_st, ret.alignment.r_ei, - extra={"action": "finalcombine", "contigs": contigs}) + extra={"action": "finalcombine", "contigs": contigs, "result": ret}) return ret yield from map(combine, consensus_parts) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 057c16ee7..091073079 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -6,7 +6,7 @@ from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main from micall.tests.utils import MockAligner, fixed_random_seed -from micall.utils.structured_logger import iterate_messages +from micall.utils.structured_logger import add_structured_handler from micall.tests.test_denovo import check_hcv_db @@ -655,27 +655,23 @@ def test_correct_processing_complex_logs(exact_aligner): logger = logging.getLogger("micall.core.contig_stitcher") logger.setLevel(logging.DEBUG) + handler = add_structured_handler(logger) - messages = list(iterate_messages()) - assert len(messages) == 0 - + assert len(handler.logs) == 0 list(stitch_consensus(contigs)) + assert len(handler.logs) == 76 - messages = list(iterate_messages()) - assert len(messages) == 76 - assert all(name == "micall.core.contig_stitcher" for name, m in messages) - - info_messages = [m for name, m in messages if m.levelname == 'INFO'] - debug_messages = [m for name, m in messages if m.levelname == 'DEBUG'] + info_messages = [m for m in handler.logs if m.levelname == 'INFO'] + debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] assert len(info_messages) == 42 - assert len(debug_messages) == len(messages) - len(info_messages) + assert len(debug_messages) == len(handler.logs) - len(info_messages) actions = [m.__dict__.get('action', '') + ':' + m.__dict__.get('type', '') - for name, m in messages] + for m in handler.logs] actions = [action for action in actions if action != ':'] assert actions == \ - ['intro:'] * 16 + \ + ['intro:'] * 8 + \ ['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \ ['stitchcut:', 'concordance:', 'munge:', 'stitch:'] * 2 + \ ['nooverlap:'] + \ diff --git a/micall/utils/structured_logger.py b/micall/utils/structured_logger.py index 249ce232f..c0f5ae687 100644 --- a/micall/utils/structured_logger.py +++ b/micall/utils/structured_logger.py @@ -1,25 +1,27 @@ import logging -from typing import List, Tuple, Iterable - - -LoggerName = str -_structured_logs: List[Tuple[LoggerName, logging.LogRecord]] = [] +from typing import List, Tuple, Iterable, Callable class InMemoryLogHandler(logging.Handler): def __init__(self, name: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.name: str = name + self.logs: List[logging.LogRecord] = [] + self.callbacks = [] + def emit(self, record: logging.LogRecord): - _structured_logs.append((self.name, record)) + self.logs.append(record) + for callback in self.callbacks: + callback(record) -def register_structured_logger(logger: logging.Logger): - memory_handler = InMemoryLogHandler(logger.name) - logger.addHandler(memory_handler) + def addCallback(self, callback: Callable[[logging.LogRecord], object]): + self.callbacks.append(callback) -def iterate_messages() -> Iterable[Tuple[LoggerName, logging.LogRecord]]: - yield from _structured_logs +def add_structured_handler(logger: logging.Logger): + memory_handler = InMemoryLogHandler(logger.name) + logger.addHandler(memory_handler) + return memory_handler From d4a30b10441f3cb535db426c38c61a4dbaacaa73 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Dec 2023 15:26:49 -0800 Subject: [PATCH 118/485] Contig stitcher: do not throw away parts of queries --- micall/core/contig_stitcher.py | 58 ++++++++++++---------------- micall/tests/test_contig_stitcher.py | 4 +- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 026305fba..9aa8dbf90 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -30,28 +30,14 @@ class GenotypedContig(Contig): ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. - def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: - """ - Cuts this alignment in two parts with cut_point between them. - Reference sequence is kept untouched. - """ - - cut_point = max(0, cut_point) - match_fraction = self.match_fraction - left = GenotypedContig(name=f'left({self.name})', - seq=self.seq[:ceil(cut_point)], - ref_seq=self.ref_seq, - ref_name=self.ref_name, - group_ref=self.group_ref, - match_fraction=match_fraction) - right = GenotypedContig(name=f'right({self.name})', - seq=self.seq[ceil(cut_point):], - ref_seq=self.ref_seq, - ref_name=self.ref_name, - group_ref=self.group_ref, - match_fraction=match_fraction) - - return (left, right) + def rename(self, new_name: str) -> 'GenotypedContig': + return GenotypedContig( + name = new_name, + seq = self.seq, + ref_name = self.ref_name, + group_ref = self.group_ref, + ref_seq = self.ref_seq, + match_fraction = self.match_fraction) @dataclass @@ -77,24 +63,20 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - query_left, query_right = self.query.cut_query(alignment_left.q_ei + 0.5) - alignment_right = alignment_right.translate(0, -1 * alignment_right.q_st) - - return (AlignedContig(query_left, alignment_left, self.reverse), - AlignedContig(query_right, alignment_right, self.reverse)) + left_query = self.query.rename(f"left({self.query.name})") + right_query = self.query.rename(f"right({self.query.name})") + return (AlignedContig(left_query, alignment_left, self.reverse), + AlignedContig(right_query, alignment_right, self.reverse)) def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() - q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) - alignment = alignment.translate(0, -1 * alignment.q_st) - return AlignedContig(query, alignment, self.reverse) + return AlignedContig(self.query, alignment, self.reverse) def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() - query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) - return AlignedContig(query, alignment, self.reverse) + return AlignedContig(self.query, alignment, self.reverse) def overlaps(self, other) -> bool: @@ -177,7 +159,13 @@ def rstrip_query(self): @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: - query_seq = left.rstrip_query().seq + right.lstrip_query().seq + + # query_seq = left.rstrip_query().seq + right.lstrip_query().seq + left = left.rstrip_query() + right = right.lstrip_query() + query_seq = left.seq[:left.alignment.q_ei + 1] \ + + right.seq[right.alignment.q_st:] + match_fraction = min(left.match_fraction, right.match_fraction) ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=query_seq, @@ -353,7 +341,9 @@ def stitch_2_contigs(left, right): "left_overlap": left_overlap, "right_overlap": right_overlap}) # Align overlapping parts, then recombine based on concordance. - aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) + left_seq = left_overlap.seq[left_overlap.alignment.q_st:left_overlap.alignment.q_ei + 1] + right_seq = right_overlap.seq[right_overlap.alignment.q_st:right_overlap.alignment.q_ei + 1] + aligned_left, aligned_right = align_queries(left_seq, right_seq) concordance = calculate_concordance(aligned_left, aligned_right) max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) aligned_left_part = aligned_left[:max_concordance_index] diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 091073079..10853b449 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -499,10 +499,10 @@ def test_stitching_partial_align(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) - assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) + assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) assert { contig.seq for contig in contigs } \ - != { contig.lstrip_query().rstrip_query().seq for contig in results } + == { contig.lstrip_query().rstrip_query().seq for contig in results } def test_partial_align_consensus(exact_aligner): From 21fb8f7650e979c8e2beaa64cc151f178da9f50f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Dec 2023 15:57:00 -0800 Subject: [PATCH 119/485] Contig stitcher: only throw out query on strip() operations --- micall/core/contig_stitcher.py | 47 +++++++++++++++++++--------- micall/tests/test_contig_stitcher.py | 4 +-- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9aa8dbf90..425b95248 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -30,6 +30,30 @@ class GenotypedContig(Contig): ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. + def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: + """ + Cuts this alignment in two parts with cut_point between them. + Reference sequence is kept untouched. + """ + + cut_point = max(0, cut_point) + match_fraction = self.match_fraction + left = GenotypedContig(name=f'left({self.name})', + seq=self.seq[:ceil(cut_point)], + ref_seq=self.ref_seq, + ref_name=self.ref_name, + group_ref=self.group_ref, + match_fraction=match_fraction) + right = GenotypedContig(name=f'right({self.name})', + seq=self.seq[ceil(cut_point):], + ref_seq=self.ref_seq, + ref_name=self.ref_name, + group_ref=self.group_ref, + match_fraction=match_fraction) + + return (left, right) + + def rename(self, new_name: str) -> 'GenotypedContig': return GenotypedContig( name = new_name, @@ -71,12 +95,15 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() - return AlignedContig(self.query, alignment, self.reverse) + q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) + alignment = alignment.translate(0, -1 * alignment.q_st) + return AlignedContig(query, alignment, self.reverse) def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() - return AlignedContig(self.query, alignment, self.reverse) + query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) + return AlignedContig(query, alignment, self.reverse) def overlaps(self, other) -> bool: @@ -159,13 +186,7 @@ def rstrip_query(self): @staticmethod def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: - - # query_seq = left.rstrip_query().seq + right.lstrip_query().seq - left = left.rstrip_query() - right = right.lstrip_query() - query_seq = left.seq[:left.alignment.q_ei + 1] \ - + right.seq[right.alignment.q_st:] - + query_seq = left.rstrip_query().seq + right.lstrip_query().seq match_fraction = min(left.match_fraction, right.match_fraction) ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=query_seq, @@ -327,8 +348,8 @@ def stitch_2_contigs(left, right): # Cut in 4 parts. left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) - left_overlap = left_overlap.rstrip_query() - right_overlap = right_overlap.lstrip_query() + left_overlap = left_overlap.rstrip_query().lstrip_query() + right_overlap = right_overlap.lstrip_query().rstrip_query() logger.debug("Stitching %r at %s (len %s) with %r at %s (len %s)." " The left_overlap %r is at %s (len %s)" @@ -341,9 +362,7 @@ def stitch_2_contigs(left, right): "left_overlap": left_overlap, "right_overlap": right_overlap}) # Align overlapping parts, then recombine based on concordance. - left_seq = left_overlap.seq[left_overlap.alignment.q_st:left_overlap.alignment.q_ei + 1] - right_seq = right_overlap.seq[right_overlap.alignment.q_st:right_overlap.alignment.q_ei + 1] - aligned_left, aligned_right = align_queries(left_seq, right_seq) + aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) aligned_left_part = aligned_left[:max_concordance_index] diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 10853b449..091073079 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -499,10 +499,10 @@ def test_stitching_partial_align(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) - assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) + assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) assert { contig.seq for contig in contigs } \ - == { contig.lstrip_query().rstrip_query().seq for contig in results } + != { contig.lstrip_query().rstrip_query().seq for contig in results } def test_partial_align_consensus(exact_aligner): From 91ab1939df75f03ea75da79921d021d754c38997 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Jan 2024 15:12:41 -0800 Subject: [PATCH 120/485] Contig stitcher: handle missed None case --- micall/core/denovo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index e1d48c4b6..6fbd71adb 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -52,7 +52,7 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> ty seq=str(seq), ref_name=ref_name, group_ref=group_ref, - ref_seq=str(ref_seq), + ref_seq=str(ref_seq) if ref_seq is not None else None, match_fraction=match_fraction) From 522e281705807888593c4f84b21ca3f0c23afc98 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Jan 2024 15:47:51 -0800 Subject: [PATCH 121/485] Contig stitcher: output simplication and small fixes * Remove SyntheticContig concept * Fix setting logging level * Generate short names --- micall/core/contig_stitcher.py | 233 ++++++++++++++++----------- micall/tests/test_contig_stitcher.py | 22 +-- 2 files changed, 139 insertions(+), 116 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 425b95248..4188f674f 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,13 +1,12 @@ -from typing import Iterable, Optional, Tuple, List, Dict +from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal from collections import deque, defaultdict from dataclasses import dataclass -from math import ceil +from math import ceil, floor from mappy import Aligner from functools import cached_property, reduce from itertools import accumulate, takewhile from gotoh import align_it from queue import LifoQueue -from math import floor import logging from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @@ -17,6 +16,13 @@ logger = logging.getLogger(__name__) +name_generator_state = 0 +def generate_new_name(): + global name_generator_state + name_generator_state += 1 + return f"c{name_generator_state}" + + @dataclass class Contig: name: str @@ -38,13 +44,15 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont cut_point = max(0, cut_point) match_fraction = self.match_fraction - left = GenotypedContig(name=f'left({self.name})', + left_name = generate_new_name() + left = GenotypedContig(name=left_name, seq=self.seq[:ceil(cut_point)], ref_seq=self.ref_seq, ref_name=self.ref_name, group_ref=self.group_ref, match_fraction=match_fraction) - right = GenotypedContig(name=f'right({self.name})', + right_name = generate_new_name() + right = GenotypedContig(name=right_name, seq=self.seq[ceil(cut_point):], ref_seq=self.ref_seq, ref_name=self.ref_name, @@ -56,12 +64,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont def rename(self, new_name: str) -> 'GenotypedContig': return GenotypedContig( - name = new_name, - seq = self.seq, - ref_name = self.ref_name, - group_ref = self.group_ref, - ref_seq = self.ref_seq, - match_fraction = self.match_fraction) + name=new_name, + seq=self.seq, + ref_name=self.ref_name, + group_ref=self.group_ref, + ref_seq=self.ref_seq, + match_fraction=self.match_fraction) @dataclass @@ -70,40 +78,67 @@ class AlignedContig(GenotypedContig): alignment: CigarHit reverse: bool - def __init__(self, query: GenotypedContig, alignment: CigarHit, reverse: bool): + def __init__(self, + query: GenotypedContig, + alignment: CigarHit, + reverse: bool): self.query = query self.alignment = alignment self.reverse = reverse super().__init__( - seq = query.seq, - name = query.name, - ref_name = query.ref_name, - group_ref = query.group_ref, - ref_seq = query.ref_seq, - match_fraction = query.match_fraction) + seq=query.seq, + name=query.name, + ref_name=query.ref_name, + group_ref=query.group_ref, + ref_seq=query.ref_seq, + match_fraction=query.match_fraction) + + + def modify(self, query: GenotypedContig, alignment: CigarHit) -> 'AlignedContig': + if query.seq == self.query.seq and alignment == self.alignment: + return self + return AlignedContig( + reverse=self.reverse, + query=query, + alignment=alignment) def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left_query = self.query.rename(f"left({self.query.name})") - right_query = self.query.rename(f"right({self.query.name})") - return (AlignedContig(left_query, alignment_left, self.reverse), - AlignedContig(right_query, alignment_right, self.reverse)) + left_query = self.query.rename(generate_new_name()) + right_query = self.query.rename(generate_new_name()) + left = self.modify(left_query, alignment_left) + right = self.modify(right_query, alignment_right) + + logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", + left.name, left.alignment, right.name, right.alignment, self.name, + extra={"action": "cut", "original": self, + "left": left, "right": right}) + + return (left, right) def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) - return AlignedContig(query, alignment, self.reverse) + result = self.modify(query, alignment) + logger.debug("Contig %r morfed into contig %r, so %s became %s", + self.name, result.name, self.alignment, result.alignment, + extra={"action": "modify", "original": self, "result": result}) + return result def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) - return AlignedContig(query, alignment, self.reverse) + result = self.modify(query, alignment) + logger.debug("Contig %r morfed into contig %r, so %s became %s", + self.name, result.name, self.alignment, result.alignment, + extra={"action": "modify", "original": self, "result": result}) + return result def overlaps(self, other) -> bool: @@ -117,31 +152,6 @@ def intervals_overlap(x, y): (other.alignment.r_st, other.alignment.r_ei)) -class SyntheticContig(AlignedContig): - """ - Contig that is not really aligned, but its boundaries are known. - It is created as a result of overlaps between the real contigs. - """ - # TODO(vitalik): maybe it is worth to realign overlaps to get rid of this special-case class. - - def __init__(self, query: GenotypedContig, r_st: int, r_ei: int): - alignment = CigarHit.from_default_alignment(r_st=r_st, r_ei=r_ei, - q_st=0, q_ei=len(query.seq)-1) - super().__init__(query, alignment, reverse=False) - - - def cut_reference(self, cut_point: float): - raise NotImplementedError("SyntheticContigs cannot be cut because they are not properly aligned") - - - def lstrip_query(self): - return self - - - def rstrip_query(self): - return self - - class FrankensteinContig(AlignedContig): """ Assembled of parts that were not even aligned together, @@ -159,7 +169,9 @@ def __init__(self, parts: List[AlignedContig]): (part.parts if isinstance(part, FrankensteinContig) else [part])] aligned = reduce(FrankensteinContig.munge, self.parts) - super().__init__(aligned.query, aligned.alignment, reverse=aligned.reverse) + super().__init__(query=aligned.query, + alignment=aligned.alignment, + reverse=aligned.reverse) def cut_reference(self, cut_point: float) -> Tuple['FrankensteinContig', 'FrankensteinContig']: @@ -190,7 +202,7 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: match_fraction = min(left.match_fraction, right.match_fraction) ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=query_seq, - name=f'{left.name}+{right.name}', + name=generate_new_name(), ref_name=ref_name, group_ref=left.group_ref, ref_seq=left.ref_seq, @@ -204,15 +216,20 @@ def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: alignment = left_alignment.connect(right_alignment) assert left.reverse == right.reverse - return AlignedContig(query, alignment, left.reverse) + ret = AlignedContig(reverse=left.reverse, query=query, alignment=alignment) + logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", + left.name, left.alignment, right.name, right.alignment, + ret.name, ret.alignment, extra={"action": "munge", "left": left, + "right": right, "result": ret}) + return ret def combine_contigs(parts: List[AlignedContig]) -> FrankensteinContig: ret = FrankensteinContig(parts) - logger.debug("Munge of contigs %s results in %r at %s (len %s).", - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in ret.parts], + logger.debug("Created a frankenstein %r at %s (len %s) from %s.", ret.name, ret.alignment, len(ret.seq), - extra={"action": "munge", "contigs": ret.parts, "result": ret}) + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in ret.parts], + extra={"action": "frankenstein", "contigs": ret.parts, "result": ret}) return ret @@ -241,14 +258,21 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: extra={"action": "alignment", "type": "hitnumber", "contig": contig, "n": len(connected)}) - def logpart(i, part_name, part, is_rev): + def logpart(i, part, is_rev): logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", - i, contig.name, part_name, part.q_st, part.q_ei, part.r_st, part.r_ei, + i, contig.name, part.name, part.alignment.q_st, + part.alignment.q_ei, part.alignment.r_st, part.alignment.r_ei, " (rev)" if is_rev else "", extra={"action": "alignment", "type": "hit", "contig": contig, "part": part, "i": i}) logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, - part_name, part, " (rev)" if is_rev else "") + part.name, part.alignment, " (rev)" if is_rev else "") + + def make_aligned(query, alignment, is_rev): + return AlignedContig( + query=query, + alignment=alignment, + reverse=is_rev) to_return = connected + reversed_alignments if len(to_return) == 0: @@ -259,20 +283,22 @@ def logpart(i, part_name, part, is_rev): if len(to_return) == 1: is_rev = to_return[0] in reversed_alignments - logpart(0, contig.name, to_return[0], is_rev) - yield AlignedContig(query=contig, alignment=connected[0], reverse=is_rev) + part = make_aligned(contig, to_return[0], is_rev) + logpart(0, part, is_rev) + yield part return - for i, single_hit in enumerate(connected + reversed_alignments): - query = GenotypedContig(name=f'part({i}, {contig.name})', + for i, single_hit in enumerate(to_return): + query = GenotypedContig(name=generate_new_name(), seq=contig.seq, ref_name=contig.ref_name, group_ref=contig.group_ref, ref_seq=contig.ref_seq, match_fraction=contig.match_fraction) is_rev = single_hit in reversed_alignments - logpart(i, query.name, single_hit, is_rev) - yield AlignedContig(query=query, alignment=single_hit, reverse=is_rev) + part = make_aligned(query, single_hit, is_rev) + logpart(i, part, is_rev) + yield part def align_all_to_reference(contigs): @@ -311,7 +337,7 @@ def calculate_concordance(left: str, right: str) -> List[float]: The function compares the two strings from both left to right and then right to left, calculating for each position the ratio of matching characters in a window around the - current position. + current position. So position holds a moving avarage score. It's required that the input strings are of the same length. @@ -325,12 +351,15 @@ def calculate_concordance(left: str, right: str) -> List[float]: result: List[float] = [0] * len(left) - def slide(left, right): + def slide(start, end): window_size = 30 scores = deque([0] * window_size, maxlen=window_size) scores_sum = 0 + inputs = list(zip(left, right)) + increment = 1 if start <= end else -1 - for i, (a, b) in enumerate(zip(left, right)): + for i in range(start, end, increment): + (a, b) = inputs[i] current = a == b scores_sum -= scores.popleft() scores_sum += current @@ -338,8 +367,8 @@ def slide(left, right): result[i] += (scores_sum / window_size) / 2 # Slide forward, then in reverse, adding the scores at each position. - slide(left, right) - slide(reversed(left), reversed(right)) + slide(0, len(left)) + slide(len(left) - 1, -1) return result @@ -350,6 +379,8 @@ def stitch_2_contigs(left, right): right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) left_overlap = left_overlap.rstrip_query().lstrip_query() right_overlap = right_overlap.lstrip_query().rstrip_query() + left_remainder = left_remainder.rstrip_query() + right_remainder = right_remainder.lstrip_query() logger.debug("Stitching %r at %s (len %s) with %r at %s (len %s)." " The left_overlap %r is at %s (len %s)" @@ -359,34 +390,45 @@ def stitch_2_contigs(left, right): left_overlap.name, left_overlap.alignment, len(left_overlap.seq), right_overlap.name, right_overlap.alignment, len(right_overlap.seq), extra={"action": "stitchcut", "left": left, "right": right, - "left_overlap": left_overlap, "right_overlap": right_overlap}) + "left_overlap": left_overlap, "right_overlap": right_overlap, + "left_remainder": left_remainder, "right_remainder": right_remainder}) # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) - max_concordance_index = max(range(len(concordance)), key=lambda i: concordance[i]) - aligned_left_part = aligned_left[:max_concordance_index] - aligned_right_part = aligned_right[max_concordance_index:] - overlap_seq = ''.join(c for c in aligned_left_part + aligned_right_part if c != '-') - - average_concordance = sum(concordance) / (len(concordance) or 1) - logger.debug("Average concordance between overlapping parts of %r and %r is %s (full is %s).", - left.name, right.name, average_concordance, concordance, - extra={"action": "concordance", "left": left, "right": right, - "value": concordance, "avg": average_concordance}) + valuator = lambda i: (concordance[i], i if i < len(concordance) / 2 else len(concordance) - i - 1) + max_concordance_index = max(range(len(concordance)), key=valuator) # Return something that can be fed back into the loop. - match_fraction = min(left.match_fraction, right.match_fraction) - ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name - overlap_query = GenotypedContig(name=f'overlap({left.name},{right.name})', - ref_name=ref_name, - seq=overlap_seq, group_ref=left.group_ref, - ref_seq=left.ref_seq, match_fraction=match_fraction) - overlap_contig = SyntheticContig(overlap_query, - r_st=left_overlap.alignment.r_st, - r_ei=right_overlap.alignment.r_ei) - - return combine_contigs([left_remainder, overlap_contig, right_remainder]) + without_dashes = lambda s: ''.join(c for c in s if c != '-') + aligned_left_q_index = len(without_dashes(aligned_left[:max_concordance_index])) + aligned_right_q_index = right_overlap.alignment.query_length - len(without_dashes(aligned_right[max_concordance_index:])) + 1 + aligned_left_r_index = left_overlap.alignment.coordinate_mapping.query_to_ref.left_max(aligned_left_q_index) + if aligned_left_r_index is None: + aligned_left_r_index = left_overlap.alignment.r_st - 1 + aligned_right_r_index = right_overlap.alignment.coordinate_mapping.query_to_ref.right_min(aligned_right_q_index) + if aligned_right_r_index is None: + aligned_right_r_index = right_overlap.alignment.r_ei + 1 + left_overlap_take, left_overlap_drop = left_overlap.cut_reference(aligned_left_r_index + 0.5) + right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_r_index - 0.5) + + # Log it. + average_concordance = sum(concordance) / (len(concordance) or 1) + concordance_str = ', '.join(map(lambda x: str(round(x, 2)), concordance)), + cut_point_location_scaled = max_concordance_index / (((len(concordance) or 1) - 1) or 1) + logger.debug("Created overlap contigs %r at %s and %r at %s based on parts of %r and %r, with avg. concordance %s%%, cut point at %s%%, and full concordance [%s].", + left_overlap_take.name, left_overlap.alignment, right_overlap_take.name, right_overlap_take.alignment, + left.name, right.name, round(average_concordance * 100), + round(cut_point_location_scaled * 100), concordance_str, + extra={"action": "overlap", "left": left, "right": right, + "left_remainder": left_remainder, "right_remainder": right_remainder, + "left_overlap": left_overlap, "right_original": right_overlap, + "left_take": left_overlap_take, "right_take": right_overlap_take, + "concordance": concordance, "avg": average_concordance, + "cut_point": max_concordance_index, + "cut_point_scaled": cut_point_location_scaled}) + + return combine_contigs([left_remainder, left_overlap_take, right_overlap_take, right_remainder]) def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: @@ -617,15 +659,16 @@ def main(args): args = parser.parse_args(args) if args.quiet: - logging.basicConfig(level=logging.ERROR) + logger.setLevel(logging.ERROR) elif args.verbose: - logging.basicConfig(level=logging.INFO) + logger.setLevel(logging.INFO) elif args.debug: - logging.basicConfig(level=logging.DEBUG) + logger.setLevel(logging.DEBUG) else: - logging.basicConfig(level=logging.WARN) + logger.setLevel(logging.WARN) write_contig_refs(args.contigs.name, args.stitched_contigs) + args.contigs.close() args.stitched_contigs.close() diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 091073079..81893d519 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -114,7 +114,6 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): assert 100 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 50 - assert result.query.name == 'left(a)+overlap(a,b)+right(b)' def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner): @@ -151,7 +150,6 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali assert 100 == len(results[0].seq) assert results[0].seq == 'A' * 50 + 'C' * 50 - assert results[0].query.name == 'left(a)+overlap(a,b)+right(b)' assert results[1].query == contigs[2] @@ -192,7 +190,6 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): assert 200 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 - assert result.query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' def test_stitching_with_empty_contigs(exact_aligner): @@ -323,12 +320,10 @@ def test_correct_processing_complex_nogaps(exact_aligner): assert 170 == len(results[0].seq) assert results[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[0].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' assert results[0].query.group_ref == 'testref-1' assert 170 == len(results[1].seq) assert results[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[1].query.name == 'left(a)+overlap(a,b)+left(right(b))+overlap(left(a)+overlap(a,b)+right(b),c)+right(c)' assert results[1].query.group_ref == 'testref-2' assert results[2].query == contigs[3] @@ -580,7 +575,6 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner): results = list(stitch_consensus(contigs)) assert len(results) == 1 assert results[0].seq == contigs[0].seq + contigs[1].seq - assert results[0].name == 'a+b' def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): @@ -609,7 +603,6 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): assert len(results) == 1 assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10 assert results[0].seq == contigs[0].seq[:-10] + contigs[1].seq[20:] - assert results[0].name == 'left(a)+overlap(a,b)+right(b)' def test_correct_processing_complex_logs(exact_aligner): @@ -659,26 +652,13 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 76 + assert len(handler.logs) == 248 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] assert len(info_messages) == 42 assert len(debug_messages) == len(handler.logs) - len(info_messages) - actions = [m.__dict__.get('action', '') + ':' + m.__dict__.get('type', '') - for m in handler.logs] - actions = [action for action in actions if action != ':'] - - assert actions == \ - ['intro:'] * 8 + \ - ['alignment:reversenumber', 'alignment:hitnumber', 'alignment:hit'] * 8 + \ - ['stitchcut:', 'concordance:', 'munge:', 'stitch:'] * 2 + \ - ['nooverlap:'] + \ - ['stitchcut:', 'concordance:', 'munge:', 'stitch:'] * 2 + \ - ['nooverlap:'] * 3 + \ - ['munge:', 'finalcombine:'] * 2 - def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) From 7b358e2978da5a6e8feec7051e87efd6d3ba1ad1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Jan 2024 16:00:34 -0800 Subject: [PATCH 122/485] Contig stitcher: remove the concept of a FrankensteinContig --- micall/core/contig_stitcher.py | 85 +++++++--------------------- micall/tests/test_contig_stitcher.py | 2 +- 2 files changed, 21 insertions(+), 66 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 4188f674f..e098b6aa6 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -152,84 +152,39 @@ def intervals_overlap(x, y): (other.alignment.r_st, other.alignment.r_ei)) -class FrankensteinContig(AlignedContig): - """ - Assembled of parts that were not even aligned together, - and of some parts that were not aligned at all. - Yet its self.seq string looks like a real contig. - """ - - def __init__(self, parts: List[AlignedContig]): - if len(parts) == 0: - raise ValueError("Empty Frankenstei do not exist") - - # Flatten any possible Frankenstein parts - self.parts: List[AlignedContig] = \ - [subpart for part in parts for subpart in - (part.parts if isinstance(part, FrankensteinContig) else [part])] - - aligned = reduce(FrankensteinContig.munge, self.parts) - super().__init__(query=aligned.query, - alignment=aligned.alignment, - reverse=aligned.reverse) - - - def cut_reference(self, cut_point: float) -> Tuple['FrankensteinContig', 'FrankensteinContig']: - # Search for the part that needs to be cut - left_parts = list(takewhile(lambda part: cut_point >= part.alignment.r_ei + 1, self.parts)) - target_part = self.parts[len(left_parts)] - right_parts = self.parts[len(left_parts) + 1:] - - # Cut the target part and add its pieces to left and right. - target_part_left, target_part_right = target_part.cut_reference(cut_point) - left = FrankensteinContig(left_parts + [target_part_left]) - right = FrankensteinContig([target_part_right] + right_parts) - - return (left, right) - - - def lstrip_query(self): - return FrankensteinContig([self.parts[0].lstrip_query()] + self.parts[1:]) - - - def rstrip_query(self): - return FrankensteinContig(self.parts[:-1] + [self.parts[-1].rstrip_query()]) - - - @staticmethod - def munge(left: AlignedContig, right: AlignedContig) -> AlignedContig: - query_seq = left.rstrip_query().seq + right.lstrip_query().seq - match_fraction = min(left.match_fraction, right.match_fraction) - ref_name = max([left, right], key=lambda x: x.alignment.ref_length).ref_name + def munge(self, other: 'AlignedContig') -> 'AlignedContig': + query_seq = self.rstrip_query().seq + other.lstrip_query().seq + match_fraction = min(self.match_fraction, other.match_fraction) + ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=query_seq, name=generate_new_name(), ref_name=ref_name, - group_ref=left.group_ref, - ref_seq=left.ref_seq, + group_ref=self.group_ref, + ref_seq=self.ref_seq, match_fraction=match_fraction) - left_alignment = left.alignment - right_alignment = \ - right.alignment.translate( - query_delta=(-1 * right.alignment.q_st + left.alignment.q_ei + 1), + self_alignment = self.alignment + other_alignment = \ + other.alignment.translate( + query_delta=(-1 * other.alignment.q_st + self.alignment.q_ei + 1), reference_delta=0) - alignment = left_alignment.connect(right_alignment) + alignment = self_alignment.connect(other_alignment) - assert left.reverse == right.reverse - ret = AlignedContig(reverse=left.reverse, query=query, alignment=alignment) + assert self.reverse == other.reverse + ret = AlignedContig(reverse=self.reverse, query=query, alignment=alignment) logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", - left.name, left.alignment, right.name, right.alignment, - ret.name, ret.alignment, extra={"action": "munge", "left": left, - "right": right, "result": ret}) + self.name, self.alignment, other.name, other.alignment, + ret.name, ret.alignment, extra={"action": "munge", "left": self, + "right": other, "result": ret}) return ret -def combine_contigs(parts: List[AlignedContig]) -> FrankensteinContig: - ret = FrankensteinContig(parts) +def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: + ret = reduce(AlignedContig.munge, parts) logger.debug("Created a frankenstein %r at %s (len %s) from %s.", ret.name, ret.alignment, len(ret.seq), - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in ret.parts], - extra={"action": "frankenstein", "contigs": ret.parts, "result": ret}) + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in parts], + extra={"action": "frankenstein", "contigs": parts, "result": ret}) return ret diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 81893d519..da996e991 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -652,7 +652,7 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 248 + assert len(handler.logs) == 158 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] From 4c9829a156e6d271614eae3426c03ae997945f3e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Jan 2024 16:11:58 -0800 Subject: [PATCH 123/485] Contig stitcher: do not munge the final contigs --- micall/core/contig_stitcher.py | 19 ++++++++++++++----- micall/tests/test_contig_stitcher.py | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e098b6aa6..6168f00b4 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -587,13 +587,22 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) - ret = combine_contigs(contigs) + if len(contigs) == 1: + return contigs[0] + + seq = ''.join([contig.seq for contig in contigs]) + match_fraction = min([contig.match_fraction for contig in contigs]) + ret = GenotypedContig(name=generate_new_name(), + seq=seq, ref_name=contigs[0].ref_name, + group_ref=contigs[0].group_ref, + ref_seq=contigs[0].ref_seq, + match_fraction=match_fraction) + logger.info("Combined these contigs for final output for %r: %s," - " resulting in %r at [%s, %s]->[%s, %s].", group_ref, - [repr(x.name) for x in contigs], - ret.name, ret.alignment.q_st, ret.alignment.q_ei, - ret.alignment.r_st, ret.alignment.r_ei, + " resulting in %r of length %s.", group_ref, + [repr(x.name) for x in contigs], ret.name, len(ret.seq), extra={"action": "finalcombine", "contigs": contigs, "result": ret}) + return ret yield from map(combine, consensus_parts) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index da996e991..e3dc83df2 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -652,7 +652,7 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 158 + assert len(handler.logs) == 150 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] From 14ea3bb2e20e87fb69f0a068549b967099aded7d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Jan 2024 16:38:02 -0800 Subject: [PATCH 124/485] Contig stitcher: ensure no conflicting mappings in overlap --- micall/core/contig_stitcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 6168f00b4..a8dd7ce81 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -364,6 +364,10 @@ def stitch_2_contigs(left, right): aligned_right_r_index = right_overlap.alignment.coordinate_mapping.query_to_ref.right_min(aligned_right_q_index) if aligned_right_r_index is None: aligned_right_r_index = right_overlap.alignment.r_ei + 1 + assert aligned_right_r_index > aligned_left_r_index + if aligned_right_r_index <= aligned_left_r_index: + # This should never happen due to how aligners work, but just to be sure... + aligned_right_r_index = aligned_left_r_index + 1 left_overlap_take, left_overlap_drop = left_overlap.cut_reference(aligned_left_r_index + 0.5) right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_r_index - 0.5) From 852742895101f4e08348e9204a37a3b2f42570b5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 15 Jan 2024 10:13:16 -0800 Subject: [PATCH 125/485] Contig stitcher: improve concordance handling --- micall/core/contig_stitcher.py | 43 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index a8dd7ce81..ae81f6c00 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -328,6 +328,26 @@ def slide(start, end): return result +def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance): + valuator = lambda i: (concordance[i], i if i < len(concordance) / 2 else len(concordance) - i - 1) + sorted_concordance_indexes = sorted(range(len(concordance)), key=valuator) + remove_dashes = lambda s: ''.join(c for c in s if c != '-') + + for max_concordance_index in reversed(sorted_concordance_indexes): + aligned_left_q_index = len(remove_dashes(aligned_left[:max_concordance_index])) + aligned_right_q_index = right_overlap.alignment.query_length - len(remove_dashes(aligned_right[max_concordance_index:])) + 1 + aligned_left_r_index = left_overlap.alignment.coordinate_mapping.query_to_ref.left_max(aligned_left_q_index) + if aligned_left_r_index is None: + aligned_left_r_index = left_overlap.alignment.r_st - 1 + aligned_right_r_index = right_overlap.alignment.coordinate_mapping.query_to_ref.right_min(aligned_right_q_index) + if aligned_right_r_index is None: + aligned_right_r_index = right_overlap.alignment.r_ei + 1 + if aligned_right_r_index > aligned_left_r_index: + return (aligned_left_r_index + 0.5, aligned_right_r_index - 0.5, max_concordance_index) + + return (left_overlap.alignment.r_st - 1 + 0.5, right_overlap.alignment.r_ei + 1 - 0.5, 0) + + def stitch_2_contigs(left, right): # Cut in 4 parts. left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) @@ -351,25 +371,10 @@ def stitch_2_contigs(left, right): # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) concordance = calculate_concordance(aligned_left, aligned_right) - valuator = lambda i: (concordance[i], i if i < len(concordance) / 2 else len(concordance) - i - 1) - max_concordance_index = max(range(len(concordance)), key=valuator) - - # Return something that can be fed back into the loop. - without_dashes = lambda s: ''.join(c for c in s if c != '-') - aligned_left_q_index = len(without_dashes(aligned_left[:max_concordance_index])) - aligned_right_q_index = right_overlap.alignment.query_length - len(without_dashes(aligned_right[max_concordance_index:])) + 1 - aligned_left_r_index = left_overlap.alignment.coordinate_mapping.query_to_ref.left_max(aligned_left_q_index) - if aligned_left_r_index is None: - aligned_left_r_index = left_overlap.alignment.r_st - 1 - aligned_right_r_index = right_overlap.alignment.coordinate_mapping.query_to_ref.right_min(aligned_right_q_index) - if aligned_right_r_index is None: - aligned_right_r_index = right_overlap.alignment.r_ei + 1 - assert aligned_right_r_index > aligned_left_r_index - if aligned_right_r_index <= aligned_left_r_index: - # This should never happen due to how aligners work, but just to be sure... - aligned_right_r_index = aligned_left_r_index + 1 - left_overlap_take, left_overlap_drop = left_overlap.cut_reference(aligned_left_r_index + 0.5) - right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_r_index - 0.5) + aligned_left_cutpoint, aligned_right_cutpoint, max_concordance_index = \ + concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance) + left_overlap_take, left_overlap_drop = left_overlap.cut_reference(aligned_left_cutpoint) + right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_cutpoint) # Log it. average_concordance = sum(concordance) / (len(concordance) or 1) From 7e84f61c185a66bcbd37bc091a159f922a3d6fd6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 16 Jan 2024 17:55:39 -0800 Subject: [PATCH 126/485] Implement visualizer for contig stitcher --- micall/core/contig_stitcher.py | 53 +- micall/core/denovo.py | 24 +- micall/core/plot_contigs.py | 552 +++++++++++++++++- .../tests/data/exact_parts_contigs.plot.svg | 170 ++++++ micall/tests/test_contig_stitcher.py | 28 + micall/tests/test_contig_stitcher_fuzz.py | 22 + 6 files changed, 823 insertions(+), 26 deletions(-) create mode 100644 micall/tests/data/exact_parts_contigs.plot.svg diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index ae81f6c00..1dfed93c6 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -95,8 +95,6 @@ def __init__(self, def modify(self, query: GenotypedContig, alignment: CigarHit) -> 'AlignedContig': - if query.seq == self.query.seq and alignment == self.alignment: - return self return AlignedContig( reverse=self.reverse, query=query, @@ -125,9 +123,11 @@ def lstrip_query(self) -> 'AlignedContig': q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = self.modify(query, alignment) - logger.debug("Contig %r morfed into contig %r, so %s became %s", - self.name, result.name, self.alignment, result.alignment, - extra={"action": "modify", "original": self, "result": result}) + logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", + self.name, result.name, self.alignment, + len(self.seq), result.alignment, len(result.seq), + extra={"action": "modify", "type": "lstrip", + "original": self, "result": result}) return result @@ -135,9 +135,11 @@ def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) result = self.modify(query, alignment) - logger.debug("Contig %r morfed into contig %r, so %s became %s", - self.name, result.name, self.alignment, result.alignment, - extra={"action": "modify", "original": self, "result": result}) + logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", + self.name, result.name, self.alignment, + len(self.seq), result.alignment, len(result.seq), + extra={"action": "modify", "type": "rstrip", + "original": self, "result": result}) return result @@ -153,10 +155,9 @@ def intervals_overlap(x, y): def munge(self, other: 'AlignedContig') -> 'AlignedContig': - query_seq = self.rstrip_query().seq + other.lstrip_query().seq match_fraction = min(self.match_fraction, other.match_fraction) ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name - query = GenotypedContig(seq=query_seq, + query = GenotypedContig(seq=self.seq + other.seq, name=generate_new_name(), ref_name=ref_name, group_ref=self.group_ref, @@ -179,12 +180,33 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': return ret +def sliding_window(sequence): + if not sequence: + return + + yield (None, sequence[0], sequence[1] if len(sequence) > 1 else None) + + for i in range(1, len(sequence) - 1): + yield (sequence[i - 1], sequence[i], sequence[i + 1]) + + if len(sequence) > 1: + yield (sequence[-2], sequence[-1], None) + + def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: - ret = reduce(AlignedContig.munge, parts) + stripped_parts = [] + for prev_part, part, next_part in sliding_window(parts): + if prev_part is not None: + part = part.lstrip_query() + if next_part is not None: + part = part.rstrip_query() + stripped_parts.append(part) + + ret = reduce(AlignedContig.munge, stripped_parts) logger.debug("Created a frankenstein %r at %s (len %s) from %s.", ret.name, ret.alignment, len(ret.seq), - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in parts], - extra={"action": "frankenstein", "contigs": parts, "result": ret}) + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in stripped_parts], + extra={"action": "combine", "contigs": stripped_parts, "result": ret}) return ret @@ -597,6 +619,8 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) if len(contigs) == 1: + logger.info("Returning contig %r as is.", contigs[0].name, + extra={"action": "finalreturn", "contig": contigs[0]}) return contigs[0] seq = ''.join([contig.seq for contig in contigs]) @@ -624,6 +648,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument('contigs', type=argparse.FileType('r')) parser.add_argument('stitched_contigs', type=argparse.FileType('w')) + parser.add_argument('--plot') verbosity_group = parser.add_mutually_exclusive_group() verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity') verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity', default=True) @@ -640,7 +665,7 @@ def main(args): else: logger.setLevel(logging.WARN) - write_contig_refs(args.contigs.name, args.stitched_contigs) + write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) args.contigs.close() args.stitched_contigs.close() diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 6fbd71adb..712995aee 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,7 +19,9 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, logger as stitcher_logger +from micall.core.plot_contigs import plot_stitcher_coverage +from micall.utils.structured_logger import add_structured_handler IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), @@ -59,7 +61,8 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> ty def write_contig_refs(contigs_fasta_path, contigs_csv, merged_contigs_csv=None, - blast_csv=None): + blast_csv=None, + stitcher_plot_path=None): """ Run BLAST search to identify contig sequences. :param str contigs_fasta_path: path to file to read contig sequences from @@ -80,6 +83,10 @@ def write_contig_refs(contigs_fasta_path, contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") group_refs = {} + + logger = logging.getLogger("micall.core.contig_stitcher") + handler = add_structured_handler(logger) + genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) @@ -93,6 +100,9 @@ def write_contig_refs(contigs_fasta_path, group_ref=contig.group_ref, contig=contig.seq)) + if stitcher_logger.level <= logging.DEBUG and stitcher_plot_path is not None: + plot_stitcher_coverage(handler.logs, stitcher_plot_path) + return len(contigs) @@ -202,7 +212,9 @@ def denovo(fastq1_path: str, contigs_csv: typing.TextIO, work_dir: str = '.', merged_contigs_csv: typing.TextIO = None, - blast_csv: typing.TextIO = None): + blast_csv: typing.TextIO = None, + stitcher_plot_path: typing.Union[str, None] = None, + ): """ Use de novo assembly to build contigs from reads. :param fastq1_path: FASTQ file name for read 1 reads @@ -255,7 +267,8 @@ def denovo(fastq1_path: str, duration = datetime.now() - start_time contig_count = write_contig_refs(contigs_fasta_path, contigs_csv, - blast_csv=blast_csv) + blast_csv=blast_csv, + stitcher_plot_path=stitcher_plot_path) logger.info('Assembled %d contigs in %s (%ds) on %s.', contig_count, duration, @@ -269,6 +282,7 @@ def denovo(fastq1_path: str, parser.add_argument('fastq1') parser.add_argument('fastq2') parser.add_argument('contigs', type=argparse.FileType('w')) + parser.add_argument('--stitcher_plot') args = parser.parse_args() - denovo(args.fastq1, args.fastq2, args.contigs) + denovo(args.fastq1, args.fastq2, args.contigs, args.stitcher_plot_path) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 35fee6ace..b99111541 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -12,7 +12,7 @@ from genetracks import Figure, Track, Multitrack, Coverage # noinspection PyPep8Naming import drawsvg as draw -from genetracks.elements import Element +from genetracks.elements import Element, Label from matplotlib import cm, colors from matplotlib.colors import Normalize @@ -20,6 +20,18 @@ from micall.utils.alignment_wrapper import align_nucs +class LeftLabel(Label): + """Like Label, but anchored to the left, instead of the middle. + """ + def draw(self, *args, **kwargs): + d = super().draw(*args, **kwargs) + assert len(d.children) == 1 + text = d.children[0] + text.args['text-anchor'] = 'left' + # text.args['fill'] = 'red' # works by the way + return d + + class SmoothCoverage(Coverage): def __init__(self, a, b, ys, height=10, color='blue', opacity='1.0'): groups = [] @@ -153,17 +165,19 @@ def draw(self, x=0, y=0, xscale=1.0): group.append(draw.Line(line_start, arrow_y, arrow_start, arrow_y, stroke='black')) - group.append(draw.Circle(centre, h/2, r, fill='ivory', stroke='black')) + if self.label is not None: + group.append(draw.Circle(centre, h/2, r, fill='ivory', stroke='black')) group.append(draw.Lines(arrow_end, arrow_y, arrow_start, arrow_y + arrow_size/2, arrow_start, arrow_y - arrow_size/2, arrow_end, arrow_y, fill='black')) - group.append(draw.Text(self.label, - font_size, - centre, h / 2, - text_anchor='middle', - dy="0.35em")) + if self.label is not None: + group.append(draw.Text(self.label, + font_size, + centre, h / 2, + text_anchor='middle', + dy="0.35em")) return group @@ -376,6 +390,530 @@ def build_coverage_figure(genome_coverage_csv, blast_csv=None, use_concordance=F return f +def plot_stitcher_coverage(logs, genome_coverage_svg_path): + f = build_stitcher_figure(logs) + f.show(w=970).save_svg(genome_coverage_svg_path, context=draw.Context(invert_y=True)) + + +from types import SimpleNamespace +from typing import Union, Dict, Tuple, List, Optional, Set +from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig +import logging +import random + +def build_stitcher_figure(logs) -> None: + contig_map: Dict[str, Contig] = {} + name_mappings: Dict[str, str] = {} + parent_graph: Dict[str, List[str]] = defaultdict(list) + morphism_graph: Dict[str, List[str]] = {} + reduced_parent_graph: Dict[str, List[str]] = {} + transitive_parent_graph: Dict[str, List[str]] = {} + discarded: List[str] = [] + unknown: List[str] = [] + anomaly: List[str] = [] + overlap_leftparent_map: Dict[str, str] = {} + overlap_rightparent_map: Dict[str, str] = {} + overlap_lefttake_map: Dict[str, str] = {} + overlap_righttake_map: Dict[str, str] = {} + overlap_sibling_map: Dict[str, str] = {} + combine_left_edge: Dict[str, str] = {} + combine_right_edge: Dict[str, str] = {} + synthetic: Set[str] = set() + sinks: Dict[str, bool] = {} + + def get_oldest_ancestors(recur, graph, ancestor_name): + if ancestor_name in recur: + return + else: + recur = recur.copy() + recur.add(ancestor_name) + + if ancestor_name in graph: + existing_ancestors = graph[ancestor_name] + for existing in existing_ancestors: + yield from get_oldest_ancestors(recur, graph, existing) + else: + yield ancestor_name + return + + def reduced_closure(graph): + ret = {} + for parent, children in graph.items(): + lst = [] + for child in children: + for anc in get_oldest_ancestors(set(), graph, child): + if anc not in lst: + lst.append(anc) + ret[parent] = lst + return ret + + def transitive_closure(graph): + def dfs(current_node, start_node): + if current_node not in visited: + visited.add(current_node) + closure[start_node].add(current_node) + for neighbor in graph.get(current_node, []): + dfs(neighbor, start_node) + + closure = {node: set() for node in graph} + for node in graph: + visited = set() + dfs(node, node) + + return {node: list(descendants) for node, descendants in closure.items()} + + def reflexive_closure(graph): + ret = graph.copy() + for parent, children in graph.items(): + if parent not in children: + children.append(parent) + for child in children[:]: + if child not in ret: + ret[child] = [] + lst = ret[child] + if child not in lst: + ret[child].append(child) + return ret + + def inverse_graph(graph): + ret = {} + for parent, children in graph.items(): + for child in children: + if child not in ret: + ret[child] = [] + ret[child].append(parent) + return ret + + def graph_sum(graph_a, graph_b): + ret = graph_a.copy() + for key, values in graph_b.items(): + if key not in ret: + ret[key] = [] + for value in values: + lst = ret[key] + if value not in lst: + lst.append(value) + return ret + + def symmetric_closure(graph): + return graph_sum(graph, inverse_graph(graph)) + + def record_contig(contig: Contig, parents: List[Contig]): + contig_map[contig.name] = contig + if [contig.name] != [parent.name for parent in parents]: + for parent in parents: + contig_map[parent.name] = parent + parent_graph[contig.name].append(parent.name) + + def record_morphism(contig: Contig, original: Contig): + if original.name not in morphism_graph: + morphism_graph[original.name] = [] + lst = morphism_graph[original.name] + if contig.name not in lst: + lst.append(contig.name) + + def unwrap_final(contig): + yield contig + + for event in logs: + if not hasattr(event, "action"): + pass + elif event.action == "finalcombine": + record_contig(event.result, event.contigs) + elif event.action == "splitgap": + record_contig(event.left, [event.contig]) + record_contig(event.right, [event.contig]) + elif event.action == "intro": + record_contig(event.contig, []) + elif event.action == "alignment": + if event.type == "hit": + record_contig(event.part, [event.contig]) + if event.part.reverse: + anomaly.append(event.part.name) + elif event.type == "noref": + unknown.append(event.contig.name) + elif event.type == "zerohits": + anomaly.append(event.contig.name) + elif event.type in ("hitnumber", "reversenumber"): + pass + else: + raise RuntimeError(f"Unrecognized event of type {event.type!r}: {event}") + elif event.action == "munge": + record_contig(event.result, [event.left, event.right]) + elif event.action == "modify": + record_contig(event.result, [event.original]) + record_morphism(event.result, event.original) + elif event.action == "overlap": + synthetic.add(event.left_take.name) + synthetic.add(event.right_take.name) + overlap_leftparent_map[event.left_remainder.name] = event.left.name + overlap_rightparent_map[event.right_remainder.name] = event.right.name + overlap_lefttake_map[event.left_remainder.name] = event.left_take.name + overlap_righttake_map[event.right_remainder.name] = event.right_take.name + overlap_sibling_map[event.left_remainder.name] = event.right_remainder.name + overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name + elif event.action == "drop": + discarded.append(event.contig.name) + elif event.action == "stitchcut": + record_contig(event.left_overlap, [event.left]) + record_contig(event.left_remainder, [event.left]) + record_contig(event.right_overlap, [event.right]) + record_contig(event.right_remainder, [event.right]) + elif event.action == "stitch": + record_contig(event.result, [event.left, event.right]) + elif event.action == "cut": + record_contig(event.left, [event.original]) + record_contig(event.right, [event.original]) + elif event.action == "combine": + record_contig(event.result, event.contigs) + combine_left_edge[event.result.name] = event.contigs[0].name + combine_right_edge[event.result.name] = event.contigs[-1].name + elif event.action in ("ignoregap", "nooverlap", "finalreturn"): + pass + else: + raise RuntimeError(f"Unrecognized action: {event.action}") + + group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} + children_graph = inverse_graph(parent_graph) + reduced_parent_graph = reduced_closure(parent_graph) + reduced_children_graph = reduced_closure(children_graph) + transitive_parent_graph = transitive_closure(parent_graph) + sorted_roots = list(sorted(parent_name for + parent_name in contig_map + if parent_name not in parent_graph)) + + eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) + reduced_morphism_graph = reduced_closure(morphism_graph) + + # Closing `synthetic' + for contig in contig_map: + if contig in synthetic: + for clone in eqv_morphism_graph.get(contig, []): + synthetic.add(clone) + + def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): + for parent in edge_table: + child_remainder = edge_table[parent] + for child_remainder_morph in eqv_morphism_graph.get(child_remainder, [child_remainder]): + if child_remainder_morph in overlap_xtake_map: + continue + + for parent_morph in eqv_morphism_graph.get(parent, [parent]): + for parent_remainder in overlap_xparent_map: + if overlap_xparent_map[parent_remainder] == parent_morph: + overlap_xtake_map[child_remainder_morph] = overlap_xtake_map[parent_remainder] + yield True + + # Closing `takes` by parents + while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map)): pass + while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass + + final_parts: Dict[str, bool] = {} + for contig in contig_map: + if contig in synthetic: + continue + + if contig in overlap_sibling_map: + finals = reduced_morphism_graph.get(contig, [contig]) + if len(finals) == 1: + [final] = finals + parents = reduced_parent_graph.get(final, []) + if len(parents) == 1: + final_parts[final] = True + + elif contig in discarded or contig in anomaly or contig in unknown: + final_parts[contig] = True + + final_parent_mapping: Dict[str, List[str]] = {} + for parent_name in sorted_roots: + children = [] + for final_contig in final_parts: + if final_contig == parent_name or \ + parent_name in reduced_parent_graph.get(final_contig, []): + children.append(final_contig) + + final_parent_mapping[parent_name] = children + + min_position, max_position = 1, 1 + position_offset = 100 + for contig in contig_map.values(): + if isinstance(contig, GenotypedContig) and contig.ref_seq is not None: + max_position = max(max_position, len(contig.ref_seq) + 3 * position_offset) + else: + max_position = max(max_position, len(contig.seq) + 3 * position_offset) + + name_mappings = {} + + for i, (parent, children) in enumerate(sorted(final_parent_mapping.items(), key=lambda p: p[0])): + name_mappings[parent] = f"{i + 1}" + mapped_children = [child for child in children] + for k, child in enumerate(mapped_children): + if len(mapped_children) > 1: + name_mappings[child] = f"{i + 1}.{k + 1}" + else: + name_mappings[child] = f"{i + 1}" + + for child in discarded + anomaly + unknown: + if child not in children: + if child in transitive_parent_graph \ + and parent in transitive_parent_graph[child]: + k += 1 + name_mappings[child] = f"{i + 1}.{k + 1}" + + def get_neighbours(part, lookup): + for clone in eqv_morphism_graph.get(part.name, [part.name]): + maybe_name = lookup.get(clone, None) + if maybe_name is not None: + yield contig_map[maybe_name] + + def get_final_version(contig): + name = reduced_morphism_graph.get(contig.name, [contig.name])[0] # FIXME: why 0??? + return contig_map[name] + + def get_neighbour(part, lookup): + if not part: return None + lst = list(get_neighbours(part, lookup)) + ret = max(map(get_final_version, lst), key=lambda contig: contig.alignment.ref_length, default=None) + return ret + + aligned_size_map: Dict[str, Tuple[int, int]] = {} + full_size_map: Dict[str, Tuple[int, int]] = {} + + for parent_name in sorted_roots: + parts = final_parent_mapping[parent_name] + parts = [contig_map[part] for part in parts] + + for part in parts: + if not isinstance(part, AlignedContig): + continue + + prev_part = get_neighbour(part, overlap_righttake_map) + next_part = get_neighbour(part, overlap_lefttake_map) + + if prev_part is not None: + r_st = prev_part.alignment.r_st + position_offset + else: + start_delta = -1 * part.alignment.q_st + r_st = part.alignment.r_st + start_delta + position_offset + + if next_part is not None: + r_ei = next_part.alignment.r_ei + position_offset + else: + end_delta = len(part.seq) - part.alignment.q_ei + r_ei = part.alignment.r_ei + end_delta + position_offset + + aligned_size_map[part.name] = (r_st, r_ei) + + sibling = ([overlap_sibling_map[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_sibling_map] or [None])[0] + sibling = sibling and contig_map[sibling] + prev_part = get_neighbour(sibling, overlap_lefttake_map) + next_part = get_neighbour(sibling, overlap_righttake_map) + + if prev_part is not None and prev_part.alignment.r_ei < part.alignment.r_st and prev_part: + r_st = prev_part.alignment.r_st + position_offset + else: + start_delta = -1 * part.alignment.q_st + r_st = part.alignment.r_st + start_delta + position_offset + + if next_part is not None and next_part.alignment.r_st > part.alignment.r_ei and next_part: + r_ei = next_part.alignment.r_ei + position_offset + else: + end_delta = len(part.seq) - part.alignment.q_ei + r_ei = part.alignment.r_ei + end_delta + position_offset + + full_size_map[part.name] = (r_st, r_ei) + + def get_contig_coordinates(contig): + if isinstance(contig, AlignedContig): + r_st = position_offset + contig.alignment.r_st + r_ei = position_offset + contig.alignment.r_ei + if contig.name in aligned_size_map: + a_r_st, a_r_ei = aligned_size_map[contig.name] + else: + a_r_st = r_st + a_r_ei = r_ei + if contig.name in full_size_map: + f_r_st, f_r_ei = full_size_map[contig.name] + else: + f_r_st = r_st - contig.alignment.q_st + f_r_ei = r_ei + (len(contig.seq) - contig.alignment.q_ei) + else: + f_r_st = position_offset + f_r_ei = position_offset + len(contig.seq) + a_r_st = f_r_st + a_r_ei = f_r_ei + return (a_r_st, a_r_ei, f_r_st, f_r_ei) + + def get_tracks(group_ref, contig_name): + parts = final_parent_mapping[contig_name] + for part_name in parts: + part = contig_map[part_name] + + if not isinstance(part, AlignedContig): + continue + + if part.group_ref != group_ref: + continue + + indexes = name_mappings[part.name] + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) + yield Track(f_r_st, f_r_ei, label=f"{indexes}") + + def get_arrows(group_ref, contig_name, labels): + parts = final_parent_mapping[contig_name] + for part_name in parts: + part = contig_map[part_name] + + if not isinstance(part, AlignedContig): + continue + + if part.group_ref != group_ref: + continue + + indexes = name_mappings[part.name] if labels else None + height = 20 if labels else 1 + elevation = 1 if labels else -20 + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) + yield Arrow(a_r_st, a_r_ei, + elevation=elevation, + h=height, + label=indexes) + + def get_all_arrows(group_ref, labels): + for parent_name in sorted_roots: + yield from get_arrows(group_ref, parent_name, labels) + + ################ + # Drawing part # + ################ + + landmarks_path = (Path(__file__).parent.parent / "data" / + "landmark_references.yaml") + landmark_groups = yaml.safe_load(landmarks_path.read_text()) + projects = ProjectConfig.loadDefault() + figure = Figure() + for group_ref in group_refs: + matching_groups = [group for group in landmark_groups if group['coordinates'] == group_ref] + if matching_groups: + reference_set = matching_groups[0] + elif "HIV1" in group_ref: + matching_groups = [group for group in landmark_groups if group['coordinates'] == "HIV1-B-FR-K03455-seed"] + reference_set = matching_groups[0] + else: + reference_set = None + add_partial_banner(figure, position_offset, max_position) + continue + + ############# + # Landmarks # + ############# + + if reference_set: + prev_landmark = None + for i, landmark in enumerate(sorted(reference_set['landmarks'], + key=itemgetter('start'))): + landmark.setdefault('frame', 0) + if prev_landmark and 'end' not in prev_landmark: + prev_landmark['end'] = landmark['start'] - 1 + prev_landmark = landmark + for frame, frame_landmarks in groupby(reference_set['landmarks'], + itemgetter('frame')): + subtracks = [] + for landmark in frame_landmarks: + landmark_colour = landmark.get('colour') + if landmark_colour is None: + continue + subtracks.append(Track(landmark['start'] + position_offset, + landmark['end'] + position_offset, + label=landmark['name'], + color=landmark_colour)) + max_position = max(max_position, + landmark['end'] + position_offset) + figure.add(Multitrack(subtracks)) + + r_st = position_offset + r_ei = position_offset + group_refs[group_ref] + figure.add(Track(r_st, r_ei, label=f"{group_ref}")) + + ########## + # Arrows # + ########## + + ref_arrows = list(get_all_arrows(group_ref, labels=True)) + if ref_arrows: + figure.add(ArrowGroup(ref_arrows)) + + ########### + # Contigs # + ########### + + for parent_name in sorted_roots: + arrows = list(get_arrows(group_ref, parent_name, labels=False)) + if arrows: + figure.add(ArrowGroup(arrows)) + parts = list(get_tracks(group_ref, parent_name)) + if parts: + figure.add(Multitrack(parts)) + + ############# + # Discarded # + ############# + + if discarded: + label = LeftLabel(text=f"discards:", x=0, font_size=12) + pos = position_offset / 2 + figure.add(Track(pos, pos, h=40, label=label)) + for contig_name in discarded: + contig = contig_map[contig_name] + (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + name = name_mappings.get(contig.name, contig.name) + figure.add(Arrow(r_st, r_ei, elevation=-20, h=1)) + figure.add(Track(f_r_st, f_r_ei, label=name)) + + ############# + # Anomalies # + ############# + + if anomaly: + label = LeftLabel(text=f"anomaly:", x=0, font_size=12) + pos = position_offset / 2 + figure.add(Track(pos, pos, h=40, label=label)) + for contig_name in anomaly: + contig = contig_map[contig_name] + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + if isinstance(contig, AlignedContig): + colour = "lightgray" + if contig.reverse: + figure.add(Arrow(a_r_ei, a_r_st, elevation=-20, h=1)) + else: + figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) + else: + colour = "red" + + name = name_mappings.get(contig.name, contig.name) + figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) + + ########### + # Unknown # + ########### + + if unknown: + label = LeftLabel(text=f"unknown:", x=0, font_size=12) + pos = position_offset / 2 + figure.add(Track(pos, pos, h=40, label=label)) + for contig_name in unknown: + contig = contig_map[contig_name] + r_st = position_offset + r_ei = position_offset + len(contig.seq) + colour = "red" + name = name_mappings.get(contig.name, contig.name) + figure.add(Track(r_st, r_ei, color=colour, label=name)) + + if not figure.elements: + figure.add(Track(1, max_position, label='No contigs found.', color='none')) + return figure + + def map_references(contig_ref_name: str, coordinates_name: str, projects: ProjectConfig) -> typing.Mapping[int, int]: diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg new file mode 100644 index 000000000..23dae0cc1 --- /dev/null +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -0,0 +1,170 @@ + + + + + + + + +5' LTR + + + + + +gag + + + + + +vif + + + + + +tat + + + + + +nef + + + + + + + +tat + + + + + +vpu + + + + + +rev + + + + + +3' LTR + + + + + + + +pol + + + + + +vpr + + + + + +rev + + + + + +env + + + + + + + +PR + + + + + +RT + + + + + +INT + + + + + +V3 + + + + + +GP41 + + + + + + +HIV1-B-FR-K03455-seed + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e3dc83df2..a6ca2528e 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -680,6 +680,34 @@ def test_main_invocation(exact_aligner, tmp_path, hcv_db): assert stitched_data == expected_data, "The contents of the stitched contigs file do not match the expected contents." +def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): + pwd = os.path.dirname(__file__) + contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") + stitched_contigs = os.path.join(tmp_path, "stitched.csv") + plot = os.path.join(tmp_path, "exact_parts_contigs.plot.svg") + main([contigs, stitched_contigs, "--debug", "--plot", plot]) + + assert os.path.exists(contigs) + assert os.path.exists(stitched_contigs) + + # Check the contents of stitched_contigs + with open(stitched_contigs, 'r') as stitched_file: + stitched_data = stitched_file.read() + + expected_file_path = os.path.join(pwd, "data", "exact_parts_contigs_stitched.csv") + with open(expected_file_path, 'r') as expected_file: + expected_data = expected_file.read() + assert stitched_data == expected_data, "The contents of the stitched contigs file do not match the expected contents." + + # Check the contents of stitched_contigs + expected_plot = os.path.join(pwd, "data", "exact_parts_contigs.plot.svg") + with open(plot, 'r') as stitched_file, \ + open(expected_plot, 'r') as expected_file: + stitched_data = stitched_file.read() + expected_data = expected_file.read() + assert stitched_data == expected_data, "The contents of the stitched plot file do not match the expected contents." + + # _ _ _ _ _ _ # | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ # | | | | '_ \| | __| | __/ _ \/ __| __/ __| diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 9846826d6..e80bcb51a 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -2,10 +2,13 @@ import json import os from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps +from micall.core.plot_contigs import build_stitcher_figure from micall.utils.cigar_tools import CigarHit, Cigar from micall.utils.consensus_aligner import CigarActions +from micall.utils.structured_logger import add_structured_handler from typing import Dict, List from collections import defaultdict +import logging @pytest.fixture @@ -110,3 +113,22 @@ def test_stitching_intervals_prop(no_aligner, description): # Checks that there are no overlaps between contigs assert all(v == 1 for (k, v) in stitched_positions.items()) + + +@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +def test_visualizer_simple(no_aligner, description): + contigs = description['contigs'] + for contig in contigs: + contig.group_ref = "HIV1-B-FR-K03455-seed" + + logger = logging.getLogger("micall.core.contig_stitcher") + logger.setLevel(logging.DEBUG) + handler = add_structured_handler(logger) + + stitched = list(stitch_consensus(contigs)) + + assert logger.level <= logging.DEBUG + assert len(handler.logs) >= len(contigs) + + figure = build_stitcher_figure(handler.logs) + assert len(figure.elements) > len(contigs) + 1 From 82904a9f69ee7f9eb16a2710fc557d27d23db4bf Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 16 Jan 2024 17:43:49 -0800 Subject: [PATCH 127/485] Contig stitcher: improve boundaries of cut parts --- micall/core/contig_stitcher.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1dfed93c6..ddd965f24 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -105,15 +105,31 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left_query = self.query.rename(generate_new_name()) - right_query = self.query.rename(generate_new_name()) + + left_query = GenotypedContig( + name=generate_new_name(), + seq=self.seq[:alignment_right.q_st], + ref_name=self.ref_name, + group_ref=self.group_ref, + ref_seq=self.ref_seq, + match_fraction=self.match_fraction) + right_query = GenotypedContig( + name=generate_new_name(), + seq=self.seq[alignment_left.q_ei + 1:], + ref_name=self.ref_name, + group_ref=self.group_ref, + ref_seq=self.ref_seq, + match_fraction=self.match_fraction) + + alignment_right = alignment_right.translate(0, -1 * (alignment_left.q_ei + 1)) + left = self.modify(left_query, alignment_left) right = self.modify(right_query, alignment_right) - logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", - left.name, left.alignment, right.name, right.alignment, self.name, - extra={"action": "cut", "original": self, - "left": left, "right": right}) + logger.debug("Created contigs %r at %s (len %s) and %r at %s (len %s) by cutting %r.", + left.name, left.alignment, len(left.seq), right.name, right.alignment, + len(right.seq), self.name, extra={"action": "cut", "original": self, + "left": left, "right": right}) return (left, right) From 53ad6c1b9d5ae702862303278212d7558b311d80 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 16 Jan 2024 18:09:38 -0800 Subject: [PATCH 128/485] Contig stitcher: make all structures frozen --- micall/core/contig_stitcher.py | 26 +++++++++++------------ micall/tests/test_contig_stitcher_fuzz.py | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index ddd965f24..f6119374a 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -23,13 +23,13 @@ def generate_new_name(): return f"c{name_generator_state}" -@dataclass +@dataclass(frozen=True) class Contig: name: str seq: str -@dataclass +@dataclass(frozen=True) class GenotypedContig(Contig): ref_name: str group_ref: str @@ -72,20 +72,18 @@ def rename(self, new_name: str) -> 'GenotypedContig': match_fraction=self.match_fraction) -@dataclass +@dataclass(frozen=True) class AlignedContig(GenotypedContig): query: GenotypedContig alignment: CigarHit reverse: bool - def __init__(self, - query: GenotypedContig, - alignment: CigarHit, - reverse: bool): - self.query = query - self.alignment = alignment - self.reverse = reverse - super().__init__( + @staticmethod + def make(query: GenotypedContig, alignment: CigarHit, reverse: bool): + return AlignedContig( + query=query, + alignment=alignment, + reverse=reverse, seq=query.seq, name=query.name, ref_name=query.ref_name, @@ -95,7 +93,7 @@ def __init__(self, def modify(self, query: GenotypedContig, alignment: CigarHit) -> 'AlignedContig': - return AlignedContig( + return AlignedContig.make( reverse=self.reverse, query=query, alignment=alignment) @@ -188,7 +186,7 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': alignment = self_alignment.connect(other_alignment) assert self.reverse == other.reverse - ret = AlignedContig(reverse=self.reverse, query=query, alignment=alignment) + ret = AlignedContig.make(reverse=self.reverse, query=query, alignment=alignment) logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", self.name, self.alignment, other.name, other.alignment, ret.name, ret.alignment, extra={"action": "munge", "left": self, @@ -262,7 +260,7 @@ def logpart(i, part, is_rev): part.name, part.alignment, " (rev)" if is_rev else "") def make_aligned(query, alignment, is_rev): - return AlignedContig( + return AlignedContig.make( query=query, alignment=alignment, reverse=is_rev) diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index e80bcb51a..83e643321 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -37,7 +37,7 @@ def read_contigs(line): match_fraction=2/3, ) alignment = CigarHit(Cigar([(length, CigarActions.MATCH)]), q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) - contig = AlignedContig(query=query, alignment=alignment, reverse=False) + contig = AlignedContig.make(query=query, alignment=alignment, reverse=False) aidee = f'{start:03d}-{end:03d}' yield {'contig': contig, 'id': aidee} @@ -119,7 +119,7 @@ def test_stitching_intervals_prop(no_aligner, description): def test_visualizer_simple(no_aligner, description): contigs = description['contigs'] for contig in contigs: - contig.group_ref = "HIV1-B-FR-K03455-seed" + contig.__dict__["group_ref"] = "HIV1-B-FR-K03455-seed" logger = logging.getLogger("micall.core.contig_stitcher") logger.setLevel(logging.DEBUG) From 0a65fb39eb20b812b2a0319ad16a3d9c66baf634 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 16 Jan 2024 18:13:40 -0800 Subject: [PATCH 129/485] Contig stitcher: remove unused code --- micall/core/contig_stitcher.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f6119374a..30d066e01 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -62,16 +62,6 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont return (left, right) - def rename(self, new_name: str) -> 'GenotypedContig': - return GenotypedContig( - name=new_name, - seq=self.seq, - ref_name=self.ref_name, - group_ref=self.group_ref, - ref_seq=self.ref_seq, - match_fraction=self.match_fraction) - - @dataclass(frozen=True) class AlignedContig(GenotypedContig): query: GenotypedContig @@ -92,13 +82,6 @@ def make(query: GenotypedContig, alignment: CigarHit, reverse: bool): match_fraction=query.match_fraction) - def modify(self, query: GenotypedContig, alignment: CigarHit) -> 'AlignedContig': - return AlignedContig.make( - reverse=self.reverse, - query=query, - alignment=alignment) - - def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: """ Cuts this alignment in two parts with cut_point between them. """ @@ -121,8 +104,8 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont alignment_right = alignment_right.translate(0, -1 * (alignment_left.q_ei + 1)) - left = self.modify(left_query, alignment_left) - right = self.modify(right_query, alignment_right) + left = AlignedContig.make(query=left_query, alignment=alignment_left, reverse=self.reverse) + right = AlignedContig.make(query=right_query, alignment=alignment_right, reverse=self.reverse) logger.debug("Created contigs %r at %s (len %s) and %r at %s (len %s) by cutting %r.", left.name, left.alignment, len(left.seq), right.name, right.alignment, @@ -136,7 +119,7 @@ def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) - result = self.modify(query, alignment) + result = AlignedContig.make(query, alignment, self.reverse) logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, len(self.seq), result.alignment, len(result.seq), @@ -148,7 +131,7 @@ def lstrip_query(self) -> 'AlignedContig': def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) - result = self.modify(query, alignment) + result = AlignedContig.make(query, alignment, self.reverse) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, len(self.seq), result.alignment, len(result.seq), From 99dd324a64d7b1fc208c657036214490c45691ad Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 10:43:12 -0800 Subject: [PATCH 130/485] Contig stitcher: throw away non-prime-end unaligned parts --- micall/core/contig_stitcher.py | 24 +++++------------------- micall/core/plot_contigs.py | 2 +- micall/tests/test_contig_stitcher.py | 4 ++-- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 30d066e01..5d3efe3dd 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -615,25 +615,11 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) - if len(contigs) == 1: - logger.info("Returning contig %r as is.", contigs[0].name, - extra={"action": "finalreturn", "contig": contigs[0]}) - return contigs[0] - - seq = ''.join([contig.seq for contig in contigs]) - match_fraction = min([contig.match_fraction for contig in contigs]) - ret = GenotypedContig(name=generate_new_name(), - seq=seq, ref_name=contigs[0].ref_name, - group_ref=contigs[0].group_ref, - ref_seq=contigs[0].ref_seq, - match_fraction=match_fraction) - - logger.info("Combined these contigs for final output for %r: %s," - " resulting in %r of length %s.", group_ref, - [repr(x.name) for x in contigs], ret.name, len(ret.seq), - extra={"action": "finalcombine", "contigs": contigs, "result": ret}) - - return ret + result = combine_contigs(contigs) + logger.debug("Combining these contigs for final output for %r: %s.", + group_ref, [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs], + extra={"action": "finalcombine", "contigs": contigs, "result": result}) + return result yield from map(combine, consensus_parts) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index b99111541..757c20fe7 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -568,7 +568,7 @@ def unwrap_final(contig): record_contig(event.result, event.contigs) combine_left_edge[event.result.name] = event.contigs[0].name combine_right_edge[event.result.name] = event.contigs[-1].name - elif event.action in ("ignoregap", "nooverlap", "finalreturn"): + elif event.action in ("ignoregap", "nooverlap"): pass else: raise RuntimeError(f"Unrecognized action: {event.action}") diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index a6ca2528e..4e314db92 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -652,11 +652,11 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 150 + assert len(handler.logs) == 158 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] - assert len(info_messages) == 42 + assert len(info_messages) == 40 assert len(debug_messages) == len(handler.logs) - len(info_messages) From fc3cd69e9ee58fd7d0e0fe1ddda93692a55010d0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 13:37:27 -0800 Subject: [PATCH 131/485] Contig stitcher: revert to the simpler version of cut_reference --- micall/core/contig_stitcher.py | 36 ++++++++++------------------------ 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 5d3efe3dd..fed913f32 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,6 +1,6 @@ from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal from collections import deque, defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, replace from math import ceil, floor from mappy import Aligner from functools import cached_property, reduce @@ -86,31 +86,15 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - - left_query = GenotypedContig( - name=generate_new_name(), - seq=self.seq[:alignment_right.q_st], - ref_name=self.ref_name, - group_ref=self.group_ref, - ref_seq=self.ref_seq, - match_fraction=self.match_fraction) - right_query = GenotypedContig( - name=generate_new_name(), - seq=self.seq[alignment_left.q_ei + 1:], - ref_name=self.ref_name, - group_ref=self.group_ref, - ref_seq=self.ref_seq, - match_fraction=self.match_fraction) - - alignment_right = alignment_right.translate(0, -1 * (alignment_left.q_ei + 1)) - - left = AlignedContig.make(query=left_query, alignment=alignment_left, reverse=self.reverse) - right = AlignedContig.make(query=right_query, alignment=alignment_right, reverse=self.reverse) - - logger.debug("Created contigs %r at %s (len %s) and %r at %s (len %s) by cutting %r.", - left.name, left.alignment, len(left.seq), right.name, right.alignment, - len(right.seq), self.name, extra={"action": "cut", "original": self, - "left": left, "right": right}) + left_query = replace(self.query, name=generate_new_name()) + right_query = replace(self.query, name=generate_new_name()) + left = AlignedContig.make(left_query, alignment_left, reverse=self.reverse) + right = AlignedContig.make(right_query, alignment_right, reverse=self.reverse) + + logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", + left.name, left.alignment, right.name, right.alignment, self.name, + extra={"action": "cut", "original": self, + "left": left, "right": right}) return (left, right) From 6bb959dacd0f0aa6c01b222c45b4dda09b46445f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 14:05:52 -0800 Subject: [PATCH 132/485] Contig stitcher: do not duplicate query in AlignedContig --- micall/core/contig_stitcher.py | 10 ++++------ micall/tests/test_contig_stitcher.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index fed913f32..7a14cc3db 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -64,14 +64,12 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont @dataclass(frozen=True) class AlignedContig(GenotypedContig): - query: GenotypedContig alignment: CigarHit reverse: bool @staticmethod def make(query: GenotypedContig, alignment: CigarHit, reverse: bool): return AlignedContig( - query=query, alignment=alignment, reverse=reverse, seq=query.seq, @@ -86,8 +84,8 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left_query = replace(self.query, name=generate_new_name()) - right_query = replace(self.query, name=generate_new_name()) + left_query = replace(self, name=generate_new_name()) + right_query = replace(self, name=generate_new_name()) left = AlignedContig.make(left_query, alignment_left, reverse=self.reverse) right = AlignedContig.make(right_query, alignment_right, reverse=self.reverse) @@ -101,7 +99,7 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() - q_remainder, query = self.query.cut_query(alignment.q_st - 0.5) + q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = AlignedContig.make(query, alignment, self.reverse) logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", @@ -114,7 +112,7 @@ def lstrip_query(self) -> 'AlignedContig': def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() - query, q_remainder = self.query.cut_query(alignment.q_ei + 0.5) + query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.reverse) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 4e314db92..e8ac45110 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -151,7 +151,7 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali assert 100 == len(results[0].seq) assert results[0].seq == 'A' * 50 + 'C' * 50 - assert results[1].query == contigs[2] + assert results[1].seq == contigs[2].seq def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): @@ -234,7 +234,8 @@ def test_stitching_of_identical_contigs(exact_aligner): results = list(stitch_contigs(contigs)) assert len(results) == 1 - assert results[0].query == contigs[2] + assert results[0].name == contigs[2].name + assert results[0].seq == contigs[2].seq def test_stitching_of_zero_contigs(exact_aligner): @@ -320,14 +321,14 @@ def test_correct_processing_complex_nogaps(exact_aligner): assert 170 == len(results[0].seq) assert results[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[0].query.group_ref == 'testref-1' + assert results[0].group_ref == 'testref-1' assert 170 == len(results[1].seq) assert results[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[1].query.group_ref == 'testref-2' + assert results[1].group_ref == 'testref-2' - assert results[2].query == contigs[3] - assert results[3].query == contigs[7] + assert results[2].seq == contigs[3].seq + assert results[3].seq == contigs[7].seq def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): @@ -358,8 +359,8 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): # Test to ensure that the final result contains the contig 'b' and # does not contain the completely covered contig 'a'. - assert results[0].query.name == 'b' - assert results[0].query == contigs[1] + assert results[0].name == 'b' + assert results[0].seq == contigs[1].seq def test_stitching_contig_with_big_noncovered_gap(exact_aligner): From 3a260ce62096d008b72ef54eaf2c5449c845307b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 14:35:04 -0800 Subject: [PATCH 133/485] Contig stitcher: more documentation --- micall/core/contig_stitcher.py | 94 ++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 7a14cc3db..b68cee0a8 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal +from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar from collections import deque, defaultdict from dataclasses import dataclass, replace from math import ceil, floor @@ -37,10 +37,7 @@ class GenotypedContig(Contig): match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: - """ - Cuts this alignment in two parts with cut_point between them. - Reference sequence is kept untouched. - """ + """ Cuts query sequence in two parts with cut_point between them. """ cut_point = max(0, cut_point) match_fraction = self.match_fraction @@ -98,6 +95,11 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont def lstrip_query(self) -> 'AlignedContig': + """ + Trims the query sequence of the contig from its beginning up to the start of the + alignment. The CIGAR alignment is also updated to reflect the trimming. + """ + alignment = self.alignment.lstrip_query() q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) @@ -111,6 +113,11 @@ def lstrip_query(self) -> 'AlignedContig': def rstrip_query(self) -> 'AlignedContig': + """ + Trims the query sequence of the contig from its end based on the end of the + alignment. The CIGAR alignment is also updated to reflect the trimming. + """ + alignment = self.alignment.rstrip_query() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.reverse) @@ -134,6 +141,11 @@ def intervals_overlap(x, y): def munge(self, other: 'AlignedContig') -> 'AlignedContig': + """ + Combines two adjacent contigs into a single contig by joining their + query sequences and alignments. + """ + match_fraction = min(self.match_fraction, other.match_fraction) ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=self.seq + other.seq, @@ -159,7 +171,16 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': return ret -def sliding_window(sequence): +T = TypeVar("T") + +def sliding_window(sequence: Iterable[T]) -> Iterable[Tuple[Optional[T], T, Optional[T]]]: + """ + Generate a three-element sliding window of a sequence. + + Each element generated contains a tuple with the previous item (None if the first item), + the current item, and the next item (None if the last item) in the sequence. + """ + if not sequence: return @@ -173,6 +194,13 @@ def sliding_window(sequence): def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: + """ + Combine a list of contigs into a single AlignedContig by trimming and merging overlapping parts. + + Left-trimming and right-trimming occur at any shared overlapping points + between adjacent parts. AlignedContig.munge() is used to combine contiguous parts without overlap. + """ + stripped_parts = [] for prev_part, part, next_part in sliding_window(parts): if prev_part is not None: @@ -190,6 +218,13 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: def align_to_reference(contig) -> Iterable[GenotypedContig]: + """ + Align a single Contig to its reference sequence, producing potentially multiple aligned contigs. + + If the reference sequence (ref_seq) is unavailable, the contig is returned unaltered. + Otherwise, alignments are performed and contigs corresponding to each alignment are yielded. + """ + if contig.ref_seq is None: logger.info("Contig %r not aligned - no reference.", contig.name, extra={"action": "alignment", "type": "noref", "contig": contig}) @@ -258,10 +293,19 @@ def make_aligned(query, alignment, is_rev): def align_all_to_reference(contigs): + """ + Align multiple contigs to their respective reference sequences. + + Applies align_to_reference to each contig in the given collection, + flattening the result into a single list. + """ + return [contig for parts in map(align_to_reference, contigs) for contig in parts] def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: + """ Globally align two query sequences against each other and return the resulting aligned sequences in MSA format. """ + gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 @@ -276,15 +320,24 @@ def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: def find_all_overlapping_contigs(self, aligned_contigs): + """" + Yield all contigs from a collection that overlap with a given contig. + Contigs are considered overlapping if they have overlapping intervals on the same reference genome. + """ + for other in aligned_contigs: if self.overlaps(other): yield other def find_overlapping_contig(self, aligned_contigs): + """ + Find the single contig in a collection that overlaps the most with a given contig. + It returns the contig with the maximum overlapped reference length with the given contig (self). + """ + every = find_all_overlapping_contigs(self, aligned_contigs) - return max(every, key=lambda other: other.alignment.ref_length if other else 0, - default=None) + return max(every, key=lambda other: other.alignment.ref_length if other else 0, default=None) def calculate_concordance(left: str, right: str) -> List[float]: @@ -330,6 +383,8 @@ def slide(start, end): def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance): + """ Determine optimal cut points for stitching based on sequence concordance in the overlap region. """ + valuator = lambda i: (concordance[i], i if i < len(concordance) / 2 else len(concordance) - i - 1) sorted_concordance_indexes = sorted(range(len(concordance)), key=valuator) remove_dashes = lambda s: ''.join(c for c in s if c != '-') @@ -350,6 +405,14 @@ def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned def stitch_2_contigs(left, right): + """ + Stitch two contigs together into a single coherent contig. + + The function handles the overlap by cutting both contigs into segments, aligning the + overlapping segments, and then choosing the optimal stitching points based on sequence + concordance. Non-overlapping segments are retained as is. + """ + # Cut in 4 parts. left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) @@ -397,6 +460,11 @@ def stitch_2_contigs(left, right): def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: + """" + Repeatedly combine all overlapping aligned contigs into an iterable collection of contiguous AlignedContigs. + It proceeds by iterating through sorted contigs and stitching any overlapping ones until none are left. + """ + # Going left-to-right through aligned contigs. contigs = list(sorted(contigs, key=lambda x: x.alignment.r_st)) while contigs: @@ -500,6 +568,13 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] def split_contigs_with_gaps(contigs: List[AlignedContig]) -> List[AlignedContig]: + """ + Split contigs at large gaps if those gaps are covered by other contigs in the list. + + A gap within a contig is considered large based on a pre-defined threshold. If another contig aligns + within that gap's range, the contig is split into two around the midpoint of the gap. + """ + def covered_by(gap, other): # Check if any 1 reference coordinate in gap is mapped in other. gap_coords = gap.coordinate_mapping.ref_to_query.domain @@ -507,8 +582,7 @@ def covered_by(gap, other): return not gap_coords.isdisjoint(cover_coords) def covered(contig, gap): - return any(covered_by(gap, other) for other in contigs - if other != contig) + return any(covered_by(gap, other) for other in contigs if other != contig) def significant(gap): return gap.ref_length > 5 From 722e04b90e611aa2c52834db301a08300ea7c271 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 14:55:11 -0800 Subject: [PATCH 134/485] Contig stitcher: simplify cut_query implementation --- micall/core/contig_stitcher.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b68cee0a8..335530a58 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -40,22 +40,8 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont """ Cuts query sequence in two parts with cut_point between them. """ cut_point = max(0, cut_point) - match_fraction = self.match_fraction - left_name = generate_new_name() - left = GenotypedContig(name=left_name, - seq=self.seq[:ceil(cut_point)], - ref_seq=self.ref_seq, - ref_name=self.ref_name, - group_ref=self.group_ref, - match_fraction=match_fraction) - right_name = generate_new_name() - right = GenotypedContig(name=right_name, - seq=self.seq[ceil(cut_point):], - ref_seq=self.ref_seq, - ref_name=self.ref_name, - group_ref=self.group_ref, - match_fraction=match_fraction) - + left = replace(self, name=generate_new_name(), seq=self.seq[:ceil(cut_point)]) + right = replace(self, name=generate_new_name(), seq=self.seq[ceil(cut_point):]) return (left, right) From 88b2fdc7e6ab936084e8876b6c154431e038f191 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 15:43:11 -0800 Subject: [PATCH 135/485] Contig stitcher: simplify sliding_window implementation --- micall/core/contig_stitcher.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 335530a58..b1c709357 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -4,7 +4,7 @@ from math import ceil, floor from mappy import Aligner from functools import cached_property, reduce -from itertools import accumulate, takewhile +from itertools import accumulate, takewhile, tee, islice, chain from gotoh import align_it from queue import LifoQueue import logging @@ -167,16 +167,10 @@ def sliding_window(sequence: Iterable[T]) -> Iterable[Tuple[Optional[T], T, Opti the current item, and the next item (None if the last item) in the sequence. """ - if not sequence: - return - - yield (None, sequence[0], sequence[1] if len(sequence) > 1 else None) - - for i in range(1, len(sequence) - 1): - yield (sequence[i - 1], sequence[i], sequence[i + 1]) - - if len(sequence) > 1: - yield (sequence[-2], sequence[-1], None) + a, b, c = tee(sequence, 3) + prevs = chain([None], a) + nexts = chain(islice(c, 1, None), [None]) + return zip(prevs, b, nexts) def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: From e203e9a359a4443d83005fca442b9f7131326718 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 15:26:41 -0800 Subject: [PATCH 136/485] Contig stitcher: replace field "reverse":bool by "strand":enum --- micall/core/contig_stitcher.py | 40 +++++++++++------------ micall/core/plot_contigs.py | 4 +-- micall/tests/test_contig_stitcher_fuzz.py | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b1c709357..08de60f8b 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -48,13 +48,13 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont @dataclass(frozen=True) class AlignedContig(GenotypedContig): alignment: CigarHit - reverse: bool + strand: Literal["forward", "reverse"] @staticmethod - def make(query: GenotypedContig, alignment: CigarHit, reverse: bool): + def make(query: GenotypedContig, alignment: CigarHit, strand: Literal["forward", "reverse"]): return AlignedContig( alignment=alignment, - reverse=reverse, + strand=strand, seq=query.seq, name=query.name, ref_name=query.ref_name, @@ -67,10 +67,8 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left_query = replace(self, name=generate_new_name()) - right_query = replace(self, name=generate_new_name()) - left = AlignedContig.make(left_query, alignment_left, reverse=self.reverse) - right = AlignedContig.make(right_query, alignment_right, reverse=self.reverse) + left = replace(self, name=generate_new_name(), alignment=alignment_left) + right = replace(self, name=generate_new_name(), alignment=alignment_right) logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", left.name, left.alignment, right.name, right.alignment, self.name, @@ -89,7 +87,7 @@ def lstrip_query(self) -> 'AlignedContig': alignment = self.alignment.lstrip_query() q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) - result = AlignedContig.make(query, alignment, self.reverse) + result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, len(self.seq), result.alignment, len(result.seq), @@ -106,7 +104,7 @@ def rstrip_query(self) -> 'AlignedContig': alignment = self.alignment.rstrip_query() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) - result = AlignedContig.make(query, alignment, self.reverse) + result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, len(self.seq), result.alignment, len(result.seq), @@ -148,8 +146,8 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': reference_delta=0) alignment = self_alignment.connect(other_alignment) - assert self.reverse == other.reverse - ret = AlignedContig.make(reverse=self.reverse, query=query, alignment=alignment) + assert self.strand == other.strand + ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", self.name, self.alignment, other.name, other.alignment, ret.name, ret.alignment, extra={"action": "munge", "left": self, @@ -213,10 +211,10 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), x.strand == -1) - for x in alignments] - reversed_alignments = [alignment for alignment, is_rev in hits_array if is_rev] - alignments = [alignment for alignment, is_rev in hits_array if not is_rev] + hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), + "forward" if x.strand == 1 else "reverse") for x in alignments] + reversed_alignments = [alignment for alignment, strand in hits_array if strand == "reverse"] + alignments = [alignment for alignment, strand in hits_array if strand == "forward"] logger.info("Contig %r produced %s reverse-complement alignments.", contig.name, len(reversed_alignments), @@ -243,7 +241,7 @@ def make_aligned(query, alignment, is_rev): return AlignedContig.make( query=query, alignment=alignment, - reverse=is_rev) + strand=is_rev) to_return = connected + reversed_alignments if len(to_return) == 0: @@ -253,7 +251,7 @@ def make_aligned(query, alignment, is_rev): return if len(to_return) == 1: - is_rev = to_return[0] in reversed_alignments + is_rev = "forward" if to_return[0] in alignments else "reverse" part = make_aligned(contig, to_return[0], is_rev) logpart(0, part, is_rev) yield part @@ -266,7 +264,7 @@ def make_aligned(query, alignment, is_rev): group_ref=contig.group_ref, ref_seq=contig.ref_seq, match_fraction=contig.match_fraction) - is_rev = single_hit in reversed_alignments + is_rev = "forward" if single_hit in alignments else "reverse" part = make_aligned(query, single_hit, is_rev) logpart(i, part, is_rev) yield part @@ -629,8 +627,8 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig aligned = [x for x in aligned if isinstance(x, AlignedContig)] # Contigs aligned in reverse do not need any more processing - yield from (x for x in aligned if x.reverse) - aligned = [x for x in aligned if not x.reverse] + yield from (x for x in aligned if x.strand == "reverse") + aligned = [x for x in aligned if x.strand == "forward"] aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) @@ -644,7 +642,7 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo consensus_parts: Dict[GroupRef, List[AlignedContig]] = defaultdict(list) for contig in contigs: - if isinstance(contig, AlignedContig) and not contig.reverse: + if isinstance(contig, AlignedContig) and contig.strand == "forward": consensus_parts[contig.group_ref].append(contig) else: yield contig diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 757c20fe7..cca8444fb 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -528,7 +528,7 @@ def unwrap_final(contig): elif event.action == "alignment": if event.type == "hit": record_contig(event.part, [event.contig]) - if event.part.reverse: + if event.part.strand != "forward": anomaly.append(event.part.name) elif event.type == "noref": unknown.append(event.contig.name) @@ -883,7 +883,7 @@ def get_all_arrows(group_ref, labels): (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) if isinstance(contig, AlignedContig): colour = "lightgray" - if contig.reverse: + if contig.strand == "reverse": figure.add(Arrow(a_r_ei, a_r_st, elevation=-20, h=1)) else: figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 83e643321..f47033450 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -37,7 +37,7 @@ def read_contigs(line): match_fraction=2/3, ) alignment = CigarHit(Cigar([(length, CigarActions.MATCH)]), q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) - contig = AlignedContig.make(query=query, alignment=alignment, reverse=False) + contig = AlignedContig.make(query=query, alignment=alignment, strand="forward") aidee = f'{start:03d}-{end:03d}' yield {'contig': contig, 'id': aidee} From 199d44f182673e4ebe0ea48caa1d0b2a11592f17 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 16:23:21 -0800 Subject: [PATCH 137/485] Contig stitcher: improve handling of reverse complement alignments --- micall/core/contig_stitcher.py | 85 +++++++++++++--------------- micall/core/plot_contigs.py | 4 +- micall/tests/test_contig_stitcher.py | 4 +- 3 files changed, 44 insertions(+), 49 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 08de60f8b..c99b8239f 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -7,6 +7,7 @@ from itertools import accumulate, takewhile, tee, islice, chain from gotoh import align_it from queue import LifoQueue +from Bio import Seq import logging from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @@ -213,60 +214,52 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: alignments = list(aligner.map(contig.seq)) hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), "forward" if x.strand == 1 else "reverse") for x in alignments] - reversed_alignments = [alignment for alignment, strand in hits_array if strand == "reverse"] - alignments = [alignment for alignment, strand in hits_array if strand == "forward"] - logger.info("Contig %r produced %s reverse-complement alignments.", - contig.name, len(reversed_alignments), - extra={"action": "alignment", "type": "reversenumber", - "contig": contig, "n": len(reversed_alignments)}) + connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] - connected = connect_cigar_hits(alignments) if alignments else [] - - logger.info("Contig %r produced %s forward alignments.", contig.name, len(connected), - extra={"action": "alignment", "type": "hitnumber", - "contig": contig, "n": len(connected)}) - - def logpart(i, part, is_rev): - logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", - i, contig.name, part.name, part.alignment.q_st, - part.alignment.q_ei, part.alignment.r_st, part.alignment.r_ei, - " (rev)" if is_rev else "", - extra={"action": "alignment", "type": "hit", - "contig": contig, "part": part, "i": i}) - logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, - part.name, part.alignment, " (rev)" if is_rev else "") - - def make_aligned(query, alignment, is_rev): - return AlignedContig.make( - query=query, - alignment=alignment, - strand=is_rev) - - to_return = connected + reversed_alignments - if len(to_return) == 0: - logger.info("Contig %r not aligned - backend choice.", contig.name, + if not connected: + logger.info("Contig %r not aligned - backend's choice.", contig.name, extra={"action": "alignment", "type": "zerohits", "contig": contig}) yield contig return - if len(to_return) == 1: - is_rev = "forward" if to_return[0] in alignments else "reverse" - part = make_aligned(contig, to_return[0], is_rev) - logpart(0, part, is_rev) - yield part + if len(set(map(lambda p: p[1], hits_array))) > 1: + logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name, + extra={"action": "alignment", "type": "strandconflict", "contig": contig}) + yield contig return - for i, single_hit in enumerate(to_return): - query = GenotypedContig(name=generate_new_name(), - seq=contig.seq, - ref_name=contig.ref_name, - group_ref=contig.group_ref, - ref_seq=contig.ref_seq, - match_fraction=contig.match_fraction) - is_rev = "forward" if single_hit in alignments else "reverse" - part = make_aligned(query, single_hit, is_rev) - logpart(i, part, is_rev) + logger.info("Contig %r produced %s aligner hits. After connecting them, the number became %s.", + contig.name, len(hits_array), len(connected), + extra={"action": "alignment", "type": "hitnumber", "contig": contig, + "initial": hits_array, "connected": connected}) + + strand = hits_array[0][1] + if strand == "reverse": + rc = str(Seq(contig.seq).reverse_complement()) + new_contig = replace(contig, seq=rc) + logger.info("Reverse complemented contig %r.", contig.name, + extra={"action": "alignment", "type": "reversecomplement", + "contig": contig, "result": new_contig}) + contig = new_contig + + for i, single_hit in enumerate(connected): + if len(connected) == 1: + query = contig + else: + query = replace(contig, name=generate_new_name()) + + part = AlignedContig.make(query, single_hit, strand) + + logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", + i, contig.name,part.name,part.alignment.q_st, + part.alignment.q_ei,part.alignment.r_st,part.alignment.r_ei, + " (rev)" if strand == "reverse" else "", + extra={"action": "alignment", "type": "hit", + "contig": contig, "part":part, "i": i}) + logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, + part.name,part.alignment, " (rev)" if strand == "reverse" else "") + yield part diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index cca8444fb..6f86730ab 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -532,8 +532,10 @@ def unwrap_final(contig): anomaly.append(event.part.name) elif event.type == "noref": unknown.append(event.contig.name) - elif event.type == "zerohits": + elif event.type == "zerohits" or event.type == "strandconflict": anomaly.append(event.contig.name) + elif event.type == "reversecomplement": + record_contig(event.new_contig, [event.contig]) elif event.type in ("hitnumber", "reversenumber"): pass else: diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e8ac45110..49f19b4a5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -653,11 +653,11 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(handler.logs) == 0 list(stitch_consensus(contigs)) - assert len(handler.logs) == 158 + assert len(handler.logs) == 150 info_messages = [m for m in handler.logs if m.levelname == 'INFO'] debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] - assert len(info_messages) == 40 + assert len(info_messages) == 32 assert len(debug_messages) == len(handler.logs) - len(info_messages) From c4537ed5dd2d1af5409147bb970c926da774bc27 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 17 Jan 2024 16:25:43 -0800 Subject: [PATCH 138/485] Contig stitcher: make sure that mappy coordinates are not reversed --- micall/core/contig_stitcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index c99b8239f..4d17b2794 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -212,7 +212,11 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - hits_array = [(CigarHit(Cigar(x.cigar), x.r_st, x.r_en - 1, x.q_st, x.q_en - 1), + hits_array = [(CigarHit(Cigar(x.cigar), + min(x.r_st, x.r_en - 1), + max(x.r_st, x.r_en - 1), + min(x.q_st, x.q_en - 1), + max(x.q_st, x.q_en - 1)), "forward" if x.strand == 1 else "reverse") for x in alignments] connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] From f068e2f87850189adf572e3ff3d1ab3852b29ef7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 18 Jan 2024 17:00:07 -0800 Subject: [PATCH 139/485] Contig stitcher: fix visualisation of non-overlapping contigs --- micall/core/plot_contigs.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 6f86730ab..016c08482 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -404,7 +404,7 @@ def plot_stitcher_coverage(logs, genome_coverage_svg_path): def build_stitcher_figure(logs) -> None: contig_map: Dict[str, Contig] = {} name_mappings: Dict[str, str] = {} - parent_graph: Dict[str, List[str]] = defaultdict(list) + parent_graph: Dict[str, List[str]] = {} morphism_graph: Dict[str, List[str]] = {} reduced_parent_graph: Dict[str, List[str]] = {} transitive_parent_graph: Dict[str, List[str]] = {} @@ -420,6 +420,7 @@ def build_stitcher_figure(logs) -> None: combine_right_edge: Dict[str, str] = {} synthetic: Set[str] = set() sinks: Dict[str, bool] = {} + returned: List[str] = [] def get_oldest_ancestors(recur, graph, ancestor_name): if ancestor_name in recur: @@ -503,6 +504,9 @@ def record_contig(contig: Contig, parents: List[Contig]): if [contig.name] != [parent.name for parent in parents]: for parent in parents: contig_map[parent.name] = parent + if contig.name not in parent_graph: + parent_graph[contig.name] = [] + parent_graph[contig.name].append(parent.name) def record_morphism(contig: Contig, original: Contig): @@ -512,13 +516,11 @@ def record_morphism(contig: Contig, original: Contig): if contig.name not in lst: lst.append(contig.name) - def unwrap_final(contig): - yield contig - for event in logs: if not hasattr(event, "action"): pass elif event.action == "finalcombine": + for part in event.contigs: returned.append(part.name) record_contig(event.result, event.contigs) elif event.action == "splitgap": record_contig(event.left, [event.contig]) @@ -626,6 +628,16 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): elif contig in discarded or contig in anomaly or contig in unknown: final_parts[contig] = True + for contig in returned: + [contig] = reduced_morphism_graph.get(contig, [contig]) + + transitive_parent = transitive_parent_graph.get(contig, []) + if any(parent in transitive_parent for parent in final_parts): + continue + + final_parts[contig] = True + + final_parent_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: children = [] @@ -645,12 +657,18 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): max_position = max(max_position, len(contig.seq) + 3 * position_offset) name_mappings = {} + def part_relative_position(name): + part = contig_map[name] + if isinstance(part, AlignedContig): + return part.alignment.q_st + else: + return -1 for i, (parent, children) in enumerate(sorted(final_parent_mapping.items(), key=lambda p: p[0])): name_mappings[parent] = f"{i + 1}" - mapped_children = [child for child in children] - for k, child in enumerate(mapped_children): - if len(mapped_children) > 1: + children = list(sorted(children, key=part_relative_position)) + for k, child in enumerate(children): + if len(children) > 1: name_mappings[child] = f"{i + 1}.{k + 1}" else: name_mappings[child] = f"{i + 1}" From a82247266d4a1b957ad4aa0d569acdad59ef3527 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 18 Jan 2024 17:17:18 -0800 Subject: [PATCH 140/485] Contig stitcher: strip unaligned parts earlier In particular, strip those unaligned parts that surround out-of-order alignment hits. --- micall/core/contig_stitcher.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 4d17b2794..9c345a58b 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -267,6 +267,26 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: yield part +def strip_conflicting_mappings(contigs): + contigs = list(contigs) + names = {contig.name: contig for contig in contigs} + reference_indexes = list(sorted(names.keys(), key=lambda name: names[name].alignment.r_st if isinstance(names[name], AlignedContig) else -1)) + query_indexes = list(sorted(names.keys(), key=lambda name: names[name].alignment.q_st if isinstance(names[name], AlignedContig) else -1)) + + def is_out_of_order(name): + return reference_indexes.index(name) != query_indexes.index(name) + + sorted_by_query = list(sorted(contigs, key=lambda contig: contig.alignment.q_st if isinstance(contig, AlignedContig) else -1)) + + for prev_contig, contig, next_contig in sliding_window(sorted_by_query): + name = contig.name + if prev_contig is not None or is_out_of_order(name): + contig = contig.lstrip_query() + if next_contig is not None or is_out_of_order(name): + contig = contig.rstrip_query() + yield contig + + def align_all_to_reference(contigs): """ Align multiple contigs to their respective reference sequences. @@ -275,7 +295,7 @@ def align_all_to_reference(contigs): flattening the result into a single list. """ - return [contig for parts in map(align_to_reference, contigs) for contig in parts] + return [contig for parts in map(strip_conflicting_mappings, map(align_to_reference, contigs)) for contig in parts] def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: From 5710038c46125250e7616c45f40d8641e8528bba Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 18 Jan 2024 17:29:50 -0800 Subject: [PATCH 141/485] Contig stitcher: improve numbering of alignments in the visualizer --- micall/core/plot_contigs.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 016c08482..ec48d284b 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -421,6 +421,7 @@ def build_stitcher_figure(logs) -> None: synthetic: Set[str] = set() sinks: Dict[str, bool] = {} returned: List[str] = [] + query_position_map: Dict[str, int] = {} def get_oldest_ancestors(recur, graph, ancestor_name): if ancestor_name in recur: @@ -589,6 +590,31 @@ def record_morphism(contig: Contig, original: Contig): eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) reduced_morphism_graph = reduced_closure(morphism_graph) + + def set_query_position(contig: Contig): + if contig.name in query_position_map: + return + + if not isinstance(contig, AlignedContig): + return + + parent_names = parent_graph.get(contig.name, []) + parents = [contig_map[name] for name in parent_names] + parents = [contig for contig in parents if isinstance(contig, AlignedContig)] + + if parents: + for parent in parents: + if parent.name not in query_position_map: + set_query_position(parent) + + average = sum(query_position_map[parent_name] for parent_name in parent_names) / len(parent_names) + query_position_map[contig.name] = average + else: + query_position_map[contig.name] = (contig.alignment.q_st + contig.alignment.q_ei) // 2 + + for contig in contig_map.values(): + set_query_position(contig) + # Closing `synthetic' for contig in contig_map: if contig in synthetic: @@ -657,16 +683,9 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): max_position = max(max_position, len(contig.seq) + 3 * position_offset) name_mappings = {} - def part_relative_position(name): - part = contig_map[name] - if isinstance(part, AlignedContig): - return part.alignment.q_st - else: - return -1 - for i, (parent, children) in enumerate(sorted(final_parent_mapping.items(), key=lambda p: p[0])): name_mappings[parent] = f"{i + 1}" - children = list(sorted(children, key=part_relative_position)) + children = list(sorted(children, key=lambda name: query_position_map.get(name, -1))) for k, child in enumerate(children): if len(children) > 1: name_mappings[child] = f"{i + 1}.{k + 1}" From d0928c0ce14b6fec545fab9192c393219da77140 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 18 Jan 2024 18:06:02 -0800 Subject: [PATCH 142/485] Contig stitcher: fix landmarks positioning in the visualizer --- micall/core/plot_contigs.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ec48d284b..bf691a0e6 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -848,28 +848,37 @@ def get_all_arrows(group_ref, labels): ############# if reference_set: + + # Filling out missing ends. prev_landmark = None - for i, landmark in enumerate(sorted(reference_set['landmarks'], - key=itemgetter('start'))): + for landmark in sorted(reference_set['landmarks'], key=itemgetter('start')): landmark.setdefault('frame', 0) if prev_landmark and 'end' not in prev_landmark: prev_landmark['end'] = landmark['start'] - 1 prev_landmark = landmark + + # Computing the stretching factor. + landmark_max = 0 + for landmark in reference_set['landmarks']: + landmark_max = max(landmark_max, landmark['end']) + + stretch_c = group_refs[group_ref] / landmark_max + + # Drawing the landmarks. for frame, frame_landmarks in groupby(reference_set['landmarks'], - itemgetter('frame')): + itemgetter('frame')): subtracks = [] for landmark in frame_landmarks: landmark_colour = landmark.get('colour') if landmark_colour is None: continue - subtracks.append(Track(landmark['start'] + position_offset, - landmark['end'] + position_offset, - label=landmark['name'], - color=landmark_colour)) - max_position = max(max_position, - landmark['end'] + position_offset) + subtracks.append(Track(landmark['start'] * stretch_c + position_offset, + landmark['end'] * stretch_c + position_offset, + label=landmark['name'], + color=landmark_colour)) figure.add(Multitrack(subtracks)) + # Drawing the reference sequence. r_st = position_offset r_ei = position_offset + group_refs[group_ref] figure.add(Track(r_st, r_ei, label=f"{group_ref}")) From 83d678133327e4d9ada47b6f7cc7b77236a31880 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 12:47:51 -0800 Subject: [PATCH 143/485] Contig stitcher: plot contigs even if reference is not a standard one --- micall/core/plot_contigs.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index bf691a0e6..7790c19ed 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -840,8 +840,6 @@ def get_all_arrows(group_ref, labels): reference_set = matching_groups[0] else: reference_set = None - add_partial_banner(figure, position_offset, max_position) - continue ############# # Landmarks # @@ -878,10 +876,10 @@ def get_all_arrows(group_ref, labels): color=landmark_colour)) figure.add(Multitrack(subtracks)) - # Drawing the reference sequence. - r_st = position_offset - r_ei = position_offset + group_refs[group_ref] - figure.add(Track(r_st, r_ei, label=f"{group_ref}")) + # Drawing the reference sequence. + r_st = position_offset + r_ei = position_offset + group_refs[group_ref] + figure.add(Track(r_st, r_ei, label=f"{group_ref}")) ########## # Arrows # From a540f932afcec36cba0c7584a79811ddea6ec498 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 13:26:08 -0800 Subject: [PATCH 144/485] Contig stitcher: produce visualizer plot every time --debug is used --- micall/core/denovo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 712995aee..6cbdf956b 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -213,7 +213,7 @@ def denovo(fastq1_path: str, work_dir: str = '.', merged_contigs_csv: typing.TextIO = None, blast_csv: typing.TextIO = None, - stitcher_plot_path: typing.Union[str, None] = None, + stitcher_plot_path: typing.Optional[str] = None, ): """ Use de novo assembly to build contigs from reads. @@ -233,6 +233,8 @@ def denovo(fastq1_path: str, start_time = datetime.now() start_dir = os.getcwd() joined_path = os.path.join(tmp_dir, 'joined.fastq') + if stitcher_plot_path is None: + stitcher_plot_path = os.path.join(tmp_dir, "stitcher_plot.svg") run(['merge-mates', fastq1_path, fastq2_path, From 90f0177119d30e5ba1048a6bfdf13e128a87431b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 15:21:15 -0800 Subject: [PATCH 145/485] Contig stitcher: mention why contigs are dropped in the logs --- micall/core/contig_stitcher.py | 15 ++++++++------- micall/tests/test_contig_stitcher.py | 7 +++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9c345a58b..a0fbf45fa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -535,15 +535,15 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: current_interval = (current.alignment.r_st, current.alignment.r_ei) # Create a map of cumulative coverage for contigs - other_contigs = [x for x in contigs if x != current and x.group_ref == current.group_ref] - cumulative_coverage = calculate_cumulative_coverage(other_contigs) + overlaping_contigs = [x for x in contigs if x != current and x.overlaps(current)] + cumulative_coverage = calculate_cumulative_coverage(overlaping_contigs) # Check if the current contig is covered by the cumulative coverage intervals if any((cover_interval[0] <= current_interval[0] and cover_interval[1] >= current_interval[1]) for cover_interval in cumulative_coverage): - return current + return current, overlaping_contigs - return None + return None, None def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: @@ -551,11 +551,12 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] contigs = contigs[:] while contigs: - covered = find_covered_contig(contigs) + covered, covering = find_covered_contig(contigs) if covered: contigs.remove(covered) - logger.info("Droped contig %r as it is completely covered by other contigs.", - covered.name, extra={"action": "drop", "contig": covered}) + logger.info("Droped contig %r as it is completely covered by these contigs: %s.", + covered.name, ", ".join(repr(x.name) for x in covering), + extra={"action": "drop", "contig": covered, "covering": covering}) else: break diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 49f19b4a5..b5fa17aa1 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -4,7 +4,7 @@ import os import pytest -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig from micall.tests.utils import MockAligner, fixed_random_seed from micall.utils.structured_logger import add_structured_handler from micall.tests.test_denovo import check_hcv_db @@ -789,6 +789,9 @@ def __init__(self, ref_name, group_ref, r_st, r_ei, name="contig"): self.alignment = MockAlignment(r_st, r_ei) self.name = name + def overlaps(self, other): + return AlignedContig.overlaps(self, other) + class MockAlignment: def __init__(self, r_st, r_ei): @@ -867,7 +870,7 @@ def create_mock_aligned_contig(ref_name, r_st, r_ei, name="contig"): def test_find_covered(contigs, expected_covered_name): mock_contigs = [create_mock_aligned_contig(ref_name, r_st, r_ei, f'contig{i+1}') for i, (ref_name, r_st, r_ei) in enumerate(contigs)] - covered = find_covered_contig(mock_contigs) + covered, covering = find_covered_contig(mock_contigs) if expected_covered_name is None: assert covered is None else: From b475e9f515493912499ba5c27d8e845a2c6433c8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 16:11:13 -0800 Subject: [PATCH 146/485] Contig stitcher: visualize non-final contigs --- micall/core/contig_stitcher.py | 2 +- micall/core/plot_contigs.py | 98 +++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 31 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index a0fbf45fa..1799d532c 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -445,7 +445,7 @@ def stitch_2_contigs(left, right): round(cut_point_location_scaled * 100), concordance_str, extra={"action": "overlap", "left": left, "right": right, "left_remainder": left_remainder, "right_remainder": right_remainder, - "left_overlap": left_overlap, "right_original": right_overlap, + "left_overlap": left_overlap, "right_overlap": right_overlap, "left_take": left_overlap_take, "right_take": right_overlap_take, "concordance": concordance, "avg": average_concordance, "cut_point": max_concordance_index, diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 7790c19ed..894b72322 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -7,6 +7,7 @@ from math import log10, copysign from operator import itemgetter, attrgetter from pathlib import Path +import logging import yaml from genetracks import Figure, Track, Multitrack, Coverage @@ -20,6 +21,9 @@ from micall.utils.alignment_wrapper import align_nucs +logger = logging.getLogger(__name__) + + class LeftLabel(Label): """Like Label, but anchored to the left, instead of the middle. """ @@ -393,12 +397,12 @@ def build_coverage_figure(genome_coverage_csv, blast_csv=None, use_concordance=F def plot_stitcher_coverage(logs, genome_coverage_svg_path): f = build_stitcher_figure(logs) f.show(w=970).save_svg(genome_coverage_svg_path, context=draw.Context(invert_y=True)) + return f from types import SimpleNamespace from typing import Union, Dict, Tuple, List, Optional, Set from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig -import logging import random def build_stitcher_figure(logs) -> None: @@ -418,14 +422,15 @@ def build_stitcher_figure(logs) -> None: overlap_sibling_map: Dict[str, str] = {} combine_left_edge: Dict[str, str] = {} combine_right_edge: Dict[str, str] = {} - synthetic: Set[str] = set() - sinks: Dict[str, bool] = {} - returned: List[str] = [] + temporary: Set[str] = set() + children_join_points: List[str] = [] + children_meet_points: List[str] = [] + last_active: List[str] = [] query_position_map: Dict[str, int] = {} def get_oldest_ancestors(recur, graph, ancestor_name): if ancestor_name in recur: - return + assert RuntimeError(f"Recursion in graph {graph!r}") else: recur = recur.copy() recur.add(ancestor_name) @@ -449,20 +454,28 @@ def reduced_closure(graph): ret[parent] = lst return ret - def transitive_closure(graph): - def dfs(current_node, start_node): - if current_node not in visited: - visited.add(current_node) - closure[start_node].add(current_node) - for neighbor in graph.get(current_node, []): - dfs(neighbor, start_node) + def get_all_ancestors(recur, lst, graph, ancestor_name): + if ancestor_name in recur: + assert RuntimeError(f"Recursion in graph {graph!r}") + else: + recur = recur.copy() + recur.add(ancestor_name) - closure = {node: set() for node in graph} - for node in graph: - visited = set() - dfs(node, node) + if ancestor_name not in lst: + lst.append(ancestor_name) - return {node: list(descendants) for node, descendants in closure.items()} + existing_ancestors = graph.get(ancestor_name, []) + for existing in existing_ancestors: + get_all_ancestors(recur, lst, graph, existing) + + def transitive_closure(graph): + ret = {} + for parent, children in graph.items(): + lst = [] + for child in children: + get_all_ancestors(set(), lst, graph, child) + ret[parent] = lst + return ret def reflexive_closure(graph): ret = graph.copy() @@ -521,7 +534,6 @@ def record_morphism(contig: Contig, original: Contig): if not hasattr(event, "action"): pass elif event.action == "finalcombine": - for part in event.contigs: returned.append(part.name) record_contig(event.result, event.contigs) elif event.action == "splitgap": record_contig(event.left, [event.contig]) @@ -549,8 +561,12 @@ def record_morphism(contig: Contig, original: Contig): record_contig(event.result, [event.original]) record_morphism(event.result, event.original) elif event.action == "overlap": - synthetic.add(event.left_take.name) - synthetic.add(event.right_take.name) + temporary.add(event.left_take.name) + temporary.add(event.right_take.name) + temporary.add(event.left_overlap.name) + temporary.add(event.right_overlap.name) + temporary.add(event.left.name) + temporary.add(event.right.name) overlap_leftparent_map[event.left_remainder.name] = event.left.name overlap_rightparent_map[event.right_remainder.name] = event.right.name overlap_lefttake_map[event.left_remainder.name] = event.left_take.name @@ -583,13 +599,29 @@ def record_morphism(contig: Contig, original: Contig): reduced_parent_graph = reduced_closure(parent_graph) reduced_children_graph = reduced_closure(children_graph) transitive_parent_graph = transitive_closure(parent_graph) + transitive_children_graph = transitive_closure(children_graph) sorted_roots = list(sorted(parent_name for parent_name in contig_map if parent_name not in parent_graph)) + sorted_sinks = list(sorted(child_name for + child_name in contig_map + if child_name not in children_graph)) eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) reduced_morphism_graph = reduced_closure(morphism_graph) + for contig, parents in parent_graph.items(): + if len(parents) > 2: + children_join_points.append(contig) + for contig, children in children_graph.items(): + if len(children) > 2: + children_meet_points.append(contig) + + last_join_points_parent = {contig for join in children_join_points for contig in transitive_parent_graph.get(join, [])} + last_join_points = [] + for contig in children_join_points: + if contig not in last_join_points_parent: + last_join_points.append(contig) def set_query_position(contig: Contig): if contig.name in query_position_map: @@ -615,11 +647,11 @@ def set_query_position(contig: Contig): for contig in contig_map.values(): set_query_position(contig) - # Closing `synthetic' + # Closing `temporary' for contig in contig_map: - if contig in synthetic: + if contig in temporary: for clone in eqv_morphism_graph.get(contig, []): - synthetic.add(clone) + temporary.add(clone) def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): for parent in edge_table: @@ -640,7 +672,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): final_parts: Dict[str, bool] = {} for contig in contig_map: - if contig in synthetic: + if contig in temporary: continue if contig in overlap_sibling_map: @@ -654,15 +686,18 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): elif contig in discarded or contig in anomaly or contig in unknown: final_parts[contig] = True - for contig in returned: - [contig] = reduced_morphism_graph.get(contig, [contig]) + for join in last_join_points: + for contig in parent_graph.get(join, []): + [contig] = reduced_morphism_graph.get(contig, [contig]) - transitive_parent = transitive_parent_graph.get(contig, []) - if any(parent in transitive_parent for parent in final_parts): - continue + if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): + continue - final_parts[contig] = True + transitive_parent = transitive_parent_graph.get(contig, []) + if any(parent in transitive_parent for parent in final_parts): + continue + final_parts[contig] = True final_parent_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: @@ -699,6 +734,9 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): k += 1 name_mappings[child] = f"{i + 1}.{k + 1}" + for contig, name in name_mappings.items(): + logger.debug(f"Contig name {contig!r} is displayed as {name!r}.") + def get_neighbours(part, lookup): for clone in eqv_morphism_graph.get(part.name, [part.name]): maybe_name = lookup.get(clone, None) From a0ec6e4553b1ed26c8bbf2048d38da571833077c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 16:27:59 -0800 Subject: [PATCH 147/485] Contig stitcher: fix duplicate visualization of bad contigs --- micall/core/plot_contigs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 894b72322..44094c4ea 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -826,6 +826,9 @@ def get_tracks(group_ref, contig_name): for part_name in parts: part = contig_map[part_name] + if part_name in discarded or part_name in anomaly or part_name in unknown: + continue + if not isinstance(part, AlignedContig): continue @@ -841,6 +844,9 @@ def get_arrows(group_ref, contig_name, labels): for part_name in parts: part = contig_map[part_name] + if part_name in discarded or part_name in anomaly or part_name in unknown: + continue + if not isinstance(part, AlignedContig): continue From 7a842dd7264a358ffe2403da3bad7ca3e8e6fb7d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 16:54:39 -0800 Subject: [PATCH 148/485] Contig stitcher: fix handling of sinks in the visualizer --- micall/core/plot_contigs.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 44094c4ea..be9b78a16 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -606,6 +606,7 @@ def record_morphism(contig: Contig, original: Contig): sorted_sinks = list(sorted(child_name for child_name in contig_map if child_name not in children_graph)) + bad_contigs = anomaly + discarded + unknown eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) reduced_morphism_graph = reduced_closure(morphism_graph) @@ -683,13 +684,17 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): if len(parents) == 1: final_parts[final] = True - elif contig in discarded or contig in anomaly or contig in unknown: + elif contig in bad_contigs: final_parts[contig] = True - for join in last_join_points: - for contig in parent_graph.get(join, []): + # for join in last_join_points + sorted_sinks: + for join in last_join_points + sorted_sinks: + for contig in parent_graph.get(join, [join]): [contig] = reduced_morphism_graph.get(contig, [contig]) + if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): + continue + if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): continue @@ -727,7 +732,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): else: name_mappings[child] = f"{i + 1}" - for child in discarded + anomaly + unknown: + for child in bad_contigs: if child not in children: if child in transitive_parent_graph \ and parent in transitive_parent_graph[child]: @@ -826,7 +831,7 @@ def get_tracks(group_ref, contig_name): for part_name in parts: part = contig_map[part_name] - if part_name in discarded or part_name in anomaly or part_name in unknown: + if part_name in bad_contigs: continue if not isinstance(part, AlignedContig): @@ -844,7 +849,7 @@ def get_arrows(group_ref, contig_name, labels): for part_name in parts: part = contig_map[part_name] - if part_name in discarded or part_name in anomaly or part_name in unknown: + if part_name in bad_contigs: continue if not isinstance(part, AlignedContig): From 7c90109ddca7a25d1755a42bcdacd8368d5d59e5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 17:12:33 -0800 Subject: [PATCH 149/485] Contig stitcher: do not assume that reduced_morphism_graph is fan-out=1 --- micall/core/plot_contigs.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index be9b78a16..3d15a9391 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -690,19 +690,18 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): # for join in last_join_points + sorted_sinks: for join in last_join_points + sorted_sinks: for contig in parent_graph.get(join, [join]): - [contig] = reduced_morphism_graph.get(contig, [contig]) - - if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): - continue + for contig in reduced_morphism_graph.get(contig, [contig]): + if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): + continue - if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): - continue + if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): + continue - transitive_parent = transitive_parent_graph.get(contig, []) - if any(parent in transitive_parent for parent in final_parts): - continue + transitive_parent = transitive_parent_graph.get(contig, []) + if any(parent in transitive_parent for parent in final_parts): + continue - final_parts[contig] = True + final_parts[contig] = True final_parent_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: From d775b1e0442e8097170e90abb8b0dc4280a4591d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 19 Jan 2024 17:45:42 -0800 Subject: [PATCH 150/485] Contig stitcher: always rename children --- micall/core/contig_stitcher.py | 6 +----- micall/core/plot_contigs.py | 20 +++++++++++++------- micall/tests/test_contig_stitcher.py | 2 -- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1799d532c..f498f910a 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -248,11 +248,7 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: contig = new_contig for i, single_hit in enumerate(connected): - if len(connected) == 1: - query = contig - else: - query = replace(contig, name=generate_new_name()) - + query = replace(contig, name=generate_new_name()) part = AlignedContig.make(query, single_hit, strand) logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 3d15a9391..45e06cd2e 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -415,6 +415,7 @@ def build_stitcher_figure(logs) -> None: discarded: List[str] = [] unknown: List[str] = [] anomaly: List[str] = [] + overlaps_list: List[str] = [] overlap_leftparent_map: Dict[str, str] = {} overlap_rightparent_map: Dict[str, str] = {} overlap_lefttake_map: Dict[str, str] = {} @@ -561,12 +562,8 @@ def record_morphism(contig: Contig, original: Contig): record_contig(event.result, [event.original]) record_morphism(event.result, event.original) elif event.action == "overlap": - temporary.add(event.left_take.name) - temporary.add(event.right_take.name) - temporary.add(event.left_overlap.name) - temporary.add(event.right_overlap.name) - temporary.add(event.left.name) - temporary.add(event.right.name) + overlaps_list.append(event.left_overlap.name) + overlaps_list.append(event.right_overlap.name) overlap_leftparent_map[event.left_remainder.name] = event.left.name overlap_rightparent_map[event.right_remainder.name] = event.right.name overlap_lefttake_map[event.left_remainder.name] = event.left_take.name @@ -611,6 +608,11 @@ def record_morphism(contig: Contig, original: Contig): eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) reduced_morphism_graph = reduced_closure(morphism_graph) + for contig in overlaps_list: + temporary.add(contig) + for child in transitive_children_graph.get(contig, []): + temporary.add(child) + for contig, parents in parent_graph.items(): if len(parents) > 2: children_join_points.append(contig) @@ -689,7 +691,11 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): # for join in last_join_points + sorted_sinks: for join in last_join_points + sorted_sinks: - for contig in parent_graph.get(join, [join]): + parents = parent_graph.get(join, [join]) + if not any(isinstance(parent, AlignedContig) for parent in parents): + parents = [join] + + for contig in parents: for contig in reduced_morphism_graph.get(contig, [contig]): if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): continue diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index b5fa17aa1..bab02eb75 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -234,7 +234,6 @@ def test_stitching_of_identical_contigs(exact_aligner): results = list(stitch_contigs(contigs)) assert len(results) == 1 - assert results[0].name == contigs[2].name assert results[0].seq == contigs[2].seq @@ -359,7 +358,6 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): # Test to ensure that the final result contains the contig 'b' and # does not contain the completely covered contig 'a'. - assert results[0].name == 'b' assert results[0].seq == contigs[1].seq From c352af4c8d6fea9592bc6016fa54a79d32f97a5a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 10:04:40 -0800 Subject: [PATCH 151/485] Contig stitcher: make sure every contig is mapped in the visualizer --- micall/core/plot_contigs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 45e06cd2e..dce271bf2 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -531,6 +531,10 @@ def record_morphism(contig: Contig, original: Contig): if contig.name not in lst: lst.append(contig.name) + def record_bad_contig(contig: Contig, lst: List[Contig]): + contig_map[contig.name] = contig + lst.append(contig.name) + for event in logs: if not hasattr(event, "action"): pass @@ -544,12 +548,10 @@ def record_morphism(contig: Contig, original: Contig): elif event.action == "alignment": if event.type == "hit": record_contig(event.part, [event.contig]) - if event.part.strand != "forward": - anomaly.append(event.part.name) elif event.type == "noref": - unknown.append(event.contig.name) + record_bad_contig(event.contig, unknown) elif event.type == "zerohits" or event.type == "strandconflict": - anomaly.append(event.contig.name) + record_bad_contig(event.contig, anomaly) elif event.type == "reversecomplement": record_contig(event.new_contig, [event.contig]) elif event.type in ("hitnumber", "reversenumber"): @@ -571,7 +573,7 @@ def record_morphism(contig: Contig, original: Contig): overlap_sibling_map[event.left_remainder.name] = event.right_remainder.name overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name elif event.action == "drop": - discarded.append(event.contig.name) + record_bad_contig(event.contig, discarded) elif event.action == "stitchcut": record_contig(event.left_overlap, [event.left]) record_contig(event.left_remainder, [event.left]) From 9e28810f4fce3b62da3e376d6b0518be04464cd3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 11:45:33 -0800 Subject: [PATCH 152/485] Contig stitcher: fix logging level handling --- micall/core/contig_stitcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f498f910a..30edf04da 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -696,6 +696,7 @@ def main(args): else: logger.setLevel(logging.WARN) + logging.basicConfig(level=logger.level) write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) args.contigs.close() args.stitched_contigs.close() From 5875c392f121f525eeb81c93b2d6d8378c827c68 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 11:53:21 -0800 Subject: [PATCH 153/485] Contig stitcher: check that --debug is enabled for --plot --- micall/core/contig_stitcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 30edf04da..3158f35ef 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -697,6 +697,10 @@ def main(args): logger.setLevel(logging.WARN) logging.basicConfig(level=logger.level) + if args.plot and logger.level > logging.DEBUG: + logging.error("Option --plot requires --debug.") + exit(1) + write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) args.contigs.close() args.stitched_contigs.close() From 39a73f13db42b387ce743c33271099f2915c0bb9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 14:30:48 -0800 Subject: [PATCH 154/485] Contig stitcher: fix type checking errors --- micall/core/contig_stitcher.py | 67 +++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 3158f35ef..947bc713e 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -196,7 +196,7 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: return ret -def align_to_reference(contig) -> Iterable[GenotypedContig]: +def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: """ Align a single Contig to its reference sequence, producing potentially multiple aligned contigs. @@ -212,12 +212,11 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - hits_array = [(CigarHit(Cigar(x.cigar), - min(x.r_st, x.r_en - 1), - max(x.r_st, x.r_en - 1), - min(x.q_st, x.q_en - 1), - max(x.q_st, x.q_en - 1)), - "forward" if x.strand == 1 else "reverse") for x in alignments] + hits_array: List[Tuple[CigarHit, Literal["forward", "reverse"]]] = \ + [(CigarHit(Cigar(x.cigar), + min(x.r_st, x.r_en - 1), max(x.r_st, x.r_en - 1), + min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)), + "forward" if x.strand == 1 else "reverse") for x in alignments] connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] @@ -263,27 +262,36 @@ def align_to_reference(contig) -> Iterable[GenotypedContig]: yield part -def strip_conflicting_mappings(contigs): +def strip_conflicting_mappings(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(contigs) names = {contig.name: contig for contig in contigs} - reference_indexes = list(sorted(names.keys(), key=lambda name: names[name].alignment.r_st if isinstance(names[name], AlignedContig) else -1)) - query_indexes = list(sorted(names.keys(), key=lambda name: names[name].alignment.q_st if isinstance(names[name], AlignedContig) else -1)) - def is_out_of_order(name): - return reference_indexes.index(name) != query_indexes.index(name) + def get_indexes(name: str) -> Tuple[int, int]: + contig = names[name] + if isinstance(contig, AlignedContig): + return (contig.alignment.q_st, contig.alignment.r_st) + else: + return (-1, -1) - sorted_by_query = list(sorted(contigs, key=lambda contig: contig.alignment.q_st if isinstance(contig, AlignedContig) else -1)) + reference_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[1])) + query_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[0])) + + def is_out_of_order(name: str) -> bool: + return reference_sorted.index(name) != query_sorted.index(name) + sorted_by_query = list(sorted(contigs, key=lambda contig: contig.alignment.q_st if isinstance(contig, AlignedContig) else -1)) for prev_contig, contig, next_contig in sliding_window(sorted_by_query): - name = contig.name - if prev_contig is not None or is_out_of_order(name): - contig = contig.lstrip_query() - if next_contig is not None or is_out_of_order(name): - contig = contig.rstrip_query() + if isinstance(contig, AlignedContig): + name = contig.name + if prev_contig is not None or is_out_of_order(name): + contig = contig.lstrip_query() + if next_contig is not None or is_out_of_order(name): + contig = contig.rstrip_query() + yield contig -def align_all_to_reference(contigs): +def align_all_to_reference(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: """ Align multiple contigs to their respective reference sequences. @@ -291,7 +299,10 @@ def align_all_to_reference(contigs): flattening the result into a single list. """ - return [contig for parts in map(strip_conflicting_mappings, map(align_to_reference, contigs)) for contig in parts] + groups = map(align_to_reference, contigs) + groups = map(strip_conflicting_mappings, groups) + for group in groups: + yield from group def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: @@ -514,7 +525,7 @@ def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: return merged_intervals -def find_covered_contig(contigs: List[AlignedContig]) -> Optional[AlignedContig]: +def find_covered_contig(contigs: List[AlignedContig]) -> Tuple[Optional[AlignedContig], List[AlignedContig]]: """ Find and return the first contig that is completely covered by other contigs. @@ -539,7 +550,7 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: for cover_interval in cumulative_coverage): return current, overlaping_contigs - return None, None + return None, [] def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig]: @@ -624,7 +635,7 @@ def try_split(contig): return contigs -def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig]: +def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(contigs) for contig in contigs: logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.", @@ -634,15 +645,11 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[AlignedContig contig.name, contig.seq, contig.ref_name, contig.group_ref, contig.ref_seq, len(contig.seq)) - aligned = align_all_to_reference(contigs) + maybe_aligned = list(align_all_to_reference(contigs)) # Contigs that did not align do not need any more processing - yield from (x for x in aligned if not isinstance(x, AlignedContig)) - aligned = [x for x in aligned if isinstance(x, AlignedContig)] - - # Contigs aligned in reverse do not need any more processing - yield from (x for x in aligned if x.strand == "reverse") - aligned = [x for x in aligned if x.strand == "forward"] + yield from (x for x in maybe_aligned if not isinstance(x, AlignedContig)) + aligned = [x for x in maybe_aligned if isinstance(x, AlignedContig)] aligned = split_contigs_with_gaps(aligned) aligned = drop_completely_covered(aligned) From 16c5e12cbb50fe62e6cdf60ace7f7127822a1218 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 14:53:36 -0800 Subject: [PATCH 155/485] Contig stitcher: only extend visualizer alignments in non-bad contigs --- micall/core/plot_contigs.py | 12 +++++++++--- micall/tests/data/exact_parts_contigs.plot.svg | 16 ++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index dce271bf2..3624e441c 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -782,13 +782,19 @@ def get_neighbour(part, lookup): if prev_part is not None: r_st = prev_part.alignment.r_st + position_offset else: - start_delta = -1 * part.alignment.q_st + if part.name in bad_contigs: + start_delta = 0 + else: + start_delta = -1 * part.alignment.q_st r_st = part.alignment.r_st + start_delta + position_offset if next_part is not None: r_ei = next_part.alignment.r_ei + position_offset else: - end_delta = len(part.seq) - part.alignment.q_ei + if part.name in bad_contigs: + end_delta = 0 + else: + end_delta = len(part.seq) - 1 - part.alignment.q_ei r_ei = part.alignment.r_ei + end_delta + position_offset aligned_size_map[part.name] = (r_st, r_ei) @@ -807,7 +813,7 @@ def get_neighbour(part, lookup): if next_part is not None and next_part.alignment.r_st > part.alignment.r_ei and next_part: r_ei = next_part.alignment.r_ei + position_offset else: - end_delta = len(part.seq) - part.alignment.q_ei + end_delta = len(part.seq) - 1 - part.alignment.q_ei r_ei = part.alignment.r_ei + end_delta + position_offset full_size_map[part.name] = (r_st, r_ei) diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg index 23dae0cc1..54edb7fd6 100644 --- a/micall/tests/data/exact_parts_contigs.plot.svg +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -133,10 +133,10 @@ 1 - - - -2 + + + +2 @@ -155,14 +155,14 @@ - - + + - - + + 2 From 49c624cab85207979bd40dd7adb83248d4e51b38 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 14:52:18 -0800 Subject: [PATCH 156/485] Contig stitcher: add visualize every test case --- .../test_correct_processing_complex_logs.svg | 142 ++++++++++++++ ...test_correct_processing_complex_nogaps.svg | 182 ++++++++++++++++++ ...wo_overlapping_and_one_separate_contig.svg | 74 +++++++ ...g_of_two_partially_overlapping_contigs.svg | 54 ++++++ ...overlapping_different_organism_contigs.svg | 62 ++++++ ...test_identical_stitching_of_one_contig.svg | 34 ++++ .../test_partial_align_consensus.svg | 34 ++++ ...onsensus_multiple_overlaping_sequences.svg | 54 ++++++ ...ial_align_consensus_multiple_sequences.svg | 54 ++++++ ...stitching_of_non_overlapping_contigs_1.svg | 54 ++++++ ...stitching_of_non_overlapping_contigs_2.svg | 54 ++++++ ..._stitching_contig_with_big_covered_gap.svg | 70 +++++++ ...itching_contig_with_big_noncovered_gap.svg | 34 ++++ ...ching_contig_with_big_noncovered_gap_2.svg | 54 ++++++ ...titching_contig_with_small_covered_gap.svg | 70 +++++++ ..._overlapping_contigs_into_one_sequence.svg | 74 +++++++ .../test_stitching_of_identical_contigs.svg | 60 ++++++ .../test_stitching_of_zero_contigs.svg | 12 ++ .../test_stitching_partial_align.svg | 34 ++++ ...ching_partial_align_multiple_sequences.svg | 50 +++++ ...e_contig_completely_covered_by_another.svg | 50 +++++ .../test_stitching_with_empty_contigs.svg | 46 +++++ micall/tests/test_contig_stitcher.py | 151 +++++++++++---- 23 files changed, 1470 insertions(+), 33 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg create mode 100644 micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg create mode 100644 micall/tests/data/stitcher_plots/test_partial_align_consensus.svg create mode 100644 micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg create mode 100644 micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg create mode 100644 micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg create mode 100644 micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_partial_align.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg new file mode 100644 index 000000000..8ba52e2bf --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg @@ -0,0 +1,142 @@ + + + + + + + +testref-2 + + + + + + + +1.2 + + + + + +2.2 + + + + + +3.2 + + + + + + + + + + + + +1.2 + + + + + + + + + + + + + +2.2 + + + + + + + + + + + + + +3.2 + + + + + + +testref-1 + + + + + + + +1.1 + + + + + +2.1 + + + + + +3.1 + + + + + + + + + + + + +1.1 + + + + + + + + + + + + + +2.1 + + + + + + + + + + + + + +3.1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg new file mode 100644 index 000000000..d8de84bba --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -0,0 +1,182 @@ + + + + + + + +testref-2 + + + + + + + +1.2 + + + + + +2.2 + + + + + +3.2 + + + + + +4.2 + + + + + + + + + + + + +1.2 + + + + + + + + + + + + + +2.2 + + + + + + + + + + + + + +3.2 + + + + + + + + + + + + + +4.2 + + + + + + +testref-1 + + + + + + + +1.1 + + + + + +2.1 + + + + + +3.1 + + + + + +4.1 + + + + + + + + + + + + +1.1 + + + + + + + + + + + + + +2.1 + + + + + + + + + + + + + +3.1 + + + + + + + + + + + + + +4.1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg new file mode 100644 index 000000000..593bba7cc --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -0,0 +1,74 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + +3 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + + + + + + + + + + +3 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg new file mode 100644 index 000000000..49f2ab8f1 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg new file mode 100644 index 000000000..3647ee590 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg @@ -0,0 +1,62 @@ + + + + + + + +testref-1 + + + + + + + +1 + + + + + + + + + + + + +1 + + + + + + +testref-2 + + + + + + + +2 + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg new file mode 100644 index 000000000..88eb96ca6 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg new file mode 100644 index 000000000..74b5bdb9a --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg new file mode 100644 index 000000000..595289080 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg new file mode 100644 index 000000000..e849ade80 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +2 + + + + + +2 + + + + + + + + + + + + +2 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg new file mode 100644 index 000000000..504060aac --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg new file mode 100644 index 000000000..504060aac --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg new file mode 100644 index 000000000..db7c205fc --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg @@ -0,0 +1,70 @@ + + + + + + + +testref + + + + + + + +1.1 + + + + + +2 + + + + + +1.2 + + + + + + + + + + + + + + + + +1.1 + + + + + +1.2 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg new file mode 100644 index 000000000..f0bef2372 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg new file mode 100644 index 000000000..5e9be4ba7 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg @@ -0,0 +1,54 @@ + + + + + + + +testref + + + + + + + +2 + + + + + +1 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg new file mode 100644 index 000000000..2db694b50 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg @@ -0,0 +1,70 @@ + + + + + + + +testref + + + + + + + +1.1 + + + + + +2 + + + + + +1.2 + + + + + + + + + + + + + + + + +1.1 + + + + + +1.2 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg new file mode 100644 index 000000000..d8b0a4dcb --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg @@ -0,0 +1,74 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + +3 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + + + + + + + + + + +3 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg new file mode 100644 index 000000000..de7ef094c --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -0,0 +1,60 @@ + + + + + + + +testref + + + + + + + +3 + + + + + + + + + + + + +3 + + + + + + +discards: + + + + + + + + + +1 + + + + + + + + + +2 + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg new file mode 100644 index 000000000..edb3d2eaf --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg @@ -0,0 +1,12 @@ + + + + + + + +No contigs found. + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg new file mode 100644 index 000000000..74b5bdb9a --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg new file mode 100644 index 000000000..b2a880eae --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -0,0 +1,50 @@ + + + + + + + +testref + + + + + + + +2 + + + + + + + + + + + + +2 + + + + + + +discards: + + + + + + + + + +1 + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg new file mode 100644 index 000000000..9035a2abd --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -0,0 +1,50 @@ + + + + + + + +testref + + + + + + + +2 + + + + + + + + + + + + +2 + + + + + + +discards: + + + + + + + + + +1 + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg new file mode 100644 index 000000000..9f7c26085 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -0,0 +1,46 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + + + +anomaly: + + + + + +2 + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index bab02eb75..f6193688f 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -5,9 +5,10 @@ import pytest from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig +from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed from micall.utils.structured_logger import add_structured_handler -from micall.tests.test_denovo import check_hcv_db +from micall.tests.test_denovo import check_hcv_db # activates the fixture @pytest.fixture() @@ -15,7 +16,40 @@ def exact_aligner(monkeypatch): monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) -def test_identical_stitching_of_one_contig(exact_aligner): +@pytest.fixture +def visualizer(request, tmp_path): + # Set up the logger and structured handler + logger = logging.getLogger("micall.core.contig_stitcher") + logger.setLevel(logging.DEBUG) + handler = add_structured_handler(logger) + logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) + + test_name = request.node.name + plot_name = test_name + ".svg" + pwd = os.path.dirname(__file__) + plots_dir = os.path.join(pwd, "data", "stitcher_plots") + os.makedirs(plots_dir, exist_ok=True) + path_to_expected = os.path.join(plots_dir, plot_name) + path_to_produced = os.path.join(tmp_path, plot_name) + + def check(): + figure = plot_stitcher_coverage(handler.logs, path_to_produced) + + with open(path_to_produced, 'r') as produced_file: + produced_data = produced_file.read() + with open(path_to_expected, 'r') as expected_file: + expected_data = expected_file.read() + + assert produced_data == expected_data, \ + "The contents of the stitched contigs plot" \ + " does not match the expected contents." + + return figure + + return check + + +def test_identical_stitching_of_one_contig(exact_aligner, visualizer): # Scenario: When stitching one contig, it remains the same. contigs = [ @@ -23,7 +57,7 @@ def test_identical_stitching_of_one_contig(exact_aligner): seq='ACTGACTG' * 100, ref_name='testref', group_ref='testref', - ref_seq='ACTGACTG' * 100, + ref_seq='T' * 20 + 'ACTGACTG' * 110 + 'T' * 20, match_fraction=1.0, ), ] @@ -32,22 +66,24 @@ def test_identical_stitching_of_one_contig(exact_aligner): assert len(results) == 1 assert results[0].seq == contigs[0].seq + assert len(visualizer().elements) > len(contigs) + -def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): +def test_separate_stitching_of_non_overlapping_contigs_1(exact_aligner, visualizer): # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. - ref_seq = 'A' * 100 + ref_seq = 'A' * 100 + 'C' * 100 contigs = [ GenotypedContig(name='a', - seq=ref_seq, + seq='A' * 70, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', - seq='C' * 100, + seq='C' * 70, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -59,18 +95,26 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): # No claims about the output order, so wrap into set() assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + == { contig.seq for contig in results } + + assert len(visualizer().elements) > len(contigs) + + +def test_separate_stitching_of_non_overlapping_contigs_2(exact_aligner, visualizer): + # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. + + ref_seq = 'A' * 100 + 'C' * 100 contigs = [ GenotypedContig(name='b', - seq='C' * 100, + seq='C' * 70, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='a', - seq=ref_seq, + seq='A' * 70, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -84,8 +128,10 @@ def test_separate_stitching_of_non_overlapping_contigs(exact_aligner): assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) + -def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): +def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, visualizer): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. ref_seq = 'A' * 100 + 'C' * 100 @@ -115,8 +161,10 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner): assert 100 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 50 + assert len(visualizer().elements) > len(contigs) + -def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner): +def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner, visualizer): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -153,8 +201,10 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali assert results[1].seq == contigs[2].seq + assert len(visualizer().elements) > len(contigs) -def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): + +def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner, visualizer): # Scenario: All contigs have some overlapping parts, resulting in one continuous sequence after stitching. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -191,8 +241,10 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner): assert 200 == len(result.seq) assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 + assert len(visualizer().elements) > len(contigs) + -def test_stitching_with_empty_contigs(exact_aligner): +def test_stitching_with_empty_contigs(exact_aligner, visualizer): # Scenario: The function is able to handle and ignore empty contigs. ref_seq = 'A' * 100 @@ -218,8 +270,10 @@ def test_stitching_with_empty_contigs(exact_aligner): assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) + -def test_stitching_of_identical_contigs(exact_aligner): +def test_stitching_of_identical_contigs(exact_aligner, visualizer): # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. contigs = [ @@ -236,16 +290,20 @@ def test_stitching_of_identical_contigs(exact_aligner): assert len(results) == 1 assert results[0].seq == contigs[2].seq + assert len(visualizer().elements) > len(contigs) -def test_stitching_of_zero_contigs(exact_aligner): + +def test_stitching_of_zero_contigs(exact_aligner, visualizer): # Scenario: The function does not crash if no contigs given. contigs = [] results = list(stitch_contigs(contigs)) assert results == contigs + assert len(visualizer().elements) > 0 + -def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(exact_aligner): +def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(exact_aligner, visualizer): # Scenario: Two partially overlapping contigs, but which come from different organism, # are not stitched into a single sequence. @@ -274,8 +332,10 @@ def test_correct_stitching_of_two_partially_overlapping_different_organism_conti assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) + -def test_correct_processing_complex_nogaps(exact_aligner): +def test_correct_processing_complex_nogaps(exact_aligner, visualizer): # Scenario: There are two reference organisms. # Each with 4 contigs. # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. @@ -329,8 +389,10 @@ def test_correct_processing_complex_nogaps(exact_aligner): assert results[2].seq == contigs[3].seq assert results[3].seq == contigs[7].seq + assert len(visualizer().elements) > len(contigs) -def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): + +def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner, visualizer): # Scenario: If one contig is completely covered by another contig, # the completely covered contig must be dropped. @@ -360,8 +422,10 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner): # does not contain the completely covered contig 'a'. assert results[0].seq == contigs[1].seq + assert len(visualizer().elements) > len(contigs) + -def test_stitching_contig_with_big_noncovered_gap(exact_aligner): +def test_stitching_contig_with_big_noncovered_gap(exact_aligner, visualizer): # Scenario: One contig has a big gap, which is however not covered by anything else. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 @@ -381,8 +445,10 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner): assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) + -def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): +def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner, visualizer): # Scenario: One contig has a big gap, which is however not covered by anything else. ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 @@ -409,8 +475,10 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner): assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) -def test_stitching_contig_with_big_covered_gap(exact_aligner): + +def test_stitching_contig_with_big_covered_gap(exact_aligner, visualizer): # Scenario: If one contig has a big gap covered by another contig. ref_seq = 'G' * 100 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 @@ -424,7 +492,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): match_fraction=0.5, ), GenotypedContig(name='b', - seq='A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 50, + seq='A' * 100 + 'C' * 100 + 'T' * 50, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -433,6 +501,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): ] contigs = list(align_all_to_reference(contigs)) + assert len(contigs) == 2 assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 @@ -440,8 +509,10 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner): assert len(results) == 3 assert all(list(contig.alignment.gaps()) == [] for contig in results) + assert len(visualizer().elements) > len(contigs) + -def test_stitching_contig_with_small_covered_gap(exact_aligner): +def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): # Scenario: If one contig has a small gap covered by another contig. ref_seq = 'G' * 100 + 'A' * 9 + 'C' * 100 + 'T' * 100 @@ -455,7 +526,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): match_fraction=0.5, ), GenotypedContig(name='b', - seq='A' * 9 + 'C' * 100 + 'T' * 100, + seq='A' * 9 + 'C' * 50, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -464,18 +535,20 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner): ] contigs = list(align_all_to_reference(contigs)) + assert len(contigs) == 2 assert len(list(contigs[0].alignment.gaps())) == 1 assert len(list(contigs[1].alignment.gaps())) == 0 - results = list(split_contigs_with_gaps(contigs)) + assert len(results) == 3 - assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) + assert len(visualizer().elements) > len(contigs) + assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } -def test_stitching_partial_align(exact_aligner): +def test_stitching_partial_align(exact_aligner, visualizer): # Scenario: A single contig has a sequence that partially aligns to the reference sequence. contigs = [ @@ -493,13 +566,15 @@ def test_stitching_partial_align(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) + assert len(visualizer().elements) > len(contigs) + assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) assert { contig.seq for contig in contigs } \ != { contig.lstrip_query().rstrip_query().seq for contig in results } -def test_partial_align_consensus(exact_aligner): +def test_partial_align_consensus(exact_aligner, visualizer): # Scenario: A single contig partially aligns to the reference sequence, and a consensus sequence is being stitched. contigs = [ @@ -517,8 +592,10 @@ def test_partial_align_consensus(exact_aligner): assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } + assert len(visualizer().elements) > len(contigs) + -def test_stitching_partial_align_multiple_sequences(exact_aligner): +def test_stitching_partial_align_multiple_sequences(exact_aligner, visualizer): # Scenario: Multiple contigs have sequences that partially align to the same reference sequence. ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 @@ -545,11 +622,13 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner): for result in results: assert any(result.seq in contig.seq for contig in contigs) + assert len(visualizer().elements) > len(contigs) + assert { contig.seq for contig in contigs } \ != { contig.lstrip_query().rstrip_query().seq for contig in results } -def test_partial_align_consensus_multiple_sequences(exact_aligner): +def test_partial_align_consensus_multiple_sequences(exact_aligner, visualizer): # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 @@ -575,8 +654,10 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner): assert len(results) == 1 assert results[0].seq == contigs[0].seq + contigs[1].seq + assert len(visualizer().elements) > len(contigs) -def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): + +def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner, visualizer): # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 @@ -603,8 +684,10 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner): assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10 assert results[0].seq == contigs[0].seq[:-10] + contigs[1].seq[20:] + assert len(visualizer().elements) > len(contigs) + -def test_correct_processing_complex_logs(exact_aligner): +def test_correct_processing_complex_logs(exact_aligner, visualizer): # Scenario: There are two reference organisms. # Each with 4 contigs. # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. @@ -658,6 +741,8 @@ def test_correct_processing_complex_logs(exact_aligner): assert len(info_messages) == 32 assert len(debug_messages) == len(handler.logs) - len(info_messages) + assert len(visualizer().elements) > len(contigs) + def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) From 135ef59a3f66dc65d256bf22145b404771d262ea Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 15:17:40 -0800 Subject: [PATCH 157/485] Contig stitcher: sort bad contigs in the visualizer --- micall/core/plot_contigs.py | 65 +++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 3624e441c..9c7bfcf58 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -971,12 +971,17 @@ def get_all_arrows(group_ref, labels): label = LeftLabel(text=f"discards:", x=0, font_size=12) pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) - for contig_name in discarded: - contig = contig_map[contig_name] - (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - name = name_mappings.get(contig.name, contig.name) - figure.add(Arrow(r_st, r_ei, elevation=-20, h=1)) - figure.add(Track(f_r_st, f_r_ei, label=name)) + for parent_name in sorted_roots: + contigs = final_parent_mapping.get(parent_name, []) + for contig_name in contigs: + if contig_name not in discarded: + continue + + contig = contig_map[contig_name] + (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + name = name_mappings.get(contig.name, contig.name) + figure.add(Arrow(r_st, r_ei, elevation=-20, h=1)) + figure.add(Track(f_r_st, f_r_ei, label=name)) ############# # Anomalies # @@ -986,20 +991,25 @@ def get_all_arrows(group_ref, labels): label = LeftLabel(text=f"anomaly:", x=0, font_size=12) pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) - for contig_name in anomaly: - contig = contig_map[contig_name] - (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - if isinstance(contig, AlignedContig): - colour = "lightgray" - if contig.strand == "reverse": - figure.add(Arrow(a_r_ei, a_r_st, elevation=-20, h=1)) + for parent_name in sorted_roots: + contigs = final_parent_mapping.get(parent_name, []) + for contig_name in contigs: + if contig_name not in anomaly: + continue + + contig = contig_map[contig_name] + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + if isinstance(contig, AlignedContig): + colour = "lightgray" + if contig.strand == "reverse": + figure.add(Arrow(a_r_ei, a_r_st, elevation=-20, h=1)) + else: + figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) else: - figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) - else: - colour = "red" + colour = "red" - name = name_mappings.get(contig.name, contig.name) - figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) + name = name_mappings.get(contig.name, contig.name) + figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) ########### # Unknown # @@ -1009,13 +1019,18 @@ def get_all_arrows(group_ref, labels): label = LeftLabel(text=f"unknown:", x=0, font_size=12) pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) - for contig_name in unknown: - contig = contig_map[contig_name] - r_st = position_offset - r_ei = position_offset + len(contig.seq) - colour = "red" - name = name_mappings.get(contig.name, contig.name) - figure.add(Track(r_st, r_ei, color=colour, label=name)) + for parent_name in sorted_roots: + contigs = final_parent_mapping.get(parent_name, []) + for contig_name in contigs: + if contig_name not in unknown: + continue + + contig = contig_map[contig_name] + r_st = position_offset + r_ei = position_offset + len(contig.seq) + colour = "red" + name = name_mappings.get(contig.name, contig.name) + figure.add(Track(r_st, r_ei, color=colour, label=name)) if not figure.elements: figure.add(Track(1, max_position, label='No contigs found.', color='none')) From 77d7d7c7a6319494ce848cd844761f9e311e96ad Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 15:53:52 -0800 Subject: [PATCH 158/485] Contig stitcher: do no re-draw same contigs Make sure that every contig, including once composed of root-level contigs, is displayed at most once. --- micall/core/plot_contigs.py | 25 +++++++++++++----- ...ial_align_consensus_multiple_sequences.svg | 26 +++---------------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 9c7bfcf58..378275f67 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -839,12 +839,15 @@ def get_contig_coordinates(contig): a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) - def get_tracks(group_ref, contig_name): + def get_tracks(repeatset, group_ref, contig_name): parts = final_parent_mapping[contig_name] for part_name in parts: part = contig_map[part_name] - if part_name in bad_contigs: + if part.name in repeatset: + continue + + if part.name in bad_contigs: continue if not isinstance(part, AlignedContig): @@ -853,16 +856,20 @@ def get_tracks(group_ref, contig_name): if part.group_ref != group_ref: continue + repeatset.add(part.name) indexes = name_mappings[part.name] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) yield Track(f_r_st, f_r_ei, label=f"{indexes}") - def get_arrows(group_ref, contig_name, labels): + def get_arrows(repeatset, group_ref, contig_name, labels): parts = final_parent_mapping[contig_name] for part_name in parts: part = contig_map[part_name] - if part_name in bad_contigs: + if part.name in repeatset: + continue + + if part.name in bad_contigs: continue if not isinstance(part, AlignedContig): @@ -871,6 +878,7 @@ def get_arrows(group_ref, contig_name, labels): if part.group_ref != group_ref: continue + repeatset.add(part.name) indexes = name_mappings[part.name] if labels else None height = 20 if labels else 1 elevation = 1 if labels else -20 @@ -881,8 +889,9 @@ def get_arrows(group_ref, contig_name, labels): label=indexes) def get_all_arrows(group_ref, labels): + repeatset = set() for parent_name in sorted_roots: - yield from get_arrows(group_ref, parent_name, labels) + yield from get_arrows(repeatset, group_ref, parent_name, labels) ################ # Drawing part # @@ -955,11 +964,13 @@ def get_all_arrows(group_ref, labels): # Contigs # ########### + repeatset1 = set() + repeatset2 = set() for parent_name in sorted_roots: - arrows = list(get_arrows(group_ref, parent_name, labels=False)) + arrows = list(get_arrows(repeatset1, group_ref, parent_name, labels=False)) if arrows: figure.add(ArrowGroup(arrows)) - parts = list(get_tracks(group_ref, parent_name)) + parts = list(get_tracks(repeatset2, group_ref, parent_name)) if parts: figure.add(Multitrack(parts)) diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg index e849ade80..854b38944 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -1,21 +1,15 @@ + width="970.0" height="86.0" viewBox="0.0 -86.0 970.0 86.0"> - + testref - - - - - -2 - + @@ -23,20 +17,6 @@ 2 - - - - - - - - - - -2 - - - From 8e904cab5c10b60f5ff44fcf4c4d470395bfe69b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 22 Jan 2024 16:52:07 -0800 Subject: [PATCH 159/485] Contig stitcher: improve visualizer finals calculation * fix type error * improve parent-child check --- micall/core/plot_contigs.py | 11 +- .../test_correct_processing_complex_logs.svg | 172 +++++++++++------- ...ial_align_consensus_multiple_sequences.svg | 38 +++- 3 files changed, 142 insertions(+), 79 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 378275f67..03d6080e8 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -599,6 +599,7 @@ def record_bad_contig(contig: Contig, lst: List[Contig]): reduced_children_graph = reduced_closure(children_graph) transitive_parent_graph = transitive_closure(parent_graph) transitive_children_graph = transitive_closure(children_graph) + eqv_parent_graph = reflexive_closure(symmetric_closure(transitive_parent_graph)) sorted_roots = list(sorted(parent_name for parent_name in contig_map if parent_name not in parent_graph)) @@ -655,7 +656,7 @@ def set_query_position(contig: Contig): # Closing `temporary' for contig in contig_map: if contig in temporary: - for clone in eqv_morphism_graph.get(contig, []): + for clone in eqv_morphism_graph.get(contig, [contig]): temporary.add(clone) def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): @@ -691,21 +692,23 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): elif contig in bad_contigs: final_parts[contig] = True - # for join in last_join_points + sorted_sinks: for join in last_join_points + sorted_sinks: parents = parent_graph.get(join, [join]) - if not any(isinstance(parent, AlignedContig) for parent in parents): + if not any(isinstance(contig_map[parent], AlignedContig) for parent in parents): parents = [join] for contig in parents: for contig in reduced_morphism_graph.get(contig, [contig]): + if contig in bad_contigs: + continue + if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): continue if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): continue - transitive_parent = transitive_parent_graph.get(contig, []) + transitive_parent = eqv_parent_graph.get(contig, [contig]) if any(parent in transitive_parent for parent in final_parts): continue diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg index 8ba52e2bf..d8de84bba 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg @@ -1,141 +1,181 @@ + width="969.9999999999999" height="353.0" viewBox="0.0 -353.0 969.9999999999999 353.0"> - - - + + + testref-2 - + - - - -1.2 + + + +1.2 - - - -2.2 + + + +2.2 - - - -3.2 + + + +3.2 + + + + +4.2 - + + - - + + - + - - + + 1.2 - + - - + + - + - - + + 2.2 - + - - + + - + - - + + 3.2 - - - + + + + + + + + + + +4.2 + + + + + + testref-1 - + - - - -1.1 + + + +1.1 + + + + + +2.1 + + + + + +3.1 + + + + + +4.1 + + - - - -2.1 + + + + - - - -3.1 + + +1.1 + - - + + - - -1.1 + + +2.1 - - + + - - -2.1 + + +3.1 - - + + - - -3.1 + + +4.1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg index 854b38944..4d53d9e33 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -1,32 +1,52 @@ + width="970.0" height="117.0" viewBox="0.0 -117.0 970.0 117.0"> - + testref - + + + + + +1 + - - + + -2 +2 + + + + + + + + + + + + +1 + - + - - + + 2 From bf1390fe21f9337afb4dfbd4803bbc005a45d9d6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Jan 2024 10:22:27 -0800 Subject: [PATCH 160/485] Contig stitcher: remove hanging comma in the code --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 947bc713e..9d15720a9 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -444,7 +444,7 @@ def stitch_2_contigs(left, right): # Log it. average_concordance = sum(concordance) / (len(concordance) or 1) - concordance_str = ', '.join(map(lambda x: str(round(x, 2)), concordance)), + concordance_str = ', '.join(map(lambda x: str(round(x, 2)), concordance)) cut_point_location_scaled = max_concordance_index / (((len(concordance) or 1) - 1) or 1) logger.debug("Created overlap contigs %r at %s and %r at %s based on parts of %r and %r, with avg. concordance %s%%, cut point at %s%%, and full concordance [%s].", left_overlap_take.name, left_overlap.alignment, right_overlap_take.name, right_overlap_take.alignment, From 6caf9eead337821e0fc1c51210108b40485de34c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Jan 2024 15:27:37 -0800 Subject: [PATCH 161/485] Contig stitcher: improve concordance calculations Also add more tests for it. --- micall/core/contig_stitcher.py | 22 ++++++++++++++-- micall/tests/test_contig_stitcher.py | 38 +++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9d15720a9..61e0c5742 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -384,11 +384,29 @@ def slide(start, end): return result +def disambiguate_concordance(concordance: List[float]) -> Iterable[Tuple[float, int, int]]: + def slide(concordance): + count = 0 + for i, (prev, current, next) in enumerate(sliding_window(concordance)): + if current == prev: + count += 1 + yield count + else: + yield 0 + + forward = list(slide(concordance)) + reverse = list(reversed(list(slide(reversed(concordance))))) + for i, (x, f, r) in enumerate(zip(concordance, forward, reverse)): + local_rank = f * r + global_rank = i if i < len(concordance) / 2 else len(concordance) - i - 1 + yield (x, local_rank, global_rank) + + def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance): """ Determine optimal cut points for stitching based on sequence concordance in the overlap region. """ - valuator = lambda i: (concordance[i], i if i < len(concordance) / 2 else len(concordance) - i - 1) - sorted_concordance_indexes = sorted(range(len(concordance)), key=valuator) + concordance_d = list(disambiguate_concordance(concordance)) + sorted_concordance_indexes = sorted(range(len(concordance)), key=lambda i: concordance_d[i]) remove_dashes = lambda s: ''.join(c for c in s if c != '-') for max_concordance_index in reversed(sorted_concordance_indexes): diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index f6193688f..3fff183fb 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -4,7 +4,7 @@ import os import pytest -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed from micall.utils.structured_logger import add_structured_handler @@ -974,6 +974,42 @@ def generate_random_string_pair(length): right = ''.join(random.choice('ACGT') for _ in range(length)) return left, right + +@pytest.mark.parametrize( + 'left, right, expected', + [("aaaaa", "aaaaa", [0.1] * 5), + ("abcdd", "abcdd", [0.1] * 5), + ("aaaaaaaa", "baaaaaab", [0.1, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.1]), + ("aaaaaaaa", "aaaaaaab", [0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.12]), + ("aaaaaaaa", "aaaaaaab", [0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.12]), + ("aaaaaaaa", "aaaaabbb", [0.1, 0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08]), + ("aaaaaaaa", "aaabbaaa", [0.12, 0.12, 0.12, 0.1, 0.1, 0.12, 0.12, 0.12]), + ("aaaaa", "bbbbb", [0] * 5), + ] +) +def test_concordance_simple(left, right, expected): + result = [round(x, 2) for x in calculate_concordance(left, right)] + assert result == expected + + +@pytest.mark.parametrize( + 'left, right, expected', + [("a" * 128, "a" * 128, 64), + ("a" * 128, "a" * 64 + "b" * 64, 32), + ("a" * 128, "a" * 64 + "ba" * 32, 32), + ("a" * 128, "a" * 54 + "b" * 20 + "a" * 54, 28), # two peaks + ("a" * 128, "a" * 63 + "b" * 2 + "a" * 63, 32), # two peaks + ("a" * 1280, "b" * 640 + "a" * 640, 640 + 30), # the window is too small to account for all of the context + ] +) +def test_concordance_simple_index(left, right, expected): + concordance = calculate_concordance(left, right) + concordance_d = list(disambiguate_concordance(concordance)) + index = max(range(len(concordance)), key=lambda i: concordance_d[i]) + if abs(index - expected) > 3: + assert index == expected + + def generate_test_cases(num_cases): with fixed_random_seed(42): length = random.randint(1, 80) From 68a5b82739f330153c9d1a60a722c43429544097 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Jan 2024 15:28:33 -0800 Subject: [PATCH 162/485] Contig stitcher: introduce a proper context for the name generator --- micall/core/contig_stitcher.py | 38 ++++++++++++++++++++-------------- micall/core/denovo.py | 35 +++++++++++++++++-------------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 61e0c5742..70ca50e37 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar +from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable from collections import deque, defaultdict from dataclasses import dataclass, replace from math import ceil, floor @@ -9,19 +9,29 @@ from queue import LifoQueue from Bio import Seq import logging +from contextvars import ContextVar, Context from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions - +T = TypeVar("T") logger = logging.getLogger(__name__) -name_generator_state = 0 -def generate_new_name(): - global name_generator_state - name_generator_state += 1 - return f"c{name_generator_state}" +class StitcherContext: + def __init__(self): + self.name_generator_state: int = 0 + + def generate_new_name(self): + self.name_generator_state += 1 + return f"c{self.name_generator_state}" + + +context: ContextVar[StitcherContext] = ContextVar("StitcherContext", default=StitcherContext()) + + +def with_fresh_context(body: Callable[[StitcherContext], T]) -> T: + return Context().run(lambda: body(context.get())) @dataclass(frozen=True) @@ -41,8 +51,8 @@ def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedCont """ Cuts query sequence in two parts with cut_point between them. """ cut_point = max(0, cut_point) - left = replace(self, name=generate_new_name(), seq=self.seq[:ceil(cut_point)]) - right = replace(self, name=generate_new_name(), seq=self.seq[ceil(cut_point):]) + left = replace(self, name=context.get().generate_new_name(), seq=self.seq[:ceil(cut_point)]) + right = replace(self, name=context.get().generate_new_name(), seq=self.seq[ceil(cut_point):]) return (left, right) @@ -68,8 +78,8 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left = replace(self, name=generate_new_name(), alignment=alignment_left) - right = replace(self, name=generate_new_name(), alignment=alignment_right) + left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) + right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", left.name, left.alignment, right.name, right.alignment, self.name, @@ -134,7 +144,7 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': match_fraction = min(self.match_fraction, other.match_fraction) ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=self.seq + other.seq, - name=generate_new_name(), + name=context.get().generate_new_name(), ref_name=ref_name, group_ref=self.group_ref, ref_seq=self.ref_seq, @@ -156,8 +166,6 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': return ret -T = TypeVar("T") - def sliding_window(sequence: Iterable[T]) -> Iterable[Tuple[Optional[T], T, Optional[T]]]: """ Generate a three-element sliding window of a sequence. @@ -247,7 +255,7 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: contig = new_contig for i, single_hit in enumerate(connected): - query = replace(contig, name=generate_new_name()) + query = replace(contig, name=context.get().generate_new_name()) part = AlignedContig.make(query, single_hit, strand) logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 6cbdf956b..fbdb654b3 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,7 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, logger as stitcher_logger +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, logger as stitcher_logger, with_fresh_context from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.structured_logger import add_structured_handler @@ -84,26 +84,29 @@ def write_contig_refs(contigs_fasta_path, contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") group_refs = {} - logger = logging.getLogger("micall.core.contig_stitcher") - handler = add_structured_handler(logger) + def run_stitcher(ctx): + logger = logging.getLogger("micall.core.contig_stitcher") + handler = add_structured_handler(logger) - genotypes = genotype(contigs_fasta_path, - blast_csv=blast_csv, - group_refs=group_refs) + genotypes = genotype(contigs_fasta_path, + blast_csv=blast_csv, + group_refs=group_refs) - contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) - contigs = list(stitch_consensus(contigs)) + contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) + contigs = list(stitch_consensus(contigs)) - for contig in contigs: - writer.writerow(dict(ref=contig.ref_name, - match=contig.match_fraction, - group_ref=contig.group_ref, - contig=contig.seq)) + for contig in contigs: + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq)) - if stitcher_logger.level <= logging.DEBUG and stitcher_plot_path is not None: - plot_stitcher_coverage(handler.logs, stitcher_plot_path) + if stitcher_logger.level <= logging.DEBUG and stitcher_plot_path is not None: + plot_stitcher_coverage(handler.logs, stitcher_plot_path) - return len(contigs) + return len(contigs) + + return with_fresh_context(run_stitcher) def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): From 5d6a98a8d0e7ee5e338b1a424919bc150058334b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Jan 2024 17:10:54 -0800 Subject: [PATCH 163/485] Contig stitcher: use context for logs handling --- micall/core/contig_stitcher.py | 113 ++++++++--------- micall/core/denovo.py | 5 +- micall/core/plot_contigs.py | 55 ++++----- micall/tests/test_contig_stitcher.py | 19 +-- micall/tests/test_contig_stitcher_fuzz.py | 20 ++- micall/utils/contig_stitcher_events.py | 143 ++++++++++++++++++++++ 6 files changed, 241 insertions(+), 114 deletions(-) create mode 100644 micall/utils/contig_stitcher_events.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 70ca50e37..0c97f4dcc 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -13,25 +13,33 @@ from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions +import micall.utils.contig_stitcher_events as events T = TypeVar("T") logger = logging.getLogger(__name__) - class StitcherContext: def __init__(self): self.name_generator_state: int = 0 + self.events: List[events.EventType] = [] def generate_new_name(self): self.name_generator_state += 1 return f"c{self.name_generator_state}" + def emit(self, event: events.EventType): + self.events.append(event) + context: ContextVar[StitcherContext] = ContextVar("StitcherContext", default=StitcherContext()) def with_fresh_context(body: Callable[[StitcherContext], T]) -> T: - return Context().run(lambda: body(context.get())) + def wrapper(): + ctx = StitcherContext() + context.set(ctx) + return body(ctx) + return Context().run(wrapper) @dataclass(frozen=True) @@ -82,9 +90,8 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", - left.name, left.alignment, right.name, right.alignment, self.name, - extra={"action": "cut", "original": self, - "left": left, "right": right}) + left.name, left.alignment, right.name, right.alignment, self.name) + context.get().emit(events.Cut(self, left, right)) return (left, right) @@ -101,9 +108,8 @@ def lstrip_query(self) -> 'AlignedContig': result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, - len(self.seq), result.alignment, len(result.seq), - extra={"action": "modify", "type": "lstrip", - "original": self, "result": result}) + len(self.seq), result.alignment, len(result.seq)) + context.get().emit(events.LStrip(self, result)) return result @@ -118,9 +124,8 @@ def rstrip_query(self) -> 'AlignedContig': result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", self.name, result.name, self.alignment, - len(self.seq), result.alignment, len(result.seq), - extra={"action": "modify", "type": "rstrip", - "original": self, "result": result}) + len(self.seq), result.alignment, len(result.seq)) + context.get().emit(events.RStrip(self, result)) return result @@ -160,9 +165,8 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': assert self.strand == other.strand ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", - self.name, self.alignment, other.name, other.alignment, - ret.name, ret.alignment, extra={"action": "munge", "left": self, - "right": other, "result": ret}) + self.name, self.alignment, other.name, other.alignment, ret.name, ret.alignment) + context.get().emit(events.Munge(self, other, ret)) return ret @@ -199,8 +203,8 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: ret = reduce(AlignedContig.munge, stripped_parts) logger.debug("Created a frankenstein %r at %s (len %s) from %s.", ret.name, ret.alignment, len(ret.seq), - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in stripped_parts], - extra={"action": "combine", "contigs": stripped_parts, "result": ret}) + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in stripped_parts]) + context.get().emit(events.Combine(stripped_parts, ret)) return ret @@ -213,8 +217,8 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: """ if contig.ref_seq is None: - logger.info("Contig %r not aligned - no reference.", contig.name, - extra={"action": "alignment", "type": "noref", "contig": contig}) + logger.info("Contig %r not aligned - no reference.", contig.name) + context.get().emit(events.NoRef(contig)) yield contig return @@ -229,29 +233,27 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] if not connected: - logger.info("Contig %r not aligned - backend's choice.", contig.name, - extra={"action": "alignment", "type": "zerohits", "contig": contig}) + logger.info("Contig %r not aligned - backend's choice.", contig.name) + context.get().emit(events.ZeroHits(contig)) yield contig return if len(set(map(lambda p: p[1], hits_array))) > 1: - logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name, - extra={"action": "alignment", "type": "strandconflict", "contig": contig}) + logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name) + context.get().emit(events.StrandConflict(contig)) yield contig return logger.info("Contig %r produced %s aligner hits. After connecting them, the number became %s.", - contig.name, len(hits_array), len(connected), - extra={"action": "alignment", "type": "hitnumber", "contig": contig, - "initial": hits_array, "connected": connected}) + contig.name, len(hits_array), len(connected)) + context.get().emit(events.HitNumber(contig, hits_array, connected)) strand = hits_array[0][1] if strand == "reverse": rc = str(Seq(contig.seq).reverse_complement()) new_contig = replace(contig, seq=rc) - logger.info("Reverse complemented contig %r.", contig.name, - extra={"action": "alignment", "type": "reversecomplement", - "contig": contig, "result": new_contig}) + logger.info("Reverse complemented contig %r.", contig.name) + context.get().emit(events.ReverseComplement(contig, new_contig)) contig = new_contig for i, single_hit in enumerate(connected): @@ -261,12 +263,10 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", i, contig.name,part.name,part.alignment.q_st, part.alignment.q_ei,part.alignment.r_st,part.alignment.r_ei, - " (rev)" if strand == "reverse" else "", - extra={"action": "alignment", "type": "hit", - "contig": contig, "part":part, "i": i}) + " (rev)" if strand == "reverse" else "") logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, part.name,part.alignment, " (rev)" if strand == "reverse" else "") - + context.get().emit(events.Hit(contig, part, i)) yield part @@ -455,10 +455,8 @@ def stitch_2_contigs(left, right): left.name, left.alignment, len(left.seq), right.name, right.alignment, len(right.seq), left_overlap.name, left_overlap.alignment, len(left_overlap.seq), - right_overlap.name, right_overlap.alignment, len(right_overlap.seq), - extra={"action": "stitchcut", "left": left, "right": right, - "left_overlap": left_overlap, "right_overlap": right_overlap, - "left_remainder": left_remainder, "right_remainder": right_remainder}) + right_overlap.name, right_overlap.alignment, len(right_overlap.seq)) + context.get().emit(events.StitchCut(left, right, left_overlap, right_overlap, left_remainder, right_remainder)) # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) @@ -475,14 +473,11 @@ def stitch_2_contigs(left, right): logger.debug("Created overlap contigs %r at %s and %r at %s based on parts of %r and %r, with avg. concordance %s%%, cut point at %s%%, and full concordance [%s].", left_overlap_take.name, left_overlap.alignment, right_overlap_take.name, right_overlap_take.alignment, left.name, right.name, round(average_concordance * 100), - round(cut_point_location_scaled * 100), concordance_str, - extra={"action": "overlap", "left": left, "right": right, - "left_remainder": left_remainder, "right_remainder": right_remainder, - "left_overlap": left_overlap, "right_overlap": right_overlap, - "left_take": left_overlap_take, "right_take": right_overlap_take, - "concordance": concordance, "avg": average_concordance, - "cut_point": max_concordance_index, - "cut_point_scaled": cut_point_location_scaled}) + round(cut_point_location_scaled * 100), concordance_str) + context.get().emit(events.Overlap(left, right, left_overlap, right_overlap, + left_remainder, right_remainder, left_overlap_take, + right_overlap_take, concordance, average_concordance, + max_concordance_index, cut_point_location_scaled)) return combine_contigs([left_remainder, left_overlap_take, right_overlap_take, right_remainder]) @@ -501,8 +496,8 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: - logger.info("Nothing overlaps with %r.", current.name, - extra={"action": "nooverlap", "contig": current}) + logger.info("Nothing overlaps with %r.", current.name) + context.get().emit(events.NoOverlap(current)) yield current continue @@ -514,12 +509,11 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: logger.info("Stitching %r with %r results in %r at [%s,%s]->[%s,%s].", current.name, overlapping_contig.name, new_contig.name, new_contig.alignment.q_st, new_contig.alignment.q_ei, - new_contig.alignment.r_st, new_contig.alignment.r_ei, - extra={"action": "stitch", "result": new_contig, - "left": current, "right": overlapping_contig}) + new_contig.alignment.r_st, new_contig.alignment.r_ei) logger.debug("Stitching %r with %r results in %r at %s (len %s).", current.name, overlapping_contig.name, new_contig.name, new_contig.alignment, len(new_contig.seq)) + context.get().emit(events.Stitch(current, overlapping_contig, new_contig)) def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: @@ -588,8 +582,8 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] if covered: contigs.remove(covered) logger.info("Droped contig %r as it is completely covered by these contigs: %s.", - covered.name, ", ".join(repr(x.name) for x in covering), - extra={"action": "drop", "contig": covered, "covering": covering}) + covered.name, ", ".join(repr(x.name) for x in covering)) + context.get().emit(events.Drop(covered, covering)) else: break @@ -624,8 +618,8 @@ def try_split(contig): # overlaps around them. # And we are likely to lose quality with every stitching operation. # By skipping we assert that this gap is aligner's fault. - logger.debug("Ignored insignificant gap of %r, %s.", contig.name, gap, - extra={"action": "ignoregap", "contig": contig, "gap": gap}) + logger.debug("Ignored insignificant gap of %r, %s.", contig.name, gap) + context.get().emit(events.IgnoreGap(contig, gap)) continue if covered(contig, gap): @@ -646,9 +640,8 @@ def try_split(contig): left_part.name, left_part.alignment.q_st, left_part.alignment.q_ei, left_part.alignment.r_st, left_part.alignment.r_ei, right_part.name, right_part.alignment.q_st, right_part.alignment.q_ei, - right_part.alignment.r_st, right_part.alignment.r_ei, - extra={"action": "splitgap", "contig": contig, - "gap": gap, "left": left_part, "right": right_part}) + right_part.alignment.r_st, right_part.alignment.r_ei) + context.get().emit(events.SplitGap(contig, gap, left_part, right_part)) return process_queue: LifoQueue = LifoQueue() @@ -665,11 +658,11 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont contigs = list(contigs) for contig in contigs: logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.", - contig.name, contig.ref_name, contig.group_ref, len(contig.seq), - extra={"action": "intro", "contig": contig}) + contig.name, contig.ref_name, contig.group_ref, len(contig.seq)) logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.", contig.name, contig.seq, contig.ref_name, contig.group_ref, contig.ref_seq, len(contig.seq)) + context.get().emit(events.Intro(contig)) maybe_aligned = list(align_all_to_reference(contigs)) @@ -698,8 +691,8 @@ def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) result = combine_contigs(contigs) logger.debug("Combining these contigs for final output for %r: %s.", - group_ref, [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs], - extra={"action": "finalcombine", "contigs": contigs, "result": result}) + group_ref, [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs]) + context.get().emit(events.FinalCombine(contigs, result)) return result yield from map(combine, consensus_parts) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index fbdb654b3..b36bcceac 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -85,9 +85,6 @@ def write_contig_refs(contigs_fasta_path, group_refs = {} def run_stitcher(ctx): - logger = logging.getLogger("micall.core.contig_stitcher") - handler = add_structured_handler(logger) - genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) @@ -102,7 +99,7 @@ def run_stitcher(ctx): contig=contig.seq)) if stitcher_logger.level <= logging.DEBUG and stitcher_plot_path is not None: - plot_stitcher_coverage(handler.logs, stitcher_plot_path) + plot_stitcher_coverage(ctx.events, stitcher_plot_path) return len(contigs) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 03d6080e8..5d8a705ba 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -19,6 +19,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs +import micall.utils.contig_stitcher_events as events logger = logging.getLogger(__name__) @@ -535,35 +536,33 @@ def record_bad_contig(contig: Contig, lst: List[Contig]): contig_map[contig.name] = contig lst.append(contig.name) + for event in logs: - if not hasattr(event, "action"): - pass - elif event.action == "finalcombine": + if isinstance(event, events.FinalCombine): record_contig(event.result, event.contigs) - elif event.action == "splitgap": + elif isinstance(event, events.SplitGap): record_contig(event.left, [event.contig]) record_contig(event.right, [event.contig]) - elif event.action == "intro": + elif isinstance(event, events.Intro): record_contig(event.contig, []) - elif event.action == "alignment": - if event.type == "hit": - record_contig(event.part, [event.contig]) - elif event.type == "noref": - record_bad_contig(event.contig, unknown) - elif event.type == "zerohits" or event.type == "strandconflict": - record_bad_contig(event.contig, anomaly) - elif event.type == "reversecomplement": - record_contig(event.new_contig, [event.contig]) - elif event.type in ("hitnumber", "reversenumber"): - pass - else: - raise RuntimeError(f"Unrecognized event of type {event.type!r}: {event}") - elif event.action == "munge": + elif isinstance(event, events.Hit): + record_contig(event.part, [event.contig]) + elif isinstance(event, events.NoRef): + record_bad_contig(event.contig, unknown) + elif isinstance(event, events.ZeroHits): + record_bad_contig(event.contig, anomaly) + elif isinstance(event, events.StrandConflict): + record_bad_contig(event.contig, anomaly) + elif isinstance(event, events.ReverseComplement): + record_contig(event.result, [event.contig]) + elif isinstance(event, events.HitNumber): + pass + elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) - elif event.action == "modify": + elif isinstance(event, (events.LStrip, events.RStrip)): record_contig(event.result, [event.original]) record_morphism(event.result, event.original) - elif event.action == "overlap": + elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) overlap_leftparent_map[event.left_remainder.name] = event.left.name @@ -572,26 +571,26 @@ def record_bad_contig(contig: Contig, lst: List[Contig]): overlap_righttake_map[event.right_remainder.name] = event.right_take.name overlap_sibling_map[event.left_remainder.name] = event.right_remainder.name overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name - elif event.action == "drop": + elif isinstance(event, events.Drop): record_bad_contig(event.contig, discarded) - elif event.action == "stitchcut": + elif isinstance(event, events.StitchCut): record_contig(event.left_overlap, [event.left]) record_contig(event.left_remainder, [event.left]) record_contig(event.right_overlap, [event.right]) record_contig(event.right_remainder, [event.right]) - elif event.action == "stitch": + elif isinstance(event, events.Stitch): record_contig(event.result, [event.left, event.right]) - elif event.action == "cut": + elif isinstance(event, events.Cut): record_contig(event.left, [event.original]) record_contig(event.right, [event.original]) - elif event.action == "combine": + elif isinstance(event, events.Combine): record_contig(event.result, event.contigs) combine_left_edge[event.result.name] = event.contigs[0].name combine_right_edge[event.result.name] = event.contigs[-1].name - elif event.action in ("ignoregap", "nooverlap"): + elif isinstance(event, (events.IgnoreGap, events.NoOverlap)): pass else: - raise RuntimeError(f"Unrecognized action: {event.action}") + raise RuntimeError(f"Unrecognized action or event: {event}") group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 3fff183fb..80722d21f 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -4,6 +4,7 @@ import os import pytest +import micall.core.contig_stitcher as stitcher from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed @@ -11,6 +12,10 @@ from micall.tests.test_denovo import check_hcv_db # activates the fixture +logging.getLogger("micall.core.contig_stitcher").setLevel(logging.DEBUG) +logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) + + @pytest.fixture() def exact_aligner(monkeypatch): monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) @@ -18,12 +23,7 @@ def exact_aligner(monkeypatch): @pytest.fixture def visualizer(request, tmp_path): - # Set up the logger and structured handler - logger = logging.getLogger("micall.core.contig_stitcher") - logger.setLevel(logging.DEBUG) - handler = add_structured_handler(logger) - logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) - + logs = stitcher.context.set(stitcher.StitcherContext()) test_name = request.node.name plot_name = test_name + ".svg" pwd = os.path.dirname(__file__) @@ -33,7 +33,8 @@ def visualizer(request, tmp_path): path_to_produced = os.path.join(tmp_path, plot_name) def check(): - figure = plot_stitcher_coverage(handler.logs, path_to_produced) + logs = stitcher.context.get().events + figure = plot_stitcher_coverage(logs, path_to_produced) with open(path_to_produced, 'r') as produced_file: produced_data = produced_file.read() @@ -748,7 +749,7 @@ def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") stitched_contigs = os.path.join(tmp_path, "stitched.csv") - main([contigs, stitched_contigs]) + stitcher.main([contigs, stitched_contigs]) assert os.path.exists(contigs) assert os.path.exists(stitched_contigs) @@ -769,7 +770,7 @@ def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") stitched_contigs = os.path.join(tmp_path, "stitched.csv") plot = os.path.join(tmp_path, "exact_parts_contigs.plot.svg") - main([contigs, stitched_contigs, "--debug", "--plot", plot]) + stitcher.main([contigs, stitched_contigs, "--debug", "--plot", plot]) assert os.path.exists(contigs) assert os.path.exists(stitched_contigs) diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index f47033450..28984d87d 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -1,14 +1,12 @@ import pytest import json import os -from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps +from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps, with_fresh_context, StitcherContext from micall.core.plot_contigs import build_stitcher_figure from micall.utils.cigar_tools import CigarHit, Cigar from micall.utils.consensus_aligner import CigarActions -from micall.utils.structured_logger import add_structured_handler from typing import Dict, List from collections import defaultdict -import logging @pytest.fixture @@ -121,14 +119,10 @@ def test_visualizer_simple(no_aligner, description): for contig in contigs: contig.__dict__["group_ref"] = "HIV1-B-FR-K03455-seed" - logger = logging.getLogger("micall.core.contig_stitcher") - logger.setLevel(logging.DEBUG) - handler = add_structured_handler(logger) + def test(ctx: StitcherContext): + stitched = list(stitch_consensus(contigs)) + assert len(ctx.events) >= len(contigs) + figure = build_stitcher_figure(ctx.events) + assert len(figure.elements) > len(contigs) + 1 - stitched = list(stitch_consensus(contigs)) - - assert logger.level <= logging.DEBUG - assert len(handler.logs) >= len(contigs) - - figure = build_stitcher_figure(handler.logs) - assert len(figure.elements) > len(contigs) + 1 + with_fresh_context(test) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py new file mode 100644 index 000000000..360dd7936 --- /dev/null +++ b/micall/utils/contig_stitcher_events.py @@ -0,0 +1,143 @@ +from typing import Union, List +from dataclasses import dataclass + + +@dataclass +class Cut: + original: 'Contig' + left: 'Contig' + right: 'Contig' + + +@dataclass +class LStrip: + original: 'AlignedContig' + result: 'AlignedContig' + + +@dataclass +class RStrip: + original: 'AlignedContig' + result: 'AlignedContig' + + +@dataclass +class Munge: + left: 'AlignedContig' + right: 'AlignedContig' + result: 'AlignedContig' + + +@dataclass +class Combine: + contigs: List['AlignedContig'] + result: 'AlignedContig' + + +@dataclass +class NoRef: + contig: 'GenotypedContig' + + +@dataclass +class ZeroHits: + contig: 'GenotypedContig' + + +@dataclass +class StrandConflict: + contig: 'GenotypedContig' + + +@dataclass +class HitNumber: + contig: 'GenotypedContig' + initial: object + connected: object + + +@dataclass +class ReverseComplement: + contig: 'GenotypedContig' + result: 'GenotypedContig' + + +@dataclass +class Hit: + contig: 'GenotypedContig' + part: 'AlignedContig' + index: int + + +@dataclass +class StitchCut: + left: 'AlignedContig' + right: 'AlignedContig' + left_overlap: 'AlignedContig' + right_overlap: 'AlignedContig' + left_remainder: 'AlignedContig' + right_remainder: 'AlignedContig' + + +@dataclass +class Overlap: + left: 'AlignedContig' + right: 'AlignedContig' + left_overlap: 'AlignedContig' + right_overlap: 'AlignedContig' + left_remainder: 'AlignedContig' + right_remainder: 'AlignedContig' + left_take: 'AlignedContig' + right_take: 'AlignedContig' + concordance: List[float] + average: float + cut_point: int + cut_point_scaled: float + + +@dataclass +class NoOverlap: + contig: 'AlignedContig' + + +@dataclass +class Stitch: + left: 'AlignedContig' + right: 'AlignedContig' + result: 'AlignedContig' + + +@dataclass +class Drop: + contig: 'AlignedContig' + covering: List['AlignedContig'] + + +@dataclass +class IgnoreGap: + contig: 'AlignedContig' + gap: 'CigarHit' + + +@dataclass +class SplitGap: + contig: 'AlignedContig' + gap: 'CigarHit' + left: 'AlignedContig' + right: 'AlignedContig' + + +@dataclass +class Intro: + contig: 'GenotypedContig' + + +@dataclass +class FinalCombine: + contigs: List['AlignedContig'] + result: 'AlignedContig' + + +AlignmentEvent = Union[NoRef, ZeroHits, StrandConflict, HitNumber, ReverseComplement, Hit] +ModifyEvent = Union[LStrip, RStrip] +EventType = Union[Cut, ModifyEvent, Munge, AlignmentEvent, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro] From 834c89b0bacbdf3a1f6af40d626dcab4c2e2e573 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Jan 2024 17:14:21 -0800 Subject: [PATCH 164/485] Contig stitcher: do not require logging=debug for the visualizer --- micall/core/contig_stitcher.py | 4 ---- micall/core/denovo.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0c97f4dcc..9407e83d5 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -723,10 +723,6 @@ def main(args): logger.setLevel(logging.WARN) logging.basicConfig(level=logger.level) - if args.plot and logger.level > logging.DEBUG: - logging.error("Option --plot requires --debug.") - exit(1) - write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) args.contigs.close() args.stitched_contigs.close() diff --git a/micall/core/denovo.py b/micall/core/denovo.py index b36bcceac..8b93cf505 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,7 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, logger as stitcher_logger, with_fresh_context +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, with_fresh_context from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.structured_logger import add_structured_handler @@ -98,7 +98,7 @@ def run_stitcher(ctx): group_ref=contig.group_ref, contig=contig.seq)) - if stitcher_logger.level <= logging.DEBUG and stitcher_plot_path is not None: + if stitcher_plot_path is not None: plot_stitcher_coverage(ctx.events, stitcher_plot_path) return len(contigs) From ea50a6d4f4d91dcf081751d0a44047c0513499f9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 10:36:05 -0800 Subject: [PATCH 165/485] Contig stitcher: add missing type signatures --- micall/core/contig_stitcher.py | 6 +- micall/core/plot_contigs.py | 113 ++++++++++++------------- micall/utils/contig_stitcher_events.py | 2 +- 3 files changed, 60 insertions(+), 61 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9407e83d5..05b129551 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -19,15 +19,15 @@ logger = logging.getLogger(__name__) class StitcherContext: - def __init__(self): + def __init__(self) -> None: self.name_generator_state: int = 0 self.events: List[events.EventType] = [] - def generate_new_name(self): + def generate_new_name(self) -> str: self.name_generator_state += 1 return f"c{self.name_generator_state}" - def emit(self, event: events.EventType): + def emit(self, event: events.EventType) -> None: self.events.append(event) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 5d8a705ba..0d9781b19 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1,4 +1,5 @@ import typing +from typing import Dict, Tuple, List, Set, Iterable, NoReturn from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType from collections import Counter, defaultdict from csv import DictReader @@ -19,6 +20,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs +from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig import micall.utils.contig_stitcher_events as events @@ -395,19 +397,14 @@ def build_coverage_figure(genome_coverage_csv, blast_csv=None, use_concordance=F return f -def plot_stitcher_coverage(logs, genome_coverage_svg_path): +def plot_stitcher_coverage(logs: Iterable[events.EventType], genome_coverage_svg_path: str): f = build_stitcher_figure(logs) f.show(w=970).save_svg(genome_coverage_svg_path, context=draw.Context(invert_y=True)) return f -from types import SimpleNamespace -from typing import Union, Dict, Tuple, List, Optional, Set -from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig -import random - -def build_stitcher_figure(logs) -> None: - contig_map: Dict[str, Contig] = {} +def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: + contig_map: Dict[str, GenotypedContig] = {} name_mappings: Dict[str, str] = {} parent_graph: Dict[str, List[str]] = {} morphism_graph: Dict[str, List[str]] = {} @@ -515,7 +512,7 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) - def record_contig(contig: Contig, parents: List[Contig]): + def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): contig_map[contig.name] = contig if [contig.name] != [parent.name for parent in parents]: for parent in parents: @@ -532,7 +529,7 @@ def record_morphism(contig: Contig, original: Contig): if contig.name not in lst: lst.append(contig.name) - def record_bad_contig(contig: Contig, lst: List[Contig]): + def record_bad_contig(contig: GenotypedContig, lst: List[str]): contig_map[contig.name] = contig lst.append(contig.name) @@ -585,11 +582,13 @@ def record_bad_contig(contig: Contig, lst: List[Contig]): record_contig(event.right, [event.original]) elif isinstance(event, events.Combine): record_contig(event.result, event.contigs) - combine_left_edge[event.result.name] = event.contigs[0].name - combine_right_edge[event.result.name] = event.contigs[-1].name + if event.contigs: + combine_left_edge[event.result.name] = event.contigs[0].name + combine_right_edge[event.result.name] = event.contigs[-1].name elif isinstance(event, (events.IgnoreGap, events.NoOverlap)): pass else: + x: NoReturn = event raise RuntimeError(f"Unrecognized action or event: {event}") group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} @@ -610,23 +609,23 @@ def record_bad_contig(contig: Contig, lst: List[Contig]): eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) reduced_morphism_graph = reduced_closure(morphism_graph) - for contig in overlaps_list: - temporary.add(contig) - for child in transitive_children_graph.get(contig, []): + for contig_name in overlaps_list: + temporary.add(contig_name) + for child in transitive_children_graph.get(contig_name, []): temporary.add(child) - for contig, parents in parent_graph.items(): + for contig_name, parents in parent_graph.items(): if len(parents) > 2: - children_join_points.append(contig) - for contig, children in children_graph.items(): + children_join_points.append(contig_name) + for contig_name, children in children_graph.items(): if len(children) > 2: - children_meet_points.append(contig) + children_meet_points.append(contig_name) - last_join_points_parent = {contig for join in children_join_points for contig in transitive_parent_graph.get(join, [])} + last_join_points_parent = {contig_name for join in children_join_points for contig_name in transitive_parent_graph.get(join, [])} last_join_points = [] - for contig in children_join_points: - if contig not in last_join_points_parent: - last_join_points.append(contig) + for contig_name in children_join_points: + if contig_name not in last_join_points_parent: + last_join_points.append(contig_name) def set_query_position(contig: Contig): if contig.name in query_position_map: @@ -644,7 +643,7 @@ def set_query_position(contig: Contig): if parent.name not in query_position_map: set_query_position(parent) - average = sum(query_position_map[parent_name] for parent_name in parent_names) / len(parent_names) + average = round(sum(query_position_map[parent_name] for parent_name in parent_names) / len(parent_names)) query_position_map[contig.name] = average else: query_position_map[contig.name] = (contig.alignment.q_st + contig.alignment.q_ei) // 2 @@ -653,9 +652,9 @@ def set_query_position(contig: Contig): set_query_position(contig) # Closing `temporary' - for contig in contig_map: - if contig in temporary: - for clone in eqv_morphism_graph.get(contig, [contig]): + for contig_name in contig_map: + if contig_name in temporary: + for clone in eqv_morphism_graph.get(contig_name, [contig_name]): temporary.add(clone) def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): @@ -676,42 +675,42 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass final_parts: Dict[str, bool] = {} - for contig in contig_map: - if contig in temporary: + for contig_name in contig_map: + if contig_name in temporary: continue - if contig in overlap_sibling_map: - finals = reduced_morphism_graph.get(contig, [contig]) + if contig_name in overlap_sibling_map: + finals = reduced_morphism_graph.get(contig_name, [contig_name]) if len(finals) == 1: [final] = finals parents = reduced_parent_graph.get(final, []) if len(parents) == 1: final_parts[final] = True - elif contig in bad_contigs: - final_parts[contig] = True + elif contig_name in bad_contigs: + final_parts[contig_name] = True for join in last_join_points + sorted_sinks: parents = parent_graph.get(join, [join]) if not any(isinstance(contig_map[parent], AlignedContig) for parent in parents): parents = [join] - for contig in parents: - for contig in reduced_morphism_graph.get(contig, [contig]): - if contig in bad_contigs: + for contig_name in parents: + for contig_name in reduced_morphism_graph.get(contig_name, [contig_name]): + if contig_name in bad_contigs: continue - if any(contig in transitive_parent_graph.get(bad, []) for bad in bad_contigs): + if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs): continue - if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig, [contig])): + if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig_name, [contig_name])): continue - transitive_parent = eqv_parent_graph.get(contig, [contig]) + transitive_parent = eqv_parent_graph.get(contig_name, [contig_name]) if any(parent in transitive_parent for parent in final_parts): continue - final_parts[contig] = True + final_parts[contig_name] = True final_parent_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: @@ -725,7 +724,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): min_position, max_position = 1, 1 position_offset = 100 - for contig in contig_map.values(): + for _, contig in contig_map.items(): if isinstance(contig, GenotypedContig) and contig.ref_seq is not None: max_position = max(max_position, len(contig.ref_seq) + 3 * position_offset) else: @@ -748,8 +747,8 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): k += 1 name_mappings[child] = f"{i + 1}.{k + 1}" - for contig, name in name_mappings.items(): - logger.debug(f"Contig name {contig!r} is displayed as {name!r}.") + for contig_name, name in name_mappings.items(): + logger.debug(f"Contig name {contig_name!r} is displayed as {name!r}.") def get_neighbours(part, lookup): for clone in eqv_morphism_graph.get(part.name, [part.name]): @@ -771,8 +770,8 @@ def get_neighbour(part, lookup): full_size_map: Dict[str, Tuple[int, int]] = {} for parent_name in sorted_roots: - parts = final_parent_mapping[parent_name] - parts = [contig_map[part] for part in parts] + parts_names = final_parent_mapping[parent_name] + parts = [contig_map[part] for part in parts_names] for part in parts: if not isinstance(part, AlignedContig): @@ -801,8 +800,8 @@ def get_neighbour(part, lookup): aligned_size_map[part.name] = (r_st, r_ei) - sibling = ([overlap_sibling_map[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_sibling_map] or [None])[0] - sibling = sibling and contig_map[sibling] + sibling_name = ([overlap_sibling_map[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_sibling_map] or [""])[0] + sibling = sibling_name and contig_map[sibling_name] prev_part = get_neighbour(sibling, overlap_lefttake_map) next_part = get_neighbour(sibling, overlap_righttake_map) @@ -820,7 +819,7 @@ def get_neighbour(part, lookup): full_size_map[part.name] = (r_st, r_ei) - def get_contig_coordinates(contig): + def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]: if isinstance(contig, AlignedContig): r_st = position_offset + contig.alignment.r_st r_ei = position_offset + contig.alignment.r_ei @@ -841,7 +840,7 @@ def get_contig_coordinates(contig): a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) - def get_tracks(repeatset, group_ref, contig_name): + def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]: parts = final_parent_mapping[contig_name] for part_name in parts: part = contig_map[part_name] @@ -863,7 +862,7 @@ def get_tracks(repeatset, group_ref, contig_name): (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) yield Track(f_r_st, f_r_ei, label=f"{indexes}") - def get_arrows(repeatset, group_ref, contig_name, labels): + def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: parts = final_parent_mapping[contig_name] for part_name in parts: part = contig_map[part_name] @@ -890,8 +889,8 @@ def get_arrows(repeatset, group_ref, contig_name, labels): h=height, label=indexes) - def get_all_arrows(group_ref, labels): - repeatset = set() + def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: + repeatset: Set[str] = set() for parent_name in sorted_roots: yield from get_arrows(repeatset, group_ref, parent_name, labels) @@ -966,8 +965,8 @@ def get_all_arrows(group_ref, labels): # Contigs # ########### - repeatset1 = set() - repeatset2 = set() + repeatset1: Set[str] = set() + repeatset2: Set[str] = set() for parent_name in sorted_roots: arrows = list(get_arrows(repeatset1, group_ref, parent_name, labels=False)) if arrows: @@ -992,7 +991,7 @@ def get_all_arrows(group_ref, labels): contig = contig_map[contig_name] (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - name = name_mappings.get(contig.name, contig.name) + name = name_mappings.get(contig_name, contig_name) figure.add(Arrow(r_st, r_ei, elevation=-20, h=1)) figure.add(Track(f_r_st, f_r_ei, label=name)) @@ -1021,7 +1020,7 @@ def get_all_arrows(group_ref, labels): else: colour = "red" - name = name_mappings.get(contig.name, contig.name) + name = name_mappings.get(contig_name, contig_name) figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) ########### @@ -1042,7 +1041,7 @@ def get_all_arrows(group_ref, labels): r_st = position_offset r_ei = position_offset + len(contig.seq) colour = "red" - name = name_mappings.get(contig.name, contig.name) + name = name_mappings.get(contig_name, contig_name) figure.add(Track(r_st, r_ei, color=colour, label=name)) if not figure.elements: diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 360dd7936..9c7a60a49 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -140,4 +140,4 @@ class FinalCombine: AlignmentEvent = Union[NoRef, ZeroHits, StrandConflict, HitNumber, ReverseComplement, Hit] ModifyEvent = Union[LStrip, RStrip] -EventType = Union[Cut, ModifyEvent, Munge, AlignmentEvent, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro] +EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] From 0238f0472fc612cf1a958be7fb3adb3f7d79e85f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 11:29:12 -0800 Subject: [PATCH 166/485] Contig stitcher: simplify the concordance algorithm --- micall/core/contig_stitcher.py | 53 ++++++++++---------------- micall/tests/test_contig_stitcher.py | 25 +++++------- micall/utils/contig_stitcher_events.py | 7 ++-- 3 files changed, 33 insertions(+), 52 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 05b129551..04365be73 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -10,6 +10,7 @@ from Bio import Seq import logging from contextvars import ContextVar, Context +from fractions import Fraction from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions @@ -350,15 +351,17 @@ def find_overlapping_contig(self, aligned_contigs): return max(every, key=lambda other: other.alignment.ref_length if other else 0, default=None) -def calculate_concordance(left: str, right: str) -> List[float]: +def calculate_concordance(left: str, right: str) -> List[Fraction]: """ - Calculate concordance for two given sequences using a sliding window method. + Calculate concordance for two given sequences using a sliding average. - The function compares the two strings from both left to right and then right to left, - calculating for each position the ratio of matching characters in a window around the - current position. So position holds a moving avarage score. - - It's required that the input strings are of the same length. + The function compares the two strings character by character, simultaneously from + both left to right and right to left, calculating a score that represents a moving + average of matches at each position. If characters match at a given position, + a score of 1 is added; otherwise, a score of 0 is added. The score is then + averaged with the previous scores using a weighted sliding average where the + current score has a weight of 1/3 and the accumulated score has a weight of 2/3. + This sliding average score is halved and then processed again, but in reverse direction. :param left: string representing first sequence :param right: string representing second sequence @@ -368,22 +371,18 @@ def calculate_concordance(left: str, right: str) -> List[float]: if len(left) != len(right): raise ValueError("Can only calculate concordance for same sized sequences") - result: List[float] = [0] * len(left) + result: List[Fraction] = [Fraction(0)] * len(left) def slide(start, end): - window_size = 30 - scores = deque([0] * window_size, maxlen=window_size) - scores_sum = 0 + scores_sum = Fraction(0) inputs = list(zip(left, right)) increment = 1 if start <= end else -1 for i in range(start, end, increment): (a, b) = inputs[i] - current = a == b - scores_sum -= scores.popleft() - scores_sum += current - scores.append(current) - result[i] += (scores_sum / window_size) / 2 + current = Fraction(1) if a == b else Fraction(0) + scores_sum = (scores_sum * 2 / 3 + current * 1 / 3) + result[i] += scores_sum / 2 # Slide forward, then in reverse, adding the scores at each position. slide(0, len(left)) @@ -392,22 +391,10 @@ def slide(start, end): return result -def disambiguate_concordance(concordance: List[float]) -> Iterable[Tuple[float, int, int]]: - def slide(concordance): - count = 0 - for i, (prev, current, next) in enumerate(sliding_window(concordance)): - if current == prev: - count += 1 - yield count - else: - yield 0 - - forward = list(slide(concordance)) - reverse = list(reversed(list(slide(reversed(concordance))))) - for i, (x, f, r) in enumerate(zip(concordance, forward, reverse)): - local_rank = f * r +def disambiguate_concordance(concordance: List[float]) -> Iterable[Tuple[float, int]]: + for i, x in enumerate(concordance): global_rank = i if i < len(concordance) / 2 else len(concordance) - i - 1 - yield (x, local_rank, global_rank) + yield (x, global_rank) def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance): @@ -467,8 +454,8 @@ def stitch_2_contigs(left, right): right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_cutpoint) # Log it. - average_concordance = sum(concordance) / (len(concordance) or 1) - concordance_str = ', '.join(map(lambda x: str(round(x, 2)), concordance)) + average_concordance = Fraction(sum(concordance) / (len(concordance) or 1)) + concordance_str = ', '.join(map(lambda x: str(int(round(x * 100)) / 100), concordance)) cut_point_location_scaled = max_concordance_index / (((len(concordance) or 1) - 1) or 1) logger.debug("Created overlap contigs %r at %s and %r at %s based on parts of %r and %r, with avg. concordance %s%%, cut point at %s%%, and full concordance [%s].", left_overlap_take.name, left_overlap.alignment, right_overlap_take.name, right_overlap_take.alignment, diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 80722d21f..8131c9c58 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -978,18 +978,18 @@ def generate_random_string_pair(length): @pytest.mark.parametrize( 'left, right, expected', - [("aaaaa", "aaaaa", [0.1] * 5), - ("abcdd", "abcdd", [0.1] * 5), - ("aaaaaaaa", "baaaaaab", [0.1, 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.1]), - ("aaaaaaaa", "aaaaaaab", [0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.12]), - ("aaaaaaaa", "aaaaaaab", [0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.13, 0.12]), - ("aaaaaaaa", "aaaaabbb", [0.1, 0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08]), - ("aaaaaaaa", "aaabbaaa", [0.12, 0.12, 0.12, 0.1, 0.1, 0.12, 0.12, 0.12]), + [("aaaaa", "aaaaa", [0.6, 0.68, 0.7, 0.68, 0.6]), + ("abcdd", "abcdd", [0.6, 0.68, 0.7, 0.68, 0.6]), + ("aaaaaaaa", "baaaaaab", [0.3, 0.62, 0.71, 0.75, 0.75, 0.71, 0.62, 0.3]), + ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), + ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), + ("aaaaaaaa", "aaaaabbb", [0.6, 0.68, 0.7, 0.68, 0.6, 0.29, 0.19, 0.13]), + ("aaaaaaaa", "aaabbaaa", [0.56, 0.63, 0.62, 0.39, 0.39, 0.62, 0.63, 0.56]), ("aaaaa", "bbbbb", [0] * 5), ] ) def test_concordance_simple(left, right, expected): - result = [round(x, 2) for x in calculate_concordance(left, right)] + result = [round(float(x), 2) for x in calculate_concordance(left, right)] assert result == expected @@ -1000,7 +1000,7 @@ def test_concordance_simple(left, right, expected): ("a" * 128, "a" * 64 + "ba" * 32, 32), ("a" * 128, "a" * 54 + "b" * 20 + "a" * 54, 28), # two peaks ("a" * 128, "a" * 63 + "b" * 2 + "a" * 63, 32), # two peaks - ("a" * 1280, "b" * 640 + "a" * 640, 640 + 30), # the window is too small to account for all of the context + ("a" * 1280, "b" * 640 + "a" * 640, round(1280 * 3 / 4)), ] ) def test_concordance_simple_index(left, right, expected): @@ -1019,13 +1019,6 @@ def generate_test_cases(num_cases): concordance_cases = generate_test_cases(num_cases=100) -@pytest.mark.parametrize('left, right', concordance_cases) -def test_concordance_output_is_list_of_floats(left, right): - result = calculate_concordance(left, right) - assert isinstance(result, list), "Result should be a list" - assert all(isinstance(n, float) for n in result), "All items in result should be float" - - @pytest.mark.parametrize('left, right', concordance_cases) def test_concordance_output_range(left, right): result = calculate_concordance(left, right) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 9c7a60a49..652069704 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -1,5 +1,6 @@ from typing import Union, List from dataclasses import dataclass +from fractions import Fraction @dataclass @@ -89,10 +90,10 @@ class Overlap: right_remainder: 'AlignedContig' left_take: 'AlignedContig' right_take: 'AlignedContig' - concordance: List[float] - average: float + concordance: List[Fraction] + average: Fraction cut_point: int - cut_point_scaled: float + cut_point_scaled: Fraction @dataclass From 6bd0bfeaa800c1cfa42d00737b79ffa967dffae9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 12:49:09 -0800 Subject: [PATCH 167/485] Contig stitcher: simplify some visualizer code --- micall/core/plot_contigs.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 0d9781b19..4575c5627 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -424,7 +424,6 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: temporary: Set[str] = set() children_join_points: List[str] = [] children_meet_points: List[str] = [] - last_active: List[str] = [] query_position_map: Dict[str, int] = {} def get_oldest_ancestors(recur, graph, ancestor_name): @@ -442,30 +441,27 @@ def get_oldest_ancestors(recur, graph, ancestor_name): yield ancestor_name return - def reduced_closure(graph): + def remove_intermediate_edges(graph): ret = {} for parent, children in graph.items(): lst = [] for child in children: - for anc in get_oldest_ancestors(set(), graph, child): - if anc not in lst: - lst.append(anc) + if all(other not in graph.get(child, []) for other in children): + lst.append(child) ret[parent] = lst return ret def get_all_ancestors(recur, lst, graph, ancestor_name): - if ancestor_name in recur: - assert RuntimeError(f"Recursion in graph {graph!r}") - else: + if ancestor_name not in recur: recur = recur.copy() recur.add(ancestor_name) - if ancestor_name not in lst: - lst.append(ancestor_name) + if ancestor_name not in lst: + lst.append(ancestor_name) - existing_ancestors = graph.get(ancestor_name, []) - for existing in existing_ancestors: - get_all_ancestors(recur, lst, graph, existing) + existing_ancestors = graph.get(ancestor_name, []) + for existing in existing_ancestors: + get_all_ancestors(recur, lst, graph, existing) def transitive_closure(graph): ret = {} @@ -593,10 +589,9 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) - reduced_parent_graph = reduced_closure(parent_graph) - reduced_children_graph = reduced_closure(children_graph) transitive_parent_graph = transitive_closure(parent_graph) transitive_children_graph = transitive_closure(children_graph) + reduced_parent_graph = remove_intermediate_edges(transitive_parent_graph) eqv_parent_graph = reflexive_closure(symmetric_closure(transitive_parent_graph)) sorted_roots = list(sorted(parent_name for parent_name in contig_map @@ -606,8 +601,9 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): if child_name not in children_graph)) bad_contigs = anomaly + discarded + unknown - eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_closure(morphism_graph))) - reduced_morphism_graph = reduced_closure(morphism_graph) + transitive_morphism_graph = transitive_closure(morphism_graph) + reduced_morphism_graph = remove_intermediate_edges(transitive_morphism_graph) + eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_morphism_graph)) for contig_name in overlaps_list: temporary.add(contig_name) From 3419b9b892d424fb5e042287eab7a5299c0a68ee Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 12:53:46 -0800 Subject: [PATCH 168/485] Contig stitcher: remove all logging.info calls --- micall/core/contig_stitcher.py | 44 +++++++++------------ micall/tests/test_contig_stitcher.py | 57 ---------------------------- 2 files changed, 17 insertions(+), 84 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 04365be73..021154b58 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -218,7 +218,7 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: """ if contig.ref_seq is None: - logger.info("Contig %r not aligned - no reference.", contig.name) + logger.debug("Contig %r not aligned - no reference.", contig.name) context.get().emit(events.NoRef(contig)) yield contig return @@ -234,26 +234,26 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] if not connected: - logger.info("Contig %r not aligned - backend's choice.", contig.name) + logger.debug("Contig %r not aligned - backend's choice.", contig.name) context.get().emit(events.ZeroHits(contig)) yield contig return if len(set(map(lambda p: p[1], hits_array))) > 1: - logger.info("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name) + logger.debug("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name) context.get().emit(events.StrandConflict(contig)) yield contig return - logger.info("Contig %r produced %s aligner hits. After connecting them, the number became %s.", - contig.name, len(hits_array), len(connected)) + logger.debug("Contig %r produced %s aligner hits. After connecting them, the number became %s.", + contig.name, len(hits_array), len(connected)) context.get().emit(events.HitNumber(contig, hits_array, connected)) strand = hits_array[0][1] if strand == "reverse": rc = str(Seq(contig.seq).reverse_complement()) new_contig = replace(contig, seq=rc) - logger.info("Reverse complemented contig %r.", contig.name) + logger.debug("Reverse complemented contig %r.", contig.name) context.get().emit(events.ReverseComplement(contig, new_contig)) contig = new_contig @@ -261,10 +261,6 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: query = replace(contig, name=context.get().generate_new_name()) part = AlignedContig.make(query, single_hit, strand) - logger.info("Part %r of contig %r aligned as %r at [%s, %s]->[%s, %s]%s.", - i, contig.name,part.name,part.alignment.q_st, - part.alignment.q_ei,part.alignment.r_st,part.alignment.r_ei, - " (rev)" if strand == "reverse" else "") logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, part.name,part.alignment, " (rev)" if strand == "reverse" else "") context.get().emit(events.Hit(contig, part, i)) @@ -483,7 +479,7 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: - logger.info("Nothing overlaps with %r.", current.name) + logger.debug("Nothing overlaps with %r.", current.name) context.get().emit(events.NoOverlap(current)) yield current continue @@ -493,10 +489,6 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: contigs.remove(overlapping_contig) contigs.insert(0, new_contig) - logger.info("Stitching %r with %r results in %r at [%s,%s]->[%s,%s].", - current.name, overlapping_contig.name, - new_contig.name, new_contig.alignment.q_st, new_contig.alignment.q_ei, - new_contig.alignment.r_st, new_contig.alignment.r_ei) logger.debug("Stitching %r with %r results in %r at %s (len %s).", current.name, overlapping_contig.name, new_contig.name, new_contig.alignment, len(new_contig.seq)) @@ -568,8 +560,8 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] covered, covering = find_covered_contig(contigs) if covered: contigs.remove(covered) - logger.info("Droped contig %r as it is completely covered by these contigs: %s.", - covered.name, ", ".join(repr(x.name) for x in covering)) + logger.debug("Droped contig %r as it is completely covered by these contigs: %s.", + covered.name, ", ".join(repr(x.name) for x in covering)) context.get().emit(events.Drop(covered, covering)) else: break @@ -620,14 +612,14 @@ def try_split(contig): contigs.append(right_part) process_queue.put(right_part) - logger.info("Split contig %r around its gap at [%s, %s]->[%s, %s]. " - "Left part: %r at [%s, %s]->[%s, %s], " - "right part: %r at [%s, %s]->[%s, %s].", - contig.name, gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, - left_part.name, left_part.alignment.q_st, left_part.alignment.q_ei, - left_part.alignment.r_st, left_part.alignment.r_ei, - right_part.name, right_part.alignment.q_st, right_part.alignment.q_ei, - right_part.alignment.r_st, right_part.alignment.r_ei) + logger.debug("Split contig %r around its gap at [%s, %s]->[%s, %s]. " + "Left part: %r at [%s, %s]->[%s, %s], " + "right part: %r at [%s, %s]->[%s, %s].", + contig.name, gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, + left_part.name, left_part.alignment.q_st, left_part.alignment.q_ei, + left_part.alignment.r_st, left_part.alignment.r_ei, + right_part.name, right_part.alignment.q_st, right_part.alignment.q_ei, + right_part.alignment.r_st, right_part.alignment.r_ei) context.get().emit(events.SplitGap(contig, gap, left_part, right_part)) return @@ -644,8 +636,6 @@ def try_split(contig): def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(contigs) for contig in contigs: - logger.info("Introduced contig %r of ref %r, group_ref %r, and length %s.", - contig.name, contig.ref_name, contig.group_ref, len(contig.seq)) logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.", contig.name, contig.seq, contig.ref_name, contig.group_ref, contig.ref_seq, len(contig.seq)) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 8131c9c58..2979a2fe3 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -688,63 +688,6 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner, vi assert len(visualizer().elements) > len(contigs) -def test_correct_processing_complex_logs(exact_aligner, visualizer): - # Scenario: There are two reference organisms. - # Each with 4 contigs. - # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. - # Tested before, but this time we check the logs - - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 - - contigs = [[ - GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 20 + 'C' * 50, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c', - seq='C' * 70 + 'T' * 20, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='d', - seq='T' * 20 + 'G' * 50, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] for ref_name in ['testref-1', 'testref-2']] - - contigs = sum(contigs, start=[]) - - logger = logging.getLogger("micall.core.contig_stitcher") - logger.setLevel(logging.DEBUG) - handler = add_structured_handler(logger) - - assert len(handler.logs) == 0 - list(stitch_consensus(contigs)) - assert len(handler.logs) == 150 - - info_messages = [m for m in handler.logs if m.levelname == 'INFO'] - debug_messages = [m for m in handler.logs if m.levelname == 'DEBUG'] - assert len(info_messages) == 32 - assert len(debug_messages) == len(handler.logs) - len(info_messages) - - assert len(visualizer().elements) > len(contigs) - - def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") From a07611db74b937969f5d9e3090ae36c8a5ba3529 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 12:54:39 -0800 Subject: [PATCH 169/485] Remove structured logger module --- micall/core/denovo.py | 1 - micall/tests/test_contig_stitcher.py | 1 - micall/utils/structured_logger.py | 27 --------------------------- 3 files changed, 29 deletions(-) delete mode 100644 micall/utils/structured_logger.py diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 8b93cf505..86e5c5434 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -21,7 +21,6 @@ from micall.core.project_config import ProjectConfig from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, with_fresh_context from micall.core.plot_contigs import plot_stitcher_coverage -from micall.utils.structured_logger import add_structured_handler IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 2979a2fe3..a4ec0c5c6 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -8,7 +8,6 @@ from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed -from micall.utils.structured_logger import add_structured_handler from micall.tests.test_denovo import check_hcv_db # activates the fixture diff --git a/micall/utils/structured_logger.py b/micall/utils/structured_logger.py deleted file mode 100644 index c0f5ae687..000000000 --- a/micall/utils/structured_logger.py +++ /dev/null @@ -1,27 +0,0 @@ - -import logging -from typing import List, Tuple, Iterable, Callable - - -class InMemoryLogHandler(logging.Handler): - def __init__(self, name: str, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.name: str = name - self.logs: List[logging.LogRecord] = [] - self.callbacks = [] - - - def emit(self, record: logging.LogRecord): - self.logs.append(record) - for callback in self.callbacks: - callback(record) - - - def addCallback(self, callback: Callable[[logging.LogRecord], object]): - self.callbacks.append(callback) - - -def add_structured_handler(logger: logging.Logger): - memory_handler = InMemoryLogHandler(logger.name) - logger.addHandler(memory_handler) - return memory_handler From 3f0376a48317fd7084a35ee0177918244b950fc9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 15:26:07 -0800 Subject: [PATCH 170/485] Contig stitcher: fix midpoint calculation during gap split --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 021154b58..67ec2281d 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -602,7 +602,7 @@ def try_split(contig): continue if covered(contig, gap): - midpoint = gap.r_st + (gap.r_ei - gap.r_st) // 2 + contig.alignment.epsilon + midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon left_part, right_part = contig.cut_reference(midpoint) left_part = left_part.rstrip_query() right_part = right_part.lstrip_query() From ce034428332ff0aa77981432a84ed7a683957d05 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 16:14:19 -0800 Subject: [PATCH 171/485] Contig stitcher: change unaligned colour --- micall/core/plot_contigs.py | 4 ++-- .../data/stitcher_plots/test_stitching_with_empty_contigs.svg | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4575c5627..cedee05e7 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1014,7 +1014,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: else: figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) else: - colour = "red" + colour = "yellow" name = name_mappings.get(contig_name, contig_name) figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) @@ -1036,7 +1036,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: contig = contig_map[contig_name] r_st = position_offset r_ei = position_offset + len(contig.seq) - colour = "red" + colour = "yellow" name = name_mappings.get(contig_name, contig_name) figure.add(Track(r_st, r_ei, color=colour, label=name)) diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg index 9f7c26085..3cd329d42 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -38,7 +38,7 @@ - + 2 From 208316c48e75e6af76535965804542c3adecfbe2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Jan 2024 17:14:01 -0800 Subject: [PATCH 172/485] Cigar tools: rename "gaps()" to "deletions()" --- micall/core/contig_stitcher.py | 2 +- micall/tests/test_cigar_tools.py | 4 ++-- micall/tests/test_contig_stitcher.py | 10 +++++----- micall/utils/cigar_tools.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 67ec2281d..ae1a33a66 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -590,7 +590,7 @@ def significant(gap): return gap.ref_length > 5 def try_split(contig): - for gap in contig.alignment.gaps(): + for gap in contig.alignment.deletions(): if not significant(gap): # Really we do not want to split on every little deletion # because that would mean that we would need to stitch diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index d2c05206a..330cbf2ad 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -451,7 +451,7 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): if not isinstance(x[2], Exception)]) def test_cigar_hit_gaps_no_m_or_i(hit): hit = parsed_hit(hit) - gaps = list(hit.gaps()) + gaps = list(hit.deletions()) if 'D' in str(hit.cigar): assert len(gaps) > 0 @@ -465,7 +465,7 @@ def test_cigar_hit_gaps_no_m_or_i(hit): if not isinstance(x[2], Exception)]) def test_cigar_hit_gaps_lengths(hit): hit = parsed_hit(hit) - gaps = list(hit.gaps()) + gaps = list(hit.deletions()) for gap in gaps: assert gap.query_length == 0 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index a4ec0c5c6..b717fa9a9 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -502,12 +502,12 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner, visualizer): contigs = list(align_all_to_reference(contigs)) assert len(contigs) == 2 - assert len(list(contigs[0].alignment.gaps())) == 1 - assert len(list(contigs[1].alignment.gaps())) == 0 + assert len(list(contigs[0].alignment.deletions())) == 1 + assert len(list(contigs[1].alignment.deletions())) == 0 results = list(split_contigs_with_gaps(contigs)) assert len(results) == 3 - assert all(list(contig.alignment.gaps()) == [] for contig in results) + assert all(list(contig.alignment.deletions()) == [] for contig in results) assert len(visualizer().elements) > len(contigs) @@ -536,8 +536,8 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): contigs = list(align_all_to_reference(contigs)) assert len(contigs) == 2 - assert len(list(contigs[0].alignment.gaps())) == 1 - assert len(list(contigs[1].alignment.gaps())) == 0 + assert len(list(contigs[0].alignment.deletions())) == 1 + assert len(list(contigs[1].alignment.deletions())) == 0 results = list(split_contigs_with_gaps(contigs)) assert len(results) == 3 diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 9fd8d7514..aed662b28 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -520,7 +520,7 @@ def touches(self, other) -> bool: and self.q_ei + 1 == other.q_st - def gaps(self) -> Iterable['CigarHit']: + def deletions(self) -> Iterable['CigarHit']: # TODO(vitalik): memoize whatever possible. covered_coordinates = self.coordinate_mapping.ref_to_query.keys() From e9617cb9cc267dad1c276fcb707daf563459b850 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 25 Jan 2024 15:30:48 -0800 Subject: [PATCH 173/485] Cigar tools: add insertions() method to CigarHit Also fix the issue with deletions() where it would cluster multiple insertions+deletions in a single gap. --- micall/tests/test_cigar_tools.py | 57 ++++++++++++++++++++++++-------- micall/utils/cigar_tools.py | 56 +++++++++++++++++++------------ 2 files changed, 79 insertions(+), 34 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 330cbf2ad..ea38a2c52 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -222,6 +222,12 @@ def parsed_hit(string): ('9M9I9D9M@1,1', 13.5 or 27/2, ['9M9I4D@1,1', '5D9M@14,19']), + ('9M9D9I9D9I9D9M@1,1', 13.5 or 27/2, + ['9M4D@1,1', '5D9I9D9I9D9M@14,10']), + + ('9M9I9D9I9D9I9M@1,1', 13.5 or 27/2, + ['9M9I4D@1,1', '5D9I9D9I9M@14,19']), + ('1M1I1D1M@1,1', 1.5, # same as previous 2 cases but smaller ['1M1I@1,1', '1D1M@2,3']), @@ -449,28 +455,53 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): @pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) -def test_cigar_hit_gaps_no_m_or_i(hit): +def test_cigar_hit_deletions_no_m_or_i(hit): + hit = parsed_hit(hit) + deletions = list(hit.deletions()) + + assert len(deletions) == len([op for op in str(hit.cigar) if op in 'DN']) + + for deletion in deletions: + assert 'M' not in str(deletion.cigar) + assert 'I' not in str(deletion.cigar) + + +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_deletions_lengths(hit): + hit = parsed_hit(hit) + deletions = list(hit.deletions()) + + for deletion in deletions: + assert deletion.query_length == 0 + assert deletion.ref_length > 0 + assert deletion.coordinate_mapping.ref_to_query == {} + + +@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases + if not isinstance(x[2], Exception)]) +def test_cigar_hit_insertions_no_m_or_i(hit): hit = parsed_hit(hit) - gaps = list(hit.deletions()) + insertions = list(hit.insertions()) - if 'D' in str(hit.cigar): - assert len(gaps) > 0 + if 'I' in str(hit.cigar): + assert len(insertions) > 0 - for gap in gaps: - assert 'M' not in str(gap.cigar) - assert 'I' not in str(gap.cigar) + for insertion in insertions: + assert 'M' not in str(insertion.cigar) + assert 'D' not in str(insertion.cigar) @pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)]) -def test_cigar_hit_gaps_lengths(hit): +def test_cigar_hit_insertions_lengths(hit): hit = parsed_hit(hit) - gaps = list(hit.deletions()) + insertions = list(hit.insertions()) - for gap in gaps: - assert gap.query_length == 0 - assert gap.ref_length > 0 - assert gap.coordinate_mapping.ref_to_query == {} + for insertion in insertions: + assert insertion.ref_length == 0 + assert insertion.query_length > 0 + assert insertion.coordinate_mapping.ref_to_query == {} @pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index aed662b28..0a11ddb6c 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -520,31 +520,45 @@ def touches(self, other) -> bool: and self.q_ei + 1 == other.q_st - def deletions(self) -> Iterable['CigarHit']: - # TODO(vitalik): memoize whatever possible. - - covered_coordinates = self.coordinate_mapping.ref_to_query.keys() - all_coordinates = self.coordinate_mapping.ref_to_query.domain - - def make_gap(r_st, r_en): - r_ei = r_en - 1 - left, midright = self.cut_reference(r_st - self.epsilon) - middle, right = midright.cut_reference(r_ei + self.epsilon) - return middle - - gap_start = None - for coord in all_coordinates: - if coord in covered_coordinates: + def _gaps(self, is_deletions: bool) -> Iterable['CigarHit']: + last_query_index = self.q_st + last_ref_index = self.r_st + gap_start: Optional[int] = None + op_to_ref = {v: k for k, v in self.coordinate_mapping.ref_to_op.items()} + op_to_query = {v: k for k, v in self.coordinate_mapping.query_to_op.items()} + present = op_to_ref if is_deletions else op_to_query + missing = op_to_query if is_deletions else op_to_ref + + for op_index in sorted(self.coordinate_mapping.query_to_op.codomain) + [None]: + if op_index in present and \ + op_index not in missing: + if gap_start is None: + gap_start = op_index + else: if gap_start is not None: - yield make_gap(gap_start, coord) + cigar = self.cigar.slice_operations(gap_start, op_index) + if is_deletions: + q_st = last_query_index + r_st = present[gap_start] + else: + q_st = present[gap_start] + r_st = last_ref_index + q_ei = q_st + cigar.query_length - 1 + r_ei = r_st + cigar.ref_length - 1 + yield CigarHit(cigar, q_st=q_st, q_ei=q_ei, r_st=r_st, r_ei=r_ei) gap_start = None - else: - if gap_start is None: - gap_start = coord - if gap_start is not None: - yield make_gap(gap_start, coord) + if op_index in op_to_query: + last_query_index = op_to_query[op_index] + if op_index in op_to_ref: + last_ref_index = op_to_ref[op_index] + + + def deletions(self) -> Iterable['CigarHit']: + return self._gaps(is_deletions=True) + def insertions(self) -> Iterable['CigarHit']: + return self._gaps(is_deletions=False) def __add__(self, other): """ From f2072a023e0ea1a4b013c398d7e41347cc4bf13f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 09:24:31 -0800 Subject: [PATCH 174/485] Contig stitcher: fix a visualization of root combinations --- micall/core/contig_stitcher.py | 5 +- micall/core/plot_contigs.py | 40 ++++++----- .../test_big_insertion_in_a_single_contig.svg | 34 +++++++++ ...est_big_insertion_in_a_single_contig_2.svg | 34 +++++++++ .../test_gap_around_big_insertion.svg | 70 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 72 +++++++++++++++++++ micall/tests/test_tests_utils.py | 4 +- micall/tests/utils.py | 41 ++++++----- micall/utils/contig_stitcher_events.py | 7 +- 9 files changed, 265 insertions(+), 42 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg create mode 100644 micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg create mode 100644 micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index ae1a33a66..1884259f7 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -231,8 +231,7 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)), "forward" if x.strand == 1 else "reverse") for x in alignments] - connected = connect_cigar_hits(list(map(lambda p: p[0], hits_array))) if hits_array else [] - + connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] if not connected: logger.debug("Contig %r not aligned - backend's choice.", contig.name) context.get().emit(events.ZeroHits(contig)) @@ -284,7 +283,7 @@ def get_indexes(name: str) -> Tuple[int, int]: def is_out_of_order(name: str) -> bool: return reference_sorted.index(name) != query_sorted.index(name) - sorted_by_query = list(sorted(contigs, key=lambda contig: contig.alignment.q_st if isinstance(contig, AlignedContig) else -1)) + sorted_by_query = sorted(contigs, key=lambda contig: get_indexes(contig.name)) for prev_contig, contig, next_contig in sliding_window(sorted_by_query): if isinstance(contig, AlignedContig): name = contig.name diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index cedee05e7..650113b9d 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1,5 +1,5 @@ import typing -from typing import Dict, Tuple, List, Set, Iterable, NoReturn +from typing import Dict, Tuple, List, Set, Iterable, NoReturn, Literal from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType from collections import Counter, defaultdict from csv import DictReader @@ -20,7 +20,8 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs -from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig +from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig, sliding_window +from micall.utils.cigar_tools import CigarHit import micall.utils.contig_stitcher_events as events @@ -419,6 +420,7 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: overlap_lefttake_map: Dict[str, str] = {} overlap_righttake_map: Dict[str, str] = {} overlap_sibling_map: Dict[str, str] = {} + combine_list: List[str] = [] combine_left_edge: Dict[str, str] = {} combine_right_edge: Dict[str, str] = {} temporary: Set[str] = set() @@ -577,6 +579,9 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): record_contig(event.left, [event.original]) record_contig(event.right, [event.original]) elif isinstance(event, events.Combine): + for contig in event.contigs: + combine_list.append(contig.name) + record_contig(event.result, event.contigs) if event.contigs: combine_left_edge[event.result.name] = event.contigs[0].name @@ -617,6 +622,12 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): if len(children) > 2: children_meet_points.append(contig_name) + def hits_to_insertions(hits: List[CigarHit]): + for hit in hits: + yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.q_st - 1, r_ei=hit.q_st - 2) + yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.q_ei + 1, r_ei=hit.q_ei) + yield from hit.insertions() + last_join_points_parent = {contig_name for join in children_join_points for contig_name in transitive_parent_graph.get(join, [])} last_join_points = [] for contig_name in children_join_points: @@ -675,13 +686,10 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): if contig_name in temporary: continue - if contig_name in overlap_sibling_map: + if contig_name in combine_list: finals = reduced_morphism_graph.get(contig_name, [contig_name]) if len(finals) == 1: - [final] = finals - parents = reduced_parent_graph.get(final, []) - if len(parents) == 1: - final_parts[final] = True + final_parts[finals[0]] = True elif contig_name in bad_contigs: final_parts[contig_name] = True @@ -708,7 +716,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): final_parts[contig_name] = True - final_parent_mapping: Dict[str, List[str]] = {} + final_children_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: children = [] for final_contig in final_parts: @@ -716,7 +724,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): parent_name in reduced_parent_graph.get(final_contig, []): children.append(final_contig) - final_parent_mapping[parent_name] = children + final_children_mapping[parent_name] = children min_position, max_position = 1, 1 position_offset = 100 @@ -727,7 +735,7 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): max_position = max(max_position, len(contig.seq) + 3 * position_offset) name_mappings = {} - for i, (parent, children) in enumerate(sorted(final_parent_mapping.items(), key=lambda p: p[0])): + for i, (parent, children) in enumerate(sorted(final_children_mapping.items(), key=lambda p: p[0])): name_mappings[parent] = f"{i + 1}" children = list(sorted(children, key=lambda name: query_position_map.get(name, -1))) for k, child in enumerate(children): @@ -766,7 +774,7 @@ def get_neighbour(part, lookup): full_size_map: Dict[str, Tuple[int, int]] = {} for parent_name in sorted_roots: - parts_names = final_parent_mapping[parent_name] + parts_names = final_children_mapping[parent_name] parts = [contig_map[part] for part in parts_names] for part in parts: @@ -837,7 +845,7 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] return (a_r_st, a_r_ei, f_r_st, f_r_ei) def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]: - parts = final_parent_mapping[contig_name] + parts = final_children_mapping[contig_name] for part_name in parts: part = contig_map[part_name] @@ -859,7 +867,7 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl yield Track(f_r_st, f_r_ei, label=f"{indexes}") def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: - parts = final_parent_mapping[contig_name] + parts = final_children_mapping[contig_name] for part_name in parts: part = contig_map[part_name] @@ -980,7 +988,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) for parent_name in sorted_roots: - contigs = final_parent_mapping.get(parent_name, []) + contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: if contig_name not in discarded: continue @@ -1000,7 +1008,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) for parent_name in sorted_roots: - contigs = final_parent_mapping.get(parent_name, []) + contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: if contig_name not in anomaly: continue @@ -1028,7 +1036,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: pos = position_offset / 2 figure.add(Track(pos, pos, h=40, label=label)) for parent_name in sorted_roots: - contigs = final_parent_mapping.get(parent_name, []) + contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: if contig_name not in unknown: continue diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg new file mode 100644 index 000000000..5bebd90cc --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg new file mode 100644 index 000000000..a529c9c8e --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg @@ -0,0 +1,34 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg new file mode 100644 index 000000000..27ba8bba9 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -0,0 +1,70 @@ + + + + + + + +testref + + + + + + + +1.1 + + + + + +2 + + + + + +1.2 + + + + + + + + + + + + + + + + +1.1 + + + + + +1.2 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index b717fa9a9..8b9bead83 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -30,6 +30,7 @@ def visualizer(request, tmp_path): os.makedirs(plots_dir, exist_ok=True) path_to_expected = os.path.join(plots_dir, plot_name) path_to_produced = os.path.join(tmp_path, plot_name) + # path_to_produced = path_to_expected def check(): logs = stitcher.context.get().events @@ -687,6 +688,77 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner, vi assert len(visualizer().elements) > len(contigs) +def test_big_insertion_in_a_single_contig(exact_aligner, visualizer): + # Scenario: Single contig produces many alignments. + + ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + + contigs = [ + GenotypedContig(name='a', + seq='B' * 10 + 'D' * 100 + 'B' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq + + assert len(visualizer().elements) > len(contigs) + + +def test_big_insertion_in_a_single_contig_2(exact_aligner, visualizer): + # Scenario: Single contig produces many alignments. + + ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 10 + 'D' * 100 + 'C' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq + + assert len(visualizer().elements) > len(contigs) + + +def test_gap_around_big_insertion(exact_aligner, visualizer): + # Scenario: Contig is split around its gap, then stripped. + + ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 10 + 'D' * 100 + 'C' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig(name='b', + seq='B' * 20, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert len(visualizer().elements) > len(contigs) + + def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") diff --git a/micall/tests/test_tests_utils.py b/micall/tests/test_tests_utils.py index f1e70c845..8e7a61edd 100644 --- a/micall/tests/test_tests_utils.py +++ b/micall/tests/test_tests_utils.py @@ -49,8 +49,8 @@ def test_multiple_matches(): assert len(alignments) == 5 assert alignments[0].r_st == 0 assert alignments[0].r_en == 20 - assert alignments[1].r_st == 0 - assert alignments[1].r_en == 19 + assert alignments[1].r_st == 20 + assert alignments[1].r_en == 40 def test_multiple_matches_bigger_query(): diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 0d8d980e4..8c7b8b3cf 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -5,6 +5,14 @@ from micall.utils.consensus_aligner import CigarActions +def find_all_occurrences(s, substring): + start = 0 + while True: + start = s.find(substring, start) + if start == -1: # no more occurrences found + return + yield start + start += len(substring) @dataclass class MockAlignment: @@ -37,25 +45,22 @@ def map(self, seq): for start in range(len(seq) - length + 1): end = start + length substring = seq[start:end] - if substring not in self.seq: - continue + for r_st in find_all_occurrences(self.seq, substring): + mapq = 60 + strand = 1 # Doesn't handle reverse complements in this mock. + r_en = r_st + len(substring) + q_st = start + q_en = end + cigar = [[q_en - q_st, CigarActions.MATCH]] + cigar_str = f'{(q_en - q_st)}M' + al = MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + if (q_st, q_en, r_st, r_en) not in returned: + returned.add((q_st, q_en, r_st, r_en)) + yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) - mapq = 60 - strand = 1 # Doesn't handle reverse complements in this mock. - r_st = self.seq.index(substring) - r_en = r_st + len(substring) - q_st = start - q_en = end - cigar = [[q_en - q_st, CigarActions.MATCH]] - cigar_str = f'{(q_en - q_st)}M' - al = MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) - if (q_st, q_en, r_st, r_en) not in returned: - returned.add((q_st, q_en, r_st, r_en)) - yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) - - max_matches -= 1 - if max_matches < 1: - return + max_matches -= 1 + if max_matches < 1: + return @contextmanager diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 652069704..a18bf11e8 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -1,6 +1,7 @@ -from typing import Union, List +from typing import Union, List, Tuple, Literal from dataclasses import dataclass from fractions import Fraction +from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @dataclass @@ -53,8 +54,8 @@ class StrandConflict: @dataclass class HitNumber: contig: 'GenotypedContig' - initial: object - connected: object + initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] + connected: List[CigarHit] @dataclass From 8b654884ad856da59b733292127e9a05a0a0fa2e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 13:45:24 -0800 Subject: [PATCH 175/485] Contig stitcher: improve visualizer positions handling --- micall/core/plot_contigs.py | 83 +++++++++---------- .../test_gap_around_big_insertion.svg | 54 ++++++------ .../test_gap_around_small_insertion.svg | 70 ++++++++++++++++ micall/tests/test_contig_stitcher.py | 30 ++++++- 4 files changed, 164 insertions(+), 73 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 650113b9d..c73cad924 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -427,21 +427,7 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: children_join_points: List[str] = [] children_meet_points: List[str] = [] query_position_map: Dict[str, int] = {} - - def get_oldest_ancestors(recur, graph, ancestor_name): - if ancestor_name in recur: - assert RuntimeError(f"Recursion in graph {graph!r}") - else: - recur = recur.copy() - recur.add(ancestor_name) - - if ancestor_name in graph: - existing_ancestors = graph[ancestor_name] - for existing in existing_ancestors: - yield from get_oldest_ancestors(recur, graph, existing) - else: - yield ancestor_name - return + initial_alignments: Dict[str, List[CigarHit]] = {} def remove_intermediate_edges(graph): ret = {} @@ -551,7 +537,7 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): elif isinstance(event, events.ReverseComplement): record_contig(event.result, [event.contig]) elif isinstance(event, events.HitNumber): - pass + initial_alignments[event.contig.name] = event.connected elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) elif isinstance(event, (events.LStrip, events.RStrip)): @@ -624,9 +610,16 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): def hits_to_insertions(hits: List[CigarHit]): for hit in hits: - yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.q_st - 1, r_ei=hit.q_st - 2) - yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.q_ei + 1, r_ei=hit.q_ei) + yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) yield from hit.insertions() + yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) + + unaligned_map: Dict[str, List[CigarHit]] = {} + for contig_name, hits in initial_alignments.items(): + contig = contig_map[contig_name] + all_insertions = list(hits_to_insertions(hits)) + nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0] + unaligned_map[contig_name] = nonempty_insertions last_join_points_parent = {contig_name for join in children_join_points for contig_name in transitive_parent_graph.get(join, [])} last_join_points = [] @@ -785,22 +778,22 @@ def get_neighbour(part, lookup): next_part = get_neighbour(part, overlap_lefttake_map) if prev_part is not None: - r_st = prev_part.alignment.r_st + position_offset + r_st = prev_part.alignment.r_st else: if part.name in bad_contigs: start_delta = 0 else: start_delta = -1 * part.alignment.q_st - r_st = part.alignment.r_st + start_delta + position_offset + r_st = part.alignment.r_st + start_delta if next_part is not None: - r_ei = next_part.alignment.r_ei + position_offset + r_ei = next_part.alignment.r_ei else: if part.name in bad_contigs: end_delta = 0 else: end_delta = len(part.seq) - 1 - part.alignment.q_ei - r_ei = part.alignment.r_ei + end_delta + position_offset + r_ei = part.alignment.r_ei + end_delta aligned_size_map[part.name] = (r_st, r_ei) @@ -810,23 +803,23 @@ def get_neighbour(part, lookup): next_part = get_neighbour(sibling, overlap_righttake_map) if prev_part is not None and prev_part.alignment.r_ei < part.alignment.r_st and prev_part: - r_st = prev_part.alignment.r_st + position_offset + r_st = prev_part.alignment.r_st else: start_delta = -1 * part.alignment.q_st - r_st = part.alignment.r_st + start_delta + position_offset + r_st = part.alignment.r_st + start_delta if next_part is not None and next_part.alignment.r_st > part.alignment.r_ei and next_part: - r_ei = next_part.alignment.r_ei + position_offset + r_ei = next_part.alignment.r_ei else: end_delta = len(part.seq) - 1 - part.alignment.q_ei - r_ei = part.alignment.r_ei + end_delta + position_offset + r_ei = part.alignment.r_ei + end_delta full_size_map[part.name] = (r_st, r_ei) def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]: if isinstance(contig, AlignedContig): - r_st = position_offset + contig.alignment.r_st - r_ei = position_offset + contig.alignment.r_ei + r_st = contig.alignment.r_st + r_ei = contig.alignment.r_ei if contig.name in aligned_size_map: a_r_st, a_r_ei = aligned_size_map[contig.name] else: @@ -838,15 +831,16 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] f_r_st = r_st - contig.alignment.q_st f_r_ei = r_ei + (len(contig.seq) - contig.alignment.q_ei) else: - f_r_st = position_offset - f_r_ei = position_offset + len(contig.seq) + f_r_st = 0 + f_r_ei = len(contig.seq) a_r_st = f_r_st a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]: parts = final_children_mapping[contig_name] - for part_name in parts: + parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1)) + for prev_name, part_name, next_naem in sliding_window(parts): part = contig_map[part_name] if part.name in repeatset: @@ -864,7 +858,8 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl repeatset.add(part.name) indexes = name_mappings[part.name] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) - yield Track(f_r_st, f_r_ei, label=f"{indexes}") + + yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}") def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: parts = final_children_mapping[contig_name] @@ -888,7 +883,7 @@ def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bo height = 20 if labels else 1 elevation = 1 if labels else -20 (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) - yield Arrow(a_r_st, a_r_ei, + yield Arrow(a_r_st + position_offset, a_r_ei + position_offset, elevation=elevation, h=height, label=indexes) @@ -953,9 +948,9 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: figure.add(Multitrack(subtracks)) # Drawing the reference sequence. - r_st = position_offset - r_ei = position_offset + group_refs[group_ref] - figure.add(Track(r_st, r_ei, label=f"{group_ref}")) + r_st = 0 + r_ei = group_refs[group_ref] + figure.add(Track(r_st + position_offset, r_ei + position_offset, label=f"{group_ref}")) ########## # Arrows # @@ -996,8 +991,8 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: contig = contig_map[contig_name] (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) name = name_mappings.get(contig_name, contig_name) - figure.add(Arrow(r_st, r_ei, elevation=-20, h=1)) - figure.add(Track(f_r_st, f_r_ei, label=name)) + figure.add(Arrow(r_st + position_offset, r_ei + position_offset, elevation=-20, h=1)) + figure.add(Track(f_r_st + position_offset, f_r_ei + position_offset, label=name)) ############# # Anomalies # @@ -1018,14 +1013,14 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: if isinstance(contig, AlignedContig): colour = "lightgray" if contig.strand == "reverse": - figure.add(Arrow(a_r_ei, a_r_st, elevation=-20, h=1)) + figure.add(Arrow(a_r_ei + position_offset, a_r_st + position_offset, elevation=-20, h=1)) else: - figure.add(Arrow(a_r_st, a_r_ei, elevation=-20, h=1)) + figure.add(Arrow(a_r_st + position_offset, a_r_ei + position_offset, elevation=-20, h=1)) else: colour = "yellow" name = name_mappings.get(contig_name, contig_name) - figure.add(Track(a_r_st, a_r_ei, color=colour, label=name)) + figure.add(Track(a_r_st + position_offset, a_r_ei + position_offset, color=colour, label=name)) ########### # Unknown # @@ -1042,11 +1037,11 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: continue contig = contig_map[contig_name] - r_st = position_offset - r_ei = position_offset + len(contig.seq) + r_st = 0 + r_ei = len(contig.seq) colour = "yellow" name = name_mappings.get(contig_name, contig_name) - figure.add(Track(r_st, r_ei, color=colour, label=name)) + figure.add(Track(r_st + position_offset, r_ei + position_offset, color=colour, label=name)) if not figure.elements: figure.add(Track(1, max_position, label='No contigs found.', color='none')) diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 27ba8bba9..c2cea6e3f 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -1,68 +1,68 @@ + width="970.0" height="117.0" viewBox="0.0 -117.0 970.0 117.0"> - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg new file mode 100644 index 000000000..c2cea6e3f --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -0,0 +1,70 @@ + + + + + + + +testref + + + + + + + +1.1 + + + + + +2 + + + + + +1.2 + + + + + + + + + + + + + + + + +1.1 + + + + + +1.2 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 8b9bead83..5f709e6f0 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -30,7 +30,6 @@ def visualizer(request, tmp_path): os.makedirs(plots_dir, exist_ok=True) path_to_expected = os.path.join(plots_dir, plot_name) path_to_produced = os.path.join(tmp_path, plot_name) - # path_to_produced = path_to_expected def check(): logs = stitcher.context.get().events @@ -732,6 +731,33 @@ def test_big_insertion_in_a_single_contig_2(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) +def test_gap_around_small_insertion(exact_aligner, visualizer): + # Scenario: Contig is split around its gap, then stripped. + + ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + + contigs = [ + GenotypedContig(name='a', + seq='P' * 5 + 'A' * 10 + 'D' * 6 + 'C' * 10 + 'Z' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig(name='b', + seq='B' * 20, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert len(visualizer().elements) > len(contigs) + + def test_gap_around_big_insertion(exact_aligner, visualizer): # Scenario: Contig is split around its gap, then stripped. @@ -739,7 +765,7 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): contigs = [ GenotypedContig(name='a', - seq='A' * 10 + 'D' * 100 + 'C' * 10, + seq='P' * 5 + 'A' * 10 + 'D' * 100 + 'C' * 10 + 'Z' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, From d2a088611dd9cdb03abea7c04673720eee3db69d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 13:53:36 -0800 Subject: [PATCH 176/485] Cigar tools: add *strip_reference methods --- micall/tests/test_cigar_tools.py | 180 +++++++++++++++++++++++++++---- micall/utils/cigar_tools.py | 53 +++++++-- 2 files changed, 203 insertions(+), 30 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index ea38a2c52..352670012 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -328,53 +328,191 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): left, right = hit.cut_reference(cut_point - percentage) assert left + right == hit -lstrip_cases = [ + +lstrip_reference_cases = [ + ('9M@1,1', '9M@1,1'), + ('5M6D@1,1', '5M6D@1,1'), + ('6D5M@1,1', '6D5M@1,1'), + ('6I5M@1,1', '5M@1,7'), + ('6I4D5M@1,1', '4D5M@1,7'), + ('6D4I5M@1,1', '6D5M@1,5'), + ('3D3D4I5M@1,1', '6D5M@1,5'), + ('3I3I4D5M@1,1', '4D5M@1,7'), + ('3D2I3D2I5M@1,1', '6D5M@1,5'), + ('3I2D3I2D5M@1,1', '4D5M@1,7'), + ('4D6I5M@1,1', '4D5M@1,7'), + ('4I6D5M@1,1', '6D5M@1,5'), + ('6I4D@1,1', '4D@1,7'), + ('6D4I@1,1', '6D4I@1,1'), + ('4D6I@1,1', '4D6I@1,1'), + ('4I6D@1,1', '6D@1,5'), + ('@1,1', '@1,1'), +] + + +rstrip_reference_cases = [ + ('9M@1,1', '9M@1,1'), + ('5M6D@1,1', '5M6D@1,1'), + ('5M6I@1,1', '5M@1,1'), + ('6D5M@1,1', '6D5M@1,1'), + ('5M4I6D@1,1', '5M6D@1,1'), + ('5M4D6I@1,1', '5M4D@1,1'), + ('5M4I3D3D@1,1', '5M6D@1,1'), + ('5M4D3I3I@1,1', '5M4D@1,1'), + ('5M2I3D2I3D@1,1', '5M6D@1,1'), + ('5M2D3I2D3I@1,1', '5M4D@1,1'), + ('5M6D4I@1,1', '5M6D@1,1'), + ('5M6I4D@1,1', '5M4D@1,1'), + ('6D4I@1,1', '6D@1,1'), + ('6I4D@1,1', '6I4D@1,1'), + ('4I6D@1,1', '4I6D@1,1'), + ('4D6I@1,1', '4D@1,1'), + ('@1,1', '@1,1'), +] + + +lstrip_query_cases = [ ('9M@1,1', '9M@1,1'), ('5M6D@1,1', '5M6D@1,1'), ('6D5M@1,1', '5M@7,1'), + ('6I5M@1,1', '6I5M@1,1'), + ('6I4D5M@1,1', '6I5M@5,1'), + ('6D4I5M@1,1', '4I5M@7,1'), ('6D4I5M@1,1', '4I5M@7,1'), - ('3D3D4I5M@1,1', '4I5M@7,1'), + ('6I4D5M@1,1', '6I5M@5,1'), ('3D2I3D2I5M@1,1', '4I5M@7,1'), + ('3I2D3I2D5M@1,1', '6I5M@5,1'), + ('4D6I5M@1,1', '6I5M@5,1'), ('4I6D5M@1,1', '4I5M@7,1'), + ('6I4D@1,1', '6I4D@1,1'), ('6D4I@1,1', '4I@7,1'), + ('4D6I@1,1', '6I@5,1'), ('4I6D@1,1', '4I6D@1,1'), ('@1,1', '@1,1'), ] -@pytest.mark.parametrize('hit, expected', lstrip_cases) -def test_cigar_hit_lstrip(hit, expected): - hit = parsed_hit(hit) - expected = parsed_hit(expected) - assert expected == hit.lstrip_query() - -rstrip_cases = [ +rstrip_query_cases = [ ('9M@1,1', '9M@1,1'), ('5M6D@1,1', '5M@1,1'), + ('5M6I@1,1', '5M6I@1,1'), ('6D5M@1,1', '6D5M@1,1'), ('5M4I6D@1,1', '5M4I@1,1'), - ('5M4I3D3D@1,1', '5M4I@1,1'), + ('5M4D6I@1,1', '5M6I@1,1'), + ('5M4I6D@1,1', '5M4I@1,1'), + ('5M4D6I@1,1', '5M6I@1,1'), ('5M2I3D2I3D@1,1', '5M4I@1,1'), + ('5M2D3I2D3I@1,1', '5M6I@1,1'), ('5M6D4I@1,1', '5M4I@1,1'), + ('5M6I4D@1,1', '5M6I@1,1'), ('6D4I@1,1', '6D4I@1,1'), + ('6I4D@1,1', '6I@1,1'), ('4I6D@1,1', '4I@1,1'), + ('4D6I@1,1', '4D6I@1,1'), ('@1,1', '@1,1'), ] -@pytest.mark.parametrize('hit, expected', rstrip_cases) -def test_cigar_hit_rstrip(hit, expected): + +strip_prop_cases_all = \ + [x[0] for x in cigar_hit_ref_cut_cases] + \ + [x[0] for x in lstrip_reference_cases] + \ + [x[0] for x in rstrip_reference_cases] + + +@pytest.mark.parametrize('hit, expected', lstrip_reference_cases) +def test_cigar_hit_lstrip_reference(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) - assert expected == hit.rstrip_query() + assert expected == hit.lstrip_reference() + + +@pytest.mark.parametrize('hit, expected', rstrip_reference_cases) +def test_cigar_hit_rstrip_reference(hit, expected): + hit = parsed_hit(hit) + expected = parsed_hit(expected) + assert expected == hit.rstrip_reference() + + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_reference_strip_combines_with_connect(hit): + hit = parsed_hit(hit) + + for cut_point in range(hit.r_st - 1, hit.r_ei): + left, right = hit.cut_reference(cut_point + hit.epsilon) + + left = left.rstrip_reference() + right = right.lstrip_reference() + assert left.connect(right).coordinate_mapping.ref_to_query \ + == hit.coordinate_mapping.ref_to_query + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_reference_strip_combines_with_add(hit): + hit = parsed_hit(hit) + + for cut_point in range(hit.r_st - 1, hit.r_ei): + left, right = hit.cut_reference(cut_point + hit.epsilon) + + left = left.rstrip_reference() + right = right.lstrip_reference() + + if left.touches(right): + assert left + right == hit + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_reference_strip_never_crashes(hit): + hit = parsed_hit(hit) + + hit.rstrip_reference().lstrip_reference() + hit.lstrip_reference().rstrip_reference() + hit.lstrip_reference().lstrip_reference() + hit.rstrip_reference().rstrip_reference() + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_reference_strip_is_idempotent(hit): + hit = parsed_hit(hit) + + h1 = hit.rstrip_reference() + assert h1 == h1.rstrip_reference() == h1.rstrip_reference().rstrip_reference() + + h1 = hit.lstrip_reference() + assert h1 == h1.lstrip_reference() == h1.lstrip_reference().lstrip_reference() + + h1 = hit.lstrip_reference().rstrip_reference() + assert h1 == h1.lstrip_reference() == h1.rstrip_reference() + + h1 = hit.rstrip_reference().lstrip_reference() + assert h1 == h1.rstrip_reference() == h1.lstrip_reference() + + +@pytest.mark.parametrize('hit', strip_prop_cases_all) +def test_cigar_hit_reference_strips_are_commutative(hit): + hit = parsed_hit(hit) -strip_prop_cases_all = [x[0] for x in cigar_hit_ref_cut_cases] \ - + [x[0] for x in lstrip_cases] \ - + [x[0] for x in rstrip_cases] + assert hit.rstrip_reference().lstrip_reference() \ + == hit.lstrip_reference().rstrip_reference() + + +@pytest.mark.parametrize('hit, expected', lstrip_query_cases) +def test_cigar_hit_lstrip_query(hit, expected): + hit = parsed_hit(hit) + expected = parsed_hit(expected) + assert expected == hit.lstrip_query() + + +@pytest.mark.parametrize('hit, expected', rstrip_query_cases) +def test_cigar_hit_rstrip_query(hit, expected): + hit = parsed_hit(hit) + expected = parsed_hit(expected) + assert expected == hit.rstrip_query() @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_strip_combines_with_connect(hit): +def test_cigar_hit_query_strip_combines_with_connect(hit): hit = parsed_hit(hit) for cut_point in range(hit.r_st - 1, hit.r_ei): @@ -388,7 +526,7 @@ def test_cigar_hit_strip_combines_with_connect(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_strip_combines_with_add(hit): +def test_cigar_hit_query_strip_combines_with_add(hit): hit = parsed_hit(hit) for cut_point in range(hit.r_st - 1, hit.r_ei): @@ -402,7 +540,7 @@ def test_cigar_hit_strip_combines_with_add(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_strip_never_crashes(hit): +def test_cigar_hit_query_strip_never_crashes(hit): hit = parsed_hit(hit) hit.rstrip_query().lstrip_query() @@ -412,7 +550,7 @@ def test_cigar_hit_strip_never_crashes(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_strip_is_idempotent(hit): +def test_cigar_hit_query_strip_is_idempotent(hit): hit = parsed_hit(hit) h1 = hit.rstrip_query() @@ -429,7 +567,7 @@ def test_cigar_hit_strip_is_idempotent(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_strips_are_commutative(hit): +def test_cigar_hit_query_strips_are_commutative(hit): hit = parsed_hit(hit) assert hit.rstrip_query().lstrip_query() \ diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 0a11ddb6c..8ca03c0eb 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -226,14 +226,36 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': [start_inclusive:end_noninclusive]) + def lstrip_reference(self) -> 'Cigar': + """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ + + min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=0) + min_op = self.coordinate_mapping.ref_to_op.get(min_r, float("inf")) + + ops = [(1, op) for i, (op, ref_pointer, query_pointer) + in enumerate(self.iterate_operations_with_pointers()) + if query_pointer is None or i >= min_op] + return Cigar.coerce(ops) + + + def rstrip_reference(self) -> 'Cigar': + """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ + + max_r = max(self.coordinate_mapping.ref_to_query.keys(), + default=len(self.coordinate_mapping.ref_to_op) - 1) + max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("inf")) + + ops = [(1, op) for i, (op, ref_pointer, query_pointer) + in enumerate(self.iterate_operations_with_pointers()) + if query_pointer is None or i <= max_op] + return Cigar.coerce(ops) + + def lstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ - if self.query_length == 0: - return self - min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=0) - min_op = self.coordinate_mapping.query_to_op[min_q] + min_op = self.coordinate_mapping.query_to_op.get(min_q, float("inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()) @@ -244,12 +266,9 @@ def lstrip_query(self) -> 'Cigar': def rstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ - if self.query_length == 0: - return self - max_q = max(self.coordinate_mapping.query_to_ref.keys(), default=len(self.coordinate_mapping.query_to_op) - 1) - max_op = self.coordinate_mapping.query_to_op[max_q] + max_op = self.coordinate_mapping.query_to_op.get(max_q, float("inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()) @@ -449,7 +468,7 @@ class CigarHit: at any given reference position (`cut_reference()`), - Removing portions of the query sequence that do not align with the reference sequence from either end - while preserving the alignment context (`lstrip_query()` and `rstrip_query()`), + while preserving the alignment context (`lstrip*()` and `rstrip*()`), - Enumerating gaps in the alignment (`gaps()`). """ @@ -651,6 +670,22 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: return left, right + def lstrip_reference(self) -> 'CigarHit': + """ Return a copy of the CigarHit with leading (unmatched) reference elements removed. """ + + cigar = self.cigar.lstrip_reference() + return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, + q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) + + + def rstrip_reference(self) -> 'CigarHit': + """ Return a copy of the CigarHit with trailing (unmatched) reference elements removed. """ + + cigar = self.cigar.rstrip_reference() + return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, + q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) + + def lstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ From 947b44a4b2d70faaea989e3ce0c8a0482b54864d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 15:03:33 -0800 Subject: [PATCH 177/485] Cigar tools: improve parsing of cigar hits --- micall/tests/test_cigar_tools.py | 338 +++++++++++++++---------------- 1 file changed, 169 insertions(+), 169 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 352670012..ba46174be 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -150,11 +150,11 @@ def test_invalid_cigar_string(): Cigar.coerce('3') # Not enough Ms -CIGAR_REGEX = re.compile(r"(.*)@([0-9]+),([0-9]+)") +CIGAR_REGEX = re.compile(r"(.*)@([0-9]+)->([0-9]+)") def parsed_hit(string): match = CIGAR_REGEX.match(string) assert match, f"Cannot parse {string}" - cigar_str, r_st, q_st = match.groups() + cigar_str, q_st, r_st = match.groups() cigar = Cigar.coerce(cigar_str) r_ei = int(r_st) + cigar.ref_length - 1 q_ei = int(q_st) + cigar.query_length - 1 @@ -163,131 +163,131 @@ def parsed_hit(string): cigar_hit_ref_cut_cases = [ # Trivial cases - ('4M@1,1', 2.5, - ['2M@1,1', '2M@3,3']), + ('4M@1->1', 2.5, + ['2M@1->1', '2M@3->3']), - ('9M@1,1', 3.5, - ['3M@1,1', '6M@4,4']), + ('9M@1->1', 3.5, + ['3M@1->1', '6M@4->4']), - ('9M@1,1', 4.5, - ['4M@1,1', '5M@5,5']), + ('9M@1->1', 4.5, + ['4M@1->1', '5M@5->5']), - ('9M@0,0', 3.5, - ['4M@0,0', '5M@4,4']), + ('9M@0->0', 3.5, + ['4M@0->0', '5M@4->4']), # Simple cases - ('9M9D9M@1,1', 3.5, - ['3M@1,1', '6M9D9M@4,4']), + ('9M9D9M@1->1', 3.5, + ['3M@1->1', '6M9D9M@4->4']), - ('9M9D9M@1,1', 20.5, - ['9M9D2M@1,1', '7M@21,12']), + ('9M9D9M@1->1', 20.5, + ['9M9D2M@1->1', '7M@12->21']), - ('9M9I9M@1,1', 3.5, - ['3M@1,1', '6M9I9M@4,4']), + ('9M9I9M@1->1', 3.5, + ['3M@1->1', '6M9I9M@4->4']), - ('9M9I9M@1,1', 13.5 or 27/2, - ['9M9I4M@1,1', '5M@14,23']), + ('9M9I9M@1->1', 13.5 or 27/2, + ['9M9I4M@1->1', '5M@23->14']), - ('5M6I@1,1', 3.5, - ['3M@1,1', '2M6I@4,4']), + ('5M6I@1->1', 3.5, + ['3M@1->1', '2M6I@4->4']), - ('6I5M@1,1', 3.5, - ['6I3M@1,1', '2M@4,10']), + ('6I5M@1->1', 3.5, + ['6I3M@1->1', '2M@10->4']), - ('5M6D@1,1', 3.5, - ['3M@1,1', '2M6D@4,4']), + ('5M6D@1->1', 3.5, + ['3M@1->1', '2M6D@4->4']), - ('6D5M@1,1', 3.5, - ['3D@1,1', '3D5M@4,1']), + ('6D5M@1->1', 3.5, + ['3D@1->1', '3D5M@1->4']), - ('5M6D@1,1', 7.5, - ['5M2D@1,1', '4D@8,6']), + ('5M6D@1->1', 7.5, + ['5M2D@1->1', '4D@6->8']), - ('6D5M@1,1', 7.5, - ['6D1M@1,1', '4M@8,2']), + ('6D5M@1->1', 7.5, + ['6D1M@1->1', '4M@2->8']), - ('6D5M@1,1', 6.5, - ['6D@1,1', '5M@7,1']), + ('6D5M@1->1', 6.5, + ['6D@1->1', '5M@1->7']), # Ambigous cases - ('9M9D9M@1,1', 13.5 or 27/2, - ['9M4D@1,1', '5D9M@14,10']), + ('9M9D9M@1->1', 13.5 or 27/2, + ['9M4D@1->1', '5D9M@10->14']), - ('9M9I9M@1,1', 9.2, - ['9M1I@1,1', '8I9M@10,11']), + ('9M9I9M@1->1', 9.2, + ['9M1I@1->1', '8I9M@11->10']), - ('9M9D9I9M@1,1', 13.5 or 27/2, - ['9M4D@1,1', '5D9I9M@14,10']), + ('9M9D9I9M@1->1', 13.5 or 27/2, + ['9M4D@1->1', '5D9I9M@10->14']), - ('9M9I9D9M@1,1', 13.5 or 27/2, - ['9M9I4D@1,1', '5D9M@14,19']), + ('9M9I9D9M@1->1', 13.5 or 27/2, + ['9M9I4D@1->1', '5D9M@19->14']), - ('9M9D9I9D9I9D9M@1,1', 13.5 or 27/2, - ['9M4D@1,1', '5D9I9D9I9D9M@14,10']), + ('9M9D9I9D9I9D9M@1->1', 13.5 or 27/2, + ['9M4D@1->1', '5D9I9D9I9D9M@10->14']), - ('9M9I9D9I9D9I9M@1,1', 13.5 or 27/2, - ['9M9I4D@1,1', '5D9I9D9I9M@14,19']), + ('9M9I9D9I9D9I9M@1->1', 13.5 or 27/2, + ['9M9I4D@1->1', '5D9I9D9I9M@19->14']), - ('1M1I1D1M@1,1', 1.5, # same as previous 2 cases but smaller - ['1M1I@1,1', '1D1M@2,3']), + ('1M1I1D1M@1->1', 1.5, # same as previous 2 cases but smaller + ['1M1I@1->1', '1D1M@3->2']), - ('1M1D1I1M@1,1', 1.5, # same as previous 2 cases but smaller - ['1M@1,1', '1D1I1M@2,2']), + ('1M1D1I1M@1->1', 1.5, # same as previous 2 cases but smaller + ['1M@1->1', '1D1I1M@2->2']), # Edge cases - ('9M9I9M@1,1', 9.5, # no middlepoint - ['9M5I@1,1', '4I9M@10,15']), + ('9M9I9M@1->1', 9.5, # no middlepoint + ['9M5I@1->1', '4I9M@15->10']), - ('9M@1,1', 8.5, - ['8M@1,1', '1M@9,9']), + ('9M@1->1', 8.5, + ['8M@1->1', '1M@9->9']), - ('9M@1,1', 9.5, - ['9M@1,1', '@10,10']), + ('9M@1->1', 9.5, + ['9M@1->1', '@10->10']), - ('7M@3,3', 2.5, - ['@3,3', '7M@3,3']), + ('7M@3->3', 2.5, + ['@3->3', '7M@3->3']), - ('9M@1,1', 0.5, - ['@1,1', '9M@1,1']), + ('9M@1->1', 0.5, + ['@1->1', '9M@1->1']), - ('9M@0,0', -0.5, - ['@0,0', '9M@0,0']), + ('9M@0->0', -0.5, + ['@0->0', '9M@0->0']), - ('9D@1,1', 3.5, - ['3D@1,1', '6D@4,1']), + ('9D@1->1', 3.5, + ['3D@1->1', '6D@1->4']), - ('9D@0,0', -0.5, - ['@0,0', '9D@0,0']), + ('9D@0->0', -0.5, + ['@0->0', '9D@0->0']), - ('1M7I1M@1,1', 1.5, - ['1M4I@1,1', '3I1M@2,6']), + ('1M7I1M@1->1', 1.5, + ['1M4I@1->1', '3I1M@6->2']), - ('1M6I1M@1,1', 1.5, - ['1M3I@1,1', '3I1M@2,5']), + ('1M6I1M@1->1', 1.5, + ['1M3I@1->1', '3I1M@5->2']), - ('1M7I1M@1,1', 1.999, - ['1M7I@1,1', '1M@2,9']), + ('1M7I1M@1->1', 1.999, + ['1M7I@1->1', '1M@9->2']), - ('1M7I1M@1,1', 1.001, - ['1M@1,1', '7I1M@2,2']), + ('1M7I1M@1->1', 1.001, + ['1M@1->1', '7I1M@2->2']), - ('2=1X2N1N2=1H2S@1,1', 3.5, - ['2=1X@1,1', '3N2=1H2S@4,4']), + ('2=1X2N1N2=1H2S@1->1', 3.5, + ['2=1X@1->1', '3N2=1H2S@4->4']), # Negative cases - ('9M9I9M@1,1', 20.5, + ('9M9I9M@1->1', 20.5, IndexError("20.5 is bigger than reference (18)")), - ('@2,2', 2.5, + ('@2->2', 2.5, IndexError("Empty string cannot be cut")), - ('@2,2', 1.5, + ('@2->2', 1.5, IndexError("Empty string cannot be cut")), - ('9I@1,1', 3.5, + ('9I@1->1', 3.5, IndexError("Out of reference bounds")), - ('9M@1,1', 4, + ('9M@1->1', 4, ValueError("Cut point must not be an integer")), ] @@ -330,86 +330,86 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): lstrip_reference_cases = [ - ('9M@1,1', '9M@1,1'), - ('5M6D@1,1', '5M6D@1,1'), - ('6D5M@1,1', '6D5M@1,1'), - ('6I5M@1,1', '5M@1,7'), - ('6I4D5M@1,1', '4D5M@1,7'), - ('6D4I5M@1,1', '6D5M@1,5'), - ('3D3D4I5M@1,1', '6D5M@1,5'), - ('3I3I4D5M@1,1', '4D5M@1,7'), - ('3D2I3D2I5M@1,1', '6D5M@1,5'), - ('3I2D3I2D5M@1,1', '4D5M@1,7'), - ('4D6I5M@1,1', '4D5M@1,7'), - ('4I6D5M@1,1', '6D5M@1,5'), - ('6I4D@1,1', '4D@1,7'), - ('6D4I@1,1', '6D4I@1,1'), - ('4D6I@1,1', '4D6I@1,1'), - ('4I6D@1,1', '6D@1,5'), - ('@1,1', '@1,1'), + ('9M@1->1', '9M@1->1'), + ('5M6D@1->1', '5M6D@1->1'), + ('6D5M@1->1', '6D5M@1->1'), + ('6I5M@1->1', '5M@7->1'), + ('6I4D5M@1->1', '4D5M@7->1'), + ('6D4I5M@1->1', '6D5M@5->1'), + ('3D3D4I5M@1->1', '6D5M@5->1'), + ('3I3I4D5M@1->1', '4D5M@7->1'), + ('3D2I3D2I5M@1->1', '6D5M@5->1'), + ('3I2D3I2D5M@1->1', '4D5M@7->1'), + ('4D6I5M@1->1', '4D5M@7->1'), + ('4I6D5M@1->1', '6D5M@5->1'), + ('6I4D@1->1', '4D@7->1'), + ('6D4I@1->1', '6D4I@1->1'), + ('4D6I@1->1', '4D6I@1->1'), + ('4I6D@1->1', '6D@5->1'), + ('@1->1', '@1->1'), ] rstrip_reference_cases = [ - ('9M@1,1', '9M@1,1'), - ('5M6D@1,1', '5M6D@1,1'), - ('5M6I@1,1', '5M@1,1'), - ('6D5M@1,1', '6D5M@1,1'), - ('5M4I6D@1,1', '5M6D@1,1'), - ('5M4D6I@1,1', '5M4D@1,1'), - ('5M4I3D3D@1,1', '5M6D@1,1'), - ('5M4D3I3I@1,1', '5M4D@1,1'), - ('5M2I3D2I3D@1,1', '5M6D@1,1'), - ('5M2D3I2D3I@1,1', '5M4D@1,1'), - ('5M6D4I@1,1', '5M6D@1,1'), - ('5M6I4D@1,1', '5M4D@1,1'), - ('6D4I@1,1', '6D@1,1'), - ('6I4D@1,1', '6I4D@1,1'), - ('4I6D@1,1', '4I6D@1,1'), - ('4D6I@1,1', '4D@1,1'), - ('@1,1', '@1,1'), + ('9M@1->1', '9M@1->1'), + ('5M6D@1->1', '5M6D@1->1'), + ('5M6I@1->1', '5M@1->1'), + ('6D5M@1->1', '6D5M@1->1'), + ('5M4I6D@1->1', '5M6D@1->1'), + ('5M4D6I@1->1', '5M4D@1->1'), + ('5M4I3D3D@1->1', '5M6D@1->1'), + ('5M4D3I3I@1->1', '5M4D@1->1'), + ('5M2I3D2I3D@1->1', '5M6D@1->1'), + ('5M2D3I2D3I@1->1', '5M4D@1->1'), + ('5M6D4I@1->1', '5M6D@1->1'), + ('5M6I4D@1->1', '5M4D@1->1'), + ('6D4I@1->1', '6D@1->1'), + ('6I4D@1->1', '6I4D@1->1'), + ('4I6D@1->1', '4I6D@1->1'), + ('4D6I@1->1', '4D@1->1'), + ('@1->1', '@1->1'), ] lstrip_query_cases = [ - ('9M@1,1', '9M@1,1'), - ('5M6D@1,1', '5M6D@1,1'), - ('6D5M@1,1', '5M@7,1'), - ('6I5M@1,1', '6I5M@1,1'), - ('6I4D5M@1,1', '6I5M@5,1'), - ('6D4I5M@1,1', '4I5M@7,1'), - ('6D4I5M@1,1', '4I5M@7,1'), - ('6I4D5M@1,1', '6I5M@5,1'), - ('3D2I3D2I5M@1,1', '4I5M@7,1'), - ('3I2D3I2D5M@1,1', '6I5M@5,1'), - ('4D6I5M@1,1', '6I5M@5,1'), - ('4I6D5M@1,1', '4I5M@7,1'), - ('6I4D@1,1', '6I4D@1,1'), - ('6D4I@1,1', '4I@7,1'), - ('4D6I@1,1', '6I@5,1'), - ('4I6D@1,1', '4I6D@1,1'), - ('@1,1', '@1,1'), + ('9M@1->1', '9M@1->1'), + ('5M6D@1->1', '5M6D@1->1'), + ('6D5M@1->1', '5M@1->7'), + ('6I5M@1->1', '6I5M@1->1'), + ('6I4D5M@1->1', '6I5M@1->5'), + ('6D4I5M@1->1', '4I5M@1->7'), + ('6D4I5M@1->1', '4I5M@1->7'), + ('6I4D5M@1->1', '6I5M@1->5'), + ('3D2I3D2I5M@1->1', '4I5M@1->7'), + ('3I2D3I2D5M@1->1', '6I5M@1->5'), + ('4D6I5M@1->1', '6I5M@1->5'), + ('4I6D5M@1->1', '4I5M@1->7'), + ('6I4D@1->1', '6I4D@1->1'), + ('6D4I@1->1', '4I@1->7'), + ('4D6I@1->1', '6I@1->5'), + ('4I6D@1->1', '4I6D@1->1'), + ('@1->1', '@1->1'), ] rstrip_query_cases = [ - ('9M@1,1', '9M@1,1'), - ('5M6D@1,1', '5M@1,1'), - ('5M6I@1,1', '5M6I@1,1'), - ('6D5M@1,1', '6D5M@1,1'), - ('5M4I6D@1,1', '5M4I@1,1'), - ('5M4D6I@1,1', '5M6I@1,1'), - ('5M4I6D@1,1', '5M4I@1,1'), - ('5M4D6I@1,1', '5M6I@1,1'), - ('5M2I3D2I3D@1,1', '5M4I@1,1'), - ('5M2D3I2D3I@1,1', '5M6I@1,1'), - ('5M6D4I@1,1', '5M4I@1,1'), - ('5M6I4D@1,1', '5M6I@1,1'), - ('6D4I@1,1', '6D4I@1,1'), - ('6I4D@1,1', '6I@1,1'), - ('4I6D@1,1', '4I@1,1'), - ('4D6I@1,1', '4D6I@1,1'), - ('@1,1', '@1,1'), + ('9M@1->1', '9M@1->1'), + ('5M6D@1->1', '5M@1->1'), + ('5M6I@1->1', '5M6I@1->1'), + ('6D5M@1->1', '6D5M@1->1'), + ('5M4I6D@1->1', '5M4I@1->1'), + ('5M4D6I@1->1', '5M6I@1->1'), + ('5M4I6D@1->1', '5M4I@1->1'), + ('5M4D6I@1->1', '5M6I@1->1'), + ('5M2I3D2I3D@1->1', '5M4I@1->1'), + ('5M2D3I2D3I@1->1', '5M6I@1->1'), + ('5M6D4I@1->1', '5M4I@1->1'), + ('5M6I4D@1->1', '5M6I@1->1'), + ('6D4I@1->1', '6D4I@1->1'), + ('6I4D@1->1', '6I@1->1'), + ('4I6D@1->1', '4I@1->1'), + ('4D6I@1->1', '4D6I@1->1'), + ('@1->1', '@1->1'), ] @@ -670,53 +670,53 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): connect_cigar_hits_cases = [ # Non-overlapping hits should be connected with deletions/insertions ( - ['4M@1,1', '4M@10,8'], - ['4M5D3I4M@1,1'] + ['4M@1->1', '4M@8->10'], + ['4M5D3I4M@1->1'] ), # Overlapping hits should ignore later ones ( - ['4M@1,1', '5M@3,3'], - ['4M@1,1'] + ['4M@1->1', '5M@3->3'], + ['4M@1->1'] ), # Touching hits should be simply concatenated ( - ['4M@1,1', '4M@5,5'], - ['8M@1,1'] + ['4M@1->1', '4M@5->5'], + ['8M@1->1'] ), # Hits that touch at only one boundary should combine just fine ( - ['3M@1,1', '6M@4,6'], - ['3M2I6M@1,1'] + ['3M@1->1', '6M@6->4'], + ['3M2I6M@1->1'] ), # Hits that are subsets of earlier hits should be ignored ( - ['8M@1,1', '3M@3,3'], - ['8M@1,1'] + ['8M@1->1', '3M@3->3'], + ['8M@1->1'] ), # Hits that are out of order should be connected if no overlap ( - ['3M@10,6', '3M@1,1'], - ['3M6D2I3M@1,1'] + ['3M@6->10', '3M@1->1'], + ['3M6D2I3M@1->1'] ), # Hits that overlap by a single base should prioritize the first hit and not combine ( - ['3M@1,1', '3M@3,3'], - ['3M@1,1'] + ['3M@1->1', '3M@3->3'], + ['3M@1->1'] ), # Non-overlapping hits in the query space but overlapping in reference space ( - ['5M@1,1', '1M@3,10'], - ['5M@1,1'] + ['5M@1->1', '1M@10->3'], + ['5M@1->1'] ), # Combining more than two hits ( - ['3M@1,1', '3M@7,7', '3M@12,16'], - ['3M3D3I3M2D6I3M@1,1'] + ['3M@1->1', '3M@7->7', '3M@16->12'], + ['3M3D3I3M2D6I3M@1->1'] ), # Combining hits including hard-clipping, which should be ignored in alignments ( - ['2H5M1H@1,3', '2H5M1H@11,13'], - ['2H5M1H5D5I2H5M1H@1,3'] + ['2H5M1H@3->1', '2H5M1H@13->11'], + ['2H5M1H5D5I2H5M1H@3->1'] ), # An empty list of hits should raise a ValueError ( @@ -725,8 +725,8 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): ), # Before by reference, after by query ( - ['4M@1,8', '4M@10,1'], - ['4M@1,8', '4M@10,1'] + ['4M@8->1', '4M@1->10'], + ['4M@8->1', '4M@1->10'] ), ] @pytest.mark.parametrize('hits, expected_result', connect_cigar_hits_cases) From a86530f8ce8eb257c385a213bfeb02397b0f62c1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 15:43:01 -0800 Subject: [PATCH 178/485] Cigar tools: fix edge cases of strip --- micall/core/contig_stitcher.py | 4 +-- micall/tests/test_cigar_tools.py | 50 +++++++++++++++++++++----------- micall/utils/cigar_tools.py | 14 ++++----- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1884259f7..ad8ca0c40 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -103,7 +103,7 @@ def lstrip_query(self) -> 'AlignedContig': alignment. The CIGAR alignment is also updated to reflect the trimming. """ - alignment = self.alignment.lstrip_query() + alignment = self.alignment.lstrip_query().lstrip_reference() q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = AlignedContig.make(query, alignment, self.strand) @@ -120,7 +120,7 @@ def rstrip_query(self) -> 'AlignedContig': alignment. The CIGAR alignment is also updated to reflect the trimming. """ - alignment = self.alignment.rstrip_query() + alignment = self.alignment.rstrip_query().rstrip_reference() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index ba46174be..ce20abb92 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -343,9 +343,11 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ('4D6I5M@1->1', '4D5M@7->1'), ('4I6D5M@1->1', '6D5M@5->1'), ('6I4D@1->1', '4D@7->1'), - ('6D4I@1->1', '6D4I@1->1'), - ('4D6I@1->1', '4D6I@1->1'), + ('6D4I@1->1', '6D@5->1'), + ('4D6I@1->1', '4D@7->1'), ('4I6D@1->1', '6D@5->1'), + ('4I@1->1', '@5->1'), + ('4D@1->1', '4D@1->1'), ('@1->1', '@1->1'), ] @@ -364,9 +366,11 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ('5M6D4I@1->1', '5M6D@1->1'), ('5M6I4D@1->1', '5M4D@1->1'), ('6D4I@1->1', '6D@1->1'), - ('6I4D@1->1', '6I4D@1->1'), - ('4I6D@1->1', '4I6D@1->1'), + ('6I4D@1->1', '4D@1->1'), + ('4I6D@1->1', '6D@1->1'), ('4D6I@1->1', '4D@1->1'), + ('4I@1->1', '@1->1'), + ('4D@1->1', '4D@1->1'), ('@1->1', '@1->1'), ] @@ -384,10 +388,12 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ('3I2D3I2D5M@1->1', '6I5M@1->5'), ('4D6I5M@1->1', '6I5M@1->5'), ('4I6D5M@1->1', '4I5M@1->7'), - ('6I4D@1->1', '6I4D@1->1'), + ('6I4D@1->1', '6I@1->5'), ('6D4I@1->1', '4I@1->7'), ('4D6I@1->1', '6I@1->5'), - ('4I6D@1->1', '4I6D@1->1'), + ('4I6D@1->1', '4I@1->7'), + ('4I@1->1', '4I@1->1'), + ('4D@1->1', '@1->5'), ('@1->1', '@1->1'), ] @@ -405,10 +411,12 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ('5M2D3I2D3I@1->1', '5M6I@1->1'), ('5M6D4I@1->1', '5M4I@1->1'), ('5M6I4D@1->1', '5M6I@1->1'), - ('6D4I@1->1', '6D4I@1->1'), + ('6D4I@1->1', '4I@1->1'), ('6I4D@1->1', '6I@1->1'), ('4I6D@1->1', '4I@1->1'), - ('4D6I@1->1', '4D6I@1->1'), + ('4D6I@1->1', '6I@1->1'), + ('4I@1->1', '4I@1->1'), + ('4D@1->1', '@1->1'), ('@1->1', '@1->1'), ] @@ -493,8 +501,12 @@ def test_cigar_hit_reference_strip_is_idempotent(hit): def test_cigar_hit_reference_strips_are_commutative(hit): hit = parsed_hit(hit) - assert hit.rstrip_reference().lstrip_reference() \ - == hit.lstrip_reference().rstrip_reference() + if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: + assert hit.rstrip_reference().lstrip_reference() \ + == hit.lstrip_reference().rstrip_reference() + else: + assert hit.rstrip_reference().lstrip_reference().cigar \ + == hit.lstrip_reference().rstrip_reference().cigar @pytest.mark.parametrize('hit, expected', lstrip_query_cases) @@ -540,13 +552,13 @@ def test_cigar_hit_query_strip_combines_with_add(hit): @pytest.mark.parametrize('hit', strip_prop_cases_all) -def test_cigar_hit_query_strip_never_crashes(hit): +def test_cigar_hit_strips_work_together(hit): hit = parsed_hit(hit) - hit.rstrip_query().lstrip_query() - hit.lstrip_query().rstrip_query() - hit.lstrip_query().lstrip_query() - hit.rstrip_query().rstrip_query() + rstrip = str(hit.rstrip_query().rstrip_reference().cigar) + assert not rstrip.endswith("I") and not rstrip.endswith("D") + lstrip = str(hit.lstrip_query().lstrip_reference().cigar) + assert not lstrip.startswith("I") and not lstrip.startswith("D") @pytest.mark.parametrize('hit', strip_prop_cases_all) @@ -570,8 +582,12 @@ def test_cigar_hit_query_strip_is_idempotent(hit): def test_cigar_hit_query_strips_are_commutative(hit): hit = parsed_hit(hit) - assert hit.rstrip_query().lstrip_query() \ - == hit.lstrip_query().rstrip_query() + if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: + assert hit.rstrip_query().lstrip_query() \ + == hit.lstrip_query().rstrip_query() + else: + assert hit.rstrip_query().lstrip_query().cigar \ + == hit.lstrip_query().rstrip_query().cigar @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 8ca03c0eb..ddfd475b8 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -229,7 +229,7 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': def lstrip_reference(self) -> 'Cigar': """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ - min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=0) + min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=None) min_op = self.coordinate_mapping.ref_to_op.get(min_r, float("inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) @@ -241,9 +241,8 @@ def lstrip_reference(self) -> 'Cigar': def rstrip_reference(self) -> 'Cigar': """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ - max_r = max(self.coordinate_mapping.ref_to_query.keys(), - default=len(self.coordinate_mapping.ref_to_op) - 1) - max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("inf")) + max_r = max(self.coordinate_mapping.ref_to_query.keys(), default=None) + max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("-inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()) @@ -254,7 +253,7 @@ def rstrip_reference(self) -> 'Cigar': def lstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ - min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=0) + min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=None) min_op = self.coordinate_mapping.query_to_op.get(min_q, float("inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) @@ -266,9 +265,8 @@ def lstrip_query(self) -> 'Cigar': def rstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ - max_q = max(self.coordinate_mapping.query_to_ref.keys(), - default=len(self.coordinate_mapping.query_to_op) - 1) - max_op = self.coordinate_mapping.query_to_op.get(max_q, float("inf")) + max_q = max(self.coordinate_mapping.query_to_ref.keys(), default=None) + max_op = self.coordinate_mapping.query_to_op.get(max_q, float("-inf")) ops = [(1, op) for i, (op, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()) From ebe1e9e39c0c47a74e6f36e9d0c9cae5b5718a64 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 15:51:41 -0800 Subject: [PATCH 179/485] Cigar tools: swap names of query and reference strips Also adjust the contig stitcher to the change. --- micall/core/contig_stitcher.py | 28 ++++---- micall/tests/test_cigar_tools.py | 104 +++++++++++++-------------- micall/tests/test_contig_stitcher.py | 8 +-- micall/utils/cigar_tools.py | 40 +++++------ 4 files changed, 90 insertions(+), 90 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index ad8ca0c40..ff2e5ffff 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -97,13 +97,13 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont return (left, right) - def lstrip_query(self) -> 'AlignedContig': + def lstrip(self) -> 'AlignedContig': """ Trims the query sequence of the contig from its beginning up to the start of the alignment. The CIGAR alignment is also updated to reflect the trimming. """ - alignment = self.alignment.lstrip_query().lstrip_reference() + alignment = self.alignment.lstrip_reference().lstrip_query() q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = AlignedContig.make(query, alignment, self.strand) @@ -114,13 +114,13 @@ def lstrip_query(self) -> 'AlignedContig': return result - def rstrip_query(self) -> 'AlignedContig': + def rstrip(self) -> 'AlignedContig': """ Trims the query sequence of the contig from its end based on the end of the alignment. The CIGAR alignment is also updated to reflect the trimming. """ - alignment = self.alignment.rstrip_query().rstrip_reference() + alignment = self.alignment.rstrip_reference().rstrip_query() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.strand) logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", @@ -196,9 +196,9 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: stripped_parts = [] for prev_part, part, next_part in sliding_window(parts): if prev_part is not None: - part = part.lstrip_query() + part = part.lstrip() if next_part is not None: - part = part.rstrip_query() + part = part.rstrip() stripped_parts.append(part) ret = reduce(AlignedContig.munge, stripped_parts) @@ -288,9 +288,9 @@ def is_out_of_order(name: str) -> bool: if isinstance(contig, AlignedContig): name = contig.name if prev_contig is not None or is_out_of_order(name): - contig = contig.lstrip_query() + contig = contig.lstrip() if next_contig is not None or is_out_of_order(name): - contig = contig.rstrip_query() + contig = contig.rstrip() yield contig @@ -426,10 +426,10 @@ def stitch_2_contigs(left, right): # Cut in 4 parts. left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) - left_overlap = left_overlap.rstrip_query().lstrip_query() - right_overlap = right_overlap.lstrip_query().rstrip_query() - left_remainder = left_remainder.rstrip_query() - right_remainder = right_remainder.lstrip_query() + left_overlap = left_overlap.rstrip().lstrip() + right_overlap = right_overlap.lstrip().rstrip() + left_remainder = left_remainder.rstrip() + right_remainder = right_remainder.lstrip() logger.debug("Stitching %r at %s (len %s) with %r at %s (len %s)." " The left_overlap %r is at %s (len %s)" @@ -603,8 +603,8 @@ def try_split(contig): if covered(contig, gap): midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon left_part, right_part = contig.cut_reference(midpoint) - left_part = left_part.rstrip_query() - right_part = right_part.lstrip_query() + left_part = left_part.rstrip() + right_part = right_part.lstrip() contigs.remove(contig) contigs.append(left_part) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index ce20abb92..551142ef6 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -329,7 +329,7 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): assert left + right == hit -lstrip_reference_cases = [ +lstrip_query_cases = [ ('9M@1->1', '9M@1->1'), ('5M6D@1->1', '5M6D@1->1'), ('6D5M@1->1', '6D5M@1->1'), @@ -352,7 +352,7 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ] -rstrip_reference_cases = [ +rstrip_query_cases = [ ('9M@1->1', '9M@1->1'), ('5M6D@1->1', '5M6D@1->1'), ('5M6I@1->1', '5M@1->1'), @@ -375,7 +375,7 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ] -lstrip_query_cases = [ +lstrip_reference_cases = [ ('9M@1->1', '9M@1->1'), ('5M6D@1->1', '5M6D@1->1'), ('6D5M@1->1', '5M@1->7'), @@ -398,7 +398,7 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): ] -rstrip_query_cases = [ +rstrip_reference_cases = [ ('9M@1->1', '9M@1->1'), ('5M6D@1->1', '5M@1->1'), ('5M6I@1->1', '5M6I@1->1'), @@ -423,22 +423,22 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): strip_prop_cases_all = \ [x[0] for x in cigar_hit_ref_cut_cases] + \ - [x[0] for x in lstrip_reference_cases] + \ - [x[0] for x in rstrip_reference_cases] + [x[0] for x in lstrip_query_cases] + \ + [x[0] for x in rstrip_query_cases] -@pytest.mark.parametrize('hit, expected', lstrip_reference_cases) +@pytest.mark.parametrize('hit, expected', lstrip_query_cases) def test_cigar_hit_lstrip_reference(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) - assert expected == hit.lstrip_reference() + assert expected == hit.lstrip_query() -@pytest.mark.parametrize('hit, expected', rstrip_reference_cases) +@pytest.mark.parametrize('hit, expected', rstrip_query_cases) def test_cigar_hit_rstrip_reference(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) - assert expected == hit.rstrip_reference() + assert expected == hit.rstrip_query() @@ -449,8 +449,8 @@ def test_cigar_hit_reference_strip_combines_with_connect(hit): for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) - left = left.rstrip_reference() - right = right.lstrip_reference() + left = left.rstrip_query() + right = right.lstrip_query() assert left.connect(right).coordinate_mapping.ref_to_query \ == hit.coordinate_mapping.ref_to_query @@ -463,8 +463,8 @@ def test_cigar_hit_reference_strip_combines_with_add(hit): for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) - left = left.rstrip_reference() - right = right.lstrip_reference() + left = left.rstrip_query() + right = right.lstrip_query() if left.touches(right): assert left + right == hit @@ -474,27 +474,27 @@ def test_cigar_hit_reference_strip_combines_with_add(hit): def test_cigar_hit_reference_strip_never_crashes(hit): hit = parsed_hit(hit) - hit.rstrip_reference().lstrip_reference() - hit.lstrip_reference().rstrip_reference() - hit.lstrip_reference().lstrip_reference() - hit.rstrip_reference().rstrip_reference() + hit.rstrip_query().lstrip_query() + hit.lstrip_query().rstrip_query() + hit.lstrip_query().lstrip_query() + hit.rstrip_query().rstrip_query() @pytest.mark.parametrize('hit', strip_prop_cases_all) def test_cigar_hit_reference_strip_is_idempotent(hit): hit = parsed_hit(hit) - h1 = hit.rstrip_reference() - assert h1 == h1.rstrip_reference() == h1.rstrip_reference().rstrip_reference() + h1 = hit.rstrip_query() + assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() - h1 = hit.lstrip_reference() - assert h1 == h1.lstrip_reference() == h1.lstrip_reference().lstrip_reference() + h1 = hit.lstrip_query() + assert h1 == h1.lstrip_query() == h1.lstrip_query().lstrip_query() - h1 = hit.lstrip_reference().rstrip_reference() - assert h1 == h1.lstrip_reference() == h1.rstrip_reference() + h1 = hit.lstrip_query().rstrip_query() + assert h1 == h1.lstrip_query() == h1.rstrip_query() - h1 = hit.rstrip_reference().lstrip_reference() - assert h1 == h1.rstrip_reference() == h1.lstrip_reference() + h1 = hit.rstrip_query().lstrip_query() + assert h1 == h1.rstrip_query() == h1.lstrip_query() @pytest.mark.parametrize('hit', strip_prop_cases_all) @@ -502,25 +502,25 @@ def test_cigar_hit_reference_strips_are_commutative(hit): hit = parsed_hit(hit) if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert hit.rstrip_reference().lstrip_reference() \ - == hit.lstrip_reference().rstrip_reference() + assert hit.rstrip_query().lstrip_query() \ + == hit.lstrip_query().rstrip_query() else: - assert hit.rstrip_reference().lstrip_reference().cigar \ - == hit.lstrip_reference().rstrip_reference().cigar + assert hit.rstrip_query().lstrip_query().cigar \ + == hit.lstrip_query().rstrip_query().cigar -@pytest.mark.parametrize('hit, expected', lstrip_query_cases) +@pytest.mark.parametrize('hit, expected', lstrip_reference_cases) def test_cigar_hit_lstrip_query(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) - assert expected == hit.lstrip_query() + assert expected == hit.lstrip_reference() -@pytest.mark.parametrize('hit, expected', rstrip_query_cases) +@pytest.mark.parametrize('hit, expected', rstrip_reference_cases) def test_cigar_hit_rstrip_query(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) - assert expected == hit.rstrip_query() + assert expected == hit.rstrip_reference() @pytest.mark.parametrize('hit', strip_prop_cases_all) @@ -530,8 +530,8 @@ def test_cigar_hit_query_strip_combines_with_connect(hit): for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) - left = left.rstrip_query() - right = right.lstrip_query() + left = left.rstrip_reference() + right = right.lstrip_reference() assert left.connect(right).coordinate_mapping.ref_to_query \ == hit.coordinate_mapping.ref_to_query @@ -544,8 +544,8 @@ def test_cigar_hit_query_strip_combines_with_add(hit): for cut_point in range(hit.r_st - 1, hit.r_ei): left, right = hit.cut_reference(cut_point + hit.epsilon) - left = left.rstrip_query() - right = right.lstrip_query() + left = left.rstrip_reference() + right = right.lstrip_reference() if left.touches(right): assert left + right == hit @@ -555,9 +555,9 @@ def test_cigar_hit_query_strip_combines_with_add(hit): def test_cigar_hit_strips_work_together(hit): hit = parsed_hit(hit) - rstrip = str(hit.rstrip_query().rstrip_reference().cigar) + rstrip = str(hit.rstrip_reference().rstrip_query().cigar) assert not rstrip.endswith("I") and not rstrip.endswith("D") - lstrip = str(hit.lstrip_query().lstrip_reference().cigar) + lstrip = str(hit.lstrip_reference().lstrip_query().cigar) assert not lstrip.startswith("I") and not lstrip.startswith("D") @@ -565,17 +565,17 @@ def test_cigar_hit_strips_work_together(hit): def test_cigar_hit_query_strip_is_idempotent(hit): hit = parsed_hit(hit) - h1 = hit.rstrip_query() - assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() + h1 = hit.rstrip_reference() + assert h1 == h1.rstrip_reference() == h1.rstrip_reference().rstrip_reference() - h1 = hit.lstrip_query() - assert h1 == h1.lstrip_query() == h1.lstrip_query().lstrip_query() + h1 = hit.lstrip_reference() + assert h1 == h1.lstrip_reference() == h1.lstrip_reference().lstrip_reference() - h1 = hit.lstrip_query().rstrip_query() - assert h1 == h1.lstrip_query() == h1.rstrip_query() + h1 = hit.lstrip_reference().rstrip_reference() + assert h1 == h1.lstrip_reference() == h1.rstrip_reference() - h1 = hit.rstrip_query().lstrip_query() - assert h1 == h1.rstrip_query() == h1.lstrip_query() + h1 = hit.rstrip_reference().lstrip_reference() + assert h1 == h1.rstrip_reference() == h1.lstrip_reference() @pytest.mark.parametrize('hit', strip_prop_cases_all) @@ -583,11 +583,11 @@ def test_cigar_hit_query_strips_are_commutative(hit): hit = parsed_hit(hit) if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert hit.rstrip_query().lstrip_query() \ - == hit.lstrip_query().rstrip_query() + assert hit.rstrip_reference().lstrip_reference() \ + == hit.lstrip_reference().rstrip_reference() else: - assert hit.rstrip_query().lstrip_query().cigar \ - == hit.lstrip_query().rstrip_query().cigar + assert hit.rstrip_reference().lstrip_reference().cigar \ + == hit.lstrip_reference().rstrip_reference().cigar @pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 5f709e6f0..0faba0bc8 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -543,7 +543,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) - assert all(x.seq == x.lstrip_query().rstrip_query().seq for x in results) + assert all(x.seq == x.lstrip().rstrip().seq for x in results) assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } @@ -568,10 +568,10 @@ def test_stitching_partial_align(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) - assert all(x.seq != x.lstrip_query().rstrip_query().seq for x in results) + assert all(x.seq != x.lstrip().rstrip().seq for x in results) assert { contig.seq for contig in contigs } \ - != { contig.lstrip_query().rstrip_query().seq for contig in results } + != { contig.lstrip().rstrip().seq for contig in results } def test_partial_align_consensus(exact_aligner, visualizer): @@ -625,7 +625,7 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) assert { contig.seq for contig in contigs } \ - != { contig.lstrip_query().rstrip_query().seq for contig in results } + != { contig.lstrip().rstrip().seq for contig in results } def test_partial_align_consensus_multiple_sequences(exact_aligner, visualizer): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index ddfd475b8..d129039d5 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -226,8 +226,8 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': [start_inclusive:end_noninclusive]) - def lstrip_reference(self) -> 'Cigar': - """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ + def lstrip_query(self) -> 'Cigar': + """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=None) min_op = self.coordinate_mapping.ref_to_op.get(min_r, float("inf")) @@ -238,8 +238,8 @@ def lstrip_reference(self) -> 'Cigar': return Cigar.coerce(ops) - def rstrip_reference(self) -> 'Cigar': - """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ + def rstrip_query(self) -> 'Cigar': + """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ max_r = max(self.coordinate_mapping.ref_to_query.keys(), default=None) max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("-inf")) @@ -250,8 +250,8 @@ def rstrip_reference(self) -> 'Cigar': return Cigar.coerce(ops) - def lstrip_query(self) -> 'Cigar': - """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ + def lstrip_reference(self) -> 'Cigar': + """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=None) min_op = self.coordinate_mapping.query_to_op.get(min_q, float("inf")) @@ -262,8 +262,8 @@ def lstrip_query(self) -> 'Cigar': return Cigar.coerce(ops) - def rstrip_query(self) -> 'Cigar': - """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ + def rstrip_reference(self) -> 'Cigar': + """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ max_q = max(self.coordinate_mapping.query_to_ref.keys(), default=None) max_op = self.coordinate_mapping.query_to_op.get(max_q, float("-inf")) @@ -668,34 +668,34 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: return left, right - def lstrip_reference(self) -> 'CigarHit': - """ Return a copy of the CigarHit with leading (unmatched) reference elements removed. """ + def lstrip_query(self) -> 'CigarHit': + """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ - cigar = self.cigar.lstrip_reference() + cigar = self.cigar.lstrip_query() return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - def rstrip_reference(self) -> 'CigarHit': - """ Return a copy of the CigarHit with trailing (unmatched) reference elements removed. """ + def rstrip_query(self) -> 'CigarHit': + """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ - cigar = self.cigar.rstrip_reference() + cigar = self.cigar.rstrip_query() return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) - def lstrip_query(self) -> 'CigarHit': - """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ + def lstrip_reference(self) -> 'CigarHit': + """ Return a copy of the CigarHit with leading (unmatched) reference elements removed. """ - cigar = self.cigar.lstrip_query() + cigar = self.cigar.lstrip_reference() return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - def rstrip_query(self) -> 'CigarHit': - """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ + def rstrip_reference(self) -> 'CigarHit': + """ Return a copy of the CigarHit with trailing (unmatched) reference elements removed. """ - cigar = self.cigar.rstrip_query() + cigar = self.cigar.rstrip_reference() return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) From 0b8bac565a47f636eef5ed6800484efaeafec7d0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Jan 2024 17:04:19 -0800 Subject: [PATCH 180/485] Contig stitcher: improve log messages --- micall/core/contig_stitcher.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index ff2e5ffff..0be685494 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -90,8 +90,9 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) - logger.debug("Created contigs %r at %s and %r at %s by cutting %r.", - left.name, left.alignment, right.name, right.alignment, self.name) + logger.debug("Created contigs %r at %s and %r at %s by cutting %r at %s at cut point = %s.", + left.name, left.alignment, right.name, right.alignment, + self.name, self.alignment, round(cut_point, 1)) context.get().emit(events.Cut(self, left, right)) return (left, right) @@ -107,9 +108,9 @@ def lstrip(self) -> 'AlignedContig': q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = AlignedContig.make(query, alignment, self.strand) - logger.debug("Doing lstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", - self.name, result.name, self.alignment, - len(self.seq), result.alignment, len(result.seq)) + logger.debug("Doing lstrip of %r at %s (len %s) resulted in %r at %s (len %s).", + self.name, self.alignment, len(self.seq), + result.name, result.alignment, len(result.seq)) context.get().emit(events.LStrip(self, result)) return result @@ -123,9 +124,9 @@ def rstrip(self) -> 'AlignedContig': alignment = self.alignment.rstrip_reference().rstrip_query() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.strand) - logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)", - self.name, result.name, self.alignment, - len(self.seq), result.alignment, len(result.seq)) + logger.debug("Doing rstrip of %r at %s (len %s) resulted in %r at %s (len %s).", + self.name, self.alignment, len(self.seq), + result.name, result.alignment, len(result.seq)) context.get().emit(events.RStrip(self, result)) return result @@ -611,14 +612,13 @@ def try_split(contig): contigs.append(right_part) process_queue.put(right_part) - logger.debug("Split contig %r around its gap at [%s, %s]->[%s, %s]. " - "Left part: %r at [%s, %s]->[%s, %s], " - "right part: %r at [%s, %s]->[%s, %s].", - contig.name, gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, - left_part.name, left_part.alignment.q_st, left_part.alignment.q_ei, - left_part.alignment.r_st, left_part.alignment.r_ei, - right_part.name, right_part.alignment.q_st, right_part.alignment.q_ei, - right_part.alignment.r_st, right_part.alignment.r_ei) + logger.debug("Split contig %r at %s around its gap at [%s, %s]->[%s, %s]. " + "Left part: %r at %s, " + "right part: %r at %s.", + contig.name, contig.alignment, + gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, + left_part.name, left_part.alignment, + right_part.name, right_part.alignment) context.get().emit(events.SplitGap(contig, gap, left_part, right_part)) return From 19bddbf69016766ba147d49bf31bbd477836ca10 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 29 Jan 2024 13:19:34 -0800 Subject: [PATCH 181/485] Contig stitcher: base drawing only on the parent-child relationship --- micall/core/plot_contigs.py | 177 ++++++++++++++++++++---------------- 1 file changed, 101 insertions(+), 76 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index c73cad924..1d16fa657 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -405,9 +405,10 @@ def plot_stitcher_coverage(logs: Iterable[events.EventType], genome_coverage_svg def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: - contig_map: Dict[str, GenotypedContig] = {} + complete_contig_map: Dict[str, GenotypedContig] = {} name_mappings: Dict[str, str] = {} - parent_graph: Dict[str, List[str]] = {} + complete_parent_graph: Dict[str, List[str]] = {} + alive_set: Set[str] = set() morphism_graph: Dict[str, List[str]] = {} reduced_parent_graph: Dict[str, List[str]] = {} transitive_parent_graph: Dict[str, List[str]] = {} @@ -420,12 +421,9 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: overlap_lefttake_map: Dict[str, str] = {} overlap_righttake_map: Dict[str, str] = {} overlap_sibling_map: Dict[str, str] = {} - combine_list: List[str] = [] combine_left_edge: Dict[str, str] = {} combine_right_edge: Dict[str, str] = {} - temporary: Set[str] = set() children_join_points: List[str] = [] - children_meet_points: List[str] = [] query_position_map: Dict[str, int] = {} initial_alignments: Dict[str, List[CigarHit]] = {} @@ -439,6 +437,18 @@ def remove_intermediate_edges(graph): ret[parent] = lst return ret + def remove_transitive_edges(graph): + tr_cl = transitive_closure(graph) + ret = {} + for parent, children in graph.items(): + lst = [] + for child in children: + is_transitive = any(child in tr_cl.get(other_node, []) for other_node in children if other_node != child) + if not is_transitive: + lst.append(child) + ret[parent] = lst + return ret + def get_all_ancestors(recur, lst, graph, ancestor_name): if ancestor_name not in recur: recur = recur.copy() @@ -460,9 +470,15 @@ def transitive_closure(graph): ret[parent] = lst return ret - def reflexive_closure(graph): - ret = graph.copy() + def copy_graph(graph): + ret = {} for parent, children in graph.items(): + ret[parent] = children[:] + return ret + + def reflexive_closure(graph): + ret = copy_graph(graph) + for parent, children in ret.items(): if parent not in children: children.append(parent) for child in children[:]: @@ -483,7 +499,7 @@ def inverse_graph(graph): return ret def graph_sum(graph_a, graph_b): - ret = graph_a.copy() + ret = copy_graph(graph_a) for key, values in graph_b.items(): if key not in ret: ret[key] = [] @@ -497,52 +513,56 @@ def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): - contig_map[contig.name] = contig + complete_contig_map[contig.name] = contig if [contig.name] != [parent.name for parent in parents]: for parent in parents: - contig_map[parent.name] = parent - if contig.name not in parent_graph: - parent_graph[contig.name] = [] + complete_contig_map[parent.name] = parent + if contig.name not in complete_parent_graph: + complete_parent_graph[contig.name] = [] - parent_graph[contig.name].append(parent.name) + complete_parent_graph[contig.name].append(parent.name) - def record_morphism(contig: Contig, original: Contig): - if original.name not in morphism_graph: - morphism_graph[original.name] = [] - lst = morphism_graph[original.name] - if contig.name not in lst: - lst.append(contig.name) + def record_alive(contig: Contig): + alive_set.add(contig.name) def record_bad_contig(contig: GenotypedContig, lst: List[str]): - contig_map[contig.name] = contig + complete_contig_map[contig.name] = contig lst.append(contig.name) - for event in logs: if isinstance(event, events.FinalCombine): record_contig(event.result, event.contigs) + record_alive(event.result) elif isinstance(event, events.SplitGap): record_contig(event.left, [event.contig]) record_contig(event.right, [event.contig]) + record_alive(event.left) + record_alive(event.right) elif isinstance(event, events.Intro): record_contig(event.contig, []) + record_alive(event.contig) elif isinstance(event, events.Hit): record_contig(event.part, [event.contig]) + record_alive(event.part) elif isinstance(event, events.NoRef): record_bad_contig(event.contig, unknown) + record_alive(event.contig) elif isinstance(event, events.ZeroHits): record_bad_contig(event.contig, anomaly) + record_alive(event.contig) elif isinstance(event, events.StrandConflict): record_bad_contig(event.contig, anomaly) + record_alive(event.contig) elif isinstance(event, events.ReverseComplement): record_contig(event.result, [event.contig]) + record_alive(event.result) elif isinstance(event, events.HitNumber): initial_alignments[event.contig.name] = event.connected + record_alive(event.contig) elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) elif isinstance(event, (events.LStrip, events.RStrip)): record_contig(event.result, [event.original]) - record_morphism(event.result, event.original) elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -554,6 +574,7 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name elif isinstance(event, events.Drop): record_bad_contig(event.contig, discarded) + record_alive(event.contig) elif isinstance(event, events.StitchCut): record_contig(event.left_overlap, [event.left]) record_contig(event.left_remainder, [event.left]) @@ -561,13 +582,12 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): record_contig(event.right_remainder, [event.right]) elif isinstance(event, events.Stitch): record_contig(event.result, [event.left, event.right]) + record_alive(event.result) elif isinstance(event, events.Cut): record_contig(event.left, [event.original]) record_contig(event.right, [event.original]) elif isinstance(event, events.Combine): - for contig in event.contigs: - combine_list.append(contig.name) - + record_alive(event.result) record_contig(event.result, event.contigs) if event.contigs: combine_left_edge[event.result.name] = event.contigs[0].name @@ -578,6 +598,26 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): x: NoReturn = event raise RuntimeError(f"Unrecognized action or event: {event}") + nodup_parent_graph = remove_transitive_edges(complete_parent_graph) + + # Close alive set by parents + def extend_alive(contig_name): + if contig_name not in alive_set: + alive_set.add(contig_name) + + for parent_name in nodup_parent_graph.get(contig_name, []): + extend_alive(parent_name) + + for contig_name in alive_set.copy(): + extend_alive(contig_name) + + parent_graph: Dict[str, List[str]] = {} + for contig_name in nodup_parent_graph: + if contig_name in alive_set: + parent_graph[contig_name] = nodup_parent_graph[contig_name] + + contig_map: Dict[str, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set} + bad_contigs = anomaly + discarded + unknown group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) transitive_parent_graph = transitive_closure(parent_graph) @@ -590,23 +630,18 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): sorted_sinks = list(sorted(child_name for child_name in contig_map if child_name not in children_graph)) - bad_contigs = anomaly + discarded + unknown + + for contig_name, parents in parent_graph.items(): + if len(parents) == 1: + morphism_graph[parents[0]] = [contig_name] transitive_morphism_graph = transitive_closure(morphism_graph) reduced_morphism_graph = remove_intermediate_edges(transitive_morphism_graph) eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_morphism_graph)) - for contig_name in overlaps_list: - temporary.add(contig_name) - for child in transitive_children_graph.get(contig_name, []): - temporary.add(child) - for contig_name, parents in parent_graph.items(): - if len(parents) > 2: + if len(parents) > 1: children_join_points.append(contig_name) - for contig_name, children in children_graph.items(): - if len(children) > 2: - children_meet_points.append(contig_name) def hits_to_insertions(hits: List[CigarHit]): for hit in hits: @@ -621,12 +656,6 @@ def hits_to_insertions(hits: List[CigarHit]): nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0] unaligned_map[contig_name] = nonempty_insertions - last_join_points_parent = {contig_name for join in children_join_points for contig_name in transitive_parent_graph.get(join, [])} - last_join_points = [] - for contig_name in children_join_points: - if contig_name not in last_join_points_parent: - last_join_points.append(contig_name) - def set_query_position(contig: Contig): if contig.name in query_position_map: return @@ -651,12 +680,6 @@ def set_query_position(contig: Contig): for contig in contig_map.values(): set_query_position(contig) - # Closing `temporary' - for contig_name in contig_map: - if contig_name in temporary: - for clone in eqv_morphism_graph.get(contig_name, [contig_name]): - temporary.add(clone) - def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): for parent in edge_table: child_remainder = edge_table[parent] @@ -675,39 +698,42 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass final_parts: Dict[str, bool] = {} - for contig_name in contig_map: - if contig_name in temporary: - continue + pre_join_points = [] - if contig_name in combine_list: - finals = reduced_morphism_graph.get(contig_name, [contig_name]) - if len(finals) == 1: - final_parts[finals[0]] = True + def add_join_parents(join_name): + if join_name in children_join_points: + for contig_name in parent_graph.get(join_name, [join_name]): + add_join_parents(contig_name) + else: + pre_join_points.append(join_name) - elif contig_name in bad_contigs: - final_parts[contig_name] = True + for join_name in children_join_points + sorted_sinks: + add_join_parents(join_name) - for join in last_join_points + sorted_sinks: - parents = parent_graph.get(join, [join]) - if not any(isinstance(contig_map[parent], AlignedContig) for parent in parents): - parents = [join] + def is_ancestor(contig_name, other_names): + for other in other_names: + if other == contig_name: + continue - for contig_name in parents: - for contig_name in reduced_morphism_graph.get(contig_name, [contig_name]): - if contig_name in bad_contigs: - continue + if contig_name in transitive_children_graph.get(other, []): + return True + return False - if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs): - continue + for contig_name in pre_join_points[:]: + if is_ancestor(contig_name, pre_join_points): + pre_join_points.remove(contig_name) - if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig_name, [contig_name])): - continue + for contig_name in pre_join_points: + if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs): + continue - transitive_parent = eqv_parent_graph.get(contig_name, [contig_name]) - if any(parent in transitive_parent for parent in final_parts): - continue + if any(contig_name in eqv_morphism_graph.get(temp_name, [temp_name]) for temp_name in overlaps_list): + continue - final_parts[contig_name] = True + final_parts[contig_name] = True + + for contig_name in bad_contigs: + final_parts[contig_name] = True final_children_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: @@ -838,11 +864,10 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] return (a_r_st, a_r_ei, f_r_st, f_r_ei) def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]: - parts = final_children_mapping[contig_name] + parts_names = final_children_mapping[contig_name] + parts = [contig_map[name] for name in parts_names] parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1)) - for prev_name, part_name, next_naem in sliding_window(parts): - part = contig_map[part_name] - + for part in parts: if part.name in repeatset: continue From 68aa30c8a8b7072e569fe520cc634e84ae27a9c0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 29 Jan 2024 17:17:07 -0800 Subject: [PATCH 182/485] Contig stitcher: draw unaligned parts in yellow in the visualizer --- micall/core/plot_contigs.py | 202 +++++++++++------- .../test_gap_around_big_insertion.svg | 48 +++-- .../test_gap_around_small_insertion.svg | 48 +++-- .../test_partial_align_consensus.svg | 8 +- ...onsensus_multiple_overlaping_sequences.svg | 14 +- .../test_stitching_partial_align.svg | 8 +- ...ching_partial_align_multiple_sequences.svg | 9 +- 7 files changed, 222 insertions(+), 115 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 1d16fa657..03f9d18eb 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -8,6 +8,7 @@ from math import log10, copysign from operator import itemgetter, attrgetter from pathlib import Path +import dataclasses import logging import yaml @@ -406,7 +407,7 @@ def plot_stitcher_coverage(logs: Iterable[events.EventType], genome_coverage_svg def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: complete_contig_map: Dict[str, GenotypedContig] = {} - name_mappings: Dict[str, str] = {} + name_map: Dict[str, str] = {} complete_parent_graph: Dict[str, List[str]] = {} alive_set: Set[str] = set() morphism_graph: Dict[str, List[str]] = {} @@ -415,6 +416,7 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: discarded: List[str] = [] unknown: List[str] = [] anomaly: List[str] = [] + unaligned: List[str] = [] overlaps_list: List[str] = [] overlap_leftparent_map: Dict[str, str] = {} overlap_rightparent_map: Dict[str, str] = {} @@ -424,8 +426,10 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: combine_left_edge: Dict[str, str] = {} combine_right_edge: Dict[str, str] = {} children_join_points: List[str] = [] - query_position_map: Dict[str, int] = {} + query_position_map: Dict[str, Tuple[int, int]] = {} initial_alignments: Dict[str, List[CigarHit]] = {} + lstrip_map: Dict[str, str] = {} + rstrip_map: Dict[str, str] = {} def remove_intermediate_edges(graph): ret = {} @@ -512,6 +516,22 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) + def hits_to_insertions(contig: GenotypedContig, hits: List[CigarHit]): + for hit in hits: + # yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) + yield from hit.insertions() + # yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) + + def record_unaligned_parts(contig: GenotypedContig, connected: List[CigarHit]): + all_insertions = list(hits_to_insertions(contig, connected)) + nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0] + for insertion in nonempty_insertions: + query = dataclasses.replace(contig, name=f"u{len(complete_contig_map)}", seq='A' * insertion.query_length) + fake_aligned = AlignedContig.make(query=query, alignment=insertion, strand="forward") + record_contig(fake_aligned, [contig]) + record_bad_contig(fake_aligned, unaligned) + record_alive(fake_aligned) + def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): complete_contig_map[contig.name] = contig if [contig.name] != [parent.name for parent in parents]: @@ -529,6 +549,12 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): complete_contig_map[contig.name] = contig lst.append(contig.name) + def record_lstrip(result: GenotypedContig, original: GenotypedContig): + lstrip_map[result.name] = original.name + + def record_rstrip(result: GenotypedContig, original: GenotypedContig): + rstrip_map[result.name] = original.name + for event in logs: if isinstance(event, events.FinalCombine): record_contig(event.result, event.contigs) @@ -557,12 +583,16 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): record_contig(event.result, [event.contig]) record_alive(event.result) elif isinstance(event, events.HitNumber): - initial_alignments[event.contig.name] = event.connected + record_unaligned_parts(event.contig, event.connected) record_alive(event.contig) elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) - elif isinstance(event, (events.LStrip, events.RStrip)): + elif isinstance(event, events.LStrip): + record_contig(event.result, [event.original]) + record_lstrip(event.result, event.original) + elif isinstance(event, events.RStrip): record_contig(event.result, [event.original]) + record_rstrip(event.result, event.original) elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -602,9 +632,7 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): # Close alive set by parents def extend_alive(contig_name): - if contig_name not in alive_set: - alive_set.add(contig_name) - + alive_set.add(contig_name) for parent_name in nodup_parent_graph.get(contig_name, []): extend_alive(parent_name) @@ -617,7 +645,7 @@ def extend_alive(contig_name): parent_graph[contig_name] = nodup_parent_graph[contig_name] contig_map: Dict[str, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set} - bad_contigs = anomaly + discarded + unknown + bad_contigs = anomaly + discarded + unknown + unaligned group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) transitive_parent_graph = transitive_closure(parent_graph) @@ -643,42 +671,35 @@ def extend_alive(contig_name): if len(parents) > 1: children_join_points.append(contig_name) - def hits_to_insertions(hits: List[CigarHit]): - for hit in hits: - yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) - yield from hit.insertions() - yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) - - unaligned_map: Dict[str, List[CigarHit]] = {} - for contig_name, hits in initial_alignments.items(): + def set_query_position(contig_name: str) -> None: contig = contig_map[contig_name] - all_insertions = list(hits_to_insertions(hits)) - nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0] - unaligned_map[contig_name] = nonempty_insertions - - def set_query_position(contig: Contig): - if contig.name in query_position_map: - return - - if not isinstance(contig, AlignedContig): - return - - parent_names = parent_graph.get(contig.name, []) - parents = [contig_map[name] for name in parent_names] - parents = [contig for contig in parents if isinstance(contig, AlignedContig)] - - if parents: - for parent in parents: - if parent.name not in query_position_map: - set_query_position(parent) - - average = round(sum(query_position_map[parent_name] for parent_name in parent_names) / len(parent_names)) - query_position_map[contig.name] = average - else: - query_position_map[contig.name] = (contig.alignment.q_st + contig.alignment.q_ei) // 2 - - for contig in contig_map.values(): - set_query_position(contig) + children_names = children_graph.get(contig.name, []) + + if contig_name not in query_position_map: + if isinstance(contig, AlignedContig): + query_position_map[contig_name] = (contig.alignment.q_st, contig.alignment.q_ei) + + children = [contig_map[name] for name in children_names] + for child in children: + strip_parent = lstrip_map.get(child.name, None) + if strip_parent is None: + if contig_name in query_position_map: + query_position_map[child.name] = query_position_map[contig_name] + elif isinstance(child, AlignedContig): + (original_q_st, original_q_ei) = query_position_map[strip_parent] + (current_q_st, current_q_ei) = (child.alignment.q_st, child.alignment.q_ei) + original_query_len = len(contig_map[strip_parent].seq) + current_query_len = len(child.seq) + offset = abs(original_query_len - current_query_len) + new_q_st = offset + current_q_st + new_q_ei = offset + current_q_ei + query_position_map[child.name] = (new_q_st, new_q_ei) + + for child_name in children_names: + set_query_position(child_name) + + for contig_name in sorted_roots: + set_query_position(contig_name) def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): for parent in edge_table: @@ -697,15 +718,16 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map)): pass while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass + final_nodes: List[str] = [] final_parts: Dict[str, bool] = {} - pre_join_points = [] + final_children_mapping: Dict[str, List[str]] = {} def add_join_parents(join_name): if join_name in children_join_points: for contig_name in parent_graph.get(join_name, [join_name]): add_join_parents(contig_name) else: - pre_join_points.append(join_name) + final_nodes.append(join_name) for join_name in children_join_points + sorted_sinks: add_join_parents(join_name) @@ -719,11 +741,11 @@ def is_ancestor(contig_name, other_names): return True return False - for contig_name in pre_join_points[:]: - if is_ancestor(contig_name, pre_join_points): - pre_join_points.remove(contig_name) + for contig_name in final_nodes[:]: + if is_ancestor(contig_name, final_nodes): + final_nodes.remove(contig_name) - for contig_name in pre_join_points: + for contig_name in final_nodes: if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs): continue @@ -735,7 +757,6 @@ def is_ancestor(contig_name, other_names): for contig_name in bad_contigs: final_parts[contig_name] = True - final_children_mapping: Dict[str, List[str]] = {} for parent_name in sorted_roots: children = [] for final_contig in final_parts: @@ -753,24 +774,45 @@ def is_ancestor(contig_name, other_names): else: max_position = max(max_position, len(contig.seq) + 3 * position_offset) - name_mappings = {} + def overlaps(self, other) -> bool: + def intervals_overlap(x, y): + return x[0] <= y[1] and x[1] >= y[0] + + return intervals_overlap((self.alignment.q_st, self.alignment.q_ei), + (other.alignment.q_st, other.alignment.q_ei)) + + name_map = {} for i, (parent, children) in enumerate(sorted(final_children_mapping.items(), key=lambda p: p[0])): - name_mappings[parent] = f"{i + 1}" - children = list(sorted(children, key=lambda name: query_position_map.get(name, -1))) - for k, child in enumerate(children): - if len(children) > 1: - name_mappings[child] = f"{i + 1}.{k + 1}" + name_map[parent] = f"{i + 1}" + + unaligned_names = [name for name in children if name in unaligned] + aligned_names = [name for name in children if name not in unaligned] + + todo_names = aligned_names + for contig_name in unaligned_names: + all_other = [contig_map[name] for name in aligned_names] + aligned_other = [contig for contig in all_other if isinstance(contig, AlignedContig)] + current = contig_map[contig_name] + if isinstance(current, AlignedContig) and \ + not any(overlaps(current, other) for other in aligned_other): + todo_names.append(contig_name) + discarded.append(contig_name) + + todo_names = list(sorted(todo_names, key=lambda name: query_position_map.get(name, (-1, -1)))) + for k, child_name in enumerate(todo_names): + if len(todo_names) > 1: + name_map[child_name] = f"{i + 1}.{k + 1}" else: - name_mappings[child] = f"{i + 1}" + name_map[child_name] = f"{i + 1}" - for child in bad_contigs: - if child not in children: - if child in transitive_parent_graph \ - and parent in transitive_parent_graph[child]: + for bad_name in bad_contigs: + if bad_name not in children: + if bad_name in transitive_parent_graph \ + and parent in transitive_parent_graph[bad_name]: k += 1 - name_mappings[child] = f"{i + 1}.{k + 1}" + name_map[bad_name] = f"{i + 1}.{k + 1}" - for contig_name, name in name_mappings.items(): + for contig_name, name in name_map.items(): logger.debug(f"Contig name {contig_name!r} is displayed as {name!r}.") def get_neighbours(part, lookup): @@ -831,19 +873,17 @@ def get_neighbour(part, lookup): if prev_part is not None and prev_part.alignment.r_ei < part.alignment.r_st and prev_part: r_st = prev_part.alignment.r_st else: - start_delta = -1 * part.alignment.q_st - r_st = part.alignment.r_st + start_delta + r_st = part.alignment.r_st if next_part is not None and next_part.alignment.r_st > part.alignment.r_ei and next_part: r_ei = next_part.alignment.r_ei else: - end_delta = len(part.seq) - 1 - part.alignment.q_ei - r_ei = part.alignment.r_ei + end_delta + r_ei = part.alignment.r_ei full_size_map[part.name] = (r_st, r_ei) def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]: - if isinstance(contig, AlignedContig): + if isinstance(contig, AlignedContig) and contig.alignment.ref_length > 0: r_st = contig.alignment.r_st r_ei = contig.alignment.r_ei if contig.name in aligned_size_map: @@ -867,7 +907,7 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl parts_names = final_children_mapping[contig_name] parts = [contig_map[name] for name in parts_names] parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1)) - for part in parts: + for prev_part, part, next_part in sliding_window(parts): if part.name in repeatset: continue @@ -881,9 +921,15 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl continue repeatset.add(part.name) - indexes = name_mappings[part.name] + indexes = name_map[part.name] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) + if a_r_st < f_r_st: + yield Track(min(a_r_st, f_r_st) + position_offset, max(a_r_st, f_r_st) + position_offset, color="yellow") + + if a_r_ei > f_r_ei: + yield Track(min(a_r_ei, f_r_ei) + position_offset, max(a_r_ei, f_r_ei) + position_offset, color="yellow") + yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}") def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: @@ -904,7 +950,7 @@ def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bo continue repeatset.add(part.name) - indexes = name_mappings[part.name] if labels else None + indexes = name_map[part.name] if labels else None height = 20 if labels else 1 elevation = 1 if labels else -20 (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) @@ -1015,9 +1061,13 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: contig = contig_map[contig_name] (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - name = name_mappings.get(contig_name, contig_name) - figure.add(Arrow(r_st + position_offset, r_ei + position_offset, elevation=-20, h=1)) - figure.add(Track(f_r_st + position_offset, f_r_ei + position_offset, label=name)) + name = name_map.get(contig_name, contig_name) + if isinstance(contig, AlignedContig) and contig.name not in unaligned: + colour = 'lightgrey' + figure.add(Arrow(r_st + position_offset, r_ei + position_offset, elevation=-20, h=1)) + else: + colour = "yellow" + figure.add(Track(f_r_st + position_offset, f_r_ei + position_offset, label=name, color=colour)) ############# # Anomalies # @@ -1044,7 +1094,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: else: colour = "yellow" - name = name_mappings.get(contig_name, contig_name) + name = name_map.get(contig_name, contig_name) figure.add(Track(a_r_st + position_offset, a_r_ei + position_offset, color=colour, label=name)) ########### @@ -1065,7 +1115,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: r_st = 0 r_ei = len(contig.seq) colour = "yellow" - name = name_mappings.get(contig_name, contig_name) + name = name_map.get(contig_name, contig_name) figure.add(Track(r_st + position_offset, r_ei + position_offset, color=colour, label=name)) if not figure.elements: diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index c2cea6e3f..59354dfea 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -1,20 +1,20 @@ + width="970.0" height="187.0" viewBox="0.0 -187.0 970.0 187.0"> - + testref - + -1.1 +1.2 @@ -26,10 +26,10 @@ -1.2 +1.1 - + @@ -39,27 +39,33 @@ - + - - -1.1 - + - - + + 1.2 + + + + + + +1.1 + + - + - + @@ -67,4 +73,16 @@ + + + +discards: + + + + + +1.3 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index c2cea6e3f..191255656 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -1,20 +1,20 @@ + width="970.0" height="187.0" viewBox="0.0 -187.0 970.0 187.0"> - + testref - + -1.1 +1.2 @@ -26,10 +26,10 @@ -1.2 +1.1 - + @@ -39,27 +39,33 @@ - + - - -1.1 - + - - + + 1.2 + + + + + + +1.1 + + - + - + @@ -67,4 +73,16 @@ + + + +discards: + + + + + +1.3 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg index 74b5bdb9a..aba964bcb 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg @@ -25,7 +25,13 @@ - + + + + + + + 1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg index 595289080..4f27e61e0 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -31,8 +31,11 @@ - - + + + + + 1 @@ -45,8 +48,11 @@ - - + + + + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg index 74b5bdb9a..aba964bcb 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg @@ -25,7 +25,13 @@ - + + + + + + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index b2a880eae..94243a708 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -25,8 +25,11 @@ - - + + + + + 2 @@ -42,7 +45,7 @@ - + 1 From cffb35284a1fcb712499a5913837a7a16d2e313f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 09:19:09 -0800 Subject: [PATCH 183/485] Contig stitcher: improve visualization of unaligned parts --- micall/core/plot_contigs.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 03f9d18eb..1a31a6fa3 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -516,19 +516,15 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) - def hits_to_insertions(contig: GenotypedContig, hits: List[CigarHit]): - for hit in hits: - # yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) - yield from hit.insertions() - # yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) - - def record_unaligned_parts(contig: GenotypedContig, connected: List[CigarHit]): - all_insertions = list(hits_to_insertions(contig, connected)) - nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0] - for insertion in nonempty_insertions: - query = dataclasses.replace(contig, name=f"u{len(complete_contig_map)}", seq='A' * insertion.query_length) + def record_unaligned_parts(result: AlignedContig, original: AlignedContig): + length = abs(result.alignment.query_length - original.alignment.query_length) + if length > 0: + q_st = max(result.alignment.q_st, original.alignment.q_st) + r_st = min(result.alignment.r_st, original.alignment.r_st) + insertion = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_st + length - 1, r_st=r_st, r_ei=r_st-1) + query = dataclasses.replace(original, name=f"u{len(complete_contig_map)}", seq='A' * insertion.query_length) fake_aligned = AlignedContig.make(query=query, alignment=insertion, strand="forward") - record_contig(fake_aligned, [contig]) + record_contig(fake_aligned, [original]) record_bad_contig(fake_aligned, unaligned) record_alive(fake_aligned) @@ -583,16 +579,17 @@ def record_rstrip(result: GenotypedContig, original: GenotypedContig): record_contig(event.result, [event.contig]) record_alive(event.result) elif isinstance(event, events.HitNumber): - record_unaligned_parts(event.contig, event.connected) record_alive(event.contig) elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) elif isinstance(event, events.LStrip): record_contig(event.result, [event.original]) record_lstrip(event.result, event.original) + record_unaligned_parts(event.result, event.original) elif isinstance(event, events.RStrip): record_contig(event.result, [event.original]) record_rstrip(event.result, event.original) + record_unaligned_parts(event.result, event.original) elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -790,12 +787,8 @@ def intervals_overlap(x, y): todo_names = aligned_names for contig_name in unaligned_names: - all_other = [contig_map[name] for name in aligned_names] - aligned_other = [contig for contig in all_other if isinstance(contig, AlignedContig)] - current = contig_map[contig_name] - if isinstance(current, AlignedContig) and \ - not any(overlaps(current, other) for other in aligned_other): - todo_names.append(contig_name) + todo_names.append(contig_name) + if contig_name not in discarded: discarded.append(contig_name) todo_names = list(sorted(todo_names, key=lambda name: query_position_map.get(name, (-1, -1)))) From 2443278575281ba9897e575fdb70117bd9e4f6d9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 10:31:15 -0800 Subject: [PATCH 184/485] Contig stitcher: add few more tests --- ..._overlapping_and_one_separate_contig_2.svg | 83 ++++++++++++ ...cross_alignment_around_small_insertion.svg | 70 ++++++++++ .../test_stitch_cross_alignment.svg | 50 +++++++ .../test_stitch_with_insertion.svg | 40 ++++++ ...ching_partial_align_multiple_sequences.svg | 57 ++++---- micall/tests/test_contig_stitcher.py | 123 ++++++++++++++++-- 6 files changed, 388 insertions(+), 35 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg create mode 100644 micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg new file mode 100644 index 000000000..2868289f7 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -0,0 +1,83 @@ + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + +3 + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + +3 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg new file mode 100644 index 000000000..054c55b78 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -0,0 +1,70 @@ + + + + + + + +testref + + + + + + + +1.2 + + + + + +2 + + + + + +1.1 + + + + + + + + + + + + + + + + +1.2 + + + + + +1.1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg new file mode 100644 index 000000000..6e1a9ecb1 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -0,0 +1,50 @@ + + + + + + + +testref + + + + + + + +1.2 + + + + + +1.1 + + + + + + + + + + + + + + + + +1.2 + + + + + +1.1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg new file mode 100644 index 000000000..d2fa07921 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg @@ -0,0 +1,40 @@ + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index 94243a708..910d8e6d6 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -1,53 +1,60 @@ + width="970.0" height="117.0" viewBox="0.0 -117.0 970.0 117.0"> - + testref - + - - + + + +1 + + + + -2 +2 - + - - + + - + - + - - -2 + + +1 + + + + + - - - -discards: + + + - - - + + + +2 - - - -1 \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 0faba0bc8..24009e3ae 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -204,6 +204,41 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali assert len(visualizer().elements) > len(contigs) +def test_correct_processing_of_two_overlapping_and_one_separate_contig_2(exact_aligner, visualizer): + # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. + + ref_seq = 'Z' * 5 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'Y' * 5 + + contigs = [ + GenotypedContig(name='a', + seq='N' * 5 + 'A' * 50 + 'C' * 20 + 'H' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq='M' * 5 + 'C' * 50 + 'T' * 20 + 'J' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='c', + seq='Q' * 5 + 'C' * 20 + 'T' * 50 + 'I' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq.rstrip('H') + 'C' * 30 + contigs[2].seq.lstrip('Q') + assert len(visualizer().elements) > len(contigs) + + def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner, visualizer): # Scenario: All contigs have some overlapping parts, resulting in one continuous sequence after stitching. @@ -602,14 +637,14 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner, visualizer): contigs = [ GenotypedContig(name='a', - seq='T' * 10 + 'C' * 20 + 'A' * 10, + seq='Z' * 5 + 'C' * 20 + 'T' * 5 + 'U' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.3, ), GenotypedContig(name='b', - seq='C' * 20 + 'A' * 10 + 'G' * 10, + seq='M' * 5 + 'C' * 5 + 'T' * 10 + 'G' * 10, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -619,14 +654,9 @@ def test_stitching_partial_align_multiple_sequences(exact_aligner, visualizer): results = list(stitch_contigs(contigs)) assert len(results) == 1 - for result in results: - assert any(result.seq in contig.seq for contig in contigs) - + assert results[0].seq == 'Z' * 5 + 'C' * 20 + 'T' * 10 + 'G' * 10 assert len(visualizer().elements) > len(contigs) - assert { contig.seq for contig in contigs } \ - != { contig.lstrip().rstrip().seq for contig in results } - def test_partial_align_consensus_multiple_sequences(exact_aligner, visualizer): # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. @@ -745,7 +775,7 @@ def test_gap_around_small_insertion(exact_aligner, visualizer): match_fraction=0.3, ), GenotypedContig(name='b', - seq='B' * 20, + seq='Q' * 5 + 'B' * 20 + 'J' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -755,6 +785,7 @@ def test_gap_around_small_insertion(exact_aligner, visualizer): results = list(stitch_consensus(contigs)) assert len(results) == 1 + assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "Z" * 5 assert len(visualizer().elements) > len(contigs) @@ -772,7 +803,78 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): match_fraction=0.3, ), GenotypedContig(name='b', - seq='B' * 20, + seq='Q' * 5 + 'B' * 20 + 'J' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "Z" * 5 + assert len(visualizer().elements) > len(contigs) + + +def test_stitch_with_insertion(exact_aligner, visualizer): + # Scenario: Contig is aligned with multiple hits, and the borders are correctly handled. + + ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + + contigs = [ + GenotypedContig(name='a', + seq='P' * 5 + 'A' * 10 + 'D' * 6 + 'C' * 10 + 'Z' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == "PPPPPAAAAAAAAAADDDDDDCCCCCCCCCCZZZZZ" + assert len(visualizer().elements) > len(contigs) + + + +def test_stitch_cross_alignment(exact_aligner, visualizer): + # Scenario: Single contig is cross-aligned. + + ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + + contigs = [ + GenotypedContig(name='a', + seq='P' * 5 + 'C' * 10 + 'D' * 6 + 'A' * 10 + 'Z' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == "AAAAAAAAAACCCCCCCCCC" + assert len(visualizer().elements) > len(contigs) + + +def test_cross_alignment_around_small_insertion(exact_aligner, visualizer): + # Scenario: Single contig is cross-aligned, then combined with another contig that is between its aligned parts. + + ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + + contigs = [ + GenotypedContig(name='a', + seq='P' * 5 + 'C' * 10 + 'D' * 6 + 'A' * 10 + 'Z' * 5, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig(name='b', + seq='Q' * 5 + 'B' * 20 + 'J' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -782,6 +884,7 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): results = list(stitch_consensus(contigs)) assert len(results) == 1 + assert results[0].seq == "A" * 10 + "B" * 20 + "C" * 10 assert len(visualizer().elements) > len(contigs) From 88c900e80e30229c797b0d4ecdd88c13be34a9c5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 12:50:37 -0800 Subject: [PATCH 185/485] Contig stitcher: fix incorrect numbering case in the visualizer The issue has to do with incorrect caluculation for query_position_map values. --- micall/core/plot_contigs.py | 64 ++++++++++++------- .../test_gap_around_big_insertion.svg | 10 +-- .../test_gap_around_small_insertion.svg | 10 +-- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 1a31a6fa3..b758f2901 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -519,14 +519,15 @@ def symmetric_closure(graph): def record_unaligned_parts(result: AlignedContig, original: AlignedContig): length = abs(result.alignment.query_length - original.alignment.query_length) if length > 0: - q_st = max(result.alignment.q_st, original.alignment.q_st) - r_st = min(result.alignment.r_st, original.alignment.r_st) + q_st = original.alignment.q_st + r_st = original.alignment.r_st insertion = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_st + length - 1, r_st=r_st, r_ei=r_st-1) query = dataclasses.replace(original, name=f"u{len(complete_contig_map)}", seq='A' * insertion.query_length) - fake_aligned = AlignedContig.make(query=query, alignment=insertion, strand="forward") + fake_aligned = AlignedContig.make(query=query, alignment=insertion, strand=original.strand) record_contig(fake_aligned, [original]) record_bad_contig(fake_aligned, unaligned) record_alive(fake_aligned) + return fake_aligned def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): complete_contig_map[contig.name] = contig @@ -547,9 +548,17 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): def record_lstrip(result: GenotypedContig, original: GenotypedContig): lstrip_map[result.name] = original.name + unaligned = record_unaligned_parts(result, original) + assert original.name != result.name + if unaligned: + lstrip_map[unaligned.name] = result.name def record_rstrip(result: GenotypedContig, original: GenotypedContig): rstrip_map[result.name] = original.name + unaligned = record_unaligned_parts(result, original) + assert original.name != result.name + if unaligned: + rstrip_map[unaligned.name] = result.name for event in logs: if isinstance(event, events.FinalCombine): @@ -585,11 +594,9 @@ def record_rstrip(result: GenotypedContig, original: GenotypedContig): elif isinstance(event, events.LStrip): record_contig(event.result, [event.original]) record_lstrip(event.result, event.original) - record_unaligned_parts(event.result, event.original) elif isinstance(event, events.RStrip): record_contig(event.result, [event.original]) record_rstrip(event.result, event.original) - record_unaligned_parts(event.result, event.original) elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -672,25 +679,38 @@ def set_query_position(contig_name: str) -> None: contig = contig_map[contig_name] children_names = children_graph.get(contig.name, []) + def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: + parent = contig_map[parent_name] + if parent_name in query_position_map: + (original_q_st, original_q_ei) = query_position_map[parent_name] + (current_q_st, current_q_ei) = (contig.alignment.q_st, contig.alignment.q_ei) + original_query_len = abs(original_q_st - original_q_ei) + current_query_len = abs(current_q_st - current_q_ei) + + if contig_name in lstrip_map: + if contig_name in unaligned: + query_position_map[contig.name] = (original_q_st - current_query_len - 1, original_q_st - 1) + else: + query_position_map[contig.name] = (original_q_ei - current_query_len, original_q_ei) + elif contig_name in rstrip_map: + if contig_name in unaligned: + query_position_map[contig.name] = (original_q_ei + 1, original_q_ei + 1 + current_query_len) + else: + query_position_map[contig.name] = (original_q_st, original_q_st + current_query_len) + else: + query_position_map[contig_name] = query_position_map[parent_name] + if contig_name not in query_position_map: if isinstance(contig, AlignedContig): - query_position_map[contig_name] = (contig.alignment.q_st, contig.alignment.q_ei) - - children = [contig_map[name] for name in children_names] - for child in children: - strip_parent = lstrip_map.get(child.name, None) - if strip_parent is None: - if contig_name in query_position_map: - query_position_map[child.name] = query_position_map[contig_name] - elif isinstance(child, AlignedContig): - (original_q_st, original_q_ei) = query_position_map[strip_parent] - (current_q_st, current_q_ei) = (child.alignment.q_st, child.alignment.q_ei) - original_query_len = len(contig_map[strip_parent].seq) - current_query_len = len(child.seq) - offset = abs(original_query_len - current_query_len) - new_q_st = offset + current_q_st - new_q_ei = offset + current_q_ei - query_position_map[child.name] = (new_q_st, new_q_ei) + regular_parents_names = parent_graph.get(contig_name, []) + regular_parents_names = [name for name in regular_parents_names if name in query_position_map] + strip_parents_names = lstrip_map.get(contig_name, None) or rstrip_map.get(contig_name, None) + parents_names = (strip_parents_names and [strip_parents_names]) or regular_parents_names + if parents_names: + for parent_name in parents_names: + copy_from_parent(contig, parent_name) + else: + query_position_map[contig_name] = (contig.alignment.q_st, contig.alignment.q_ei) for child_name in children_names: set_query_position(child_name) diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 59354dfea..6d54f27db 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -14,7 +14,7 @@ -1.2 +1.1 @@ -26,7 +26,7 @@ -1.1 +1.3 @@ -46,7 +46,7 @@ -1.2 +1.1 @@ -55,7 +55,7 @@ -1.1 +1.3 @@ -82,7 +82,7 @@ -1.3 +1.2 \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index 191255656..74111b4e8 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -14,7 +14,7 @@ -1.2 +1.1 @@ -26,7 +26,7 @@ -1.1 +1.3 @@ -46,7 +46,7 @@ -1.2 +1.1 @@ -55,7 +55,7 @@ -1.1 +1.3 @@ -82,7 +82,7 @@ -1.3 +1.2 \ No newline at end of file From 2c203d02cf9db1d69d9d4254a24894592488df42 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 14:40:36 -0800 Subject: [PATCH 186/485] Contig stitcher: fix unaligned display in cross alignment case --- micall/core/contig_stitcher.py | 11 +++-- micall/core/plot_contigs.py | 45 ++++++++++------- ...cross_alignment_around_small_insertion.svg | 46 ++++++++++++----- .../test_stitch_cross_alignment.svg | 40 ++++++++++++--- micall/utils/contig_stitcher_events.py | 49 +++++++++++-------- 5 files changed, 131 insertions(+), 60 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0be685494..a826d5635 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -287,11 +287,16 @@ def is_out_of_order(name: str) -> bool: sorted_by_query = sorted(contigs, key=lambda contig: get_indexes(contig.name)) for prev_contig, contig, next_contig in sliding_window(sorted_by_query): if isinstance(contig, AlignedContig): - name = contig.name - if prev_contig is not None or is_out_of_order(name): + original = contig + start = prev_contig.alignment.q_ei + 1 if isinstance(prev_contig, AlignedContig) else 0 + end = next_contig.alignment.q_st - 1 if isinstance(next_contig, AlignedContig) else len(contig.seq) - 1 + + if prev_contig is not None or is_out_of_order(original.name): contig = contig.lstrip() - if next_contig is not None or is_out_of_order(name): + context.get().emit(events.InitialStrip(original, start, original.alignment.q_st - 1)) + if next_contig is not None or is_out_of_order(original.name): contig = contig.rstrip() + context.get().emit(events.InitialStrip(original, original.alignment.q_ei + 1, end)) yield contig diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index b758f2901..ed4f6649f 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -427,9 +427,9 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: combine_right_edge: Dict[str, str] = {} children_join_points: List[str] = [] query_position_map: Dict[str, Tuple[int, int]] = {} - initial_alignments: Dict[str, List[CigarHit]] = {} lstrip_map: Dict[str, str] = {} rstrip_map: Dict[str, str] = {} + strip_set: Set[Tuple[str, int, int]] = set() def remove_intermediate_edges(graph): ret = {} @@ -516,18 +516,31 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) - def record_unaligned_parts(result: AlignedContig, original: AlignedContig): - length = abs(result.alignment.query_length - original.alignment.query_length) - if length > 0: - q_st = original.alignment.q_st - r_st = original.alignment.r_st - insertion = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_st + length - 1, r_st=r_st, r_ei=r_st-1) - query = dataclasses.replace(original, name=f"u{len(complete_contig_map)}", seq='A' * insertion.query_length) - fake_aligned = AlignedContig.make(query=query, alignment=insertion, strand=original.strand) + def record_unaligned_parts(original: AlignedContig, q_st: int, r_st: int, length: int): + key = (original.seq, q_st, q_st + length) + if length > 0 and key not in strip_set: + strip_set.add(key) + alignment = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_st + length - 1, r_st=r_st, r_ei=r_st-1) + seq = 'A' * alignment.query_length + query = dataclasses.replace(original, name=f"u{len(complete_contig_map)}", seq=seq) + fake_aligned = AlignedContig.make(query, alignment, strand=original.strand) record_contig(fake_aligned, [original]) record_bad_contig(fake_aligned, unaligned) record_alive(fake_aligned) return fake_aligned + return None + + def record_regular_strip(result: AlignedContig, original: AlignedContig): + length = abs(result.alignment.query_length - original.alignment.query_length) + q_st = original.alignment.q_st + r_st = original.alignment.r_st + return record_unaligned_parts(original, q_st=q_st, r_st=r_st, length=length) + + def record_initial_strip(original: AlignedContig, q_st: int, q_ei: int): + length = q_ei - q_st + 1 + contig = record_unaligned_parts(original, q_st, original.alignment.r_st, length) + if contig: + query_position_map[contig.name] = (q_st, q_ei) def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): complete_contig_map[contig.name] = contig @@ -546,17 +559,15 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): complete_contig_map[contig.name] = contig lst.append(contig.name) - def record_lstrip(result: GenotypedContig, original: GenotypedContig): + def record_lstrip(result: AlignedContig, original: AlignedContig): lstrip_map[result.name] = original.name - unaligned = record_unaligned_parts(result, original) - assert original.name != result.name + unaligned = record_regular_strip(result, original) if unaligned: lstrip_map[unaligned.name] = result.name - def record_rstrip(result: GenotypedContig, original: GenotypedContig): + def record_rstrip(result: AlignedContig, original: AlignedContig): rstrip_map[result.name] = original.name - unaligned = record_unaligned_parts(result, original) - assert original.name != result.name + unaligned = record_regular_strip(result, original) if unaligned: rstrip_map[unaligned.name] = result.name @@ -597,6 +608,8 @@ def record_rstrip(result: GenotypedContig, original: GenotypedContig): elif isinstance(event, events.RStrip): record_contig(event.result, [event.original]) record_rstrip(event.result, event.original) + elif isinstance(event, events.InitialStrip): + record_initial_strip(event.contig, event.q_st, event.q_ei) elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -680,11 +693,9 @@ def set_query_position(contig_name: str) -> None: children_names = children_graph.get(contig.name, []) def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: - parent = contig_map[parent_name] if parent_name in query_position_map: (original_q_st, original_q_ei) = query_position_map[parent_name] (current_q_st, current_q_ei) = (contig.alignment.q_st, contig.alignment.q_ei) - original_query_len = abs(original_q_st - original_q_ei) current_query_len = abs(current_q_st - current_q_ei) if contig_name in lstrip_map: diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index 054c55b78..d44530a93 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -1,20 +1,20 @@ + width="970.0" height="227.0" viewBox="0.0 -227.0 970.0 227.0"> - + testref - + -1.2 +1.4 @@ -26,10 +26,10 @@ -1.1 +1.2 - + @@ -39,27 +39,27 @@ - + -1.2 +1.4 -1.1 +1.2 - + - + @@ -67,4 +67,28 @@ + + + +discards: + + + + + +1.5 + + + + + +1.1 + + + + + +1.3 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index 6e1a9ecb1..f2aa421ef 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -1,29 +1,29 @@ + width="970.0" height="196.0" viewBox="0.0 -196.0 970.0 196.0"> - + testref - + -1.2 +1.4 -1.1 +1.2 - + @@ -33,18 +33,42 @@ - + -1.2 +1.4 +1.2 + + + + + + +discards: + + + + + 1.1 + + + +1.3 + + + + + +1.5 + \ No newline at end of file diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index a18bf11e8..fccfba40d 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -4,74 +4,81 @@ from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit -@dataclass +@dataclass(frozen=True) class Cut: original: 'Contig' left: 'Contig' right: 'Contig' -@dataclass +@dataclass(frozen=True) class LStrip: original: 'AlignedContig' result: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class RStrip: original: 'AlignedContig' result: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Munge: left: 'AlignedContig' right: 'AlignedContig' result: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Combine: contigs: List['AlignedContig'] result: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class NoRef: contig: 'GenotypedContig' -@dataclass +@dataclass(frozen=True) class ZeroHits: contig: 'GenotypedContig' -@dataclass +@dataclass(frozen=True) class StrandConflict: contig: 'GenotypedContig' -@dataclass +@dataclass(frozen=True) class HitNumber: contig: 'GenotypedContig' initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] connected: List[CigarHit] -@dataclass +@dataclass(frozen=True) class ReverseComplement: contig: 'GenotypedContig' result: 'GenotypedContig' -@dataclass +@dataclass(frozen=True) class Hit: contig: 'GenotypedContig' part: 'AlignedContig' index: int -@dataclass +@dataclass(frozen=True) +class InitialStrip: + contig: 'AlignedContig' + q_st: int + q_ei: int + + +@dataclass(frozen=True) class StitchCut: left: 'AlignedContig' right: 'AlignedContig' @@ -81,7 +88,7 @@ class StitchCut: right_remainder: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Overlap: left: 'AlignedContig' right: 'AlignedContig' @@ -97,31 +104,31 @@ class Overlap: cut_point_scaled: Fraction -@dataclass +@dataclass(frozen=True) class NoOverlap: contig: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Stitch: left: 'AlignedContig' right: 'AlignedContig' result: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Drop: contig: 'AlignedContig' covering: List['AlignedContig'] -@dataclass +@dataclass(frozen=True) class IgnoreGap: contig: 'AlignedContig' gap: 'CigarHit' -@dataclass +@dataclass(frozen=True) class SplitGap: contig: 'AlignedContig' gap: 'CigarHit' @@ -129,12 +136,12 @@ class SplitGap: right: 'AlignedContig' -@dataclass +@dataclass(frozen=True) class Intro: contig: 'GenotypedContig' -@dataclass +@dataclass(frozen=True) class FinalCombine: contigs: List['AlignedContig'] result: 'AlignedContig' @@ -142,4 +149,4 @@ class FinalCombine: AlignmentEvent = Union[NoRef, ZeroHits, StrandConflict, HitNumber, ReverseComplement, Hit] ModifyEvent = Union[LStrip, RStrip] -EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] +EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, InitialStrip, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] From ef78c2dbb0ba54e0efb24d07a197390cb1cd15f6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 15:22:07 -0800 Subject: [PATCH 187/485] Contig stitcher: colour the reference track depending on coverage --- micall/core/plot_contigs.py | 48 ++++- .../tests/data/exact_parts_contigs.plot.svg | 31 ++- .../test_big_insertion_in_a_single_contig.svg | 17 +- ...est_big_insertion_in_a_single_contig_2.svg | 15 +- .../test_correct_processing_complex_logs.svg | 182 ------------------ ...test_correct_processing_complex_nogaps.svg | 84 ++++++-- ...wo_overlapping_and_one_separate_contig.svg | 27 ++- ..._overlapping_and_one_separate_contig_2.svg | 27 ++- ...g_of_two_partially_overlapping_contigs.svg | 21 +- ...overlapping_different_organism_contigs.svg | 36 +++- ...cross_alignment_around_small_insertion.svg | 27 ++- .../test_gap_around_big_insertion.svg | 27 ++- .../test_gap_around_small_insertion.svg | 27 ++- ...test_identical_stitching_of_one_contig.svg | 15 +- .../test_partial_align_consensus.svg | 15 +- ...onsensus_multiple_overlaping_sequences.svg | 23 ++- ...ial_align_consensus_multiple_sequences.svg | 21 +- ...stitching_of_non_overlapping_contigs_1.svg | 21 +- ...stitching_of_non_overlapping_contigs_2.svg | 21 +- .../test_stitch_cross_alignment.svg | 21 +- .../test_stitch_with_insertion.svg | 17 +- ..._stitching_contig_with_big_covered_gap.svg | 27 ++- ...itching_contig_with_big_noncovered_gap.svg | 17 +- ...ching_contig_with_big_noncovered_gap_2.svg | 21 +- ...titching_contig_with_small_covered_gap.svg | 29 ++- ..._overlapping_contigs_into_one_sequence.svg | 27 ++- .../test_stitching_of_identical_contigs.svg | 15 +- .../test_stitching_partial_align.svg | 15 +- ...ching_partial_align_multiple_sequences.svg | 21 +- ...e_contig_completely_covered_by_another.svg | 15 +- .../test_stitching_with_empty_contigs.svg | 15 +- 31 files changed, 656 insertions(+), 269 deletions(-) delete mode 100644 micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ed4f6649f..d94dd6f48 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -789,7 +789,7 @@ def is_ancestor(contig_name, other_names): children = [] for final_contig in final_parts: if final_contig == parent_name or \ - parent_name in reduced_parent_graph.get(final_contig, []): + parent_name in reduced_parent_graph.get(final_contig, [final_contig]): children.append(final_contig) final_children_mapping[parent_name] = children @@ -1042,10 +1042,52 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: color=landmark_colour)) figure.add(Multitrack(subtracks)) - # Drawing the reference sequence. + ############# + # Reference # + ############# + r_st = 0 r_ei = group_refs[group_ref] - figure.add(Track(r_st + position_offset, r_ei + position_offset, label=f"{group_ref}")) + reference_tracks = [] + reference_min = r_st + position_offset + reference_max = r_ei + position_offset + reference_tracks.append(Track(r_st + position_offset, r_ei + position_offset, color="red")) + + for contig_name in final_parts: + contig = contig_map[contig_name] + if contig.group_ref != group_ref: + continue + + if not isinstance(contig, AlignedContig): + continue + + if contig_name in bad_contigs: + continue + + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + reference_tracks.append(Track(a_r_st + position_offset, a_r_ei + position_offset, color="yellow")) + reference_min = min(a_r_st + position_offset, reference_min) + reference_max = max(a_r_ei + position_offset, reference_max) + + for contig_name in final_parts: + contig = contig_map[contig_name] + if contig.group_ref != group_ref: + continue + + if not isinstance(contig, AlignedContig): + continue + + if contig_name in bad_contigs: + continue + + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + reference_tracks.append(Track(f_r_st + position_offset, f_r_ei + position_offset, color="lightgray")) + reference_min = min(f_r_st + position_offset, reference_min) + reference_max = max(f_r_ei + position_offset, reference_max) + + figure.add(Multitrack(reference_tracks)) + midpoint = round((reference_max - reference_min) / 2 + reference_min) + figure.add(Track(midpoint, midpoint, label=group_ref, color="transparent", h=-11.5)) ########## # Arrows # diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg index 54edb7fd6..d67bb740d 100644 --- a/micall/tests/data/exact_parts_contigs.plot.svg +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -1,9 +1,9 @@ + width="970.0" height="195.5" viewBox="0.0 -195.5 970.0 195.5"> - + @@ -35,7 +35,7 @@ - + @@ -61,7 +61,7 @@ - + @@ -87,7 +87,7 @@ - + @@ -119,9 +119,26 @@ + + + + + + + + + + + + + + + + + - - + + HIV1-B-FR-K03455-seed diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg index 5bebd90cc..0bdb9b7e2 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg @@ -1,11 +1,22 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> + + + + + + + + + + + - - + + testref diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg index a529c9c8e..028a2ef7c 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg @@ -1,10 +1,21 @@ + width="970.0000000000001" height="84.5" viewBox="0.0 -84.5 970.0000000000001 84.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg deleted file mode 100644 index d8de84bba..000000000 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_logs.svg +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - -testref-2 - - - - - - - -1.2 - - - - - -2.2 - - - - - -3.2 - - - - - -4.2 - - - - - - - - - - - - -1.2 - - - - - - - - - - - - - -2.2 - - - - - - - - - - - - - -3.2 - - - - - - - - - - - - - -4.2 - - - - - - -testref-1 - - - - - - - -1.1 - - - - - -2.1 - - - - - -3.1 - - - - - -4.1 - - - - - - - - - - - - -1.1 - - - - - - - - - - - - - -2.1 - - - - - - - - - - - - - -3.1 - - - - - - - - - - - - - -4.1 - - - - \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index d8de84bba..3230d983f 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -1,15 +1,44 @@ + width="969.9999999999999" height="350.0" viewBox="0.0 -350.0 969.9999999999999 350.0"> - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + testref-2 - + @@ -35,13 +64,13 @@ 4.2 - + - + @@ -49,13 +78,13 @@ - + - + @@ -63,13 +92,13 @@ - + - + @@ -77,13 +106,13 @@ - + - + @@ -91,8 +120,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + testref-1 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index 593bba7cc..bf8755c35 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -1,10 +1,33 @@ + width="969.9999999999999" height="146.5" viewBox="0.0 -146.5 969.9999999999999 146.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 2868289f7..0109eeb68 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -1,10 +1,33 @@ + width="970.0000000000001" height="146.5" viewBox="0.0 -146.5 970.0000000000001 146.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg index 49f2ab8f1..6ac677308 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg @@ -1,10 +1,27 @@ + width="970.0000000000001" height="115.5" viewBox="0.0 -115.5 970.0000000000001 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg index 3647ee590..1e422d0bd 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg @@ -1,15 +1,26 @@ + width="970.0000000000001" height="164.0" viewBox="0.0 -164.0 970.0000000000001 164.0"> - - + + + + + + + + + + + + + testref-1 - + @@ -17,13 +28,13 @@ 1 - + - + @@ -31,8 +42,19 @@ + + + + + + + + + + + - + testref-2 diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index d44530a93..93391e7b7 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -1,10 +1,33 @@ + width="970.0" height="225.5" viewBox="0.0 -225.5 970.0 225.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 6d54f27db..98cf4d95e 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -1,10 +1,33 @@ + width="970.0" height="185.5" viewBox="0.0 -185.5 970.0 185.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index 74111b4e8..3d46a9969 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -1,10 +1,33 @@ + width="970.0" height="185.5" viewBox="0.0 -185.5 970.0 185.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg index 88eb96ca6..9ca6c6216 100644 --- a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg +++ b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg @@ -1,10 +1,21 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg index aba964bcb..f921a2010 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg @@ -1,10 +1,21 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg index 4f27e61e0..4841d31b6 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -1,11 +1,28 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> + + + + + + + + + + + + + + + + + - - + + testref diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg index 4d53d9e33..8e68713ae 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -1,10 +1,27 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg index 504060aac..c41cf81cc 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg @@ -1,10 +1,27 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg index 504060aac..420874285 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg @@ -1,10 +1,27 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index f2aa421ef..45eccaada 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -1,10 +1,27 @@ + width="970.0" height="194.5" viewBox="0.0 -194.5 970.0 194.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg index d2fa07921..aed852d3e 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg @@ -1,11 +1,22 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> + + + + + + + + + + + - - + + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg index db7c205fc..f9a6d08f5 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg @@ -1,10 +1,33 @@ + width="970.0" height="138.5" viewBox="0.0 -138.5 970.0 138.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg index f0bef2372..b452e1d25 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg @@ -1,11 +1,22 @@ + width="969.9999999999999" height="84.5" viewBox="0.0 -84.5 969.9999999999999 84.5"> + + + + + + + + + + + - - + + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg index 5e9be4ba7..fe2df981f 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg @@ -1,10 +1,27 @@ + width="969.9999999999999" height="115.5" viewBox="0.0 -115.5 969.9999999999999 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg index 2db694b50..248071e1d 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg @@ -1,11 +1,34 @@ + width="970.0" height="138.5" viewBox="0.0 -138.5 970.0 138.5"> + + + + + + + + + + + + + + + + + + + + + + + - - + + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg index d8b0a4dcb..6f5e767eb 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg @@ -1,10 +1,33 @@ + width="969.9999999999999" height="146.5" viewBox="0.0 -146.5 969.9999999999999 146.5"> + + + + + + + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg index de7ef094c..eb6412601 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -1,10 +1,21 @@ + width="970.0" height="196.5" viewBox="0.0 -196.5 970.0 196.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg index aba964bcb..f921a2010 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg @@ -1,10 +1,21 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index 910d8e6d6..c8722b346 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -1,10 +1,27 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> + + + + + + + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index 9035a2abd..f1c6df0dc 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -1,10 +1,21 @@ + width="970.0000000000001" height="165.5" viewBox="0.0 -165.5 970.0000000000001 165.5"> + + + + + + + + + + + - + testref diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg index 3cd329d42..8bec656af 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -1,10 +1,21 @@ + width="970.0" height="154.5" viewBox="0.0 -154.5 970.0 154.5"> + + + + + + + + + + + - + testref From 6faaa1e8f3c1c1bcf09b608767570f1deb3729bc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 15:31:50 -0800 Subject: [PATCH 188/485] Contig stitcher: remove unused variables in the visualizer --- micall/core/plot_contigs.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index d94dd6f48..31bdd29c1 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -927,14 +927,11 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) - def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]: + def get_tracks(group_ref: str, contig_name: str) -> Iterable[Track]: parts_names = final_children_mapping[contig_name] parts = [contig_map[name] for name in parts_names] parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1)) for prev_part, part, next_part in sliding_window(parts): - if part.name in repeatset: - continue - if part.name in bad_contigs: continue @@ -944,7 +941,6 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl if part.group_ref != group_ref: continue - repeatset.add(part.name) indexes = name_map[part.name] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) @@ -956,14 +952,11 @@ def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterabl yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}") - def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: + def get_arrows(group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: parts = final_children_mapping[contig_name] for part_name in parts: part = contig_map[part_name] - if part.name in repeatset: - continue - if part.name in bad_contigs: continue @@ -973,7 +966,6 @@ def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bo if part.group_ref != group_ref: continue - repeatset.add(part.name) indexes = name_map[part.name] if labels else None height = 20 if labels else 1 elevation = 1 if labels else -20 @@ -984,9 +976,8 @@ def get_arrows(repeatset: Set[str], group_ref: str, contig_name: str, labels: bo label=indexes) def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: - repeatset: Set[str] = set() for parent_name in sorted_roots: - yield from get_arrows(repeatset, group_ref, parent_name, labels) + yield from get_arrows(group_ref, parent_name, labels) ################ # Drawing part # @@ -1101,13 +1092,11 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: # Contigs # ########### - repeatset1: Set[str] = set() - repeatset2: Set[str] = set() for parent_name in sorted_roots: - arrows = list(get_arrows(repeatset1, group_ref, parent_name, labels=False)) + arrows = list(get_arrows(group_ref, parent_name, labels=False)) if arrows: figure.add(ArrowGroup(arrows)) - parts = list(get_tracks(repeatset2, group_ref, parent_name)) + parts = list(get_tracks(group_ref, parent_name)) if parts: figure.add(Multitrack(parts)) From 614a0f716cab8740fa9386d502fa0320b0174965 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 15:39:10 -0800 Subject: [PATCH 189/485] Contig stitcher: handle negative drawing coordinates better --- micall/core/plot_contigs.py | 20 +++-- .../test_gap_around_big_insertion.svg | 78 +++++++++---------- .../test_gap_around_small_insertion.svg | 78 +++++++++---------- 3 files changed, 90 insertions(+), 86 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 31bdd29c1..0ce1b431a 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -794,14 +794,6 @@ def is_ancestor(contig_name, other_names): final_children_mapping[parent_name] = children - min_position, max_position = 1, 1 - position_offset = 100 - for _, contig in contig_map.items(): - if isinstance(contig, GenotypedContig) and contig.ref_seq is not None: - max_position = max(max_position, len(contig.ref_seq) + 3 * position_offset) - else: - max_position = max(max_position, len(contig.seq) + 3 * position_offset) - def overlaps(self, other) -> bool: def intervals_overlap(x, y): return x[0] <= y[1] and x[1] >= y[0] @@ -979,6 +971,18 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: for parent_name in sorted_roots: yield from get_arrows(group_ref, parent_name, labels) + min_position, max_position = 0, 1 + for contig_name in final_parts: + contig = contig_map[contig_name] + if isinstance(contig, AlignedContig): + positions = get_contig_coordinates(contig) + max_position = max(max_position, max(positions)) + min_position = min(min_position, min(positions)) + else: + max_position = max(max_position, len(contig.seq)) + + position_offset = -1 * min_position + 100 + ################ # Drawing part # ################ diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 98cf4d95e..439de04e4 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -5,106 +5,106 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2 - - + + discards: - - + + 1.2 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index 3d46a9969..a948c5471 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -5,106 +5,106 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2 - - + + discards: - - + + 1.2 From e277bdcaa4ca0ad858aae28d6f445bfd4eac4c65 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 16:22:23 -0800 Subject: [PATCH 190/485] Contig stitcher: improve visualizer positioning for small images --- micall/core/plot_contigs.py | 5 +- .../tests/data/exact_parts_contigs.plot.svg | 118 ++++++------ .../test_big_insertion_in_a_single_contig.svg | 26 +-- ...est_big_insertion_in_a_single_contig_2.svg | 28 +-- ...test_correct_processing_complex_nogaps.svg | 172 +++++++++--------- ...wo_overlapping_and_one_separate_contig.svg | 68 +++---- ..._overlapping_and_one_separate_contig_2.svg | 74 ++++---- ...g_of_two_partially_overlapping_contigs.svg | 48 ++--- ...overlapping_different_organism_contigs.svg | 54 +++--- ...cross_alignment_around_small_insertion.svg | 82 ++++----- .../test_gap_around_big_insertion.svg | 78 ++++---- .../test_gap_around_small_insertion.svg | 78 ++++---- ...test_identical_stitching_of_one_contig.svg | 26 +-- .../test_partial_align_consensus.svg | 30 +-- ...onsensus_multiple_overlaping_sequences.svg | 50 ++--- ...ial_align_consensus_multiple_sequences.svg | 48 ++--- ...stitching_of_non_overlapping_contigs_1.svg | 46 ++--- ...stitching_of_non_overlapping_contigs_2.svg | 46 ++--- .../test_stitch_cross_alignment.svg | 62 +++---- .../test_stitch_with_insertion.svg | 30 +-- ..._stitching_contig_with_big_covered_gap.svg | 66 +++---- ...itching_contig_with_big_noncovered_gap.svg | 28 +-- ...ching_contig_with_big_noncovered_gap_2.svg | 46 ++--- ...titching_contig_with_small_covered_gap.svg | 66 +++---- ..._overlapping_contigs_into_one_sequence.svg | 68 +++---- .../test_stitching_of_identical_contigs.svg | 48 ++--- .../test_stitching_of_zero_contigs.svg | 10 +- .../test_stitching_partial_align.svg | 30 +-- ...ching_partial_align_multiple_sequences.svg | 52 +++--- ...e_contig_completely_covered_by_another.svg | 40 ++-- .../test_stitching_with_empty_contigs.svg | 36 ++-- 31 files changed, 833 insertions(+), 826 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 0ce1b431a..a157a18e2 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -981,7 +981,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: else: max_position = max(max_position, len(contig.seq)) - position_offset = -1 * min_position + 100 + position_offset = -1 * min_position + 0.05 * (max_position - min_position) ################ # Drawing part # @@ -1178,7 +1178,8 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: figure.add(Track(r_st + position_offset, r_ei + position_offset, color=colour, label=name)) if not figure.elements: - figure.add(Track(1, max_position, label='No contigs found.', color='none')) + figure.add(Track(0, max_position, label='.', color='none')) + figure.add(Track(0, max_position * 3 / 2, label='No contigs found.', color='none', h=-10)) return figure diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg index d67bb740d..57f0ae5a4 100644 --- a/micall/tests/data/exact_parts_contigs.plot.svg +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -5,181 +5,181 @@ - - + + 5' LTR - - + + gag - - + + vif - - + + tat - - + + nef - - + + tat - - + + vpu - - + + rev - - + + 3' LTR - - + + pol - - + + vpr - - + + rev - - + + env - - + + PR - - + + RT - - + + INT - - + + V3 - - + + GP41 - + - + - + - + - + - - + + HIV1-B-FR-K03455-seed - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg index 0bdb9b7e2..1354a4e59 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg index 028a2ef7c..ee8e6e7ca 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg @@ -1,43 +1,43 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index 3230d983f..9ee304317 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -5,234 +5,234 @@ - + - + - + - + - + - + - + - + - + - - + + testref-2 - - - -1.2 + + + +1.2 - - - -2.2 + + + +2.2 - - - -3.2 + + + +3.2 - - - -4.2 + + + +4.2 - - + + - - + + 1.2 - - + + - - + + 2.2 - - + + - - + + 3.2 - - + + - - + + 4.2 - + - + - + - + - + - + - + - + - + - - + + testref-1 - - - -1.1 + + + +1.1 - - - -2.1 + + + +2.1 - - - -3.1 + + + +3.1 - - - -4.1 + + + +4.1 - - + + - - + + 1.1 - - + + - - + + 2.1 - - + + - - + + 3.1 - - + + - - + + 4.1 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index bf8755c35..d7982712d 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -1,95 +1,95 @@ + width="970.0" height="146.5" viewBox="0.0 -146.5 970.0 146.5"> - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - - + + 1 - - + + - - + + 2 - - + + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 0109eeb68..494e97909 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -1,104 +1,104 @@ + width="970.0" height="146.5" viewBox="0.0 -146.5 970.0 146.5"> - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - + - - + + 1 - - + + - + - - + + 2 - - + + - + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg index 6ac677308..6bfe21d0b 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg @@ -1,69 +1,69 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg index 1e422d0bd..3b761d3e1 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg @@ -1,82 +1,82 @@ + width="970.0" height="164.0" viewBox="0.0 -164.0 970.0 164.0"> - + - + - + - - + + testref-1 - - - -1 + + + +1 - - + + - - + + 1 - + - + - + - - + + testref-2 - - - -2 + + + +2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index 93391e7b7..2b15b873f 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -5,112 +5,112 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.4 + + + +1.4 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.4 - - + + 1.2 - - + + - - + + 2 - - + + discards: - - + + 1.5 - - + + 1.1 - - + + 1.3 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 439de04e4..b827f3135 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -5,106 +5,106 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2 - - + + discards: - - + + 1.2 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index a948c5471..458c2f82a 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -5,106 +5,106 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2 - - + + discards: - - + + 1.2 diff --git a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg index 9ca6c6216..8f6edc705 100644 --- a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg +++ b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg index f921a2010..db61ddadf 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg index 4841d31b6..2f1fa6751 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -5,71 +5,71 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - + - - + + 1 - - + + - + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg index 8e68713ae..fa71913ef 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -1,69 +1,69 @@ + width="970.0000000000001" height="115.5" viewBox="0.0 -115.5 970.0000000000001 115.5"> - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg index c41cf81cc..a36396ded 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg index 420874285..025e46de5 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index 45eccaada..5f6c6d33c 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -5,86 +5,86 @@ - + - + - + - + - + - - + + testref - - - -1.4 + + + +1.4 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.4 - - + + 1.2 - - + + discards: - - + + 1.1 - - + + 1.3 - - + + 1.5 diff --git a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg index aed852d3e..aad92c783 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg index f9a6d08f5..b8b7b1e50 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg @@ -5,87 +5,87 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg index b452e1d25..7e663f73b 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg @@ -1,43 +1,43 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg index fe2df981f..5db9b1daa 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -2 + + + +2 - - - -1 + + + +1 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg index 248071e1d..629ad55cc 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg @@ -5,87 +5,87 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg index 6f5e767eb..bf13e76e8 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg @@ -1,95 +1,95 @@ + width="970.0" height="146.5" viewBox="0.0 -146.5 970.0 146.5"> - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - - + + 1 - - + + - - + + 2 - - + + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg index eb6412601..0e61a9491 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -1,70 +1,70 @@ + width="969.9999999999999" height="196.5" viewBox="0.0 -196.5 969.9999999999999 196.5"> - + - + - + - - + + testref - - - -3 + + + +3 - - + + - - + + 3 - - + + discards: - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg index edb3d2eaf..3a07b1711 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_zero_contigs.svg @@ -4,8 +4,14 @@ - - + + +. + + + + + No contigs found. diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg index f921a2010..db61ddadf 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index c8722b346..6c44de7d5 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -1,75 +1,75 @@ + width="970.0000000000001" height="115.5" viewBox="0.0 -115.5 970.0000000000001 115.5"> - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - + - - + + 1 - - + + - + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index f1c6df0dc..891cb5c0d 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -1,60 +1,60 @@ + width="970.0" height="165.5" viewBox="0.0 -165.5 970.0 165.5"> - + - + - + - - + + testref - - - -2 + + + +2 - - + + - - + + 2 - - + + discards: - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg index 8bec656af..c3cf7c090 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -1,56 +1,56 @@ + width="969.9999999999999" height="154.5" viewBox="0.0 -154.5 969.9999999999999 154.5"> - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 - - + + anomaly: - - + + 2 From 832e560193eec7ed8fc6b3839c116282214610a1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 16:28:56 -0800 Subject: [PATCH 191/485] Contig stitcher: do not draw arrows above discarded contigs --- micall/core/plot_contigs.py | 1 - .../test_stitching_of_identical_contigs.svg | 24 +++++++------------ ...e_contig_completely_covered_by_another.svg | 18 ++++++-------- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index a157a18e2..8b36de97c 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1123,7 +1123,6 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: name = name_map.get(contig_name, contig_name) if isinstance(contig, AlignedContig) and contig.name not in unaligned: colour = 'lightgrey' - figure.add(Arrow(r_st + position_offset, r_ei + position_offset, elevation=-20, h=1)) else: colour = "yellow" figure.add(Track(f_r_st + position_offset, f_r_ei + position_offset, label=name, color=colour)) diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg index 0e61a9491..1a6736275 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -1,9 +1,9 @@ + width="969.9999999999999" height="174.5" viewBox="0.0 -174.5 969.9999999999999 174.5"> - + @@ -14,13 +14,13 @@ - + testref - + @@ -28,13 +28,13 @@ 3 - + - + @@ -42,26 +42,18 @@ - + discards: - - - - - + 1 - - - - diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index 891cb5c0d..af7536837 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -1,9 +1,9 @@ + width="970.0" height="154.5" viewBox="0.0 -154.5 970.0 154.5"> - + @@ -14,13 +14,13 @@ - + testref - + @@ -28,13 +28,13 @@ 2 - + - + @@ -42,16 +42,12 @@ - + discards: - - - - From 63c1695dcb83955b481da71ddbeceba90fc4649d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 16:34:29 -0800 Subject: [PATCH 192/485] Contig stitcher: make sure that names do not repeat --- micall/core/contig_stitcher.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index a826d5635..9d800dd97 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable +from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable, Set from collections import deque, defaultdict from dataclasses import dataclass, replace from math import ceil, floor @@ -22,11 +22,16 @@ class StitcherContext: def __init__(self) -> None: self.name_generator_state: int = 0 + self.nameset: Set[str] = set() self.events: List[events.EventType] = [] def generate_new_name(self) -> str: - self.name_generator_state += 1 - return f"c{self.name_generator_state}" + while True: + self.name_generator_state += 1 + name = f"c{self.name_generator_state}" + if name not in self.nameset: + self.nameset.add(name) + return name def emit(self, event: events.EventType) -> None: self.events.append(event) @@ -644,6 +649,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont contig.name, contig.seq, contig.ref_name, contig.group_ref, contig.ref_seq, len(contig.seq)) context.get().emit(events.Intro(contig)) + context.get().nameset.add(contig.name) maybe_aligned = list(align_all_to_reference(contigs)) From da8cd99dab745f1b5ff4e576f6112c65e26d5461 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 16:55:04 -0800 Subject: [PATCH 193/485] Contig stitcher: simplify usage of the context --- micall/core/contig_stitcher.py | 25 ++++++++++++++--------- micall/core/denovo.py | 6 ++---- micall/tests/test_contig_stitcher_fuzz.py | 15 +++++++------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9d800dd97..9dfd2bdd4 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -10,6 +10,7 @@ from Bio import Seq import logging from contextvars import ContextVar, Context +from contextlib import contextmanager from fractions import Fraction from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit @@ -37,15 +38,18 @@ def emit(self, event: events.EventType) -> None: self.events.append(event) -context: ContextVar[StitcherContext] = ContextVar("StitcherContext", default=StitcherContext()) + @staticmethod + @contextmanager + def fresh(): + ctx = StitcherContext() + token = context.set(ctx) + try: + yield ctx + finally: + context.reset(token) -def with_fresh_context(body: Callable[[StitcherContext], T]) -> T: - def wrapper(): - ctx = StitcherContext() - context.set(ctx) - return body(ctx) - return Context().run(wrapper) +context: ContextVar[StitcherContext] = ContextVar("StitcherContext") @dataclass(frozen=True) @@ -710,9 +714,10 @@ def main(args): logger.setLevel(logging.WARN) logging.basicConfig(level=logger.level) - write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) - args.contigs.close() - args.stitched_contigs.close() + with StitcherContext.fresh(): + write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) + args.contigs.close() + args.stitched_contigs.close() if __name__ == '__main__': diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 86e5c5434..e8c82bda4 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,7 +19,7 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, with_fresh_context +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, StitcherContext from micall.core.plot_contigs import plot_stitcher_coverage IVA = "iva" @@ -83,7 +83,7 @@ def write_contig_refs(contigs_fasta_path, contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") group_refs = {} - def run_stitcher(ctx): + with StitcherContext.fresh() as ctx: genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) @@ -102,8 +102,6 @@ def run_stitcher(ctx): return len(contigs) - return with_fresh_context(run_stitcher) - def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): """ Use Blastn to search for the genotype of a set of reference sequences. diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 28984d87d..a83a07761 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -1,7 +1,8 @@ import pytest import json import os -from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps, with_fresh_context, StitcherContext +from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps, StitcherContext +import micall.core.contig_stitcher as stitcher from micall.core.plot_contigs import build_stitcher_figure from micall.utils.cigar_tools import CigarHit, Cigar from micall.utils.consensus_aligner import CigarActions @@ -14,6 +15,11 @@ def no_aligner(monkeypatch): monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [x]) +@pytest.fixture(autouse=True) +def stitcher_context(): + stitcher.context.set(StitcherContext()) + + def read_contigs(line): array = json.loads(line) contig_descriptions = [obj['fields'] for obj in array if obj['type'] == 'contig'] @@ -116,13 +122,8 @@ def test_stitching_intervals_prop(no_aligner, description): @pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) def test_visualizer_simple(no_aligner, description): contigs = description['contigs'] - for contig in contigs: - contig.__dict__["group_ref"] = "HIV1-B-FR-K03455-seed" - - def test(ctx: StitcherContext): + with StitcherContext.fresh() as ctx: stitched = list(stitch_consensus(contigs)) assert len(ctx.events) >= len(contigs) figure = build_stitcher_figure(ctx.events) assert len(figure.elements) > len(contigs) + 1 - - with_fresh_context(test) From 5321972aeb986931a11b1948fd6c2b7ddd2a12b9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 30 Jan 2024 17:21:52 -0800 Subject: [PATCH 194/485] Contig stitcher: draw dashed lines to separate contig sections --- micall/core/plot_contigs.py | 36 ++-- .../tests/data/exact_parts_contigs.plot.svg | 118 +++++------ .../test_big_insertion_in_a_single_contig.svg | 26 +-- ...est_big_insertion_in_a_single_contig_2.svg | 26 +-- ...test_correct_processing_complex_nogaps.svg | 174 ++++++++-------- ...wo_overlapping_and_one_separate_contig.svg | 66 +++---- ..._overlapping_and_one_separate_contig_2.svg | 72 +++---- ...g_of_two_partially_overlapping_contigs.svg | 46 ++--- ...overlapping_different_organism_contigs.svg | 52 ++--- ...cross_alignment_around_small_insertion.svg | 186 +++++++++++++----- .../test_gap_around_big_insertion.svg | 106 +++++++++- .../test_gap_around_small_insertion.svg | 106 +++++++++- ...test_identical_stitching_of_one_contig.svg | 28 +-- .../test_partial_align_consensus.svg | 30 +-- ...ial_align_consensus_multiple_sequences.svg | 48 ++--- ...stitching_of_non_overlapping_contigs_1.svg | 46 ++--- ...stitching_of_non_overlapping_contigs_2.svg | 46 ++--- .../test_stitch_cross_alignment.svg | 162 +++++++++++---- .../test_stitch_with_insertion.svg | 30 +-- ..._stitching_contig_with_big_covered_gap.svg | 66 +++---- ...itching_contig_with_big_noncovered_gap.svg | 26 +-- ...ching_contig_with_big_noncovered_gap_2.svg | 48 ++--- ...titching_contig_with_small_covered_gap.svg | 66 +++---- ..._overlapping_contigs_into_one_sequence.svg | 66 +++---- .../test_stitching_of_identical_contigs.svg | 138 ++++++++++--- .../test_stitching_partial_align.svg | 30 +-- ...ching_partial_align_multiple_sequences.svg | 52 ++--- ...e_contig_completely_covered_by_another.svg | 134 ++++++++++--- .../test_stitching_with_empty_contigs.svg | 134 ++++++++++--- 29 files changed, 1397 insertions(+), 767 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 8b36de97c..66c3fef0e 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -5,7 +5,7 @@ from csv import DictReader from io import StringIO from itertools import groupby -from math import log10, copysign +from math import log10, copysign, floor from operator import itemgetter, attrgetter from pathlib import Path import dataclasses @@ -971,7 +971,27 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: for parent_name in sorted_roots: yield from get_arrows(group_ref, parent_name, labels) - min_position, max_position = 0, 1 + def make_ray() -> Element: + screen_size = (max_position - min_position) + position_offset / 2 + single_size = 0.02 * screen_size + + def generate_beams(): + for i in range(floor(screen_size / single_size) + 1): + if i % 2 == 0: + yield Track(i * single_size + min_position + position_offset / 2, (i + 1) * single_size + min_position + position_offset / 2, h=0.1, color="green") + + return Multitrack(list(generate_beams())) + + def add_section(title: str) -> None: + label = LeftLabel(text=title, x=0, font_size=12) + pos = position_offset / 2 + figure.add(Arrow(pos, pos, h=0)) + figure.add(make_ray()) + figure.add(Arrow(pos, pos, h=0)) + figure.add(Track(pos, pos, label=label, h=0)) + + min_position = 0 + max_position = max(group_refs.values(), default=1) for contig_name in final_parts: contig = contig_map[contig_name] if isinstance(contig, AlignedContig): @@ -1109,9 +1129,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: ############# if discarded: - label = LeftLabel(text=f"discards:", x=0, font_size=12) - pos = position_offset / 2 - figure.add(Track(pos, pos, h=40, label=label)) + add_section("discards:") for parent_name in sorted_roots: contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: @@ -1132,9 +1150,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: ############# if anomaly: - label = LeftLabel(text=f"anomaly:", x=0, font_size=12) - pos = position_offset / 2 - figure.add(Track(pos, pos, h=40, label=label)) + add_section("anomaly:") for parent_name in sorted_roots: contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: @@ -1160,9 +1176,7 @@ def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: ########### if unknown: - label = LeftLabel(text=f"unknown:", x=0, font_size=12) - pos = position_offset / 2 - figure.add(Track(pos, pos, h=40, label=label)) + add_section("unknown:") for parent_name in sorted_roots: contigs = final_children_mapping.get(parent_name, []) for contig_name in contigs: diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg index 57f0ae5a4..50e268a9c 100644 --- a/micall/tests/data/exact_parts_contigs.plot.svg +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -5,181 +5,181 @@ - - + + 5' LTR - - + + gag - - + + vif - - + + tat - - + + nef - - + + tat - - + + vpu - - + + rev - - + + 3' LTR - - + + pol - - + + vpr - - + + rev - - + + env - - + + PR - - + + RT - - + + INT - - + + V3 - - + + GP41 - + - + - + - + - + - - + + HIV1-B-FR-K03455-seed - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg index 1354a4e59..be9c6fd17 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg index ee8e6e7ca..7d250074d 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig_2.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index 9ee304317..a0f1d5cd9 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -1,238 +1,238 @@ + width="970.0" height="350.0" viewBox="0.0 -350.0 970.0 350.0"> - + - + - + - + - + - + - + - + - + - - + + testref-2 - - - -1.2 + + + +1.2 - - - -2.2 + + + +2.2 - - - -3.2 + + + +3.2 - - - -4.2 + + + +4.2 - - + + - - + + 1.2 - - + + - - + + 2.2 - - + + - - + + 3.2 - - + + - - + + 4.2 - + - + - + - + - + - + - + - + - + - - + + testref-1 - - - -1.1 + + + +1.1 - - - -2.1 + + + +2.1 - - - -3.1 + + + +3.1 - - - -4.1 + + + +4.1 - - + + - - + + 1.1 - - + + - - + + 2.1 - - + + - - + + 3.1 - - + + - - + + 4.1 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index d7982712d..e44db239f 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -5,91 +5,91 @@ - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - - + + 1 - - + + - - + + 2 - - + + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 494e97909..5be34238b 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -5,100 +5,100 @@ - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - + - - + + 1 - - + + - + - - + + 2 - - + + - + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg index 6bfe21d0b..87df386a2 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg index 3b761d3e1..00499e4f3 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_different_organism_contigs.svg @@ -5,78 +5,78 @@ - + - + - + - - + + testref-1 - - - -1 + + + +1 - - + + - - + + 1 - + - + - + - - + + testref-2 - - - -2 + + + +2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index 2b15b873f..fc5cbd75e 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -1,116 +1,204 @@ + width="970.0" height="215.6" viewBox="0.0 -215.6 970.0 215.6"> - + - + - + - + - + - + - + - + - - - + + + testref - + - - - -1.4 + + + +1.4 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - + - - + + - - + + - + - - + + 1.4 - - + + 1.2 - + - - + + - + - - + + 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + discards: - - + + 1.5 - - + + 1.1 - - + + 1.3 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index b827f3135..9851a4607 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -1,9 +1,9 @@ + width="970.0" height="175.6" viewBox="0.0 -175.6 970.0 175.6"> - + @@ -26,13 +26,13 @@ - + testref - + @@ -52,7 +52,7 @@ 1.3 - + @@ -62,7 +62,7 @@ - + @@ -82,13 +82,13 @@ - + - + @@ -96,8 +96,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + discards: diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index 458c2f82a..82bb9a3a7 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -1,9 +1,9 @@ + width="970.0" height="175.6" viewBox="0.0 -175.6 970.0 175.6"> - + @@ -26,13 +26,13 @@ - + testref - + @@ -52,7 +52,7 @@ 1.3 - + @@ -62,7 +62,7 @@ - + @@ -82,13 +82,13 @@ - + - + @@ -96,8 +96,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + discards: diff --git a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg index 8f6edc705..ff865ed0e 100644 --- a/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg +++ b/micall/tests/data/stitcher_plots/test_identical_stitching_of_one_contig.svg @@ -1,43 +1,43 @@ + width="970.0000000000001" height="84.5" viewBox="0.0 -84.5 970.0000000000001 84.5"> - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg index db61ddadf..fa876a30d 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg index fa71913ef..533ef9bba 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_sequences.svg @@ -1,69 +1,69 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg index a36396ded..79a8d0e22 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_1.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg index 025e46de5..e1a300e9b 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg @@ -5,65 +5,65 @@ - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index 5f6c6d33c..54860e3a9 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -1,90 +1,178 @@ + width="970.0" height="184.6" viewBox="0.0 -184.6 970.0 184.6"> - + - + - + - + - + - + - - - + + + testref - + - - - -1.4 + + + +1.4 - - - -1.2 + + + +1.2 - + - - + + - - + + - + - - + + 1.4 - - + + 1.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + discards: - - + + 1.1 - - + + 1.3 - - + + 1.5 diff --git a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg index aad92c783..50bfa70b5 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_with_insertion.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg index b8b7b1e50..21629cb29 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_covered_gap.svg @@ -5,87 +5,87 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg index 7e663f73b..7c692ab46 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg index 5db9b1daa..ec3fc4060 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg @@ -1,69 +1,69 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> - + - + - + - + - + - - + + testref - - - -2 + + + +2 - - - -1 + + + +1 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg index 629ad55cc..cb87affb3 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg @@ -5,87 +5,87 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg index bf13e76e8..d0d69d99a 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg @@ -5,91 +5,91 @@ - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - - + + 1 - - + + - - + + 2 - - + + - - + + 3 diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg index 1a6736275..ea3e51ea3 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -1,62 +1,150 @@ + width="969.9999999999999" height="164.6" viewBox="0.0 -164.6 969.9999999999999 164.6"> - + - + - + - + - - - + + + testref - + - - - -3 + + + +3 - + - - + + - + - - + + 3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + discards: - - + + 1 - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg index db61ddadf..fa876a30d 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align.svg @@ -5,45 +5,45 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - + - + - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index 6c44de7d5..87669e97c 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -1,75 +1,75 @@ + width="970.0" height="115.5" viewBox="0.0 -115.5 970.0 115.5"> - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - + + - + - - + + 1 - - + + - + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index af7536837..b25ffd359 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -1,56 +1,144 @@ + width="970.0" height="144.6" viewBox="0.0 -144.6 970.0 144.6"> - + - + - + - + - - - + + + testref - + - - - -2 + + + +2 - + - - + + - + - - + + 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + discards: - - + + 1 diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg index c3cf7c090..5ee083891 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -1,56 +1,144 @@ + width="970.0" height="144.6" viewBox="0.0 -144.6 970.0 144.6"> - + - + - + - + - - - + + + testref - + - - - -1 + + + +1 - + - - + + - + - - + + 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + anomaly: - - + + 2 From fc89f7e825d092c8f44fa983c90518b20ad57377 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 31 Jan 2024 11:21:22 -0800 Subject: [PATCH 195/485] Contig stitcher: simplify contigs size calculations in visualizer --- micall/core/plot_contigs.py | 31 +++--- ...e_contig_completely_covered_by_another.svg | 102 +++++++++--------- micall/tests/test_contig_stitcher.py | 4 +- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 66c3fef0e..ab996ef90 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -422,7 +422,8 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: overlap_rightparent_map: Dict[str, str] = {} overlap_lefttake_map: Dict[str, str] = {} overlap_righttake_map: Dict[str, str] = {} - overlap_sibling_map: Dict[str, str] = {} + overlap_left_sibling: Dict[str, str] = {} + overlap_right_sibling: Dict[str, str] = {} combine_left_edge: Dict[str, str] = {} combine_right_edge: Dict[str, str] = {} children_join_points: List[str] = [] @@ -617,8 +618,8 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): overlap_rightparent_map[event.right_remainder.name] = event.right.name overlap_lefttake_map[event.left_remainder.name] = event.left_take.name overlap_righttake_map[event.right_remainder.name] = event.right_take.name - overlap_sibling_map[event.left_remainder.name] = event.right_remainder.name - overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name + overlap_left_sibling[event.left_remainder.name] = event.right_remainder.name + overlap_right_sibling[event.right_remainder.name] = event.left_remainder.name elif isinstance(event, events.Drop): record_bad_contig(event.contig, discarded) record_alive(event.contig) @@ -864,34 +865,30 @@ def get_neighbour(part, lookup): if prev_part is not None: r_st = prev_part.alignment.r_st else: - if part.name in bad_contigs: - start_delta = 0 - else: - start_delta = -1 * part.alignment.q_st + start_delta = -1 * part.alignment.q_st r_st = part.alignment.r_st + start_delta if next_part is not None: r_ei = next_part.alignment.r_ei else: - if part.name in bad_contigs: - end_delta = 0 - else: - end_delta = len(part.seq) - 1 - part.alignment.q_ei + end_delta = len(part.seq) - 1 - part.alignment.q_ei r_ei = part.alignment.r_ei + end_delta aligned_size_map[part.name] = (r_st, r_ei) - sibling_name = ([overlap_sibling_map[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_sibling_map] or [""])[0] - sibling = sibling_name and contig_map[sibling_name] - prev_part = get_neighbour(sibling, overlap_lefttake_map) - next_part = get_neighbour(sibling, overlap_righttake_map) + sibling_left_name = ([overlap_left_sibling[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_left_sibling] or [""])[0] + sibling_left = sibling_left_name and contig_map[sibling_left_name] + sibling_right_name = ([overlap_right_sibling[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_right_sibling] or [""])[0] + sibling_right = sibling_right_name and contig_map[sibling_right_name] + prev_part = get_neighbour(sibling_right, overlap_lefttake_map) + next_part = get_neighbour(sibling_left, overlap_righttake_map) - if prev_part is not None and prev_part.alignment.r_ei < part.alignment.r_st and prev_part: + if prev_part is not None: r_st = prev_part.alignment.r_st else: r_st = part.alignment.r_st - if next_part is not None and next_part.alignment.r_st > part.alignment.r_ei and next_part: + if next_part is not None: r_ei = next_part.alignment.r_ei else: r_ei = part.alignment.r_ei diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index b25ffd359..a33294ba0 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -1,144 +1,150 @@ + width="969.9999999999999" height="144.6" viewBox="0.0 -144.6 969.9999999999999 144.6"> - + - + - + - - + + testref - - - -2 + + + +2 - - + + - - + + + + + + + + 2 - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - + + discards: - - + + 1 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 24009e3ae..3a7bb7c7c 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -435,14 +435,14 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner, contigs = [ GenotypedContig(name='a', - seq='A' * 20 + 'C' * 20, + seq='M' * 10 + 'A' * 20 + 'C' * 20 + 'O' * 10, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', - seq='A' * 50 + 'C' * 50, + seq='P' * 10 + 'A' * 50 + 'C' * 50 + 'Z' * 10, ref_name='testref', group_ref='testref', ref_seq=ref_seq, From bebe947296f6fbf02709e2dda95a5ef9593895da Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 31 Jan 2024 11:20:39 -0800 Subject: [PATCH 196/485] Contig stitcher: show unaligned parts of discarded contigs --- micall/core/plot_contigs.py | 87 +++++++------------ ...wo_overlapping_and_one_separate_contig.svg | 75 +++++++++------- ...cross_alignment_around_small_insertion.svg | 12 ++- .../test_gap_around_big_insertion.svg | 4 +- .../test_gap_around_small_insertion.svg | 4 +- .../test_stitch_cross_alignment.svg | 12 ++- .../test_stitching_of_identical_contigs.svg | 8 +- ...e_contig_completely_covered_by_another.svg | 10 ++- micall/tests/test_contig_stitcher.py | 19 ++-- 9 files changed, 121 insertions(+), 110 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ab996ef90..5f6e1eed5 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -730,7 +730,7 @@ def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: for contig_name in sorted_roots: set_query_position(contig_name) - def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): + def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, overlap_xsibling): for parent in edge_table: child_remainder = edge_table[parent] for child_remainder_morph in eqv_morphism_graph.get(child_remainder, [child_remainder]): @@ -741,11 +741,12 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map): for parent_remainder in overlap_xparent_map: if overlap_xparent_map[parent_remainder] == parent_morph: overlap_xtake_map[child_remainder_morph] = overlap_xtake_map[parent_remainder] + overlap_xsibling[child_remainder_morph] = overlap_xsibling[parent_remainder] yield True # Closing `takes` by parents - while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map)): pass - while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass + while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map, overlap_left_sibling)): pass + while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map, overlap_right_sibling)): pass final_nodes: List[str] = [] final_parts: Dict[str, bool] = {} @@ -916,20 +917,8 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) - def get_tracks(group_ref: str, contig_name: str) -> Iterable[Track]: - parts_names = final_children_mapping[contig_name] - parts = [contig_map[name] for name in parts_names] - parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1)) - for prev_part, part, next_part in sliding_window(parts): - if part.name in bad_contigs: - continue - - if not isinstance(part, AlignedContig): - continue - - if part.group_ref != group_ref: - continue - + def get_tracks(parts: Iterable[GenotypedContig]) -> Iterable[Track]: + for part in parts: indexes = name_map[part.name] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) @@ -939,22 +928,15 @@ def get_tracks(group_ref: str, contig_name: str) -> Iterable[Track]: if a_r_ei > f_r_ei: yield Track(min(a_r_ei, f_r_ei) + position_offset, max(a_r_ei, f_r_ei) + position_offset, color="yellow") - yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}") - - def get_arrows(group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow]: - parts = final_children_mapping[contig_name] - for part_name in parts: - part = contig_map[part_name] - - if part.name in bad_contigs: - continue - - if not isinstance(part, AlignedContig): - continue + if isinstance(part, AlignedContig) and part.name not in unaligned: + colour = 'lightgrey' + else: + colour = "yellow" - if part.group_ref != group_ref: - continue + yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}", color=colour) + def get_arrows(parts: Iterable[GenotypedContig], labels: bool) -> Iterable[Arrow]: + for part in parts: indexes = name_map[part.name] if labels else None height = 20 if labels else 1 elevation = 1 if labels else -20 @@ -964,10 +946,6 @@ def get_arrows(group_ref: str, contig_name: str, labels: bool) -> Iterable[Arrow h=height, label=indexes) - def get_all_arrows(group_ref: str, labels: bool) -> Iterable[Arrow]: - for parent_name in sorted_roots: - yield from get_arrows(group_ref, parent_name, labels) - def make_ray() -> Element: screen_size = (max_position - min_position) + position_offset / 2 single_size = 0.02 * screen_size @@ -1105,7 +1083,14 @@ def add_section(title: str) -> None: # Arrows # ########## - ref_arrows = list(get_all_arrows(group_ref, labels=True)) + ref_arrows: List[Arrow] = [] + for parent_name in sorted_roots: + parts_names = final_children_mapping[parent_name] + parts_names = [name for name in parts_names if name not in bad_contigs] + parts = [contig_map[name] for name in parts_names] + parts = [part for part in parts if part.group_ref == group_ref] + ref_arrows.extend(get_arrows(parts, labels=True)) + if ref_arrows: figure.add(ArrowGroup(ref_arrows)) @@ -1114,12 +1099,13 @@ def add_section(title: str) -> None: ########### for parent_name in sorted_roots: - arrows = list(get_arrows(group_ref, parent_name, labels=False)) - if arrows: - figure.add(ArrowGroup(arrows)) - parts = list(get_tracks(group_ref, parent_name)) + parts_names = final_children_mapping[parent_name] + parts_names = [name for name in parts_names if name not in bad_contigs] + parts = [contig_map[name] for name in parts_names] + parts = [part for part in parts if part.group_ref == group_ref] if parts: - figure.add(Multitrack(parts)) + figure.add(ArrowGroup(list(get_arrows(parts, labels=False)))) + figure.add(Multitrack(list(get_tracks(parts)))) ############# # Discarded # @@ -1128,19 +1114,12 @@ def add_section(title: str) -> None: if discarded: add_section("discards:") for parent_name in sorted_roots: - contigs = final_children_mapping.get(parent_name, []) - for contig_name in contigs: - if contig_name not in discarded: - continue - - contig = contig_map[contig_name] - (r_st, r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - name = name_map.get(contig_name, contig_name) - if isinstance(contig, AlignedContig) and contig.name not in unaligned: - colour = 'lightgrey' - else: - colour = "yellow" - figure.add(Track(f_r_st + position_offset, f_r_ei + position_offset, label=name, color=colour)) + parts_names = final_children_mapping[parent_name] + parts_names = [name for name in parts_names if name in discarded] + parts = [contig_map[name] for name in parts_names] + parts = [part for part in parts if part.group_ref == group_ref] + for part in parts: + figure.add(Multitrack(list(get_tracks([part])))) ############# # Anomalies # diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index e44db239f..de65a4f76 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -5,91 +5,100 @@ - + - + - + - + - + - + - + - - + + testref - - - -1 + + + +1 - - - -2 + + + +2 - - - -3 + + + +3 - - + + - - + + + + + 1 - - + + - - + + + + + 2 - - + + - - + + + + + 3 diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index fc5cbd75e..bd2166947 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -184,22 +184,28 @@ discards: - + + 1.5 - + + + 1.1 - + + + 1.3 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 9851a4607..9525f81b8 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -190,10 +190,12 @@ discards: - + + 1.2 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index 82bb9a3a7..ff9b9f9db 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -190,10 +190,12 @@ discards: - + + 1.2 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index 54860e3a9..18415cca0 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -158,22 +158,28 @@ discards: - + + 1.1 - + + + 1.3 - + + + 1.5 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg index ea3e51ea3..52229e76d 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_identical_contigs.svg @@ -136,16 +136,20 @@ discards: - + + 1 - + + + 2 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg index a33294ba0..d7ed69a7a 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_one_contig_completely_covered_by_another.svg @@ -142,10 +142,18 @@ discards: - + + + + + + + + 1 + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 3a7bb7c7c..8c49743e5 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -167,25 +167,25 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, v def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner, visualizer): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + ref_seq = 'Z' * 5 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'Y' * 5 contigs = [ GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, + seq='M' * 5 + 'A' * 50 + 'C' * 20 + 'J' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='b', - seq='A' * 20 + 'C' * 50, + seq='Q' * 5 + 'A' * 20 + 'C' * 50 + 'I' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, match_fraction=0.5, ), GenotypedContig(name='c', - seq='C' * 20 + 'T' * 50, + seq='N' * 5 + 'C' * 20 + 'T' * 50 + 'H' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -193,14 +193,9 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_ali ), ] - results = list(stitch_contigs(contigs)) - assert len(results) == 2 - - assert 100 == len(results[0].seq) - assert results[0].seq == 'A' * 50 + 'C' * 50 - - assert results[1].seq == contigs[2].seq - + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[0].seq.rstrip('J') + 'C' * 30 + contigs[2].seq.lstrip('N') assert len(visualizer().elements) > len(contigs) From cd7d828f84c3f305dd38d6c02193463c9e5ca490 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 16:21:32 -0800 Subject: [PATCH 197/485] Contig stitcher: fix handling of reverse-complement alignments --- micall/core/contig_stitcher.py | 35 +++-- micall/core/plot_contigs.py | 4 +- .../test_forward_and_reverse_match.svg | 117 ++++++++++++++ ...st_multiple_reverse_complement_matches.svg | 51 ++++++ ...everse_complement_matches_out_of_order.svg | 143 +++++++++++++++++ .../test_reverse_complement_match.svg | 45 ++++++ ..._reverse_complement_match_with_padding.svg | 51 ++++++ micall/tests/test_contig_stitcher.py | 145 +++++++++++++++++- micall/utils/contig_stitcher_events.py | 23 ++- 9 files changed, 592 insertions(+), 22 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg create mode 100644 micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg create mode 100644 micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg create mode 100644 micall/tests/data/stitcher_plots/test_reverse_complement_match.svg create mode 100644 micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9dfd2bdd4..b833d00f6 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -241,38 +241,51 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)), "forward" if x.strand == 1 else "reverse") for x in alignments] - connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] - if not connected: + for i, (hit, strand) in enumerate(hits_array): + logger.debug("Part %r of contig %r aligned at %s%s.", + i, contig.name, hit, " (rev)" if strand == "reverse" else "") + context.get().emit(events.InitialHit(contig, hit, strand)) + + if not hits_array: logger.debug("Contig %r not aligned - backend's choice.", contig.name) context.get().emit(events.ZeroHits(contig)) yield contig return - if len(set(map(lambda p: p[1], hits_array))) > 1: + if len(set(strand for hit, strand in hits_array)) > 1: logger.debug("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name) context.get().emit(events.StrandConflict(contig)) yield contig return - logger.debug("Contig %r produced %s aligner hits. After connecting them, the number became %s.", - contig.name, len(hits_array), len(connected)) - context.get().emit(events.HitNumber(contig, hits_array, connected)) - strand = hits_array[0][1] if strand == "reverse": - rc = str(Seq(contig.seq).reverse_complement()) + rc = str(Seq.Seq(contig.seq).reverse_complement()) new_contig = replace(contig, seq=rc) + contig = new_contig + hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand) + for hit, strand in hits_array] + logger.debug("Reverse complemented contig %r.", contig.name) context.get().emit(events.ReverseComplement(contig, new_contig)) - contig = new_contig + + for i, (hit, strand) in enumerate(hits_array): + logger.debug("Part %r of contig %r reverse-aligned at %s%s.", + i, contig.name, hit, " (rev)" if strand == "reverse" else "") + context.get().emit(events.InitialHit(contig, hit, strand)) + + connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] + logger.debug("Contig %r produced %s aligner hits. After connecting them, the number became %s.", + contig.name, len(hits_array), len(connected)) + context.get().emit(events.HitNumber(contig, hits_array, connected)) for i, single_hit in enumerate(connected): query = replace(contig, name=context.get().generate_new_name()) part = AlignedContig.make(query, single_hit, strand) - logger.debug("Part %r of contig %r aligned as %r at %s%s.", i, contig.name, + logger.debug("Part %r of contig %r re-aligned as %r at %s%s.", i, contig.name, part.name,part.alignment, " (rev)" if strand == "reverse" else "") - context.get().emit(events.Hit(contig, part, i)) + context.get().emit(events.ConnectedHit(contig, part, i)) yield part diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 5f6e1eed5..4af8bab41 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -584,7 +584,7 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): elif isinstance(event, events.Intro): record_contig(event.contig, []) record_alive(event.contig) - elif isinstance(event, events.Hit): + elif isinstance(event, events.ConnectedHit): record_contig(event.part, [event.contig]) record_alive(event.part) elif isinstance(event, events.NoRef): @@ -640,7 +640,7 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): if event.contigs: combine_left_edge[event.result.name] = event.contigs[0].name combine_right_edge[event.result.name] = event.contigs[-1].name - elif isinstance(event, (events.IgnoreGap, events.NoOverlap)): + elif isinstance(event, (events.IgnoreGap, events.NoOverlap, events.InitialHit)): pass else: x: NoReturn = event diff --git a/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg b/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg new file mode 100644 index 000000000..bb3a0b700 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg @@ -0,0 +1,117 @@ + + + + + + + + + + + + +testref + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + +1 + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg new file mode 100644 index 000000000..e180d9f76 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg new file mode 100644 index 000000000..aac68b6dd --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg @@ -0,0 +1,143 @@ + + + + + + + + + + + + +testref + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + + +1.3 + + + + + + + +1.4 + + + + + + + +1.1 + + + + + + + +1.2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg b/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg new file mode 100644 index 000000000..2efb6d7ee --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg b/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg new file mode 100644 index 000000000..de19e9502 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 8c49743e5..92a5536c6 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -9,7 +9,7 @@ from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed from micall.tests.test_denovo import check_hcv_db # activates the fixture - +from micall.tests.test_remap import load_projects # activates the "projects" fixture logging.getLogger("micall.core.contig_stitcher").setLevel(logging.DEBUG) logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) @@ -883,6 +883,149 @@ def test_cross_alignment_around_small_insertion(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) +def test_reverse_complement_match(projects, visualizer): + # Scenario: Single contig is aligned in the reverse strand. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + ref_part = ref[2000:2200] + seq = revcomp(ref_part) + + contigs = [ + GenotypedContig(name='a', + seq=seq, + ref_name='testref', + group_ref='testref', + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == ref_part + assert len(visualizer().elements) > len(contigs) + + +def test_reverse_complement_match_with_padding(projects, visualizer): + # Scenario: Single contig is aligned in the reverse strand. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + ref_part = 'T' * 24 + ref[2000:2200] + 'G' * 27 + seq = revcomp(ref_part) + + contigs = [ + GenotypedContig(name='a', + seq=seq, + ref_name='testref', + group_ref='testref', + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == ref_part + assert len(results[0].lstrip().seq) == len(ref_part) - 24 + assert len(results[0].rstrip().seq) == len(ref_part) - 27 + assert results[0].rstrip().seq == ref_part[:-27] # 27 Gs on the right + assert results[0].lstrip().seq == ref_part[24:] # 24 Ts on the left + assert len(visualizer().elements) > len(contigs) + + +def test_multiple_reverse_complement_matches(projects, visualizer): + # Scenario: Single contig is aligned in the reverse strand in multiple places. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + ref_part = 'T' * 24 + ref[2000:2600] + 'A' * 9 + ref[3000:3600] + 'T' * 9 + ref[4000:4600] + 'G' * 27 + seq = revcomp(ref_part) + + contigs = [ + GenotypedContig(name='a', + seq=seq, + ref_name='testref', + group_ref='testref', + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert len(results[0].seq) == len(ref_part) + assert results[0].seq == ref_part + assert len(results[0].lstrip().seq) == len(ref_part) - 24 + assert len(results[0].rstrip().seq) == len(ref_part) - 27 + assert results[0].lstrip().seq == ref_part[24:] + assert results[0].rstrip().seq == ref_part[:-27] + + assert len(visualizer().elements) > len(contigs) + + +def test_multiple_reverse_complement_matches_out_of_order(projects, visualizer): + # Scenario: Single contig is aligned in the reverse strand in multiple places, producing an out of order alignment. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + seq = 'A' * 9 + revcomp(ref[2000:2300]) + 'A' * 9 + revcomp(ref[3000:3300]) + 'T' * 9 + revcomp(ref[4000:4300]) + 'G' * 27 + + contigs = [ + GenotypedContig(name='a', + seq=seq, + ref_name='testref', + group_ref='testref', + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 3 + + # Note how parts are stripped because aligned out of order + assert results[0].seq == ref[2000:2300] + assert results[1].seq == ref[3000:3300] + assert results[2].seq == ref[4000:4300] + + assert len(visualizer().elements) > len(contigs) + + +def test_forward_and_reverse_match(projects, visualizer): + # Scenario: Single contig is aligned in both strands. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + seq = ref[1000:1100] + revcomp(ref[2000:2200]) + + contigs = [ + GenotypedContig(name='a', + seq=seq, + ref_name='testref', + group_ref='testref', + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == seq + assert len(visualizer().elements) > len(contigs) + + def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index fccfba40d..99ac62256 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -42,30 +42,37 @@ class NoRef: @dataclass(frozen=True) -class ZeroHits: +class InitialHit: contig: 'GenotypedContig' + hit: CigarHit + strand: Literal["forward", "reverse"] @dataclass(frozen=True) -class StrandConflict: +class ZeroHits: contig: 'GenotypedContig' @dataclass(frozen=True) -class HitNumber: +class StrandConflict: contig: 'GenotypedContig' - initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] - connected: List[CigarHit] -@dataclass(frozen=True) +@dataclass class ReverseComplement: contig: 'GenotypedContig' result: 'GenotypedContig' @dataclass(frozen=True) -class Hit: +class HitNumber: + contig: 'GenotypedContig' + initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] + connected: List[CigarHit] + + +@dataclass(frozen=True) +class ConnectedHit: contig: 'GenotypedContig' part: 'AlignedContig' index: int @@ -147,6 +154,6 @@ class FinalCombine: result: 'AlignedContig' -AlignmentEvent = Union[NoRef, ZeroHits, StrandConflict, HitNumber, ReverseComplement, Hit] +AlignmentEvent = Union[NoRef, InitialHit, ZeroHits, StrandConflict, ReverseComplement, HitNumber, ConnectedHit] ModifyEvent = Union[LStrip, RStrip] EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, InitialStrip, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] From 8012f12145232b6a1946cda98b342b07a840172f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 08:36:48 -0800 Subject: [PATCH 198/485] Contig stitcher: do not assume that overlaping contigs align to the same strand --- micall/core/contig_stitcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b833d00f6..2b0af3211 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -173,7 +173,6 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': reference_delta=0) alignment = self_alignment.connect(other_alignment) - assert self.strand == other.strand ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", self.name, self.alignment, other.name, other.alignment, ret.name, ret.alignment) From 77df8c2cc2f5e80627f7e2ccd2fcc3e34093d664 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 09:05:11 -0800 Subject: [PATCH 199/485] Contig stitcher: fix test input name --- .../{exact_parts_contigs.csv => exact_parts_contigs.fasta} | 0 micall/tests/test_contig_stitcher.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename micall/tests/data/{exact_parts_contigs.csv => exact_parts_contigs.fasta} (100%) diff --git a/micall/tests/data/exact_parts_contigs.csv b/micall/tests/data/exact_parts_contigs.fasta similarity index 100% rename from micall/tests/data/exact_parts_contigs.csv rename to micall/tests/data/exact_parts_contigs.fasta diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 92a5536c6..26fc2679c 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1028,7 +1028,7 @@ def test_forward_and_reverse_match(projects, visualizer): def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) - contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") + contigs = os.path.join(pwd, "data", "exact_parts_contigs.fasta") stitched_contigs = os.path.join(tmp_path, "stitched.csv") stitcher.main([contigs, stitched_contigs]) @@ -1048,7 +1048,7 @@ def test_main_invocation(exact_aligner, tmp_path, hcv_db): def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) - contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") + contigs = os.path.join(pwd, "data", "exact_parts_contigs.fasta") stitched_contigs = os.path.join(tmp_path, "stitched.csv") plot = os.path.join(tmp_path, "exact_parts_contigs.plot.svg") stitcher.main([contigs, stitched_contigs, "--debug", "--plot", plot]) From c6e1dcf7aae4ff7a13343e17dfe4475cc18436c5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 16:12:39 -0800 Subject: [PATCH 200/485] Contig stitcher: optimize transitive closure calculation --- micall/core/plot_contigs.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4af8bab41..3a4f23bf3 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -454,25 +454,19 @@ def remove_transitive_edges(graph): ret[parent] = lst return ret - def get_all_ancestors(recur, lst, graph, ancestor_name): - if ancestor_name not in recur: - recur = recur.copy() - recur.add(ancestor_name) - - if ancestor_name not in lst: - lst.append(ancestor_name) - - existing_ancestors = graph.get(ancestor_name, []) - for existing in existing_ancestors: - get_all_ancestors(recur, lst, graph, existing) + def get_transitive_children(recur, lst, graph, current): + for child in graph.get(current, []): + if child not in recur: + recur.add(child) + lst.append(child) + get_transitive_children(recur, lst, graph, child) def transitive_closure(graph): ret = {} - for parent, children in graph.items(): - lst = [] - for child in children: - get_all_ancestors(set(), lst, graph, child) - ret[parent] = lst + for parent in graph: + children = [] + get_transitive_children(set(), children, graph, parent) + ret[parent] = children return ret def copy_graph(graph): From 58aa1f92a0830ab2e8b1a8691ee2cd1c4badaac8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 16:20:09 -0800 Subject: [PATCH 201/485] Contig stitcher: make sure to draw rc alignments correctly --- micall/core/plot_contigs.py | 6 ++++++ .../test_multiple_reverse_complement_matches.svg | 8 ++++---- .../data/stitcher_plots/test_reverse_complement_match.svg | 8 ++++---- .../test_reverse_complement_match_with_padding.svg | 8 ++++---- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 3a4f23bf3..4e812b60f 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -935,6 +935,12 @@ def get_arrows(parts: Iterable[GenotypedContig], labels: bool) -> Iterable[Arrow height = 20 if labels else 1 elevation = 1 if labels else -20 (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) + + if isinstance(part, AlignedContig) and part.strand == "reverse": + tmp = a_r_st + a_r_st = a_r_ei + a_r_ei = tmp + yield Arrow(a_r_st + position_offset, a_r_ei + position_offset, elevation=elevation, h=height, diff --git a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg index e180d9f76..17bf82242 100644 --- a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg +++ b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches.svg @@ -22,16 +22,16 @@ - + - + 1 - - + + diff --git a/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg b/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg index 2efb6d7ee..1e61ce1b3 100644 --- a/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg +++ b/micall/tests/data/stitcher_plots/test_reverse_complement_match.svg @@ -22,16 +22,16 @@ - + - + 1 - - + + diff --git a/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg b/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg index de19e9502..cba7989be 100644 --- a/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg +++ b/micall/tests/data/stitcher_plots/test_reverse_complement_match_with_padding.svg @@ -22,16 +22,16 @@ - + - + 1 - - + + From 9397b29e4135d5e820bf32d5b661c9971df38634 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 1 Feb 2024 17:16:53 -0800 Subject: [PATCH 202/485] Contig stitcher: remove shortcut handling of rc alignments --- micall/core/contig_stitcher.py | 2 +- micall/core/plot_contigs.py | 4 +- ...everse_complement_matches_out_of_order.svg | 138 +++--------------- micall/tests/test_contig_stitcher.py | 17 ++- 4 files changed, 36 insertions(+), 125 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 2b0af3211..f7c469bb9 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -685,7 +685,7 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo consensus_parts: Dict[GroupRef, List[AlignedContig]] = defaultdict(list) for contig in contigs: - if isinstance(contig, AlignedContig) and contig.strand == "forward": + if isinstance(contig, AlignedContig): consensus_parts[contig.group_ref].append(contig) else: yield contig diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4e812b60f..88812ea78 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -625,6 +625,8 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): elif isinstance(event, events.Stitch): record_contig(event.result, [event.left, event.right]) record_alive(event.result) + elif isinstance(event, events.NoOverlap): + record_alive(event.contig) elif isinstance(event, events.Cut): record_contig(event.left, [event.original]) record_contig(event.right, [event.original]) @@ -634,7 +636,7 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): if event.contigs: combine_left_edge[event.result.name] = event.contigs[0].name combine_right_edge[event.result.name] = event.contigs[-1].name - elif isinstance(event, (events.IgnoreGap, events.NoOverlap, events.InitialHit)): + elif isinstance(event, (events.IgnoreGap, events.InitialHit)): pass else: x: NoReturn = event diff --git a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg index aac68b6dd..17bf82242 100644 --- a/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg +++ b/micall/tests/data/stitcher_plots/test_multiple_reverse_complement_matches_out_of_order.svg @@ -1,142 +1,50 @@ + width="970.0" height="84.5" viewBox="0.0 -84.5 970.0 84.5"> - + - - - - - - -testref - - - - - - - - - - - - - - - - - - - - - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + +testref - - + - - + + + +1 - - - - - - -discards: - - - + - - -1.3 - + + - + - - -1.4 - + - - - - -1.1 + - - - - - -1.2 + + +1 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 26fc2679c..f9666e7bc 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -978,7 +978,8 @@ def test_multiple_reverse_complement_matches_out_of_order(projects, visualizer): hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) - seq = 'A' * 9 + revcomp(ref[2000:2300]) + 'A' * 9 + revcomp(ref[3000:3300]) + 'T' * 9 + revcomp(ref[4000:4300]) + 'G' * 27 + ref_part = 'T' * 24 + ref[2000:2600] + 'A' * 9 + ref[3000:3600] + 'T' * 9 + ref[4000:4600] + 'G' * 27 + seq = revcomp(ref_part) contigs = [ GenotypedContig(name='a', @@ -991,13 +992,13 @@ def test_multiple_reverse_complement_matches_out_of_order(projects, visualizer): ] results = list(stitch_consensus(contigs)) - assert len(results) == 3 - - # Note how parts are stripped because aligned out of order - assert results[0].seq == ref[2000:2300] - assert results[1].seq == ref[3000:3300] - assert results[2].seq == ref[4000:4300] - + assert len(results) == 1 + assert len(results[0].seq) == len(ref_part) + assert results[0].seq == ref_part + assert len(results[0].lstrip().seq) == len(ref_part) - 24 + assert len(results[0].rstrip().seq) == len(ref_part) - 27 + assert results[0].lstrip().seq == ref_part[24:] + assert results[0].rstrip().seq == ref_part[:-27] assert len(visualizer().elements) > len(contigs) From d08c22125ef4665665d231a9a7de1776a7366fc8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 2 Feb 2024 08:52:50 -0800 Subject: [PATCH 203/485] Contig stitcher: add more tests with the real aligner --- ...test_correct_processing_complex_nogaps.svg | 36 ++-- ...titching_of_one_normal_and_one_unknown.svg | 145 +++++++++++++ ...two_partially_overlapping_contigs_real.svg | 187 ++++++++++++++++ ...apping_contigs_with_insignificant_gaps.svg | 190 +++++++++++++++++ ...gs_completely_covered_by_other_contigs.svg | 199 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 158 +++++++++++++- 6 files changed, 893 insertions(+), 22 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg create mode 100644 micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index a0f1d5cd9..468c24c46 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -35,7 +35,7 @@ -testref-2 +testref-1 @@ -43,25 +43,25 @@ -1.2 +1 -2.2 +3 -3.2 +5 -4.2 +7 @@ -74,7 +74,7 @@ -1.2 +1 @@ -88,7 +88,7 @@ -2.2 +3 @@ -102,7 +102,7 @@ -3.2 +5 @@ -116,7 +116,7 @@ -4.2 +7 @@ -152,7 +152,7 @@ -testref-1 +testref-2 @@ -160,25 +160,25 @@ -1.1 +2 -2.1 +4 -3.1 +6 -4.1 +8 @@ -191,7 +191,7 @@ -1.1 +2 @@ -205,7 +205,7 @@ -2.1 +4 @@ -219,7 +219,7 @@ -3.1 +6 @@ -233,7 +233,7 @@ -4.1 +8 diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg new file mode 100644 index 000000000..3da1ce91b --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg @@ -0,0 +1,145 @@ + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +unknown: + + + + + +2 + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg new file mode 100644 index 000000000..90b68759e --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg @@ -0,0 +1,187 @@ + + + + + + + + +5' LTR + + + + + +gag + + + + + +vif + + + + + +tat + + + + + +nef + + + + + + + +tat + + + + + +vpu + + + + + +rev + + + + + +3' LTR + + + + + + + +pol + + + + + +vpr + + + + + +rev + + + + + +env + + + + + + + +PR + + + + + +RT + + + + + +INT + + + + + +V3 + + + + + +GP41 + + + + + + + + + + + + + + + + + + + + + + + +HIV1-B-ZA-KP109515-seed + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg new file mode 100644 index 000000000..6f1fcd641 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg @@ -0,0 +1,190 @@ + + + + + + + + +5' LTR + + + + + +gag + + + + + +vif + + + + + +tat + + + + + +nef + + + + + + + +tat + + + + + +vpu + + + + + +rev + + + + + +3' LTR + + + + + + + +pol + + + + + +vpr + + + + + +rev + + + + + +env + + + + + + + +PR + + + + + +RT + + + + + +INT + + + + + +V3 + + + + + +GP41 + + + + + + + + + + + + + + + + + + + + + + + +HIV1-B-FR-K03455-seed + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg new file mode 100644 index 000000000..1441ba7d0 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg @@ -0,0 +1,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + + + + + + + + +3 + + + + + + + + + + + + + +4 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index f9666e7bc..971933d8a 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -164,6 +164,79 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, v assert len(visualizer().elements) > len(contigs) +def test_correct_stitching_of_two_partially_overlapping_contigs_real(projects, visualizer): + # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. Not using exact aligner this time. + + from mappy import revcomp + + ref_name = 'HIV1-B-ZA-KP109515-seed' + ref = projects.getReference(ref_name) + + contigs = [ + GenotypedContig(name='a', + seq=ref[1700:2000], + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq=ref[1900:2200], + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + + result = results[0] + + assert 500 == len(result.seq) + assert result.seq == ref[1700:2200] + + assert len(visualizer().elements) > len(contigs) + + +def test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps(projects, visualizer): + # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence, with insignificant gaps. + + from mappy import revcomp + + hxb2_name = 'HIV1-B-FR-K03455-seed' + ref = projects.getReference(hxb2_name) + gap_ref = ''.join(c if i % 30 > 2 else '' for i, c in enumerate(ref)) + + contigs = [ + GenotypedContig(name='a', + seq=gap_ref[1700:2000], + ref_name=hxb2_name, + group_ref=hxb2_name, + ref_seq=ref, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq=gap_ref[1900:2200], + ref_name=hxb2_name, + group_ref=hxb2_name, + ref_seq=ref, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + + result = results[0] + + assert 500 == len(result.seq) + assert result.seq == gap_ref[1700:2200] + + assert len(visualizer().elements) > len(contigs) + + def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner, visualizer): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. @@ -373,28 +446,28 @@ def test_correct_processing_complex_nogaps(exact_aligner, visualizer): ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 contigs = [[ - GenotypedContig(name='a', + GenotypedContig(name='a' + ref_name, seq='A' * 50 + 'C' * 20, ref_name=ref_name, group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), - GenotypedContig(name='b', + GenotypedContig(name='b' + ref_name, seq='A' * 20 + 'C' * 50, ref_name=ref_name, group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), - GenotypedContig(name='c', + GenotypedContig(name='c' + ref_name, seq='C' * 70 + 'T' * 20, ref_name=ref_name, group_ref=ref_name, ref_seq=ref_seq, match_fraction=0.5, ), - GenotypedContig(name='d', + GenotypedContig(name='d' + ref_name, seq='T' * 20 + 'G' * 50, ref_name=ref_name, group_ref=ref_name, @@ -455,6 +528,48 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner, assert len(visualizer().elements) > len(contigs) +def test_stitching_when_multiple_contigs_completely_covered_by_other_contigs(exact_aligner, visualizer): + # Scenario: If two contigs are completely covered by another two contigs. + + ref_seq = 'A' * 100 + 'B' * 100 + 'C' * 100 + 'D' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='M' * 10 + 'A' * 20 + 'B' * 100 + 'C' * 20 + 'O' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq='K' * 10 + 'B' * 20 + 'C' * 100 + 'D' * 20 + 'J' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='c', + seq='I' * 10 + 'B' * 60 + 'C' * 80 + 'P' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='d', + seq='Z' * 10 + 'B' * 80 + 'C' * 60 + 'F' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + + assert len(visualizer().elements) > len(contigs) + + def test_stitching_contig_with_big_noncovered_gap(exact_aligner, visualizer): # Scenario: One contig has a big gap, which is however not covered by anything else. @@ -1027,6 +1142,40 @@ def test_forward_and_reverse_match(projects, visualizer): assert len(visualizer().elements) > len(contigs) +def test_correct_stitching_of_one_normal_and_one_unknown(exact_aligner, visualizer): + # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. + + ref_seq = 'A' * 100 + 'C' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='A' * 50 + 'C' * 20, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq='A' * 20 + 'C' * 50, + ref_name=None, + group_ref=None, + ref_seq=None, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 2 + + assert 70 == len(results[0].seq) + assert 70 == len(results[1].seq) + + assert {result.seq for result in results} \ + == {contig.seq for contig in contigs} + + assert len(visualizer().elements) > len(contigs) + + def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) contigs = os.path.join(pwd, "data", "exact_parts_contigs.fasta") @@ -1268,6 +1417,7 @@ def generate_random_string_pair(length): ("aaaaaaaa", "aaaaabbb", [0.6, 0.68, 0.7, 0.68, 0.6, 0.29, 0.19, 0.13]), ("aaaaaaaa", "aaabbaaa", [0.56, 0.63, 0.62, 0.39, 0.39, 0.62, 0.63, 0.56]), ("aaaaa", "bbbbb", [0] * 5), + ("", "", []), ] ) def test_concordance_simple(left, right, expected): From 4108bdfaa7ac0c768f3fbd18a20350f7beefdaad Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 2 Feb 2024 09:01:05 -0800 Subject: [PATCH 204/485] Contig stitcher: unify handling of v5s types of bad contigs --- micall/core/plot_contigs.py | 52 +++++-------------- ...titching_of_one_normal_and_one_unknown.svg | 4 +- .../test_forward_and_reverse_match.svg | 4 +- .../test_stitching_with_empty_contigs.svg | 4 +- 4 files changed, 22 insertions(+), 42 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 88812ea78..ac81134f1 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -792,13 +792,6 @@ def is_ancestor(contig_name, other_names): final_children_mapping[parent_name] = children - def overlaps(self, other) -> bool: - def intervals_overlap(x, y): - return x[0] <= y[1] and x[1] >= y[0] - - return intervals_overlap((self.alignment.q_st, self.alignment.q_ei), - (other.alignment.q_st, other.alignment.q_ei)) - name_map = {} for i, (parent, children) in enumerate(sorted(final_children_mapping.items(), key=lambda p: p[0])): name_map[parent] = f"{i + 1}" @@ -836,7 +829,7 @@ def get_neighbours(part, lookup): yield contig_map[maybe_name] def get_final_version(contig): - name = reduced_morphism_graph.get(contig.name, [contig.name])[0] # FIXME: why 0??? + [name] = reduced_morphism_graph.get(contig.name, [contig.name]) return contig_map[name] def get_neighbour(part, lookup): @@ -984,8 +977,7 @@ def add_section(title: str) -> None: # Drawing part # ################ - landmarks_path = (Path(__file__).parent.parent / "data" / - "landmark_references.yaml") + landmarks_path = (Path(__file__).parent.parent / "data" / "landmark_references.yaml") landmark_groups = yaml.safe_load(landmarks_path.read_text()) projects = ProjectConfig.loadDefault() figure = Figure() @@ -1130,24 +1122,12 @@ def add_section(title: str) -> None: if anomaly: add_section("anomaly:") for parent_name in sorted_roots: - contigs = final_children_mapping.get(parent_name, []) - for contig_name in contigs: - if contig_name not in anomaly: - continue - - contig = contig_map[contig_name] - (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - if isinstance(contig, AlignedContig): - colour = "lightgray" - if contig.strand == "reverse": - figure.add(Arrow(a_r_ei + position_offset, a_r_st + position_offset, elevation=-20, h=1)) - else: - figure.add(Arrow(a_r_st + position_offset, a_r_ei + position_offset, elevation=-20, h=1)) - else: - colour = "yellow" - - name = name_map.get(contig_name, contig_name) - figure.add(Track(a_r_st + position_offset, a_r_ei + position_offset, color=colour, label=name)) + parts_names = final_children_mapping[parent_name] + parts_names = [name for name in parts_names if name in anomaly] + parts = [contig_map[name] for name in parts_names] + parts = [part for part in parts if part.group_ref == group_ref] + for part in parts: + figure.add(Multitrack(list(get_tracks([part])))) ########### # Unknown # @@ -1156,17 +1136,11 @@ def add_section(title: str) -> None: if unknown: add_section("unknown:") for parent_name in sorted_roots: - contigs = final_children_mapping.get(parent_name, []) - for contig_name in contigs: - if contig_name not in unknown: - continue - - contig = contig_map[contig_name] - r_st = 0 - r_ei = len(contig.seq) - colour = "yellow" - name = name_map.get(contig_name, contig_name) - figure.add(Track(r_st + position_offset, r_ei + position_offset, color=colour, label=name)) + parts_names = final_children_mapping[parent_name] + parts_names = [name for name in parts_names if name in unknown] + parts = [contig_map[name] for name in parts_names] + for part in parts: + figure.add(Multitrack(list(get_tracks([part])))) if not figure.elements: figure.add(Track(0, max_position, label='.', color='none')) diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg index 3da1ce91b..48dbf9a57 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_one_normal_and_one_unknown.svg @@ -136,10 +136,12 @@ unknown: - + + 2 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg b/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg index bb3a0b700..63ad69129 100644 --- a/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg +++ b/micall/tests/data/stitcher_plots/test_forward_and_reverse_match.svg @@ -108,10 +108,12 @@ anomaly: - + + 1 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg index 5ee083891..4f3fa7839 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_with_empty_contigs.svg @@ -136,10 +136,12 @@ anomaly: - + + 2 + \ No newline at end of file From 417ef9a89a50805007e07b6ecffbded9956d2b0a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 2 Feb 2024 10:43:05 -0800 Subject: [PATCH 205/485] Contig stitcher: fix the issue with overreaching bad_contigs --- micall/core/plot_contigs.py | 2 +- ...test_cross_alignment_around_small_insertion.svg | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ac81134f1..7972b36d6 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -772,7 +772,7 @@ def is_ancestor(contig_name, other_names): final_nodes.remove(contig_name) for contig_name in final_nodes: - if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs): + if any(contig_name in eqv_morphism_graph.get(bad, []) for bad in bad_contigs): continue if any(contig_name in eqv_morphism_graph.get(temp_name, [temp_name]) for temp_name in overlaps_list): diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index bd2166947..5808c9040 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -188,23 +188,23 @@ -1.5 +1.1 - - -1.1 + + +1.3 - - -1.3 + + +1.5 From 1e998af976e2d673e2df07e7151fe3683f90603e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 2 Feb 2024 13:21:22 -0800 Subject: [PATCH 206/485] Contig stitcher: make sure to not double-draw any contigs --- micall/core/plot_contigs.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 7972b36d6..49c6ce84e 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -454,6 +454,16 @@ def remove_transitive_edges(graph): ret[parent] = lst return ret + def remove_duplicate_edges(graph): + ret = {} + for parent, children in graph.items(): + lst = [] + for child in children: + if child not in lst: + lst.append(child) + ret[parent] = lst + return ret + def get_transitive_children(recur, lst, graph, current): for child in graph.get(current, []): if child not in recur: @@ -552,7 +562,8 @@ def record_alive(contig: Contig): def record_bad_contig(contig: GenotypedContig, lst: List[str]): complete_contig_map[contig.name] = contig - lst.append(contig.name) + if contig.name not in lst: + lst.append(contig.name) def record_lstrip(result: AlignedContig, original: AlignedContig): lstrip_map[result.name] = original.name @@ -642,7 +653,8 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): x: NoReturn = event raise RuntimeError(f"Unrecognized action or event: {event}") - nodup_parent_graph = remove_transitive_edges(complete_parent_graph) + notransitive_parent_graph = remove_transitive_edges(complete_parent_graph) + nodup_parent_graph = remove_duplicate_edges(notransitive_parent_graph) # Close alive set by parents def extend_alive(contig_name): From 4b4fca411d6e9d362d4b8f63be4ae961bac33f43 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 2 Feb 2024 14:50:50 -0800 Subject: [PATCH 207/485] Contig stitcher: add couple more simple test cases --- ...ially_overlapping_contigs_with_padding.svg | 77 +++++++++++++++++++ micall/tests/test_contig_stitcher.py | 39 +++++++++- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg new file mode 100644 index 000000000..298192e39 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 971933d8a..1687a8149 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -164,6 +164,39 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, v assert len(visualizer().elements) > len(contigs) +def test_correct_stitching_of_two_partially_overlapping_contigs_with_padding(exact_aligner, visualizer): + # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. + + ref_seq = 'A' * 100 + 'C' * 100 + + contigs = [ + GenotypedContig(name='a', + seq='M' * 10 + 'A' * 50 + 'C' * 20 + 'Z' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig(name='b', + seq='J' * 10 + 'A' * 20 + 'C' * 50 + 'N' * 10, + ref_name='testref', + group_ref='testref', + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + + result = results[0] + + assert 120 == len(result.seq) + assert result.seq == 'M' * 10 + 'A' * 50 + 'C' * 50 + 'N' * 10 + + assert len(visualizer().elements) > len(contigs) + + def test_correct_stitching_of_two_partially_overlapping_contigs_real(projects, visualizer): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. Not using exact aligner this time. @@ -1433,13 +1466,17 @@ def test_concordance_simple(left, right, expected): ("a" * 128, "a" * 54 + "b" * 20 + "a" * 54, 28), # two peaks ("a" * 128, "a" * 63 + "b" * 2 + "a" * 63, 32), # two peaks ("a" * 1280, "b" * 640 + "a" * 640, round(1280 * 3 / 4)), + ("a" * 128, "b" * 48 + "a" * 32 + "b" * 48, 64), + ("a" * 128, "b" * 48 + "a" * 15 + "ab" + "a" * 15 + "b" * 48, 48 + 16//2), # two peaks - choosing 1nd + ("a" * 128, "b" * 48 + "a" * 15 + "ba" + "a" * 15 + "b" * 48, 48 + 15 + 16//2), # two peaks - choosing 2nd + ("a" * 128, "b" * 48 + "a" * 15 + "bb" + "a" * 15 + "b" * 48, 48 + 15//2), # two peaks - choosing 1st ] ) def test_concordance_simple_index(left, right, expected): concordance = calculate_concordance(left, right) concordance_d = list(disambiguate_concordance(concordance)) index = max(range(len(concordance)), key=lambda i: concordance_d[i]) - if abs(index - expected) > 3: + if abs(index - expected) > 1: assert index == expected From ad74829dcc40855678cb3631c02495ce97b7eb5c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 5 Feb 2024 12:39:07 -0800 Subject: [PATCH 208/485] Contig stitcher: fix unaligned regions handling in the visualizer Sometimes smaller regions would not show up on the final diagram. --- micall/core/plot_contigs.py | 249 ++++++++++-------- ...wo_overlapping_and_one_separate_contig.svg | 144 ++++++++-- ..._overlapping_and_one_separate_contig_2.svg | 144 ++++++++-- ...apping_contigs_with_insignificant_gaps.svg | 246 ++++++++++++----- ...ially_overlapping_contigs_with_padding.svg | 130 ++++++++- ...cross_alignment_around_small_insertion.svg | 62 +++-- .../test_gap_around_big_insertion.svg | 178 +++++++------ .../test_gap_around_small_insertion.svg | 46 ++-- ...onsensus_multiple_overlaping_sequences.svg | 120 ++++++++- .../test_stitch_cross_alignment.svg | 24 +- ...ching_partial_align_multiple_sequences.svg | 130 ++++++++- ...gs_completely_covered_by_other_contigs.svg | 44 ++-- 12 files changed, 1110 insertions(+), 407 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 49c6ce84e..05f8dfa36 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1,5 +1,5 @@ import typing -from typing import Dict, Tuple, List, Set, Iterable, NoReturn, Literal +from typing import Dict, Tuple, List, Set, Iterable, NoReturn, Literal, Union from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType from collections import Counter, defaultdict from csv import DictReader @@ -416,7 +416,7 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: discarded: List[str] = [] unknown: List[str] = [] anomaly: List[str] = [] - unaligned: List[str] = [] + unaligned_map: Dict[str, List[CigarHit]] = {} overlaps_list: List[str] = [] overlap_leftparent_map: Dict[str, str] = {} overlap_rightparent_map: Dict[str, str] = {} @@ -521,32 +521,6 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) - def record_unaligned_parts(original: AlignedContig, q_st: int, r_st: int, length: int): - key = (original.seq, q_st, q_st + length) - if length > 0 and key not in strip_set: - strip_set.add(key) - alignment = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_st + length - 1, r_st=r_st, r_ei=r_st-1) - seq = 'A' * alignment.query_length - query = dataclasses.replace(original, name=f"u{len(complete_contig_map)}", seq=seq) - fake_aligned = AlignedContig.make(query, alignment, strand=original.strand) - record_contig(fake_aligned, [original]) - record_bad_contig(fake_aligned, unaligned) - record_alive(fake_aligned) - return fake_aligned - return None - - def record_regular_strip(result: AlignedContig, original: AlignedContig): - length = abs(result.alignment.query_length - original.alignment.query_length) - q_st = original.alignment.q_st - r_st = original.alignment.r_st - return record_unaligned_parts(original, q_st=q_st, r_st=r_st, length=length) - - def record_initial_strip(original: AlignedContig, q_st: int, q_ei: int): - length = q_ei - q_st + 1 - contig = record_unaligned_parts(original, q_st, original.alignment.r_st, length) - if contig: - query_position_map[contig.name] = (q_st, q_ei) - def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): complete_contig_map[contig.name] = contig if [contig.name] != [parent.name for parent in parents]: @@ -567,15 +541,22 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]): def record_lstrip(result: AlignedContig, original: AlignedContig): lstrip_map[result.name] = original.name - unaligned = record_regular_strip(result, original) - if unaligned: - lstrip_map[unaligned.name] = result.name def record_rstrip(result: AlignedContig, original: AlignedContig): rstrip_map[result.name] = original.name - unaligned = record_regular_strip(result, original) - if unaligned: - rstrip_map[unaligned.name] = result.name + + def hit_to_insertions(contig: GenotypedContig, hit: CigarHit): + yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) + yield from hit.insertions() + yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) + + def hits_to_insertions(contig: GenotypedContig, hits: List[CigarHit]): + for hit in hits: + yield from hit_to_insertions(contig, hit) + + def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): + insertions = [gap for gap in hits_to_insertions(contig, hits)] + unaligned_map[contig.name] = insertions for event in logs: if isinstance(event, events.FinalCombine): @@ -605,6 +586,7 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): record_contig(event.result, [event.contig]) record_alive(event.result) elif isinstance(event, events.HitNumber): + record_initial_hit(event.contig, event.connected) record_alive(event.contig) elif isinstance(event, events.Munge): record_contig(event.result, [event.left, event.right]) @@ -615,7 +597,7 @@ def record_rstrip(result: AlignedContig, original: AlignedContig): record_contig(event.result, [event.original]) record_rstrip(event.result, event.original) elif isinstance(event, events.InitialStrip): - record_initial_strip(event.contig, event.q_st, event.q_ei) + pass elif isinstance(event, events.Overlap): overlaps_list.append(event.left_overlap.name) overlaps_list.append(event.right_overlap.name) @@ -671,7 +653,7 @@ def extend_alive(contig_name): parent_graph[contig_name] = nodup_parent_graph[contig_name] contig_map: Dict[str, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set} - bad_contigs = anomaly + discarded + unknown + unaligned + bad_contigs = anomaly + discarded + unknown group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) transitive_parent_graph = transitive_closure(parent_graph) @@ -708,15 +690,9 @@ def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: current_query_len = abs(current_q_st - current_q_ei) if contig_name in lstrip_map: - if contig_name in unaligned: - query_position_map[contig.name] = (original_q_st - current_query_len - 1, original_q_st - 1) - else: - query_position_map[contig.name] = (original_q_ei - current_query_len, original_q_ei) + query_position_map[contig.name] = (original_q_ei - current_query_len, original_q_ei) elif contig_name in rstrip_map: - if contig_name in unaligned: - query_position_map[contig.name] = (original_q_ei + 1, original_q_ei + 1 + current_query_len) - else: - query_position_map[contig.name] = (original_q_st, original_q_st + current_query_len) + query_position_map[contig.name] = (original_q_st, original_q_st + current_query_len) else: query_position_map[contig_name] = query_position_map[parent_name] @@ -804,35 +780,8 @@ def is_ancestor(contig_name, other_names): final_children_mapping[parent_name] = children - name_map = {} - for i, (parent, children) in enumerate(sorted(final_children_mapping.items(), key=lambda p: p[0])): - name_map[parent] = f"{i + 1}" - - unaligned_names = [name for name in children if name in unaligned] - aligned_names = [name for name in children if name not in unaligned] - - todo_names = aligned_names - for contig_name in unaligned_names: - todo_names.append(contig_name) - if contig_name not in discarded: - discarded.append(contig_name) - - todo_names = list(sorted(todo_names, key=lambda name: query_position_map.get(name, (-1, -1)))) - for k, child_name in enumerate(todo_names): - if len(todo_names) > 1: - name_map[child_name] = f"{i + 1}.{k + 1}" - else: - name_map[child_name] = f"{i + 1}" - - for bad_name in bad_contigs: - if bad_name not in children: - if bad_name in transitive_parent_graph \ - and parent in transitive_parent_graph[bad_name]: - k += 1 - name_map[bad_name] = f"{i + 1}.{k + 1}" - - for contig_name, name in name_map.items(): - logger.debug(f"Contig name {contig_name!r} is displayed as {name!r}.") + aligned_size_map: Dict[str, Tuple[int, int]] = {} + full_size_map: Dict[str, Tuple[int, int]] = {} def get_neighbours(part, lookup): for clone in eqv_morphism_graph.get(part.name, [part.name]): @@ -850,8 +799,26 @@ def get_neighbour(part, lookup): ret = max(map(get_final_version, lst), key=lambda contig: contig.alignment.ref_length, default=None) return ret - aligned_size_map: Dict[str, Tuple[int, int]] = {} - full_size_map: Dict[str, Tuple[int, int]] = {} + def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]: + if isinstance(contig, AlignedContig) and contig.alignment.ref_length > 0: + r_st = contig.alignment.r_st + r_ei = contig.alignment.r_ei + if contig.name in aligned_size_map: + a_r_st, a_r_ei = aligned_size_map[contig.name] + else: + a_r_st = r_st + a_r_ei = r_ei + if contig.name in full_size_map: + f_r_st, f_r_ei = full_size_map[contig.name] + else: + f_r_st = r_st - contig.alignment.q_st + f_r_ei = r_ei + (len(contig.seq) - contig.alignment.q_ei) + else: + f_r_st = 0 + f_r_ei = len(contig.seq) + a_r_st = f_r_st + a_r_ei = f_r_ei + return (a_r_st, a_r_ei, f_r_st, f_r_ei) for parent_name in sorted_roots: parts_names = final_children_mapping[parent_name] @@ -897,26 +864,86 @@ def get_neighbour(part, lookup): full_size_map[part.name] = (r_st, r_ei) - def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]: - if isinstance(contig, AlignedContig) and contig.alignment.ref_length > 0: - r_st = contig.alignment.r_st - r_ei = contig.alignment.r_ei - if contig.name in aligned_size_map: - a_r_st, a_r_ei = aligned_size_map[contig.name] + def carve_gap(gap: CigarHit, aligned_parts: Iterable[AlignedContig]): + for contig in aligned_parts: + (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) + other_coords = query_position_map.get(contig.name, (-1, -2)) + + other_q_st = min(other_coords) - max(0, abs(f_r_st - a_r_st)) + other_q_ei = max(other_coords) + max(0, abs(a_r_ei - f_r_ei)) + + if gap.q_st <= other_q_st and gap.q_ei >= other_q_st: + q_st = gap.q_st + q_ei = other_q_st - 1 + elif gap.q_ei >= other_q_ei and gap.q_ei <= other_q_ei: + q_st = other_q_ei + 1 + q_ei = gap.q_ei + elif gap.q_st >= other_q_st and gap.q_ei <= other_q_ei: + return None else: - a_r_st = r_st - a_r_ei = r_ei - if contig.name in full_size_map: - f_r_st, f_r_ei = full_size_map[contig.name] + continue + + if q_st >= other_q_st and q_ei <= other_q_ei: + return None + + if q_st > q_ei: + return None + + gap = CigarHit.from_default_alignment(q_st=q_st, q_ei=q_ei, r_st=gap.r_st, r_ei=gap.r_ei) + + if gap.query_length > 0: + return gap + + def collect_gaps(root: str, children_names: List[str]): + all_children = [contig_map[name] for name in children_names] + children = [child for child in all_children if isinstance(child, AlignedContig)] + for name in unaligned_map: + if reduced_parent_graph.get(name, [name]) == [root]: + for gap in unaligned_map[name]: + carved = carve_gap(gap, children) + if carved is not None: + yield carved + + carved_unaligned_parts: Dict[str, List[str]] = {} + counter = 0 + for root in sorted_roots: + existing: Set[Tuple[int, int]] = set() + children = final_children_mapping[root] + for gap in collect_gaps(root, children): + coords = (gap.q_st, gap.q_ei) + if coords not in existing: + existing.add(coords) + counter += 1 + fake_name = f"u{counter}" + if root not in carved_unaligned_parts: + carved_unaligned_parts[root] = [] + carved_unaligned_parts[root].append(fake_name) + query_position_map[fake_name] = coords + + name_map = {} + for i, root in enumerate(sorted_roots): + children = final_children_mapping[root] + unaligned_children = carved_unaligned_parts.get(root, []) + + name_map[root] = f"{i + 1}" + + todo_names = children + unaligned_children + todo_names = list(sorted(todo_names, key=lambda name: query_position_map.get(name, (-1, -1)))) + for k, child_name in enumerate(todo_names): + if len(todo_names) > 1: + name_map[child_name] = f"{i + 1}.{k + 1}" else: - f_r_st = r_st - contig.alignment.q_st - f_r_ei = r_ei + (len(contig.seq) - contig.alignment.q_ei) - else: - f_r_st = 0 - f_r_ei = len(contig.seq) - a_r_st = f_r_st - a_r_ei = f_r_ei - return (a_r_st, a_r_ei, f_r_st, f_r_ei) + name_map[child_name] = f"{i + 1}" + + for bad_name in bad_contigs: + if bad_name not in children: + if bad_name in transitive_parent_graph \ + and root in transitive_parent_graph[bad_name]: + k += 1 + name_map[bad_name] = f"{i + 1}.{k + 1}" + + for contig_name, name in name_map.items(): + logger.debug(f"Contig name {contig_name!r} is displayed as {name!r}.") def get_tracks(parts: Iterable[GenotypedContig]) -> Iterable[Track]: for part in parts: @@ -929,7 +956,7 @@ def get_tracks(parts: Iterable[GenotypedContig]) -> Iterable[Track]: if a_r_ei > f_r_ei: yield Track(min(a_r_ei, f_r_ei) + position_offset, max(a_r_ei, f_r_ei) + position_offset, color="yellow") - if isinstance(part, AlignedContig) and part.name not in unaligned: + if isinstance(part, AlignedContig): colour = 'lightgrey' else: colour = "yellow" @@ -1090,8 +1117,8 @@ def add_section(title: str) -> None: ########## ref_arrows: List[Arrow] = [] - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] + for root in sorted_roots: + parts_names = final_children_mapping[root] parts_names = [name for name in parts_names if name not in bad_contigs] parts = [contig_map[name] for name in parts_names] parts = [part for part in parts if part.group_ref == group_ref] @@ -1104,8 +1131,8 @@ def add_section(title: str) -> None: # Contigs # ########### - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] + for root in sorted_roots: + parts_names = final_children_mapping[root] parts_names = [name for name in parts_names if name not in bad_contigs] parts = [contig_map[name] for name in parts_names] parts = [part for part in parts if part.group_ref == group_ref] @@ -1117,15 +1144,23 @@ def add_section(title: str) -> None: # Discarded # ############# - if discarded: + if discarded or carved_unaligned_parts: add_section("discards:") - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] + for root in sorted_roots: + if contig_map[root].group_ref != group_ref: + continue + + parts_names = final_children_mapping[root] parts_names = [name for name in parts_names if name in discarded] - parts = [contig_map[name] for name in parts_names] - parts = [part for part in parts if part.group_ref == group_ref] - for part in parts: - figure.add(Multitrack(list(get_tracks([part])))) + unaligned_parts = carved_unaligned_parts.get(root, []) + for name in sorted(parts_names + unaligned_parts, key=lambda x: name_map[x.name] if isinstance(x, Contig) else name_map[x]): + if name in unaligned_parts: + (q_st, q_ei) = query_position_map[name] + label = name_map[name] + figure.add(Track(position_offset, position_offset + abs(q_ei - q_st), label=label, color="yellow")) + else: + part = contig_map[name] + figure.add(Multitrack(list(get_tracks([part])))) ############# # Anomalies # @@ -1133,8 +1168,8 @@ def add_section(title: str) -> None: if anomaly: add_section("anomaly:") - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] + for root in sorted_roots: + parts_names = final_children_mapping[root] parts_names = [name for name in parts_names if name in anomaly] parts = [contig_map[name] for name in parts_names] parts = [part for part in parts if part.group_ref == group_ref] diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index de65a4f76..5bbf0b0f9 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -1,9 +1,9 @@ + width="970.0" height="246.6" viewBox="0.0 -246.6 970.0 246.6"> - + @@ -26,81 +26,193 @@ - + testref - + -1 +1.1 -2 +2.2 -3 +3.2 - + - + -1 +1.1 - + - + -2 +2.2 - + - + -3 +3.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + + + +2.1 + + + + + +3.1 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 5be34238b..30864a1b1 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -1,9 +1,9 @@ + width="970.0" height="246.6" viewBox="0.0 -246.6 970.0 246.6"> - + @@ -26,81 +26,193 @@ - + testref - + -1 +1.1 -2 +2.1 -3 +3.2 - + - + -1 +1.1 - + - + -2 +2.1 - + - + -3 +3.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + + + +2.2 + + + + + +3.1 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg index 6f1fcd641..2c555bf9d 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps.svg @@ -1,190 +1,290 @@ + width="970.0" height="255.6" viewBox="0.0 -255.6 970.0 255.6"> - + - - + + 5' LTR - - + + gag - - + + vif - - + + tat - - + + nef - + - - + + tat - - + + vpu - - + + rev - - + + 3' LTR - + - - + + pol - - + + vpr - - + + rev - - + + env - + - - + + PR - - + + RT - - + + INT - - + + V3 - - + + GP41 - + - + - + - + - + - + - - - + + + HIV1-B-FR-K03455-seed - + - - - -1 + + + +1.1 - - - -2 + + + +2 - + - - + + - + - + - - -1 + + +1.1 - + - - + + - + - - + + 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg index 298192e39..4451d8f16 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg @@ -1,9 +1,9 @@ + width="970.0" height="195.6" viewBox="0.0 -195.6 970.0 195.6"> - + @@ -20,58 +20,164 @@ - + testref - + -1 +1.1 -2 +2.2 - + - + -1 +1.1 - + - + -2 +2.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + + + +2.1 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg index 5808c9040..bc367f670 100644 --- a/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_cross_alignment_around_small_insertion.svg @@ -1,9 +1,9 @@ + width="970.0" height="255.6" viewBox="0.0 -255.6 970.0 255.6"> - + @@ -26,13 +26,13 @@ - + testref - + @@ -43,7 +43,7 @@ -2 +2.2 @@ -52,7 +52,7 @@ 1.2 - + @@ -62,7 +62,7 @@ - + @@ -76,25 +76,25 @@ - + - + -2 +2.2 - + - + @@ -174,38 +174,44 @@ - + - + discards: - - - - + + + 1.1 - - - - - + + + 1.3 - - - - - + + + 1.5 + + + +2.1 + + + + + +2.3 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index 9525f81b8..abf11223d 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -1,201 +1,211 @@ + width="970.0" height="215.6" viewBox="0.0 -215.6 970.0 215.6"> - + - + - + - + - + - + - + - + - - - + + + testref - + - - - -1.1 + + + +1.1 - - - -2 + + + +2.2 - - - -1.3 + + + +1.3 - + - - + + - - + + - + - + - - + + 1.1 - + - - + + 1.3 - + - - + + - + - - -2 + + +2.2 - - - + + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - - + + + - - - + + + discards: - - - - + + + 1.2 + + + +2.1 + + + + + +2.3 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index ff9b9f9db..e756792fa 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -1,9 +1,9 @@ + width="970.0" height="215.6" viewBox="0.0 -215.6 970.0 215.6"> - + @@ -26,13 +26,13 @@ - + testref - + @@ -43,7 +43,7 @@ -2 +2.2 @@ -52,7 +52,7 @@ 1.3 - + @@ -62,7 +62,7 @@ - + @@ -82,25 +82,25 @@ - + - + -2 +2.2 - + - + @@ -180,22 +180,32 @@ - + - + discards: - - - - + + + 1.2 + + + +2.1 + + + + + +2.3 + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg index 2f1fa6751..a5df4e916 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -1,9 +1,9 @@ + width="970.0" height="175.6" viewBox="0.0 -175.6 970.0 175.6"> - + @@ -20,18 +20,18 @@ - + testref - + -1 +1.1 @@ -40,30 +40,30 @@ 2 - + - + -1 +1.1 - + - + @@ -74,4 +74,104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg index 18415cca0..75e2a1f5a 100644 --- a/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg +++ b/micall/tests/data/stitcher_plots/test_stitch_cross_alignment.svg @@ -158,28 +158,22 @@ discards: - - - - + + + 1.1 - - - - - + + + 1.3 - - - - - + + + 1.5 - \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index 87669e97c..fe2216305 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -1,9 +1,9 @@ + width="970.0" height="195.6" viewBox="0.0 -195.6 970.0 195.6"> - + @@ -20,58 +20,164 @@ - + testref - + -1 +1.1 -2 +2.2 - + - + -1 +1.1 - + - + -2 +2.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + + + +2.1 + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg index 1441ba7d0..acb07c521 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg @@ -1,9 +1,9 @@ + width="969.9999999999999" height="235.6" viewBox="0.0 -235.6 969.9999999999999 235.6"> - + @@ -20,65 +20,65 @@ - + testref - + -1 +1.1 -2 +2.2 - + - + -1 +1.1 - + - + -2 +2.2 - + - + @@ -158,16 +158,28 @@ - + - + discards: + + + +1.2 + + + + + +2.1 + + From 2b34d331e278fa9cd0378bc501ffb81626325fc2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 5 Feb 2024 15:17:45 -0800 Subject: [PATCH 209/485] Contig stitcher: further improve drawing of unaligned parts Merge unaligned parts that are inbetween the aligned parts. --- micall/core/plot_contigs.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 05f8dfa36..48822e04f 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -905,7 +905,7 @@ def collect_gaps(root: str, children_names: List[str]): yield carved carved_unaligned_parts: Dict[str, List[str]] = {} - counter = 0 + fake_name_counter = 0 for root in sorted_roots: existing: Set[Tuple[int, int]] = set() children = final_children_mapping[root] @@ -913,17 +913,40 @@ def collect_gaps(root: str, children_names: List[str]): coords = (gap.q_st, gap.q_ei) if coords not in existing: existing.add(coords) - counter += 1 - fake_name = f"u{counter}" + fake_name_counter += 1 + fake_name = f"m{fake_name_counter}" if root not in carved_unaligned_parts: carved_unaligned_parts[root] = [] carved_unaligned_parts[root].append(fake_name) query_position_map[fake_name] = coords + merged_unaligned_parts: Dict[str, List[str]] = {} + fake_name_counter = 0 + for root in sorted_roots: + children = final_children_mapping[root] + unaligned_children = carved_unaligned_parts.get(root, []) + todo = children + unaligned_children + todo = list(sorted(todo, key=lambda name: query_position_map.get(name, (-1, -1)))) + current_group = [] + for child_name in todo + [None]: + if child_name in unaligned_children: + coords = query_position_map[child_name] + current_group.append(coords) + elif current_group: + coords = (min(q_st for q_st, q_ei in current_group), + max(q_ei for q_st, q_ei in current_group)) + fake_name_counter += 1 + fake_name = f"u{fake_name_counter}" + query_position_map[fake_name] = coords + if root not in merged_unaligned_parts: + merged_unaligned_parts[root] = [] + merged_unaligned_parts[root].append(fake_name) + current_group = [] + name_map = {} for i, root in enumerate(sorted_roots): children = final_children_mapping[root] - unaligned_children = carved_unaligned_parts.get(root, []) + unaligned_children = merged_unaligned_parts.get(root, []) name_map[root] = f"{i + 1}" @@ -1144,7 +1167,7 @@ def add_section(title: str) -> None: # Discarded # ############# - if discarded or carved_unaligned_parts: + if discarded or merged_unaligned_parts: add_section("discards:") for root in sorted_roots: if contig_map[root].group_ref != group_ref: @@ -1152,7 +1175,7 @@ def add_section(title: str) -> None: parts_names = final_children_mapping[root] parts_names = [name for name in parts_names if name in discarded] - unaligned_parts = carved_unaligned_parts.get(root, []) + unaligned_parts = merged_unaligned_parts.get(root, []) for name in sorted(parts_names + unaligned_parts, key=lambda x: name_map[x.name] if isinstance(x, Contig) else name_map[x]): if name in unaligned_parts: (q_st, q_ei) = query_position_map[name] From fe391f4096a4c6e3e1029e49bf8b8ccdf50d68aa Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 16 Feb 2024 10:23:41 -0800 Subject: [PATCH 210/485] Contig stitcher: increase the split gap size threshold --- micall/core/contig_stitcher.py | 5 +- .../test_gap_around_big_insertion.svg | 146 +++++++++--------- .../test_gap_around_small_insertion.svg | 146 +++++++++--------- ...titching_contig_with_small_covered_gap.svg | 68 ++++---- micall/tests/test_contig_stitcher.py | 16 +- 5 files changed, 192 insertions(+), 189 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index f7c469bb9..81d0c02ef 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -613,7 +613,10 @@ def covered(contig, gap): return any(covered_by(gap, other) for other in contigs if other != contig) def significant(gap): - return gap.ref_length > 5 + # The size of the gap is unavoidably, to some point, arbitrary. Here we tried to adjust it to common gaps in HIV, as HIV is the primary test subject in MiCall. + # A notable feature of HIV-1 reverse transcription is the appearance of periodic deletions of approximately 21 nucleotides. These deletions have been reported to occur in the HIV-1 genome and are thought to be influenced by the structure of the viral RNA. Specifically, the secondary structures and foldings of the RNA can lead to pause sites for the reverse transcriptase, resulting in staggered alignment when the enzyme slips. This misalignment can cause the reverse transcriptase to "jump," leading to deletions in the newly synthesized DNA. + # The unusually high frequency of about 21-nucleotide deletions is believed to correspond to the pitch of the RNA helix, which reflects the spatial arrangement of the RNA strands. The 21 nucleotide cycle is an average measure and is thought to be associated with the length of one turn of the RNA helix, meaning that when reverse transcriptase slips and reattaches, it often does so one helical turn away from the original site. + return gap.ref_length > 21 def try_split(contig): for gap in contig.alignment.deletions(): diff --git a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg index abf11223d..77435ae1e 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_big_insertion.svg @@ -5,206 +5,206 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2.2 + + + +2.2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2.2 - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - + + discards: - - + + 1.2 - - + + 2.1 - - + + 2.3 diff --git a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg index e756792fa..7cf51be1b 100644 --- a/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg +++ b/micall/tests/data/stitcher_plots/test_gap_around_small_insertion.svg @@ -5,206 +5,206 @@ - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2.2 + + + +2.2 - - - -1.3 + + + +1.3 - - + + - - + + - + - - + + 1.1 - + - - + + 1.3 - - + + - - + + 2.2 - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - + + discards: - - + + 1.2 - - + + 2.1 - - + + 2.3 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg index cb87affb3..1323c199e 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_small_covered_gap.svg @@ -1,91 +1,91 @@ + width="970.0000000000001" height="138.5" viewBox="0.0 -138.5 970.0000000000001 138.5"> - + - + - + - + - + - + - + - - + + testref - - - -1.1 + + + +1.1 - - - -2 + + + +2 - - - -1.2 + + + +1.2 - - + + - - + + - - + + 1.1 - - + + 1.2 - - + + - - + + 2 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 1687a8149..fc516def9 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -693,7 +693,7 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner, visualizer): def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): # Scenario: If one contig has a small gap covered by another contig. - ref_seq = 'G' * 100 + 'A' * 9 + 'C' * 100 + 'T' * 100 + ref_seq = 'G' * 100 + 'A' * 29 + 'C' * 100 + 'T' * 100 contigs = [ GenotypedContig(name='a', @@ -704,7 +704,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): match_fraction=0.5, ), GenotypedContig(name='b', - seq='A' * 9 + 'C' * 50, + seq='A' * 29 + 'C' * 50, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -907,7 +907,7 @@ def test_big_insertion_in_a_single_contig_2(exact_aligner, visualizer): def test_gap_around_small_insertion(exact_aligner, visualizer): # Scenario: Contig is split around its gap, then stripped. - ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + ref_seq='A' * 10 + 'B' * 29 + 'C' * 10 contigs = [ GenotypedContig(name='a', @@ -918,7 +918,7 @@ def test_gap_around_small_insertion(exact_aligner, visualizer): match_fraction=0.3, ), GenotypedContig(name='b', - seq='Q' * 5 + 'B' * 20 + 'J' * 5, + seq='Q' * 5 + 'B' * 29 + 'J' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -928,14 +928,14 @@ def test_gap_around_small_insertion(exact_aligner, visualizer): results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "Z" * 5 + assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 29 + "C" * 10 + "Z" * 5 assert len(visualizer().elements) > len(contigs) def test_gap_around_big_insertion(exact_aligner, visualizer): # Scenario: Contig is split around its gap, then stripped. - ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + ref_seq='A' * 10 + 'B' * 29 + 'C' * 10 contigs = [ GenotypedContig(name='a', @@ -946,7 +946,7 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): match_fraction=0.3, ), GenotypedContig(name='b', - seq='Q' * 5 + 'B' * 20 + 'J' * 5, + seq='Q' * 5 + 'B' * 29 + 'J' * 5, ref_name='testref', group_ref='testref', ref_seq=ref_seq, @@ -956,7 +956,7 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "Z" * 5 + assert results[0].seq == "P" * 5 + "A" * 10 + "B" * 29 + "C" * 10 + "Z" * 5 assert len(visualizer().elements) > len(contigs) From c4c388647e8a94b9225f3407f8973999b0d42d41 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 20 Feb 2024 10:50:42 -0800 Subject: [PATCH 211/485] Contig stitcher: factor out context into a separate file --- micall/core/contig_stitcher.py | 34 +---------------------- micall/core/denovo.py | 3 +- micall/utils/contig_stitcher_context.py | 37 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 34 deletions(-) create mode 100644 micall/utils/contig_stitcher_context.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 81d0c02ef..e479c0eaf 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -9,48 +9,16 @@ from queue import LifoQueue from Bio import Seq import logging -from contextvars import ContextVar, Context -from contextlib import contextmanager from fractions import Fraction from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.consensus_aligner import CigarActions +from micall.utils.contig_stitcher_context import context, StitcherContext import micall.utils.contig_stitcher_events as events T = TypeVar("T") logger = logging.getLogger(__name__) -class StitcherContext: - def __init__(self) -> None: - self.name_generator_state: int = 0 - self.nameset: Set[str] = set() - self.events: List[events.EventType] = [] - - def generate_new_name(self) -> str: - while True: - self.name_generator_state += 1 - name = f"c{self.name_generator_state}" - if name not in self.nameset: - self.nameset.add(name) - return name - - def emit(self, event: events.EventType) -> None: - self.events.append(event) - - - @staticmethod - @contextmanager - def fresh(): - ctx = StitcherContext() - token = context.set(ctx) - try: - yield ctx - finally: - context.reset(token) - - -context: ContextVar[StitcherContext] = ContextVar("StitcherContext") - @dataclass(frozen=True) class Contig: diff --git a/micall/core/denovo.py b/micall/core/denovo.py index e8c82bda4..2b443dc6a 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -19,8 +19,9 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus, StitcherContext +from micall.core.contig_stitcher import GenotypedContig, stitch_consensus from micall.core.plot_contigs import plot_stitcher_coverage +from micall.utils.contig_stitcher_context import StitcherContext IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), diff --git a/micall/utils/contig_stitcher_context.py b/micall/utils/contig_stitcher_context.py new file mode 100644 index 000000000..d23bc261c --- /dev/null +++ b/micall/utils/contig_stitcher_context.py @@ -0,0 +1,37 @@ +from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable, Set +from contextvars import ContextVar +from contextlib import contextmanager + +import micall.utils.contig_stitcher_events as events + + +class StitcherContext: + def __init__(self) -> None: + self.name_generator_state: int = 0 + self.nameset: Set[str] = set() + self.events: List[events.EventType] = [] + + def generate_new_name(self) -> str: + while True: + self.name_generator_state += 1 + name = f"c{self.name_generator_state}" + if name not in self.nameset: + self.nameset.add(name) + return name + + def emit(self, event: events.EventType) -> None: + self.events.append(event) + + + @staticmethod + @contextmanager + def fresh(): + ctx = StitcherContext() + token = context.set(ctx) + try: + yield ctx + finally: + context.reset(token) + + +context: ContextVar[StitcherContext] = ContextVar("StitcherContext") From 5ce730e21e2e8577ec7ef31579a447f15d360f8a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 20 Feb 2024 11:06:19 -0800 Subject: [PATCH 212/485] Contig stitcher: move logging into events module --- micall/core/contig_stitcher.py | 123 +++++++------------------ micall/utils/contig_stitcher_events.py | 71 ++++++++++++++ 2 files changed, 104 insertions(+), 90 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e479c0eaf..1e3260251 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -16,10 +16,16 @@ from micall.utils.contig_stitcher_context import context, StitcherContext import micall.utils.contig_stitcher_events as events + T = TypeVar("T") logger = logging.getLogger(__name__) +def log(e: events.EventType) -> None: + context.get().emit(e) + logger.debug("%s", e) + + @dataclass(frozen=True) class Contig: name: str @@ -66,12 +72,7 @@ def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedCont alignment_left, alignment_right = self.alignment.cut_reference(cut_point) left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) - - logger.debug("Created contigs %r at %s and %r at %s by cutting %r at %s at cut point = %s.", - left.name, left.alignment, right.name, right.alignment, - self.name, self.alignment, round(cut_point, 1)) - context.get().emit(events.Cut(self, left, right)) - + log(events.Cut(self, left, right, cut_point)) return (left, right) @@ -85,10 +86,7 @@ def lstrip(self) -> 'AlignedContig': q_remainder, query = self.cut_query(alignment.q_st - 0.5) alignment = alignment.translate(0, -1 * alignment.q_st) result = AlignedContig.make(query, alignment, self.strand) - logger.debug("Doing lstrip of %r at %s (len %s) resulted in %r at %s (len %s).", - self.name, self.alignment, len(self.seq), - result.name, result.alignment, len(result.seq)) - context.get().emit(events.LStrip(self, result)) + log(events.LStrip(self, result)) return result @@ -101,10 +99,7 @@ def rstrip(self) -> 'AlignedContig': alignment = self.alignment.rstrip_reference().rstrip_query() query, q_remainder = self.cut_query(alignment.q_ei + 0.5) result = AlignedContig.make(query, alignment, self.strand) - logger.debug("Doing rstrip of %r at %s (len %s) resulted in %r at %s (len %s).", - self.name, self.alignment, len(self.seq), - result.name, result.alignment, len(result.seq)) - context.get().emit(events.RStrip(self, result)) + log(events.RStrip(self, result)) return result @@ -142,9 +137,7 @@ def munge(self, other: 'AlignedContig') -> 'AlignedContig': alignment = self_alignment.connect(other_alignment) ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) - logger.debug("Munged contigs %r at %s with %r at %s resulting in %r at %s.", - self.name, self.alignment, other.name, other.alignment, ret.name, ret.alignment) - context.get().emit(events.Munge(self, other, ret)) + log(events.Munge(self, other, ret)) return ret @@ -179,10 +172,7 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: stripped_parts.append(part) ret = reduce(AlignedContig.munge, stripped_parts) - logger.debug("Created a frankenstein %r at %s (len %s) from %s.", - ret.name, ret.alignment, len(ret.seq), - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in stripped_parts]) - context.get().emit(events.Combine(stripped_parts, ret)) + log(events.Combine(stripped_parts, ret)) return ret @@ -195,8 +185,7 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: """ if contig.ref_seq is None: - logger.debug("Contig %r not aligned - no reference.", contig.name) - context.get().emit(events.NoRef(contig)) + log(events.NoRef(contig)) yield contig return @@ -209,19 +198,15 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: "forward" if x.strand == 1 else "reverse") for x in alignments] for i, (hit, strand) in enumerate(hits_array): - logger.debug("Part %r of contig %r aligned at %s%s.", - i, contig.name, hit, " (rev)" if strand == "reverse" else "") - context.get().emit(events.InitialHit(contig, hit, strand)) + log(events.InitialHit(contig, i, hit, strand)) if not hits_array: - logger.debug("Contig %r not aligned - backend's choice.", contig.name) - context.get().emit(events.ZeroHits(contig)) + log(events.ZeroHits(contig)) yield contig return if len(set(strand for hit, strand in hits_array)) > 1: - logger.debug("Discarding contig %r because it aligned both in forward and reverse sense.", contig.name) - context.get().emit(events.StrandConflict(contig)) + log(events.StrandConflict(contig)) yield contig return @@ -233,26 +218,17 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand) for hit, strand in hits_array] - logger.debug("Reverse complemented contig %r.", contig.name) - context.get().emit(events.ReverseComplement(contig, new_contig)) - + log(events.ReverseComplement(contig, new_contig)) for i, (hit, strand) in enumerate(hits_array): - logger.debug("Part %r of contig %r reverse-aligned at %s%s.", - i, contig.name, hit, " (rev)" if strand == "reverse" else "") - context.get().emit(events.InitialHit(contig, hit, strand)) + log(events.InitialHit(contig, i, hit, strand)) connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] - logger.debug("Contig %r produced %s aligner hits. After connecting them, the number became %s.", - contig.name, len(hits_array), len(connected)) - context.get().emit(events.HitNumber(contig, hits_array, connected)) + log(events.HitNumber(contig, hits_array, connected)) for i, single_hit in enumerate(connected): query = replace(contig, name=context.get().generate_new_name()) part = AlignedContig.make(query, single_hit, strand) - - logger.debug("Part %r of contig %r re-aligned as %r at %s%s.", i, contig.name, - part.name,part.alignment, " (rev)" if strand == "reverse" else "") - context.get().emit(events.ConnectedHit(contig, part, i)) + log(events.ConnectedHit(contig, part, i)) yield part @@ -282,10 +258,10 @@ def is_out_of_order(name: str) -> bool: if prev_contig is not None or is_out_of_order(original.name): contig = contig.lstrip() - context.get().emit(events.InitialStrip(original, start, original.alignment.q_st - 1)) + log(events.InitialStrip(original, start, original.alignment.q_st - 1)) if next_contig is not None or is_out_of_order(original.name): contig = contig.rstrip() - context.get().emit(events.InitialStrip(original, original.alignment.q_ei + 1, end)) + log(events.InitialStrip(original, original.alignment.q_ei + 1, end)) yield contig @@ -425,15 +401,7 @@ def stitch_2_contigs(left, right): right_overlap = right_overlap.lstrip().rstrip() left_remainder = left_remainder.rstrip() right_remainder = right_remainder.lstrip() - - logger.debug("Stitching %r at %s (len %s) with %r at %s (len %s)." - " The left_overlap %r is at %s (len %s)" - " and the right_overlap %r is at %s (len %s).", - left.name, left.alignment, len(left.seq), - right.name, right.alignment, len(right.seq), - left_overlap.name, left_overlap.alignment, len(left_overlap.seq), - right_overlap.name, right_overlap.alignment, len(right_overlap.seq)) - context.get().emit(events.StitchCut(left, right, left_overlap, right_overlap, left_remainder, right_remainder)) + log(events.StitchCut(left, right, left_overlap, right_overlap, left_remainder, right_remainder)) # Align overlapping parts, then recombine based on concordance. aligned_left, aligned_right = align_queries(left_overlap.seq, right_overlap.seq) @@ -447,14 +415,10 @@ def stitch_2_contigs(left, right): average_concordance = Fraction(sum(concordance) / (len(concordance) or 1)) concordance_str = ', '.join(map(lambda x: str(int(round(x * 100)) / 100), concordance)) cut_point_location_scaled = max_concordance_index / (((len(concordance) or 1) - 1) or 1) - logger.debug("Created overlap contigs %r at %s and %r at %s based on parts of %r and %r, with avg. concordance %s%%, cut point at %s%%, and full concordance [%s].", - left_overlap_take.name, left_overlap.alignment, right_overlap_take.name, right_overlap_take.alignment, - left.name, right.name, round(average_concordance * 100), - round(cut_point_location_scaled * 100), concordance_str) - context.get().emit(events.Overlap(left, right, left_overlap, right_overlap, - left_remainder, right_remainder, left_overlap_take, - right_overlap_take, concordance, average_concordance, - max_concordance_index, cut_point_location_scaled)) + log(events.Overlap(left, right, left_overlap, right_overlap, + left_remainder, right_remainder, left_overlap_take, + right_overlap_take, concordance, average_concordance, + max_concordance_index, cut_point_location_scaled)) return combine_contigs([left_remainder, left_overlap_take, right_overlap_take, right_remainder]) @@ -473,8 +437,7 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: # Find overlap. If there isn't one - we are done with the current contig. overlapping_contig = find_overlapping_contig(current, contigs) if not overlapping_contig: - logger.debug("Nothing overlaps with %r.", current.name) - context.get().emit(events.NoOverlap(current)) + log(events.NoOverlap(current)) yield current continue @@ -482,11 +445,7 @@ def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: new_contig = stitch_2_contigs(current, overlapping_contig) contigs.remove(overlapping_contig) contigs.insert(0, new_contig) - - logger.debug("Stitching %r with %r results in %r at %s (len %s).", - current.name, overlapping_contig.name, - new_contig.name, new_contig.alignment, len(new_contig.seq)) - context.get().emit(events.Stitch(current, overlapping_contig, new_contig)) + log(events.Stitch(current, overlapping_contig, new_contig)) def merge_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: @@ -554,9 +513,7 @@ def drop_completely_covered(contigs: List[AlignedContig]) -> List[AlignedContig] covered, covering = find_covered_contig(contigs) if covered: contigs.remove(covered) - logger.debug("Droped contig %r as it is completely covered by these contigs: %s.", - covered.name, ", ".join(repr(x.name) for x in covering)) - context.get().emit(events.Drop(covered, covering)) + log(events.Drop(covered, covering)) else: break @@ -594,8 +551,7 @@ def try_split(contig): # overlaps around them. # And we are likely to lose quality with every stitching operation. # By skipping we assert that this gap is aligner's fault. - logger.debug("Ignored insignificant gap of %r, %s.", contig.name, gap) - context.get().emit(events.IgnoreGap(contig, gap)) + log(events.IgnoreGap(contig, gap)) continue if covered(contig, gap): @@ -608,15 +564,7 @@ def try_split(contig): contigs.append(left_part) contigs.append(right_part) process_queue.put(right_part) - - logger.debug("Split contig %r at %s around its gap at [%s, %s]->[%s, %s]. " - "Left part: %r at %s, " - "right part: %r at %s.", - contig.name, contig.alignment, - gap.q_st, gap.q_ei, gap.r_st, gap.r_ei, - left_part.name, left_part.alignment, - right_part.name, right_part.alignment) - context.get().emit(events.SplitGap(contig, gap, left_part, right_part)) + log(events.SplitGap(contig, gap, left_part, right_part)) return process_queue: LifoQueue = LifoQueue() @@ -632,10 +580,7 @@ def try_split(contig): def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(contigs) for contig in contigs: - logger.debug("Introduced contig %r (seq = %s) of ref %r, group_ref %r (seq = %s), and length %s.", - contig.name, contig.seq, contig.ref_name, - contig.group_ref, contig.ref_seq, len(contig.seq)) - context.get().emit(events.Intro(contig)) + log(events.Intro(contig)) context.get().nameset.add(contig.name) maybe_aligned = list(align_all_to_reference(contigs)) @@ -664,9 +609,7 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo def combine(group_ref): contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) result = combine_contigs(contigs) - logger.debug("Combining these contigs for final output for %r: %s.", - group_ref, [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in contigs]) - context.get().emit(events.FinalCombine(contigs, result)) + log(events.FinalCombine(contigs, result)) return result yield from map(combine, consensus_parts) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 99ac62256..5542c8748 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -9,6 +9,10 @@ class Cut: original: 'Contig' left: 'Contig' right: 'Contig' + cut_point: float + + def __str__(self) -> str: + return f"Created contigs {self.left.name} at {self.left.alignment} and {self.right.name} at {self.right.alignment} by cutting {self.original.name} at {self.original.alignment} at cut point = {round(self.cut_point, 1)}." @dataclass(frozen=True) @@ -16,12 +20,18 @@ class LStrip: original: 'AlignedContig' result: 'AlignedContig' + def __str__(self) -> str: + return f"Doing lstrip of {self.original.name} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + @dataclass(frozen=True) class RStrip: original: 'AlignedContig' result: 'AlignedContig' + def __str__(self) -> str: + return f"Doing rstrip of {self.original.name} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + @dataclass(frozen=True) class Munge: @@ -29,40 +39,62 @@ class Munge: right: 'AlignedContig' result: 'AlignedContig' + def __str__(self) -> str: + return f"Munged contigs {self.left.name} at {self.left.alignment} with {self.right.name} at {self.right.alignment} resulting in {self.result.name} at {self.result.alignment}." + @dataclass(frozen=True) class Combine: contigs: List['AlignedContig'] result: 'AlignedContig' + def __str__(self) -> str: + return f"Created a frankenstein {self.result.name} at {self.result.alignment} (len {len(self.result.seq)}) from {[f'{x.name!r} at {x.alignment} (len {len(x.seq)})' for x in self.contigs]}." + @dataclass(frozen=True) class NoRef: contig: 'GenotypedContig' + def __str__(self) -> str: + return f"Contig {self.contig.name} not aligned - no reference." + @dataclass(frozen=True) class InitialHit: contig: 'GenotypedContig' + index: int hit: CigarHit strand: Literal["forward", "reverse"] + def __str__(self) -> str: + return f"Part {self.index} of contig {self.contig.name} aligned at {self.hit}{'' if self.strand == 'forward' else ' (rev)'}." + @dataclass(frozen=True) class ZeroHits: contig: 'GenotypedContig' + def __str__(self) -> str: + return f"Contig {self.contig.name} not aligned - backend's choice." + @dataclass(frozen=True) class StrandConflict: contig: 'GenotypedContig' + def __str__(self) -> str: + return f"Discarding contig {self.contig.name} because it aligned both in forward and reverse sense." + @dataclass class ReverseComplement: contig: 'GenotypedContig' result: 'GenotypedContig' + def __str__(self) -> str: + return f"Reverse complemented contig {self.contig.name}." + @dataclass(frozen=True) class HitNumber: @@ -70,6 +102,9 @@ class HitNumber: initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] connected: List[CigarHit] + def __str__(self) -> str: + return f"Contig {self.contig.name} produced {len(self.initial)} aligner hits. After connecting them, the number became {len(self.connected)}." + @dataclass(frozen=True) class ConnectedHit: @@ -77,6 +112,9 @@ class ConnectedHit: part: 'AlignedContig' index: int + def __str__(self) -> str: + return f"Part {self.index} of contig {self.contig.name} re-aligned as {self.part.name} at {self.part.alignment}{'' if self.part.strand == 'forward' else ' (rev)'}." + @dataclass(frozen=True) class InitialStrip: @@ -84,6 +122,9 @@ class InitialStrip: q_st: int q_ei: int + def __str__(self) -> str: + return f"Trimming (strip) contig {self.contig.name} from {self.q_st} to {self.q_ei}." + @dataclass(frozen=True) class StitchCut: @@ -94,6 +135,9 @@ class StitchCut: left_remainder: 'AlignedContig' right_remainder: 'AlignedContig' + def __str__(self) -> str: + return f"Stitching {self.left.name} at {self.left.alignment} (len {len(self.left.seq)}) with {self.right.name} at {self.right.alignment} (len {len(self.right.seq)}). The left_overlap {self.left_overlap.name} is at {self.left_overlap.alignment} (len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name} is at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." + @dataclass(frozen=True) class Overlap: @@ -110,11 +154,20 @@ class Overlap: cut_point: int cut_point_scaled: Fraction + def __str__(self) -> str: + average_concordance = round(self.average * 100) + cut_point_location_scaled = round(self.cut_point_scaled * 100) + concordance_str = ', '.join(str(int(round(x * 100)) / 100) for x in self.concordance) + return f"Created overlap contigs {self.left_take.name} at {self.left_overlap.alignment} and {self.right_take.name} at {self.right_take.alignment} based on parts of {self.left.name} and {self.right.name}, with avg. concordance {average_concordance}%, cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." + @dataclass(frozen=True) class NoOverlap: contig: 'AlignedContig' + def __str__(self) -> str: + return f"Nothing overlaps with {self.contig.name}." + @dataclass(frozen=True) class Stitch: @@ -122,18 +175,27 @@ class Stitch: right: 'AlignedContig' result: 'AlignedContig' + def __str__(self) -> str: + return f"Stitching {self.left.name} with {self.right.name} results in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + @dataclass(frozen=True) class Drop: contig: 'AlignedContig' covering: List['AlignedContig'] + def __str__(self) -> str: + return f"Dropped contig {self.contig.name} as it is completely covered by these contigs: {', '.join(repr(x.name) for x in self.covering)}." + @dataclass(frozen=True) class IgnoreGap: contig: 'AlignedContig' gap: 'CigarHit' + def __str__(self) -> str: + return f"Ignored insignificant gap of {self.contig.name}, {self.gap}." + @dataclass(frozen=True) class SplitGap: @@ -142,17 +204,26 @@ class SplitGap: left: 'AlignedContig' right: 'AlignedContig' + def __str__(self) -> str: + return f"Split contig {self.contig.name} at {self.contig.alignment} around its gap at [{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: {self.left.name} at {self.left.alignment}, right part: {self.right.name} at {self.right.alignment}." + @dataclass(frozen=True) class Intro: contig: 'GenotypedContig' + def __str__(self) -> str: + return f"Introduced contig {self.contig.name} (seq = {self.contig.seq}) of ref {self.contig.ref_name}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), and length {len(self.contig.seq)}." + @dataclass(frozen=True) class FinalCombine: contigs: List['AlignedContig'] result: 'AlignedContig' + def __str__(self) -> str: + return f"Combining these contigs for final output for {self.result.group_ref}: {['%r at %s (len %s)' % (x.name, x.alignment, len(x.seq)) for x in self.contigs]}." + AlignmentEvent = Union[NoRef, InitialHit, ZeroHits, StrandConflict, ReverseComplement, HitNumber, ConnectedHit] ModifyEvent = Union[LStrip, RStrip] From b5bcc6a6f52438bed8dd5368bf387414e272cbe8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 20 Feb 2024 11:12:31 -0800 Subject: [PATCH 213/485] Contig stitcher: remove unused imports --- micall/core/contig_stitcher.py | 11 +++++------ micall/core/plot_contigs.py | 5 ++--- micall/utils/contig_stitcher_events.py | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 1e3260251..e93ec7e1e 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,10 +1,10 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable, Set -from collections import deque, defaultdict +from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar +from collections import defaultdict from dataclasses import dataclass, replace -from math import ceil, floor +from math import ceil from mappy import Aligner -from functools import cached_property, reduce -from itertools import accumulate, takewhile, tee, islice, chain +from functools import reduce +from itertools import tee, islice, chain from gotoh import align_it from queue import LifoQueue from Bio import Seq @@ -12,7 +12,6 @@ from fractions import Fraction from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit -from micall.utils.consensus_aligner import CigarActions from micall.utils.contig_stitcher_context import context, StitcherContext import micall.utils.contig_stitcher_events as events diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 48822e04f..045e3d47b 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1,5 +1,5 @@ import typing -from typing import Dict, Tuple, List, Set, Iterable, NoReturn, Literal, Union +from typing import Dict, Tuple, List, Set, Iterable, NoReturn from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType from collections import Counter, defaultdict from csv import DictReader @@ -8,7 +8,6 @@ from math import log10, copysign, floor from operator import itemgetter, attrgetter from pathlib import Path -import dataclasses import logging import yaml @@ -21,7 +20,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs -from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig, sliding_window +from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig from micall.utils.cigar_tools import CigarHit import micall.utils.contig_stitcher_events as events diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 5542c8748..e3ffe5b2a 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -1,7 +1,7 @@ from typing import Union, List, Tuple, Literal from dataclasses import dataclass from fractions import Fraction -from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit +from micall.utils.cigar_tools import CigarHit @dataclass(frozen=True) From 0f2184708990a8187aa64048345d1efe56ff0ab3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 20 Feb 2024 15:43:14 -0800 Subject: [PATCH 214/485] Contig stitcher: factor out contig structures definitions --- micall/core/contig_stitcher.py | 206 ++++++++++-------------- micall/core/plot_contigs.py | 4 +- micall/tests/test_contig_stitcher.py | 32 ++-- micall/utils/contig_stitcher_contigs.py | 36 +++++ micall/utils/contig_stitcher_events.py | 148 ++++++++--------- 5 files changed, 217 insertions(+), 209 deletions(-) create mode 100644 micall/utils/contig_stitcher_contigs.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e93ec7e1e..18f4752a7 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,6 +1,6 @@ from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar from collections import defaultdict -from dataclasses import dataclass, replace +from dataclasses import replace from math import ceil from mappy import Aligner from functools import reduce @@ -13,6 +13,7 @@ from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.contig_stitcher_context import context, StitcherContext +from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig import micall.utils.contig_stitcher_events as events @@ -25,119 +26,88 @@ def log(e: events.EventType) -> None: logger.debug("%s", e) -@dataclass(frozen=True) -class Contig: - name: str - seq: str +def cut_query(self: GenotypedContig, cut_point: float) -> Tuple[GenotypedContig, GenotypedContig]: + """ Cuts query sequence in two parts with cut_point between them. """ + cut_point = max(0, cut_point) + left = replace(self, name=context.get().generate_new_name(), seq=self.seq[:ceil(cut_point)]) + right = replace(self, name=context.get().generate_new_name(), seq=self.seq[ceil(cut_point):]) + return (left, right) -@dataclass(frozen=True) -class GenotypedContig(Contig): - ref_name: str - group_ref: str - ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. - match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. - def cut_query(self, cut_point: float) -> Tuple['GenotypedContig', 'GenotypedContig']: - """ Cuts query sequence in two parts with cut_point between them. """ +def cut_reference(self: AlignedContig, cut_point: float) -> Tuple[AlignedContig, AlignedContig]: + """ Cuts this alignment in two parts with cut_point between them. """ - cut_point = max(0, cut_point) - left = replace(self, name=context.get().generate_new_name(), seq=self.seq[:ceil(cut_point)]) - right = replace(self, name=context.get().generate_new_name(), seq=self.seq[ceil(cut_point):]) - return (left, right) + alignment_left, alignment_right = self.alignment.cut_reference(cut_point) + left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) + right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) + log(events.Cut(self, left, right, cut_point)) + return (left, right) -@dataclass(frozen=True) -class AlignedContig(GenotypedContig): - alignment: CigarHit - strand: Literal["forward", "reverse"] - - @staticmethod - def make(query: GenotypedContig, alignment: CigarHit, strand: Literal["forward", "reverse"]): - return AlignedContig( - alignment=alignment, - strand=strand, - seq=query.seq, - name=query.name, - ref_name=query.ref_name, - group_ref=query.group_ref, - ref_seq=query.ref_seq, - match_fraction=query.match_fraction) - - - def cut_reference(self, cut_point: float) -> Tuple['AlignedContig', 'AlignedContig']: - """ Cuts this alignment in two parts with cut_point between them. """ - - alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) - right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) - log(events.Cut(self, left, right, cut_point)) - return (left, right) - - - def lstrip(self) -> 'AlignedContig': - """ - Trims the query sequence of the contig from its beginning up to the start of the - alignment. The CIGAR alignment is also updated to reflect the trimming. - """ - - alignment = self.alignment.lstrip_reference().lstrip_query() - q_remainder, query = self.cut_query(alignment.q_st - 0.5) - alignment = alignment.translate(0, -1 * alignment.q_st) - result = AlignedContig.make(query, alignment, self.strand) - log(events.LStrip(self, result)) - return result - +def lstrip(self: AlignedContig) -> AlignedContig: + """ + Trims the query sequence of the contig from its beginning up to the start of the + alignment. The CIGAR alignment is also updated to reflect the trimming. + """ - def rstrip(self) -> 'AlignedContig': - """ - Trims the query sequence of the contig from its end based on the end of the - alignment. The CIGAR alignment is also updated to reflect the trimming. - """ + alignment = self.alignment.lstrip_reference().lstrip_query() + q_remainder, query = cut_query(self, alignment.q_st - 0.5) + alignment = alignment.translate(0, -1 * alignment.q_st) + result = AlignedContig.make(query, alignment, self.strand) + log(events.LStrip(self, result)) + return result - alignment = self.alignment.rstrip_reference().rstrip_query() - query, q_remainder = self.cut_query(alignment.q_ei + 0.5) - result = AlignedContig.make(query, alignment, self.strand) - log(events.RStrip(self, result)) - return result +def rstrip(self: AlignedContig) -> AlignedContig: + """ + Trims the query sequence of the contig from its end based on the end of the + alignment. The CIGAR alignment is also updated to reflect the trimming. + """ - def overlaps(self, other) -> bool: - def intervals_overlap(x, y): - return x[0] <= y[1] and x[1] >= y[0] + alignment = self.alignment.rstrip_reference().rstrip_query() + query, q_remainder = cut_query(self, alignment.q_ei + 0.5) + result = AlignedContig.make(query, alignment, self.strand) + log(events.RStrip(self, result)) + return result - if self.group_ref != other.group_ref: - return False - return intervals_overlap((self.alignment.r_st, self.alignment.r_ei), - (other.alignment.r_st, other.alignment.r_ei)) +def overlap(a: AlignedContig, b: AlignedContig) -> bool: + def intervals_overlap(x, y): + return x[0] <= y[1] and x[1] >= y[0] + if a.group_ref != b.group_ref: + return False - def munge(self, other: 'AlignedContig') -> 'AlignedContig': - """ - Combines two adjacent contigs into a single contig by joining their - query sequences and alignments. - """ + return intervals_overlap((a.alignment.r_st, a.alignment.r_ei), + (b.alignment.r_st, b.alignment.r_ei)) - match_fraction = min(self.match_fraction, other.match_fraction) - ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name - query = GenotypedContig(seq=self.seq + other.seq, - name=context.get().generate_new_name(), - ref_name=ref_name, - group_ref=self.group_ref, - ref_seq=self.ref_seq, - match_fraction=match_fraction) - self_alignment = self.alignment - other_alignment = \ - other.alignment.translate( - query_delta=(-1 * other.alignment.q_st + self.alignment.q_ei + 1), - reference_delta=0) - alignment = self_alignment.connect(other_alignment) +def munge(self: AlignedContig, other: AlignedContig) -> AlignedContig: + """ + Combines two adjacent contigs into a single contig by joining their + query sequences and alignments. + """ - ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) - log(events.Munge(self, other, ret)) - return ret + match_fraction = min(self.match_fraction, other.match_fraction) + ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name + query = GenotypedContig(seq=self.seq + other.seq, + name=context.get().generate_new_name(), + ref_name=ref_name, + group_ref=self.group_ref, + ref_seq=self.ref_seq, + match_fraction=match_fraction) + + self_alignment = self.alignment + other_alignment = \ + other.alignment.translate( + query_delta=(-1 * other.alignment.q_st + self.alignment.q_ei + 1), + reference_delta=0) + alignment = self_alignment.connect(other_alignment) + + ret = AlignedContig.make(query=query, alignment=alignment, strand=self.strand) + log(events.Munge(self, other, ret)) + return ret def sliding_window(sequence: Iterable[T]) -> Iterable[Tuple[Optional[T], T, Optional[T]]]: @@ -159,18 +129,18 @@ def combine_contigs(parts: List[AlignedContig]) -> AlignedContig: Combine a list of contigs into a single AlignedContig by trimming and merging overlapping parts. Left-trimming and right-trimming occur at any shared overlapping points - between adjacent parts. AlignedContig.munge() is used to combine contiguous parts without overlap. + between adjacent parts. munge() is used to combine contiguous parts without overlap. """ stripped_parts = [] for prev_part, part, next_part in sliding_window(parts): if prev_part is not None: - part = part.lstrip() + part = lstrip(part) if next_part is not None: - part = part.rstrip() + part = rstrip(part) stripped_parts.append(part) - ret = reduce(AlignedContig.munge, stripped_parts) + ret = reduce(munge, stripped_parts) log(events.Combine(stripped_parts, ret)) return ret @@ -256,10 +226,10 @@ def is_out_of_order(name: str) -> bool: end = next_contig.alignment.q_st - 1 if isinstance(next_contig, AlignedContig) else len(contig.seq) - 1 if prev_contig is not None or is_out_of_order(original.name): - contig = contig.lstrip() + contig = lstrip(contig) log(events.InitialStrip(original, start, original.alignment.q_st - 1)) if next_contig is not None or is_out_of_order(original.name): - contig = contig.rstrip() + contig = rstrip(contig) log(events.InitialStrip(original, original.alignment.q_ei + 1, end)) yield contig @@ -295,18 +265,18 @@ def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: return aseq1, aseq2 -def find_all_overlapping_contigs(self, aligned_contigs): +def find_all_overlapping_contigs(self: AlignedContig, aligned_contigs): """" Yield all contigs from a collection that overlap with a given contig. Contigs are considered overlapping if they have overlapping intervals on the same reference genome. """ for other in aligned_contigs: - if self.overlaps(other): + if overlap(self, other): yield other -def find_overlapping_contig(self, aligned_contigs): +def find_overlapping_contig(self: AlignedContig, aligned_contigs): """ Find the single contig in a collection that overlaps the most with a given contig. It returns the contig with the maximum overlapped reference length with the given contig (self). @@ -394,12 +364,12 @@ def stitch_2_contigs(left, right): """ # Cut in 4 parts. - left_remainder, left_overlap = left.cut_reference(right.alignment.r_st - 0.5) - right_overlap, right_remainder = right.cut_reference(left.alignment.r_ei + 0.5) - left_overlap = left_overlap.rstrip().lstrip() - right_overlap = right_overlap.lstrip().rstrip() - left_remainder = left_remainder.rstrip() - right_remainder = right_remainder.lstrip() + left_remainder, left_overlap = cut_reference(left, right.alignment.r_st - 0.5) + right_overlap, right_remainder = cut_reference(right, left.alignment.r_ei + 0.5) + left_overlap = lstrip(rstrip(left_overlap)) + right_overlap = lstrip(rstrip(right_overlap)) + left_remainder = rstrip(left_remainder) + right_remainder = lstrip(right_remainder) log(events.StitchCut(left, right, left_overlap, right_overlap, left_remainder, right_remainder)) # Align overlapping parts, then recombine based on concordance. @@ -407,8 +377,8 @@ def stitch_2_contigs(left, right): concordance = calculate_concordance(aligned_left, aligned_right) aligned_left_cutpoint, aligned_right_cutpoint, max_concordance_index = \ concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance) - left_overlap_take, left_overlap_drop = left_overlap.cut_reference(aligned_left_cutpoint) - right_overlap_drop, right_overlap_take = right_overlap.cut_reference(aligned_right_cutpoint) + left_overlap_take, left_overlap_drop = cut_reference(left_overlap, aligned_left_cutpoint) + right_overlap_drop, right_overlap_take = cut_reference(right_overlap, aligned_right_cutpoint) # Log it. average_concordance = Fraction(sum(concordance) / (len(concordance) or 1)) @@ -493,7 +463,7 @@ def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: current_interval = (current.alignment.r_st, current.alignment.r_ei) # Create a map of cumulative coverage for contigs - overlaping_contigs = [x for x in contigs if x != current and x.overlaps(current)] + overlaping_contigs = [x for x in contigs if x != current and overlap(current, x)] cumulative_coverage = calculate_cumulative_coverage(overlaping_contigs) # Check if the current contig is covered by the cumulative coverage intervals @@ -555,9 +525,9 @@ def try_split(contig): if covered(contig, gap): midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon - left_part, right_part = contig.cut_reference(midpoint) - left_part = left_part.rstrip() - right_part = right_part.lstrip() + left_part, right_part = cut_reference(contig, midpoint) + left_part = rstrip(left_part) + right_part = lstrip(right_part) contigs.remove(contig) contigs.append(left_part) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 045e3d47b..ffd17c0dc 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -20,7 +20,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs -from micall.core.contig_stitcher import Contig, GenotypedContig, AlignedContig +from micall.utils.contig_stitcher_contigs import Contig, GenotypedContig, AlignedContig from micall.utils.cigar_tools import CigarHit import micall.utils.contig_stitcher_events as events @@ -520,7 +520,7 @@ def graph_sum(graph_a, graph_b): def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) - def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]): + def record_contig(contig: GenotypedContig, parents: Iterable[GenotypedContig]): complete_contig_map[contig.name] = contig if [contig.name] != [parent.name for parent in parents]: for parent in parents: diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index fc516def9..955655c10 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -5,7 +5,7 @@ import pytest import micall.core.contig_stitcher as stitcher -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance +from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance, lstrip, rstrip from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed from micall.tests.test_denovo import check_hcv_db # activates the fixture @@ -721,7 +721,7 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) - assert all(x.seq == x.lstrip().rstrip().seq for x in results) + assert all(x.seq == lstrip(rstrip(x)).seq for x in results) assert { contig.seq for contig in contigs } \ == { contig.seq for contig in results } @@ -746,10 +746,10 @@ def test_stitching_partial_align(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) - assert all(x.seq != x.lstrip().rstrip().seq for x in results) + assert all(x.seq != lstrip(rstrip(x)).seq for x in results) assert { contig.seq for contig in contigs } \ - != { contig.lstrip().rstrip().seq for contig in results } + != { lstrip(rstrip(contig)).seq for contig in results } def test_partial_align_consensus(exact_aligner, visualizer): @@ -1080,10 +1080,10 @@ def test_reverse_complement_match_with_padding(projects, visualizer): results = list(stitch_consensus(contigs)) assert len(results) == 1 assert results[0].seq == ref_part - assert len(results[0].lstrip().seq) == len(ref_part) - 24 - assert len(results[0].rstrip().seq) == len(ref_part) - 27 - assert results[0].rstrip().seq == ref_part[:-27] # 27 Gs on the right - assert results[0].lstrip().seq == ref_part[24:] # 24 Ts on the left + assert len(lstrip(results[0]).seq) == len(ref_part) - 24 + assert len(rstrip(results[0]).seq) == len(ref_part) - 27 + assert rstrip(results[0]).seq == ref_part[:-27] # 27 Gs on the right + assert lstrip(results[0]).seq == ref_part[24:] # 24 Ts on the left assert len(visualizer().elements) > len(contigs) @@ -1111,10 +1111,10 @@ def test_multiple_reverse_complement_matches(projects, visualizer): assert len(results) == 1 assert len(results[0].seq) == len(ref_part) assert results[0].seq == ref_part - assert len(results[0].lstrip().seq) == len(ref_part) - 24 - assert len(results[0].rstrip().seq) == len(ref_part) - 27 - assert results[0].lstrip().seq == ref_part[24:] - assert results[0].rstrip().seq == ref_part[:-27] + assert len(lstrip(results[0]).seq) == len(ref_part) - 24 + assert len(rstrip(results[0]).seq) == len(ref_part) - 27 + assert lstrip(results[0]).seq == ref_part[24:] + assert rstrip(results[0]).seq == ref_part[:-27] assert len(visualizer().elements) > len(contigs) @@ -1143,10 +1143,10 @@ def test_multiple_reverse_complement_matches_out_of_order(projects, visualizer): assert len(results) == 1 assert len(results[0].seq) == len(ref_part) assert results[0].seq == ref_part - assert len(results[0].lstrip().seq) == len(ref_part) - 24 - assert len(results[0].rstrip().seq) == len(ref_part) - 27 - assert results[0].lstrip().seq == ref_part[24:] - assert results[0].rstrip().seq == ref_part[:-27] + assert len(lstrip(results[0]).seq) == len(ref_part) - 24 + assert len(rstrip(results[0]).seq) == len(ref_part) - 27 + assert lstrip(results[0]).seq == ref_part[24:] + assert rstrip(results[0]).seq == ref_part[:-27] assert len(visualizer().elements) > len(contigs) diff --git a/micall/utils/contig_stitcher_contigs.py b/micall/utils/contig_stitcher_contigs.py new file mode 100644 index 000000000..49117e70f --- /dev/null +++ b/micall/utils/contig_stitcher_contigs.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import Optional, Literal + +from micall.utils.cigar_tools import CigarHit + + +@dataclass(frozen=True) +class Contig: + name: str + seq: str + + +@dataclass(frozen=True) +class GenotypedContig(Contig): + ref_name: str + group_ref: str + ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. + match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. + + +@dataclass(frozen=True) +class AlignedContig(GenotypedContig): + alignment: CigarHit + strand: Literal["forward", "reverse"] + + @staticmethod + def make(query: GenotypedContig, alignment: CigarHit, strand: Literal["forward", "reverse"]): + return AlignedContig( + alignment=alignment, + strand=strand, + seq=query.seq, + name=query.name, + ref_name=query.ref_name, + group_ref=query.group_ref, + ref_seq=query.ref_seq, + match_fraction=query.match_fraction) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index e3ffe5b2a..c442f30c3 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -1,154 +1,156 @@ from typing import Union, List, Tuple, Literal from dataclasses import dataclass from fractions import Fraction + from micall.utils.cigar_tools import CigarHit +from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig @dataclass(frozen=True) class Cut: - original: 'Contig' - left: 'Contig' - right: 'Contig' + original: AlignedContig + left: AlignedContig + right: AlignedContig cut_point: float def __str__(self) -> str: - return f"Created contigs {self.left.name} at {self.left.alignment} and {self.right.name} at {self.right.alignment} by cutting {self.original.name} at {self.original.alignment} at cut point = {round(self.cut_point, 1)}." + return f"Created contigs {self.left.name!r} at {self.left.alignment} and {self.right.name!r} at {self.right.alignment} by cutting {self.original.name!r} at {self.original.alignment} at cut point = {round(self.cut_point, 1)}." @dataclass(frozen=True) class LStrip: - original: 'AlignedContig' - result: 'AlignedContig' + original: AlignedContig + result: AlignedContig def __str__(self) -> str: - return f"Doing lstrip of {self.original.name} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + return f"Doing lstrip of {self.original.name!r} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." @dataclass(frozen=True) class RStrip: - original: 'AlignedContig' - result: 'AlignedContig' + original: AlignedContig + result: AlignedContig def __str__(self) -> str: - return f"Doing rstrip of {self.original.name} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + return f"Doing rstrip of {self.original.name!r} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." @dataclass(frozen=True) class Munge: - left: 'AlignedContig' - right: 'AlignedContig' - result: 'AlignedContig' + left: AlignedContig + right: AlignedContig + result: AlignedContig def __str__(self) -> str: - return f"Munged contigs {self.left.name} at {self.left.alignment} with {self.right.name} at {self.right.alignment} resulting in {self.result.name} at {self.result.alignment}." + return f"Munged contigs {self.left.name!r} at {self.left.alignment} with {self.right.name!r} at {self.right.alignment} resulting in {self.result.name!r} at {self.result.alignment}." @dataclass(frozen=True) class Combine: - contigs: List['AlignedContig'] - result: 'AlignedContig' + contigs: List[AlignedContig] + result: AlignedContig def __str__(self) -> str: - return f"Created a frankenstein {self.result.name} at {self.result.alignment} (len {len(self.result.seq)}) from {[f'{x.name!r} at {x.alignment} (len {len(x.seq)})' for x in self.contigs]}." + return f"Created a frankenstein {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)}) from {[f'{x.name!r} at {x.alignment} (len {len(x.seq)})' for x in self.contigs]}." @dataclass(frozen=True) class NoRef: - contig: 'GenotypedContig' + contig: GenotypedContig def __str__(self) -> str: - return f"Contig {self.contig.name} not aligned - no reference." + return f"Contig {self.contig.name!r} not aligned - no reference." @dataclass(frozen=True) class InitialHit: - contig: 'GenotypedContig' + contig: GenotypedContig index: int hit: CigarHit strand: Literal["forward", "reverse"] def __str__(self) -> str: - return f"Part {self.index} of contig {self.contig.name} aligned at {self.hit}{'' if self.strand == 'forward' else ' (rev)'}." + return f"Part {self.index} of contig {self.contig.name!r} aligned at {self.hit}{'' if self.strand == 'forward' else ' (rev)'}." @dataclass(frozen=True) class ZeroHits: - contig: 'GenotypedContig' + contig: GenotypedContig def __str__(self) -> str: - return f"Contig {self.contig.name} not aligned - backend's choice." + return f"Contig {self.contig.name!r} not aligned - backend's choice." @dataclass(frozen=True) class StrandConflict: - contig: 'GenotypedContig' + contig: GenotypedContig def __str__(self) -> str: - return f"Discarding contig {self.contig.name} because it aligned both in forward and reverse sense." + return f"Discarding contig {self.contig.name!r} because it aligned both in forward and reverse sense." -@dataclass +@dataclass(frozen=True) class ReverseComplement: - contig: 'GenotypedContig' - result: 'GenotypedContig' + contig: GenotypedContig + result: GenotypedContig def __str__(self) -> str: - return f"Reverse complemented contig {self.contig.name}." + return f"Reverse complemented contig {self.contig.name!r}." @dataclass(frozen=True) class HitNumber: - contig: 'GenotypedContig' + contig: GenotypedContig initial: List[Tuple[CigarHit, Literal["reverse", "forward"]]] connected: List[CigarHit] def __str__(self) -> str: - return f"Contig {self.contig.name} produced {len(self.initial)} aligner hits. After connecting them, the number became {len(self.connected)}." + return f"Contig {self.contig.name!r} produced {len(self.initial)} aligner hits. After connecting them, the number became {len(self.connected)}." @dataclass(frozen=True) class ConnectedHit: - contig: 'GenotypedContig' - part: 'AlignedContig' + contig: GenotypedContig + part: AlignedContig index: int def __str__(self) -> str: - return f"Part {self.index} of contig {self.contig.name} re-aligned as {self.part.name} at {self.part.alignment}{'' if self.part.strand == 'forward' else ' (rev)'}." + return f"Part {self.index} of contig {self.contig.name!r} re-aligned as {self.part.name!r} at {self.part.alignment}{'' if self.part.strand == 'forward' else ' (rev)'}." @dataclass(frozen=True) class InitialStrip: - contig: 'AlignedContig' + contig: AlignedContig q_st: int q_ei: int def __str__(self) -> str: - return f"Trimming (strip) contig {self.contig.name} from {self.q_st} to {self.q_ei}." + return f"Trimming (strip) contig {self.contig.name!r} from {self.q_st} to {self.q_ei}." @dataclass(frozen=True) class StitchCut: - left: 'AlignedContig' - right: 'AlignedContig' - left_overlap: 'AlignedContig' - right_overlap: 'AlignedContig' - left_remainder: 'AlignedContig' - right_remainder: 'AlignedContig' + left: AlignedContig + right: AlignedContig + left_overlap: AlignedContig + right_overlap: AlignedContig + left_remainder: AlignedContig + right_remainder: AlignedContig def __str__(self) -> str: - return f"Stitching {self.left.name} at {self.left.alignment} (len {len(self.left.seq)}) with {self.right.name} at {self.right.alignment} (len {len(self.right.seq)}). The left_overlap {self.left_overlap.name} is at {self.left_overlap.alignment} (len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name} is at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." + return f"Stitching {self.left.name!r} at {self.left.alignment} (len {len(self.left.seq)}) with {self.right.name!r} at {self.right.alignment} (len {len(self.right.seq)}). The left_overlap {self.left_overlap.name!r} is at {self.left_overlap.alignment} (len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name!r} is at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." @dataclass(frozen=True) class Overlap: - left: 'AlignedContig' - right: 'AlignedContig' - left_overlap: 'AlignedContig' - right_overlap: 'AlignedContig' - left_remainder: 'AlignedContig' - right_remainder: 'AlignedContig' - left_take: 'AlignedContig' - right_take: 'AlignedContig' + left: AlignedContig + right: AlignedContig + left_overlap: AlignedContig + right_overlap: AlignedContig + left_remainder: AlignedContig + right_remainder: AlignedContig + left_take: AlignedContig + right_take: AlignedContig concordance: List[Fraction] average: Fraction cut_point: int @@ -158,68 +160,68 @@ def __str__(self) -> str: average_concordance = round(self.average * 100) cut_point_location_scaled = round(self.cut_point_scaled * 100) concordance_str = ', '.join(str(int(round(x * 100)) / 100) for x in self.concordance) - return f"Created overlap contigs {self.left_take.name} at {self.left_overlap.alignment} and {self.right_take.name} at {self.right_take.alignment} based on parts of {self.left.name} and {self.right.name}, with avg. concordance {average_concordance}%, cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." + return f"Created overlap contigs {self.left_take.name!r} at {self.left_overlap.alignment} and {self.right_take.name!r} at {self.right_take.alignment} based on parts of {self.left.name!r} and {self.right.name!r}, with avg. concordance {average_concordance}%, cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." @dataclass(frozen=True) class NoOverlap: - contig: 'AlignedContig' + contig: AlignedContig def __str__(self) -> str: - return f"Nothing overlaps with {self.contig.name}." + return f"Nothing overlaps with {self.contig.name!r}." @dataclass(frozen=True) class Stitch: - left: 'AlignedContig' - right: 'AlignedContig' - result: 'AlignedContig' + left: AlignedContig + right: AlignedContig + result: AlignedContig def __str__(self) -> str: - return f"Stitching {self.left.name} with {self.right.name} results in {self.result.name} at {self.result.alignment} (len {len(self.result.seq)})." + return f"Stitching {self.left.name!r} with {self.right.name!r} results in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." @dataclass(frozen=True) class Drop: - contig: 'AlignedContig' - covering: List['AlignedContig'] + contig: AlignedContig + covering: List[AlignedContig] def __str__(self) -> str: - return f"Dropped contig {self.contig.name} as it is completely covered by these contigs: {', '.join(repr(x.name) for x in self.covering)}." + return f"Dropped contig {self.contig.name!r} as it is completely covered by these contigs: {', '.join(repr(x.name) for x in self.covering)}." @dataclass(frozen=True) class IgnoreGap: - contig: 'AlignedContig' - gap: 'CigarHit' + contig: AlignedContig + gap: CigarHit def __str__(self) -> str: - return f"Ignored insignificant gap of {self.contig.name}, {self.gap}." + return f"Ignored insignificant gap of {self.contig.name!r}, {self.gap}." @dataclass(frozen=True) class SplitGap: - contig: 'AlignedContig' - gap: 'CigarHit' - left: 'AlignedContig' - right: 'AlignedContig' + contig: AlignedContig + gap: CigarHit + left: AlignedContig + right: AlignedContig def __str__(self) -> str: - return f"Split contig {self.contig.name} at {self.contig.alignment} around its gap at [{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: {self.left.name} at {self.left.alignment}, right part: {self.right.name} at {self.right.alignment}." + return f"Split contig {self.contig.name!r} at {self.contig.alignment} around its gap at [{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: {self.left.name!r} at {self.left.alignment}, right part: {self.right.name!r} at {self.right.alignment}." @dataclass(frozen=True) class Intro: - contig: 'GenotypedContig' + contig: GenotypedContig def __str__(self) -> str: - return f"Introduced contig {self.contig.name} (seq = {self.contig.seq}) of ref {self.contig.ref_name}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), and length {len(self.contig.seq)}." + return f"Introduced contig {self.contig.name!r} (seq = {self.contig.seq}) of ref {self.contig.ref_name!r}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), and length {len(self.contig.seq)}." @dataclass(frozen=True) class FinalCombine: - contigs: List['AlignedContig'] - result: 'AlignedContig' + contigs: List[AlignedContig] + result: AlignedContig def __str__(self) -> str: return f"Combining these contigs for final output for {self.result.group_ref}: {['%r at %s (len %s)' % (x.name, x.alignment, len(x.seq)) for x in self.contigs]}." From 49558fc19c49f1a6359b97242da6863022eba760 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 26 Feb 2024 07:31:12 -0800 Subject: [PATCH 215/485] Contig stitcher: fix theoretical bug in plot_contigs.py --- micall/core/plot_contigs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ffd17c0dc..b1df677e0 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -432,11 +432,12 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: strip_set: Set[Tuple[str, int, int]] = set() def remove_intermediate_edges(graph): + tr_cl = transitive_closure(graph) ret = {} for parent, children in graph.items(): lst = [] for child in children: - if all(other not in graph.get(child, []) for other in children): + if all(other not in tr_cl.get(child, []) for other in children): lst.append(child) ret[parent] = lst return ret From 800070427b2063d7c24f27c2d5cdca996010a929 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 26 Feb 2024 13:50:15 -0800 Subject: [PATCH 216/485] Cigar tools: remove dead code in the tests Also add a check for the text of the error messages. Co-authored-by: Don Kirkby --- micall/tests/test_cigar_tools.py | 47 ++++++++++---------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 551142ef6..47bf240c3 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -46,8 +46,6 @@ {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 5, 6: 6, 7: 7}), ('2=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 3, 7: 4}), - ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}), ('3=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 3, 7: 4, 8: 5}), @@ -65,10 +63,7 @@ @pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping - assert expected_mapping == mapping.ref_to_query - assert expected_mapping == {i: mapping.ref_to_query[i] - for i in mapping.ref_to_query.keys()} @pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) @@ -87,16 +82,9 @@ def test_cigar_to_coordinate_bijection_property(cigar_str): def test_cigar_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping - def test(): - fullrange = {i: mapping.ref_to_query.left_max(i) - for i in mapping.ref_to_query.domain} - assert expected_leftmax_mapping == fullrange - - if isinstance(expected_leftmax_mapping, Exception): - with pytest.raises(type(expected_leftmax_mapping)): - test() - else: - test() + fullrange = {i: mapping.ref_to_query.left_max(i) + for i in mapping.ref_to_query.domain} + assert expected_leftmax_mapping == fullrange @pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) @@ -119,18 +107,10 @@ def test_cigar_hit_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_map cigar = Cigar.coerce(cigar_str) hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) mapping = hit.coordinate_mapping - - def test(expected): - # Coordinates are translated by q_st and r_st. - fullrange = {i: mapping.ref_to_query.left_max(i) - for i in mapping.ref_to_query.domain} - assert expected == fullrange - - if isinstance(expected_leftmax_mapping, Exception): - with pytest.raises(type(expected_leftmax_mapping)): - test(expected_leftmax_mapping) - else: - test({k + hit.r_st: v + hit.q_st if v is not None else v for (k, v) in expected_leftmax_mapping.items()}) + expected = {k + hit.r_st: v + hit.q_st if v is not None else v for (k, v) in expected_leftmax_mapping.items()} + fullrange = {i: mapping.ref_to_query.left_max(i) + for i in mapping.ref_to_query.domain} + assert expected == fullrange def test_invalid_operation_in_cigar_string(): @@ -276,19 +256,19 @@ def parsed_hit(string): # Negative cases ('9M9I9M@1->1', 20.5, - IndexError("20.5 is bigger than reference (18)")), + IndexError("Cut point out of reference bounds")), # 20.5 is bigger than reference (18) ('@2->2', 2.5, - IndexError("Empty string cannot be cut")), + IndexError("Cut point out of reference bounds")), # Empty string cannot be cut ('@2->2', 1.5, - IndexError("Empty string cannot be cut")), + IndexError("Cut point out of reference bounds")), # Empty string cannot be cut ('9I@1->1', 3.5, - IndexError("Out of reference bounds")), + IndexError("Cut point out of reference bounds")), ('9M@1->1', 4, - ValueError("Cut point must not be an integer")), + ValueError("Cut accepts fractions, not integers")), ] @@ -298,8 +278,9 @@ def test_cigar_hit_ref_cut(hit, cut_point, expected_result): hit = parsed_hit(hit) if isinstance(expected_result, Exception): - with pytest.raises(type(expected_result)): + with pytest.raises(type(expected_result)) as exc_info: hit.cut_reference(cut_point) + assert exc_info.value.args == expected_result.args else: expected_result = list(map(parsed_hit, expected_result)) From 442c4e097d48fcdc5dc3040748c7a4813f356674 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 27 Feb 2024 08:47:38 -0800 Subject: [PATCH 217/485] Add initial Ruff configuration --- .ruff.toml | 62 +++++++++++++++++++++++++++++++++++++++++++ requirements-test.txt | 1 + 2 files changed, 63 insertions(+) create mode 100644 .ruff.toml diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 000000000..cd7e4b28e --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,62 @@ + +[lint.per-file-ignores] + +# These are pre-file ignores that were added for compatibility purposes when ruff was introduced into MiCall. +# TODO: resolve these warnings insteads of ignoring them. + +# Syntax error: ignore unexpected token in this file +"micall/utils/make_hcv_genotype_refs.py" = ["E999"] + +# File with unused imports and local variable assignments +"micall/core/plot_simple.py" = ["F401", "F841"] + +# Ignore bare except usage in project configuration +"micall/core/project_config.py" = ["E722"] + +# Unused import in remap module +"micall/core/remap.py" = ["F401"] + +# Do not compare types and bare excepts in pssm_lib module +"micall/g2p/pssm_lib.py" = ["E722", "E721"] + +# F-string errors without placeholders in tests +"micall/tests/svg_differ.py" = ["F541"] + +# Unused import in test_aln2counts_report module +"micall/tests/test_aln2counts_report.py" = ["F401"] + +# Multiple issues in test_consensus_aligner: unused imports, multi-statement line, and unused local variable +"micall/tests/test_consensus_aligner.py" = ["F401", "E701", "F841"] + +# Unused import in test_primer_tracker module +"micall/tests/test_primer_tracker.py" = ["F401"] + +# F-string errors without placeholders in test_remap module +"micall/tests/test_remap.py" = ["F541"] + +# Unused import in test_sample module +"micall/tests/test_sample.py" = ["F401"] + +# Unused imports, undefined name, and type comparison in aln2counts_simplify module +"micall/utils/aln2counts_simplify.py" = ["F401", "F821", "E721"] + +# Unused local variable in compare_mapping module +"micall/utils/compare_mapping.py" = ["F841"] + +# Multiple issues in conseq_compare: unused sys, redefinitions, and unused local variables +"micall/utils/conseq_compare.py" = ["F811", "F401", "F841"] + +# Comparisons to None should use `is` or `is not` in dd module +"micall/utils/dd.py" = ["E711"] + +# Unused imports and local variable in find_reads_in_sam module +"micall/utils/find_reads_in_sam.py" = ["F401", "F841"] + +# F-string errors without placeholders in primer_tracker module +"micall/utils/primer_tracker.py" = ["F541"] + +# F-string errors without placeholders in ref_aligner module +"micall/utils/ref_aligner.py" = ["F541"] + +# Unused local variable in release_test_compare script +"release_test_compare.py" = ["F841"] diff --git a/requirements-test.txt b/requirements-test.txt index d0ab5b830..52e6f08b6 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -7,6 +7,7 @@ coverage==7.2.7 pandas==2.0.2 seaborn==0.12.2 ete3 +ruff==0.2.2 # For reading HCV rules from an Excel file. openpyxl==3.1.2 From e5ac1498210778ae143c7efb14a15c727b2e67a7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 27 Feb 2024 09:22:48 -0800 Subject: [PATCH 218/485] Run Ruff on CI --- .github/workflows/build-and-test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index de5ebe481..0a8cb246b 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -60,6 +60,13 @@ jobs: python -m pip install --upgrade pip pip install -r requirements-test.txt + - name: Linter check + # Have to clean up alignment build files. Otherwise, they'll get reused + # in Singularity build with the wrong GLIBC libraries. + run: | + pip install -r requirements-test.txt + ruff check + - name: Test with pytest run: coverage run --source=micall/core,micall/g2p,micall/resistance,micall/monitor -m pytest From f44e405a3d5582a6be3aae1dd7124d2f76bb8ece Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 27 Feb 2024 09:28:28 -0800 Subject: [PATCH 219/485] Contig stitcher: fix all Ruff warnings --- micall/core/contig_stitcher.py | 22 +- micall/core/denovo.py | 2 +- micall/core/plot_contigs.py | 14 +- micall/tests/test_cigar_tools.py | 766 +++++---- micall/tests/test_contig_stitcher.py | 1774 +++++++++++---------- micall/tests/test_contig_stitcher_fuzz.py | 92 +- micall/tests/test_tests_utils.py | 1 - micall/tests/utils.py | 9 +- micall/utils/cigar_tools.py | 125 +- micall/utils/contig_stitcher_context.py | 3 +- micall/utils/contig_stitcher_contigs.py | 9 +- micall/utils/contig_stitcher_events.py | 111 +- 12 files changed, 1568 insertions(+), 1360 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 18f4752a7..8f1a2b341 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -250,7 +250,10 @@ def align_all_to_reference(contigs: Iterable[GenotypedContig]) -> Iterable[Genot def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: - """ Globally align two query sequences against each other and return the resulting aligned sequences in MSA format. """ + """ + Globally align two query sequences against each other + and return the resulting aligned sequences in MSA format. + """ gap_open_penalty = 15 gap_extend_penalty = 3 @@ -337,11 +340,12 @@ def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned concordance_d = list(disambiguate_concordance(concordance)) sorted_concordance_indexes = sorted(range(len(concordance)), key=lambda i: concordance_d[i]) - remove_dashes = lambda s: ''.join(c for c in s if c != '-') + def remove_dashes(s): return ''.join(c for c in s if c != '-') for max_concordance_index in reversed(sorted_concordance_indexes): aligned_left_q_index = len(remove_dashes(aligned_left[:max_concordance_index])) - aligned_right_q_index = right_overlap.alignment.query_length - len(remove_dashes(aligned_right[max_concordance_index:])) + 1 + aligned_right_q_index = right_overlap.alignment.query_length - \ + len(remove_dashes(aligned_right[max_concordance_index:])) + 1 aligned_left_r_index = left_overlap.alignment.coordinate_mapping.query_to_ref.left_max(aligned_left_q_index) if aligned_left_r_index is None: aligned_left_r_index = left_overlap.alignment.r_st - 1 @@ -382,7 +386,6 @@ def stitch_2_contigs(left, right): # Log it. average_concordance = Fraction(sum(concordance) / (len(concordance) or 1)) - concordance_str = ', '.join(map(lambda x: str(int(round(x * 100)) / 100), concordance)) cut_point_location_scaled = max_concordance_index / (((len(concordance) or 1) - 1) or 1) log(events.Overlap(left, right, left_overlap, right_overlap, left_remainder, right_remainder, left_overlap_take, @@ -507,9 +510,8 @@ def covered(contig, gap): return any(covered_by(gap, other) for other in contigs if other != contig) def significant(gap): - # The size of the gap is unavoidably, to some point, arbitrary. Here we tried to adjust it to common gaps in HIV, as HIV is the primary test subject in MiCall. - # A notable feature of HIV-1 reverse transcription is the appearance of periodic deletions of approximately 21 nucleotides. These deletions have been reported to occur in the HIV-1 genome and are thought to be influenced by the structure of the viral RNA. Specifically, the secondary structures and foldings of the RNA can lead to pause sites for the reverse transcriptase, resulting in staggered alignment when the enzyme slips. This misalignment can cause the reverse transcriptase to "jump," leading to deletions in the newly synthesized DNA. - # The unusually high frequency of about 21-nucleotide deletions is believed to correspond to the pitch of the RNA helix, which reflects the spatial arrangement of the RNA strands. The 21 nucleotide cycle is an average measure and is thought to be associated with the length of one turn of the RNA helix, meaning that when reverse transcriptase slips and reattaches, it often does so one helical turn away from the original site. + # noinspection PyLongLine + # The size of the gap is unavoidably, to some point, arbitrary. Here we tried to adjust it to common gaps in HIV, as HIV is the primary test subject in MiCall. A notable feature of HIV-1 reverse transcription is the appearance of periodic deletions of approximately 21 nucleotides. These deletions have been reported to occur in the HIV-1 genome and are thought to be influenced by the structure of the viral RNA. Specifically, the secondary structures and foldings of the RNA can lead to pause sites for the reverse transcriptase, resulting in staggered alignment when the enzyme slips. This misalignment can cause the reverse transcriptase to "jump," leading to deletions in the newly synthesized DNA. The unusually high frequency of about 21-nucleotide deletions is believed to correspond to the pitch of the RNA helix, which reflects the spatial arrangement of the RNA strands. The 21 nucleotide cycle is an average measure and is thought to be associated with the length of one turn of the RNA helix, meaning that when reverse transcriptase slips and reattaches, it often does so one helical turn away from the original site. # noqa: E501 return gap.ref_length > 21 def try_split(contig): @@ -537,7 +539,8 @@ def try_split(contig): return process_queue: LifoQueue = LifoQueue() - for contig in contigs: process_queue.put(contig) + for contig in contigs: + process_queue.put(contig) while not process_queue.empty(): contig = process_queue.get() @@ -565,6 +568,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont GroupRef = str + def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(stitch_contigs(contigs)) consensus_parts: Dict[GroupRef, List[AlignedContig]] = defaultdict(list) @@ -586,7 +590,7 @@ def combine(group_ref): def main(args): import argparse - from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. + from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() parser.add_argument('contigs', type=argparse.FileType('r')) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 2b443dc6a..992aae257 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -47,7 +47,7 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> ty except KeyError: try: ref_seq = projects.getReference(group_ref) - except: + except KeyError: ref_seq = None yield GenotypedContig(name=record.name, diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index b1df677e0..65f5c0837 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -429,7 +429,6 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: query_position_map: Dict[str, Tuple[int, int]] = {} lstrip_map: Dict[str, str] = {} rstrip_map: Dict[str, str] = {} - strip_set: Set[Tuple[str, int, int]] = set() def remove_intermediate_edges(graph): tr_cl = transitive_closure(graph) @@ -632,7 +631,7 @@ def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): elif isinstance(event, (events.IgnoreGap, events.InitialHit)): pass else: - x: NoReturn = event + _x: NoReturn = event raise RuntimeError(f"Unrecognized action or event: {event}") notransitive_parent_graph = remove_transitive_edges(complete_parent_graph) @@ -659,7 +658,6 @@ def extend_alive(contig_name): transitive_parent_graph = transitive_closure(parent_graph) transitive_children_graph = transitive_closure(children_graph) reduced_parent_graph = remove_intermediate_edges(transitive_parent_graph) - eqv_parent_graph = reflexive_closure(symmetric_closure(transitive_parent_graph)) sorted_roots = list(sorted(parent_name for parent_name in contig_map if parent_name not in parent_graph)) @@ -729,8 +727,10 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, over yield True # Closing `takes` by parents - while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map, overlap_left_sibling)): pass - while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map, overlap_right_sibling)): pass + while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map, overlap_left_sibling)): + pass + while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map, overlap_right_sibling)): + pass final_nodes: List[str] = [] final_parts: Dict[str, bool] = {} @@ -794,7 +794,8 @@ def get_final_version(contig): return contig_map[name] def get_neighbour(part, lookup): - if not part: return None + if not part: + return None lst = list(get_neighbours(part, lookup)) ret = max(map(get_final_version, lst), key=lambda contig: contig.alignment.ref_length, default=None) return ret @@ -1041,7 +1042,6 @@ def add_section(title: str) -> None: landmarks_path = (Path(__file__).parent.parent / "data" / "landmark_references.yaml") landmark_groups = yaml.safe_load(landmarks_path.read_text()) - projects = ProjectConfig.loadDefault() figure = Figure() for group_ref in group_refs: matching_groups = [group for group in landmark_groups if group['coordinates'] == group_ref] diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py index 47bf240c3..13f43a8b9 100644 --- a/micall/tests/test_cigar_tools.py +++ b/micall/tests/test_cigar_tools.py @@ -1,66 +1,83 @@ import pytest -from typing import List, Tuple from math import floor -import itertools import re -from micall.utils.consensus_aligner import CigarActions -from micall.utils.cigar_tools import Cigar, CigarHit, connect_cigar_hits, CoordinateMapping +from micall.utils.cigar_tools import ( + Cigar, + CigarHit, + connect_cigar_hits, +) cigar_mapping_cases = [ # Simple cases - ('3M', {0: 0, 1: 1, 2: 2}, # exact mapping - {0: 0, 1: 1, 2: 2}), # closest mapping - ('1M1D1M', {0: 0, 2: 1}, # exact mapping - {0: 0, 1: 0, 2: 1}), # closest mapping - ('1M1I1M', {0: 0, 1: 2}, - {0: 0, 1: 2}), - ('2M2D2M', {0: 0, 1: 1, 4: 2, 5: 3}, - {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3}), - ('2M2I2M', {0: 0, 1: 1, 2: 4, 3: 5}, - {0: 0, 1: 1, 2: 4, 3: 5}), - ('3M1D3M', {0: 0, 1: 1, 2: 2, 4: 3, 5: 4, 6: 5}, - {0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5}), - ('3M1I3M', {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, - {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}), - ('7M1I3M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}), - ('5M2D4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 7: 5, 8: 6, 9: 7, 10: 8}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8}), - ('5M3I4M', {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}), - ('1M1D', {0: 0}, - {0: 0, 1: 0}), - ('1M1I', {0: 0}, - {0: 0}), - ('1I1M', {0: 1}, - {0: 1}), - ('1D1M', {1: 0}, - {1: 0, 0: None}), - + ("3M", {0: 0, 1: 1, 2: 2}, {0: 0, 1: 1, 2: 2}), # exact mapping # closest mapping + ("1M1D1M", {0: 0, 2: 1}, {0: 0, 1: 0, 2: 1}), # exact mapping # closest mapping + ("1M1I1M", {0: 0, 1: 2}, {0: 0, 1: 2}), + ("2M2D2M", {0: 0, 1: 1, 4: 2, 5: 3}, {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3}), + ("2M2I2M", {0: 0, 1: 1, 2: 4, 3: 5}, {0: 0, 1: 1, 2: 4, 3: 5}), + ( + "3M1D3M", + {0: 0, 1: 1, 2: 2, 4: 3, 5: 4, 6: 5}, + {0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5}, + ), + ( + "3M1I3M", + {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, + {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, + ), + ( + "7M1I3M", + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, + ), + ( + "5M2D4M", + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 7: 5, 8: 6, 9: 7, 10: 8}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8}, + ), + ( + "5M3I4M", + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, + ), + ("1M1D", {0: 0}, {0: 0, 1: 0}), + ("1M1I", {0: 0}, {0: 0}), + ("1I1M", {0: 1}, {0: 1}), + ("1D1M", {1: 0}, {1: 0, 0: None}), # Multiple deletions and insertions - ('2M2D2M2I2M', {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}), - ('2M2I2M2D2M', {0: 0, 1: 1, 2: 4, 3: 5, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 5, 6: 6, 7: 7}), - ('2=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, - {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 3, 7: 4}), - ('3=1X2N1N2=1H2S', {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 3, 7: 4, 8: 5}), - + ( + "2M2D2M2I2M", + {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, + {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}, + ), + ( + "2M2I2M2D2M", + {0: 0, 1: 1, 2: 4, 3: 5, 6: 6, 7: 7}, + {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 5, 6: 6, 7: 7}, + ), + ( + "2=1X2N1N2=1H2S", + {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, + {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 3, 7: 4}, + ), + ( + "3=1X2N1N2=1H2S", + {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, + {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 3, 7: 4, 8: 5}, + ), # Edge cases - ('', {}, {}), - ('3I3D', {}, - {0: None, 1: None, 2: None}), - ('3D3I', {}, - {0: None, 1: None, 2: None}), - ('12I', {}, {}), - ('12D', {}, {k: None for k in range(12)}), + ("", {}, {}), + ("3I3D", {}, {0: None, 1: None, 2: None}), + ("3D3I", {}, {0: None, 1: None, 2: None}), + ("12I", {}, {}), + ("12D", {}, {k: None for k in range(12)}), ] -@pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) +@pytest.mark.parametrize( + "cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases] +) def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping assert expected_mapping == mapping.ref_to_query @@ -68,7 +85,7 @@ def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): @pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) def test_cigar_to_coordinate_bijection_property(cigar_str): - inverse = lambda d: {v: k for k, v in d.items()} + def inverse(d): return {v: k for k, v in d.items()} mapping = Cigar.coerce(cigar_str).coordinate_mapping @@ -78,59 +95,86 @@ def test_cigar_to_coordinate_bijection_property(cigar_str): assert mapping.query_to_ref == inverse(inverse(mapping.query_to_ref)) -@pytest.mark.parametrize("cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) +@pytest.mark.parametrize( + "cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases] +) def test_cigar_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): mapping = Cigar.coerce(cigar_str).coordinate_mapping - fullrange = {i: mapping.ref_to_query.left_max(i) - for i in mapping.ref_to_query.domain} + fullrange = { + i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain + } assert expected_leftmax_mapping == fullrange -@pytest.mark.parametrize("cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases]) +@pytest.mark.parametrize( + "cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases] +) def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): cigar = Cigar.coerce(cigar_str) - hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) + hit = CigarHit( + cigar, + r_st=5, + r_ei=(5 + cigar.ref_length - 1), + q_st=7, + q_ei=(7 + cigar.query_length - 1), + ) mapping = hit.coordinate_mapping # Coordinates are translated by q_st and r_st. - expected_mapping = {k + hit.r_st: v + hit.q_st for (k, v) in expected_mapping.items()} - assert mapping.ref_to_query.get(0, None) == None - assert mapping.query_to_ref.get(0, None) == None - assert expected_mapping \ - == {i: mapping.ref_to_query[i] - for i in mapping.ref_to_query.keys()} - - -@pytest.mark.parametrize("cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases]) + expected_mapping = { + k + hit.r_st: v + hit.q_st for (k, v) in expected_mapping.items() + } + assert mapping.ref_to_query.get(0, None) is None + assert mapping.query_to_ref.get(0, None) is None + assert expected_mapping == { + i: mapping.ref_to_query[i] for i in mapping.ref_to_query.keys() + } + + +@pytest.mark.parametrize( + "cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases] +) def test_cigar_hit_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): cigar = Cigar.coerce(cigar_str) - hit = CigarHit(cigar, r_st=5, r_ei=(5 + cigar.ref_length - 1), q_st=7, q_ei=(7 + cigar.query_length - 1)) + hit = CigarHit( + cigar, + r_st=5, + r_ei=(5 + cigar.ref_length - 1), + q_st=7, + q_ei=(7 + cigar.query_length - 1), + ) mapping = hit.coordinate_mapping - expected = {k + hit.r_st: v + hit.q_st if v is not None else v for (k, v) in expected_leftmax_mapping.items()} - fullrange = {i: mapping.ref_to_query.left_max(i) - for i in mapping.ref_to_query.domain} + expected = { + k + hit.r_st: v + hit.q_st if v is not None else v + for (k, v) in expected_leftmax_mapping.items() + } + fullrange = { + i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain + } assert expected == fullrange def test_invalid_operation_in_cigar_string(): with pytest.raises(ValueError): - Cigar.coerce('3M1Z3M') # Z operation is not implemented + Cigar.coerce("3M1Z3M") # Z operation is not implemented def test_invalid_operation_in_cigar_list(): with pytest.raises(ValueError): - Cigar.coerce([(3, 42)]) # Operation code "42" does not exist + Cigar.coerce([(3, 42)]) # Operation code "42" does not exist def test_invalid_cigar_string(): with pytest.raises(ValueError): - Cigar.coerce('3MMMMMM3M') # Too many Ms + Cigar.coerce("3MMMMMM3M") # Too many Ms with pytest.raises(ValueError): - Cigar.coerce('3') # Not enough Ms + Cigar.coerce("3") # Not enough Ms CIGAR_REGEX = re.compile(r"(.*)@([0-9]+)->([0-9]+)") + + def parsed_hit(string): match = CIGAR_REGEX.match(string) assert match, f"Cannot parse {string}" @@ -143,137 +187,75 @@ def parsed_hit(string): cigar_hit_ref_cut_cases = [ # Trivial cases - ('4M@1->1', 2.5, - ['2M@1->1', '2M@3->3']), - - ('9M@1->1', 3.5, - ['3M@1->1', '6M@4->4']), - - ('9M@1->1', 4.5, - ['4M@1->1', '5M@5->5']), - - ('9M@0->0', 3.5, - ['4M@0->0', '5M@4->4']), - + ("4M@1->1", 2.5, ["2M@1->1", "2M@3->3"]), + ("9M@1->1", 3.5, ["3M@1->1", "6M@4->4"]), + ("9M@1->1", 4.5, ["4M@1->1", "5M@5->5"]), + ("9M@0->0", 3.5, ["4M@0->0", "5M@4->4"]), # Simple cases - ('9M9D9M@1->1', 3.5, - ['3M@1->1', '6M9D9M@4->4']), - - ('9M9D9M@1->1', 20.5, - ['9M9D2M@1->1', '7M@12->21']), - - ('9M9I9M@1->1', 3.5, - ['3M@1->1', '6M9I9M@4->4']), - - ('9M9I9M@1->1', 13.5 or 27/2, - ['9M9I4M@1->1', '5M@23->14']), - - ('5M6I@1->1', 3.5, - ['3M@1->1', '2M6I@4->4']), - - ('6I5M@1->1', 3.5, - ['6I3M@1->1', '2M@10->4']), - - ('5M6D@1->1', 3.5, - ['3M@1->1', '2M6D@4->4']), - - ('6D5M@1->1', 3.5, - ['3D@1->1', '3D5M@1->4']), - - ('5M6D@1->1', 7.5, - ['5M2D@1->1', '4D@6->8']), - - ('6D5M@1->1', 7.5, - ['6D1M@1->1', '4M@2->8']), - - ('6D5M@1->1', 6.5, - ['6D@1->1', '5M@1->7']), - + ("9M9D9M@1->1", 3.5, ["3M@1->1", "6M9D9M@4->4"]), + ("9M9D9M@1->1", 20.5, ["9M9D2M@1->1", "7M@12->21"]), + ("9M9I9M@1->1", 3.5, ["3M@1->1", "6M9I9M@4->4"]), + ("9M9I9M@1->1", 13.5 or 27 / 2, ["9M9I4M@1->1", "5M@23->14"]), + ("5M6I@1->1", 3.5, ["3M@1->1", "2M6I@4->4"]), + ("6I5M@1->1", 3.5, ["6I3M@1->1", "2M@10->4"]), + ("5M6D@1->1", 3.5, ["3M@1->1", "2M6D@4->4"]), + ("6D5M@1->1", 3.5, ["3D@1->1", "3D5M@1->4"]), + ("5M6D@1->1", 7.5, ["5M2D@1->1", "4D@6->8"]), + ("6D5M@1->1", 7.5, ["6D1M@1->1", "4M@2->8"]), + ("6D5M@1->1", 6.5, ["6D@1->1", "5M@1->7"]), # Ambigous cases - ('9M9D9M@1->1', 13.5 or 27/2, - ['9M4D@1->1', '5D9M@10->14']), - - ('9M9I9M@1->1', 9.2, - ['9M1I@1->1', '8I9M@11->10']), - - ('9M9D9I9M@1->1', 13.5 or 27/2, - ['9M4D@1->1', '5D9I9M@10->14']), - - ('9M9I9D9M@1->1', 13.5 or 27/2, - ['9M9I4D@1->1', '5D9M@19->14']), - - ('9M9D9I9D9I9D9M@1->1', 13.5 or 27/2, - ['9M4D@1->1', '5D9I9D9I9D9M@10->14']), - - ('9M9I9D9I9D9I9M@1->1', 13.5 or 27/2, - ['9M9I4D@1->1', '5D9I9D9I9M@19->14']), - - ('1M1I1D1M@1->1', 1.5, # same as previous 2 cases but smaller - ['1M1I@1->1', '1D1M@3->2']), - - ('1M1D1I1M@1->1', 1.5, # same as previous 2 cases but smaller - ['1M@1->1', '1D1I1M@2->2']), - + ("9M9D9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9M@10->14"]), + ("9M9I9M@1->1", 9.2, ["9M1I@1->1", "8I9M@11->10"]), + ("9M9D9I9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9I9M@10->14"]), + ("9M9I9D9M@1->1", 13.5 or 27 / 2, ["9M9I4D@1->1", "5D9M@19->14"]), + ("9M9D9I9D9I9D9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9I9D9I9D9M@10->14"]), + ("9M9I9D9I9D9I9M@1->1", 13.5 or 27 / 2, ["9M9I4D@1->1", "5D9I9D9I9M@19->14"]), + ( + "1M1I1D1M@1->1", + 1.5, # same as previous 2 cases but smaller + ["1M1I@1->1", "1D1M@3->2"], + ), + ( + "1M1D1I1M@1->1", + 1.5, # same as previous 2 cases but smaller + ["1M@1->1", "1D1I1M@2->2"], + ), # Edge cases - ('9M9I9M@1->1', 9.5, # no middlepoint - ['9M5I@1->1', '4I9M@15->10']), - - ('9M@1->1', 8.5, - ['8M@1->1', '1M@9->9']), - - ('9M@1->1', 9.5, - ['9M@1->1', '@10->10']), - - ('7M@3->3', 2.5, - ['@3->3', '7M@3->3']), - - ('9M@1->1', 0.5, - ['@1->1', '9M@1->1']), - - ('9M@0->0', -0.5, - ['@0->0', '9M@0->0']), - - ('9D@1->1', 3.5, - ['3D@1->1', '6D@1->4']), - - ('9D@0->0', -0.5, - ['@0->0', '9D@0->0']), - - ('1M7I1M@1->1', 1.5, - ['1M4I@1->1', '3I1M@6->2']), - - ('1M6I1M@1->1', 1.5, - ['1M3I@1->1', '3I1M@5->2']), - - ('1M7I1M@1->1', 1.999, - ['1M7I@1->1', '1M@9->2']), - - ('1M7I1M@1->1', 1.001, - ['1M@1->1', '7I1M@2->2']), - - ('2=1X2N1N2=1H2S@1->1', 3.5, - ['2=1X@1->1', '3N2=1H2S@4->4']), - + ("9M9I9M@1->1", 9.5, ["9M5I@1->1", "4I9M@15->10"]), # no middlepoint + ("9M@1->1", 8.5, ["8M@1->1", "1M@9->9"]), + ("9M@1->1", 9.5, ["9M@1->1", "@10->10"]), + ("7M@3->3", 2.5, ["@3->3", "7M@3->3"]), + ("9M@1->1", 0.5, ["@1->1", "9M@1->1"]), + ("9M@0->0", -0.5, ["@0->0", "9M@0->0"]), + ("9D@1->1", 3.5, ["3D@1->1", "6D@1->4"]), + ("9D@0->0", -0.5, ["@0->0", "9D@0->0"]), + ("1M7I1M@1->1", 1.5, ["1M4I@1->1", "3I1M@6->2"]), + ("1M6I1M@1->1", 1.5, ["1M3I@1->1", "3I1M@5->2"]), + ("1M7I1M@1->1", 1.999, ["1M7I@1->1", "1M@9->2"]), + ("1M7I1M@1->1", 1.001, ["1M@1->1", "7I1M@2->2"]), + ("2=1X2N1N2=1H2S@1->1", 3.5, ["2=1X@1->1", "3N2=1H2S@4->4"]), # Negative cases - ('9M9I9M@1->1', 20.5, - IndexError("Cut point out of reference bounds")), # 20.5 is bigger than reference (18) - - ('@2->2', 2.5, - IndexError("Cut point out of reference bounds")), # Empty string cannot be cut - - ('@2->2', 1.5, - IndexError("Cut point out of reference bounds")), # Empty string cannot be cut - - ('9I@1->1', 3.5, - IndexError("Cut point out of reference bounds")), - - ('9M@1->1', 4, - ValueError("Cut accepts fractions, not integers")), - + ( + "9M9I9M@1->1", + 20.5, + IndexError("Cut point out of reference bounds"), + ), # 20.5 is bigger than reference (18) + ( + "@2->2", + 2.5, + IndexError("Cut point out of reference bounds"), + ), # Empty string cannot be cut + ( + "@2->2", + 1.5, + IndexError("Cut point out of reference bounds"), + ), # Empty string cannot be cut + ("9I@1->1", 3.5, IndexError("Cut point out of reference bounds")), + ("9M@1->1", 4, ValueError("Cut accepts fractions, not integers")), ] -@pytest.mark.parametrize('hit, cut_point, expected_result', cigar_hit_ref_cut_cases) +@pytest.mark.parametrize("hit, cut_point, expected_result", cigar_hit_ref_cut_cases) def test_cigar_hit_ref_cut(hit, cut_point, expected_result): hit = parsed_hit(hit) @@ -290,17 +272,23 @@ def test_cigar_hit_ref_cut(hit, cut_point, expected_result): assert expected_right == right -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit, cut_point", + [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)], +) def test_cigar_hit_ref_cut_add_prop(hit, cut_point): hit = parsed_hit(hit) left, right = hit.cut_reference(cut_point) assert left + right == hit -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in - [x for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]]) +@pytest.mark.parametrize( + "hit, cut_point", + [ + (x[0], x[1]) + for x in [x for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] + ], +) def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): hit = parsed_hit(hit) percentage = cut_point - floor(cut_point) @@ -311,119 +299,119 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): lstrip_query_cases = [ - ('9M@1->1', '9M@1->1'), - ('5M6D@1->1', '5M6D@1->1'), - ('6D5M@1->1', '6D5M@1->1'), - ('6I5M@1->1', '5M@7->1'), - ('6I4D5M@1->1', '4D5M@7->1'), - ('6D4I5M@1->1', '6D5M@5->1'), - ('3D3D4I5M@1->1', '6D5M@5->1'), - ('3I3I4D5M@1->1', '4D5M@7->1'), - ('3D2I3D2I5M@1->1', '6D5M@5->1'), - ('3I2D3I2D5M@1->1', '4D5M@7->1'), - ('4D6I5M@1->1', '4D5M@7->1'), - ('4I6D5M@1->1', '6D5M@5->1'), - ('6I4D@1->1', '4D@7->1'), - ('6D4I@1->1', '6D@5->1'), - ('4D6I@1->1', '4D@7->1'), - ('4I6D@1->1', '6D@5->1'), - ('4I@1->1', '@5->1'), - ('4D@1->1', '4D@1->1'), - ('@1->1', '@1->1'), + ("9M@1->1", "9M@1->1"), + ("5M6D@1->1", "5M6D@1->1"), + ("6D5M@1->1", "6D5M@1->1"), + ("6I5M@1->1", "5M@7->1"), + ("6I4D5M@1->1", "4D5M@7->1"), + ("6D4I5M@1->1", "6D5M@5->1"), + ("3D3D4I5M@1->1", "6D5M@5->1"), + ("3I3I4D5M@1->1", "4D5M@7->1"), + ("3D2I3D2I5M@1->1", "6D5M@5->1"), + ("3I2D3I2D5M@1->1", "4D5M@7->1"), + ("4D6I5M@1->1", "4D5M@7->1"), + ("4I6D5M@1->1", "6D5M@5->1"), + ("6I4D@1->1", "4D@7->1"), + ("6D4I@1->1", "6D@5->1"), + ("4D6I@1->1", "4D@7->1"), + ("4I6D@1->1", "6D@5->1"), + ("4I@1->1", "@5->1"), + ("4D@1->1", "4D@1->1"), + ("@1->1", "@1->1"), ] rstrip_query_cases = [ - ('9M@1->1', '9M@1->1'), - ('5M6D@1->1', '5M6D@1->1'), - ('5M6I@1->1', '5M@1->1'), - ('6D5M@1->1', '6D5M@1->1'), - ('5M4I6D@1->1', '5M6D@1->1'), - ('5M4D6I@1->1', '5M4D@1->1'), - ('5M4I3D3D@1->1', '5M6D@1->1'), - ('5M4D3I3I@1->1', '5M4D@1->1'), - ('5M2I3D2I3D@1->1', '5M6D@1->1'), - ('5M2D3I2D3I@1->1', '5M4D@1->1'), - ('5M6D4I@1->1', '5M6D@1->1'), - ('5M6I4D@1->1', '5M4D@1->1'), - ('6D4I@1->1', '6D@1->1'), - ('6I4D@1->1', '4D@1->1'), - ('4I6D@1->1', '6D@1->1'), - ('4D6I@1->1', '4D@1->1'), - ('4I@1->1', '@1->1'), - ('4D@1->1', '4D@1->1'), - ('@1->1', '@1->1'), + ("9M@1->1", "9M@1->1"), + ("5M6D@1->1", "5M6D@1->1"), + ("5M6I@1->1", "5M@1->1"), + ("6D5M@1->1", "6D5M@1->1"), + ("5M4I6D@1->1", "5M6D@1->1"), + ("5M4D6I@1->1", "5M4D@1->1"), + ("5M4I3D3D@1->1", "5M6D@1->1"), + ("5M4D3I3I@1->1", "5M4D@1->1"), + ("5M2I3D2I3D@1->1", "5M6D@1->1"), + ("5M2D3I2D3I@1->1", "5M4D@1->1"), + ("5M6D4I@1->1", "5M6D@1->1"), + ("5M6I4D@1->1", "5M4D@1->1"), + ("6D4I@1->1", "6D@1->1"), + ("6I4D@1->1", "4D@1->1"), + ("4I6D@1->1", "6D@1->1"), + ("4D6I@1->1", "4D@1->1"), + ("4I@1->1", "@1->1"), + ("4D@1->1", "4D@1->1"), + ("@1->1", "@1->1"), ] lstrip_reference_cases = [ - ('9M@1->1', '9M@1->1'), - ('5M6D@1->1', '5M6D@1->1'), - ('6D5M@1->1', '5M@1->7'), - ('6I5M@1->1', '6I5M@1->1'), - ('6I4D5M@1->1', '6I5M@1->5'), - ('6D4I5M@1->1', '4I5M@1->7'), - ('6D4I5M@1->1', '4I5M@1->7'), - ('6I4D5M@1->1', '6I5M@1->5'), - ('3D2I3D2I5M@1->1', '4I5M@1->7'), - ('3I2D3I2D5M@1->1', '6I5M@1->5'), - ('4D6I5M@1->1', '6I5M@1->5'), - ('4I6D5M@1->1', '4I5M@1->7'), - ('6I4D@1->1', '6I@1->5'), - ('6D4I@1->1', '4I@1->7'), - ('4D6I@1->1', '6I@1->5'), - ('4I6D@1->1', '4I@1->7'), - ('4I@1->1', '4I@1->1'), - ('4D@1->1', '@1->5'), - ('@1->1', '@1->1'), + ("9M@1->1", "9M@1->1"), + ("5M6D@1->1", "5M6D@1->1"), + ("6D5M@1->1", "5M@1->7"), + ("6I5M@1->1", "6I5M@1->1"), + ("6I4D5M@1->1", "6I5M@1->5"), + ("6D4I5M@1->1", "4I5M@1->7"), + ("6D4I5M@1->1", "4I5M@1->7"), + ("6I4D5M@1->1", "6I5M@1->5"), + ("3D2I3D2I5M@1->1", "4I5M@1->7"), + ("3I2D3I2D5M@1->1", "6I5M@1->5"), + ("4D6I5M@1->1", "6I5M@1->5"), + ("4I6D5M@1->1", "4I5M@1->7"), + ("6I4D@1->1", "6I@1->5"), + ("6D4I@1->1", "4I@1->7"), + ("4D6I@1->1", "6I@1->5"), + ("4I6D@1->1", "4I@1->7"), + ("4I@1->1", "4I@1->1"), + ("4D@1->1", "@1->5"), + ("@1->1", "@1->1"), ] rstrip_reference_cases = [ - ('9M@1->1', '9M@1->1'), - ('5M6D@1->1', '5M@1->1'), - ('5M6I@1->1', '5M6I@1->1'), - ('6D5M@1->1', '6D5M@1->1'), - ('5M4I6D@1->1', '5M4I@1->1'), - ('5M4D6I@1->1', '5M6I@1->1'), - ('5M4I6D@1->1', '5M4I@1->1'), - ('5M4D6I@1->1', '5M6I@1->1'), - ('5M2I3D2I3D@1->1', '5M4I@1->1'), - ('5M2D3I2D3I@1->1', '5M6I@1->1'), - ('5M6D4I@1->1', '5M4I@1->1'), - ('5M6I4D@1->1', '5M6I@1->1'), - ('6D4I@1->1', '4I@1->1'), - ('6I4D@1->1', '6I@1->1'), - ('4I6D@1->1', '4I@1->1'), - ('4D6I@1->1', '6I@1->1'), - ('4I@1->1', '4I@1->1'), - ('4D@1->1', '@1->1'), - ('@1->1', '@1->1'), + ("9M@1->1", "9M@1->1"), + ("5M6D@1->1", "5M@1->1"), + ("5M6I@1->1", "5M6I@1->1"), + ("6D5M@1->1", "6D5M@1->1"), + ("5M4I6D@1->1", "5M4I@1->1"), + ("5M4D6I@1->1", "5M6I@1->1"), + ("5M4I6D@1->1", "5M4I@1->1"), + ("5M4D6I@1->1", "5M6I@1->1"), + ("5M2I3D2I3D@1->1", "5M4I@1->1"), + ("5M2D3I2D3I@1->1", "5M6I@1->1"), + ("5M6D4I@1->1", "5M4I@1->1"), + ("5M6I4D@1->1", "5M6I@1->1"), + ("6D4I@1->1", "4I@1->1"), + ("6I4D@1->1", "6I@1->1"), + ("4I6D@1->1", "4I@1->1"), + ("4D6I@1->1", "6I@1->1"), + ("4I@1->1", "4I@1->1"), + ("4D@1->1", "@1->1"), + ("@1->1", "@1->1"), ] -strip_prop_cases_all = \ - [x[0] for x in cigar_hit_ref_cut_cases] + \ - [x[0] for x in lstrip_query_cases] + \ - [x[0] for x in rstrip_query_cases] +strip_prop_cases_all = ( + [x[0] for x in cigar_hit_ref_cut_cases] + + [x[0] for x in lstrip_query_cases] + + [x[0] for x in rstrip_query_cases] +) -@pytest.mark.parametrize('hit, expected', lstrip_query_cases) +@pytest.mark.parametrize("hit, expected", lstrip_query_cases) def test_cigar_hit_lstrip_reference(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) assert expected == hit.lstrip_query() -@pytest.mark.parametrize('hit, expected', rstrip_query_cases) +@pytest.mark.parametrize("hit, expected", rstrip_query_cases) def test_cigar_hit_rstrip_reference(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) assert expected == hit.rstrip_query() - -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_reference_strip_combines_with_connect(hit): hit = parsed_hit(hit) @@ -433,11 +421,13 @@ def test_cigar_hit_reference_strip_combines_with_connect(hit): left = left.rstrip_query() right = right.lstrip_query() - assert left.connect(right).coordinate_mapping.ref_to_query \ + assert ( + left.connect(right).coordinate_mapping.ref_to_query == hit.coordinate_mapping.ref_to_query + ) -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_reference_strip_combines_with_add(hit): hit = parsed_hit(hit) @@ -451,7 +441,7 @@ def test_cigar_hit_reference_strip_combines_with_add(hit): assert left + right == hit -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_reference_strip_never_crashes(hit): hit = parsed_hit(hit) @@ -461,7 +451,7 @@ def test_cigar_hit_reference_strip_never_crashes(hit): hit.rstrip_query().rstrip_query() -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_reference_strip_is_idempotent(hit): hit = parsed_hit(hit) @@ -478,33 +468,34 @@ def test_cigar_hit_reference_strip_is_idempotent(hit): assert h1 == h1.rstrip_query() == h1.lstrip_query() -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_reference_strips_are_commutative(hit): hit = parsed_hit(hit) if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert hit.rstrip_query().lstrip_query() \ - == hit.lstrip_query().rstrip_query() + assert hit.rstrip_query().lstrip_query() == hit.lstrip_query().rstrip_query() else: - assert hit.rstrip_query().lstrip_query().cigar \ + assert ( + hit.rstrip_query().lstrip_query().cigar == hit.lstrip_query().rstrip_query().cigar + ) -@pytest.mark.parametrize('hit, expected', lstrip_reference_cases) +@pytest.mark.parametrize("hit, expected", lstrip_reference_cases) def test_cigar_hit_lstrip_query(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) assert expected == hit.lstrip_reference() -@pytest.mark.parametrize('hit, expected', rstrip_reference_cases) +@pytest.mark.parametrize("hit, expected", rstrip_reference_cases) def test_cigar_hit_rstrip_query(hit, expected): hit = parsed_hit(hit) expected = parsed_hit(expected) assert expected == hit.rstrip_reference() -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_query_strip_combines_with_connect(hit): hit = parsed_hit(hit) @@ -514,11 +505,13 @@ def test_cigar_hit_query_strip_combines_with_connect(hit): left = left.rstrip_reference() right = right.lstrip_reference() - assert left.connect(right).coordinate_mapping.ref_to_query \ + assert ( + left.connect(right).coordinate_mapping.ref_to_query == hit.coordinate_mapping.ref_to_query + ) -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_query_strip_combines_with_add(hit): hit = parsed_hit(hit) @@ -532,7 +525,7 @@ def test_cigar_hit_query_strip_combines_with_add(hit): assert left + right == hit -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_strips_work_together(hit): hit = parsed_hit(hit) @@ -542,7 +535,7 @@ def test_cigar_hit_strips_work_together(hit): assert not lstrip.startswith("I") and not lstrip.startswith("D") -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_query_strip_is_idempotent(hit): hit = parsed_hit(hit) @@ -559,20 +552,26 @@ def test_cigar_hit_query_strip_is_idempotent(hit): assert h1 == h1.rstrip_reference() == h1.lstrip_reference() -@pytest.mark.parametrize('hit', strip_prop_cases_all) +@pytest.mark.parametrize("hit", strip_prop_cases_all) def test_cigar_hit_query_strips_are_commutative(hit): hit = parsed_hit(hit) if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert hit.rstrip_reference().lstrip_reference() \ + assert ( + hit.rstrip_reference().lstrip_reference() == hit.lstrip_reference().rstrip_reference() + ) else: - assert hit.rstrip_reference().lstrip_reference().cigar \ + assert ( + hit.rstrip_reference().lstrip_reference().cigar == hit.lstrip_reference().rstrip_reference().cigar + ) -@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit, cut_point", + [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)], +) def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): hit = parsed_hit(hit) percentage = cut_point - floor(cut_point) @@ -581,28 +580,31 @@ def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): a, x = hit.cut_reference(ax_cut - percentage) for bc_cut in range(a.r_ei + 1, hit.r_ei + 2): - if x.ref_length == 0: continue + if x.ref_length == 0: + continue b, c = x.cut_reference(bc_cut - percentage) assert (a + b) + c == a + (b + c) -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] +) def test_cigar_hit_deletions_no_m_or_i(hit): hit = parsed_hit(hit) deletions = list(hit.deletions()) - assert len(deletions) == len([op for op in str(hit.cigar) if op in 'DN']) + assert len(deletions) == len([op for op in str(hit.cigar) if op in "DN"]) for deletion in deletions: - assert 'M' not in str(deletion.cigar) - assert 'I' not in str(deletion.cigar) + assert "M" not in str(deletion.cigar) + assert "I" not in str(deletion.cigar) -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] +) def test_cigar_hit_deletions_lengths(hit): hit = parsed_hit(hit) deletions = list(hit.deletions()) @@ -613,22 +615,24 @@ def test_cigar_hit_deletions_lengths(hit): assert deletion.coordinate_mapping.ref_to_query == {} -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] +) def test_cigar_hit_insertions_no_m_or_i(hit): hit = parsed_hit(hit) insertions = list(hit.insertions()) - if 'I' in str(hit.cigar): + if "I" in str(hit.cigar): assert len(insertions) > 0 for insertion in insertions: - assert 'M' not in str(insertion.cigar) - assert 'D' not in str(insertion.cigar) + assert "M" not in str(insertion.cigar) + assert "D" not in str(insertion.cigar) -@pytest.mark.parametrize('hit', [x[0] for x in cigar_hit_ref_cut_cases - if not isinstance(x[2], Exception)]) +@pytest.mark.parametrize( + "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] +) def test_cigar_hit_insertions_lengths(hit): hit = parsed_hit(hit) insertions = list(hit.insertions()) @@ -639,26 +643,42 @@ def test_cigar_hit_insertions_lengths(hit): assert insertion.coordinate_mapping.ref_to_query == {} -@pytest.mark.parametrize("reference_seq, query_seq, cigar, expected_reference, expected_query", [ - ('ACTG', 'ACTG', '4M', 'ACTG', 'ACTG'), - ('ACTG', '', '4D', 'ACTG', '----'), - ('', 'ACTG', '4I', '----', 'ACTG'), - ('ACTGAC', 'ACAC', '2M2D2M', 'ACTGAC', 'AC--AC'), - ('ACAC', 'ACTGAC', '2M2I2M', 'AC--AC', 'ACTGAC'), - ('GCTATGGGAA', 'GCTATGGGAA', '5M3D2M', 'GCTATGGGAA', 'GCTAT---GG'), - ('ACTG', 'ACTG', '2M99H77P2M', 'ACTG', 'ACTG'), # Ignores non-consuming operations. -]) -def test_cigar_to_msa(reference_seq, query_seq, cigar, expected_reference, expected_query): - assert Cigar.coerce(cigar).to_msa(reference_seq, query_seq) \ - == (expected_reference, expected_query) - - -@pytest.mark.parametrize("cigar, reference_seq, query_seq", [ - ('10M', 'A' * 3, 'A' * 10), # reference is shorter than CIGAR - ('10M', 'A' * 10, 'A' * 3), # query is shorter than CIGAR - ('10D', 'A' * 3, 'A' * 3), - ('10I', 'A' * 3, 'A' * 3), -]) +@pytest.mark.parametrize( + "reference_seq, query_seq, cigar, expected_reference, expected_query", + [ + ("ACTG", "ACTG", "4M", "ACTG", "ACTG"), + ("ACTG", "", "4D", "ACTG", "----"), + ("", "ACTG", "4I", "----", "ACTG"), + ("ACTGAC", "ACAC", "2M2D2M", "ACTGAC", "AC--AC"), + ("ACAC", "ACTGAC", "2M2I2M", "AC--AC", "ACTGAC"), + ("GCTATGGGAA", "GCTATGGGAA", "5M3D2M", "GCTATGGGAA", "GCTAT---GG"), + ( + "ACTG", + "ACTG", + "2M99H77P2M", + "ACTG", + "ACTG", + ), # Ignores non-consuming operations. + ], +) +def test_cigar_to_msa( + reference_seq, query_seq, cigar, expected_reference, expected_query +): + assert Cigar.coerce(cigar).to_msa(reference_seq, query_seq) == ( + expected_reference, + expected_query, + ) + + +@pytest.mark.parametrize( + "cigar, reference_seq, query_seq", + [ + ("10M", "A" * 3, "A" * 10), # reference is shorter than CIGAR + ("10M", "A" * 10, "A" * 3), # query is shorter than CIGAR + ("10D", "A" * 3, "A" * 3), + ("10I", "A" * 3, "A" * 3), + ], +) def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): with pytest.raises(ValueError): Cigar.coerce(cigar).to_msa(reference_seq, query_seq) @@ -666,67 +686,33 @@ def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): connect_cigar_hits_cases = [ # Non-overlapping hits should be connected with deletions/insertions - ( - ['4M@1->1', '4M@8->10'], - ['4M5D3I4M@1->1'] - ), + (["4M@1->1", "4M@8->10"], ["4M5D3I4M@1->1"]), # Overlapping hits should ignore later ones - ( - ['4M@1->1', '5M@3->3'], - ['4M@1->1'] - ), + (["4M@1->1", "5M@3->3"], ["4M@1->1"]), # Touching hits should be simply concatenated - ( - ['4M@1->1', '4M@5->5'], - ['8M@1->1'] - ), + (["4M@1->1", "4M@5->5"], ["8M@1->1"]), # Hits that touch at only one boundary should combine just fine - ( - ['3M@1->1', '6M@6->4'], - ['3M2I6M@1->1'] - ), + (["3M@1->1", "6M@6->4"], ["3M2I6M@1->1"]), # Hits that are subsets of earlier hits should be ignored - ( - ['8M@1->1', '3M@3->3'], - ['8M@1->1'] - ), + (["8M@1->1", "3M@3->3"], ["8M@1->1"]), # Hits that are out of order should be connected if no overlap - ( - ['3M@6->10', '3M@1->1'], - ['3M6D2I3M@1->1'] - ), + (["3M@6->10", "3M@1->1"], ["3M6D2I3M@1->1"]), # Hits that overlap by a single base should prioritize the first hit and not combine - ( - ['3M@1->1', '3M@3->3'], - ['3M@1->1'] - ), + (["3M@1->1", "3M@3->3"], ["3M@1->1"]), # Non-overlapping hits in the query space but overlapping in reference space - ( - ['5M@1->1', '1M@10->3'], - ['5M@1->1'] - ), + (["5M@1->1", "1M@10->3"], ["5M@1->1"]), # Combining more than two hits - ( - ['3M@1->1', '3M@7->7', '3M@16->12'], - ['3M3D3I3M2D6I3M@1->1'] - ), + (["3M@1->1", "3M@7->7", "3M@16->12"], ["3M3D3I3M2D6I3M@1->1"]), # Combining hits including hard-clipping, which should be ignored in alignments - ( - ['2H5M1H@3->1', '2H5M1H@13->11'], - ['2H5M1H5D5I2H5M1H@3->1'] - ), + (["2H5M1H@3->1", "2H5M1H@13->11"], ["2H5M1H5D5I2H5M1H@3->1"]), # An empty list of hits should raise a ValueError - ( - [], - ValueError("Expected a non-empty list of cigar hits") - ), + ([], ValueError("Expected a non-empty list of cigar hits")), # Before by reference, after by query - ( - ['4M@8->1', '4M@1->10'], - ['4M@8->1', '4M@1->10'] - ), + (["4M@8->1", "4M@1->10"], ["4M@8->1", "4M@1->10"]), ] -@pytest.mark.parametrize('hits, expected_result', connect_cigar_hits_cases) + + +@pytest.mark.parametrize("hits, expected_result", connect_cigar_hits_cases) def test_connect_cigar_hits(hits, expected_result): hits = list(map(parsed_hit, hits)) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 955655c10..4715f1a59 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,28 +1,44 @@ - import random import logging import os import pytest import micall.core.contig_stitcher as stitcher -from micall.core.contig_stitcher import split_contigs_with_gaps, stitch_contigs, GenotypedContig, merge_intervals, find_covered_contig, stitch_consensus, calculate_concordance, align_all_to_reference, main, AlignedContig, disambiguate_concordance, lstrip, rstrip +from micall.core.contig_stitcher import ( + split_contigs_with_gaps, + stitch_contigs, + GenotypedContig, + merge_intervals, + find_covered_contig, + stitch_consensus, + calculate_concordance, + align_all_to_reference, + AlignedContig, + disambiguate_concordance, + lstrip, + rstrip, +) from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed -from micall.tests.test_denovo import check_hcv_db # activates the fixture -from micall.tests.test_remap import load_projects # activates the "projects" fixture +from micall.tests.test_denovo import check_hcv_db # activates the fixture +from micall.tests.test_remap import load_projects # activates the "projects" fixture + logging.getLogger("micall.core.contig_stitcher").setLevel(logging.DEBUG) logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) +random.choice([check_hcv_db, load_projects]) # make linters not complain about unused imports. + + @pytest.fixture() def exact_aligner(monkeypatch): - monkeypatch.setattr('micall.core.contig_stitcher.Aligner', MockAligner) + monkeypatch.setattr("micall.core.contig_stitcher.Aligner", MockAligner) @pytest.fixture def visualizer(request, tmp_path): - logs = stitcher.context.set(stitcher.StitcherContext()) + stitcher.context.set(stitcher.StitcherContext()) test_name = request.node.name plot_name = test_name + ".svg" pwd = os.path.dirname(__file__) @@ -35,14 +51,15 @@ def check(): logs = stitcher.context.get().events figure = plot_stitcher_coverage(logs, path_to_produced) - with open(path_to_produced, 'r') as produced_file: + with open(path_to_produced, "r") as produced_file: produced_data = produced_file.read() - with open(path_to_expected, 'r') as expected_file: + with open(path_to_expected, "r") as expected_file: expected_data = expected_file.read() - assert produced_data == expected_data, \ - "The contents of the stitched contigs plot" \ + assert produced_data == expected_data, ( + "The contents of the stitched contigs plot" " does not match the expected contents." + ) return figure @@ -53,14 +70,15 @@ def test_identical_stitching_of_one_contig(exact_aligner, visualizer): # Scenario: When stitching one contig, it remains the same. contigs = [ - GenotypedContig(name='a', - seq='ACTGACTG' * 100, - ref_name='testref', - group_ref='testref', - ref_seq='T' * 20 + 'ACTGACTG' * 110 + 'T' * 20, - match_fraction=1.0, - ), - ] + GenotypedContig( + name="a", + seq="ACTGACTG" * 100, + ref_name="testref", + group_ref="testref", + ref_seq="T" * 20 + "ACTGACTG" * 110 + "T" * 20, + match_fraction=1.0, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -72,86 +90,93 @@ def test_identical_stitching_of_one_contig(exact_aligner, visualizer): def test_separate_stitching_of_non_overlapping_contigs_1(exact_aligner, visualizer): # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 70, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='C' * 70, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 70, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="C" * 70, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) def test_separate_stitching_of_non_overlapping_contigs_2(exact_aligner, visualizer): - # Scenario: When stitching multiple non-overlapping contigs, the order doesn't matter. + # Scenario: When stitching multiple non-overlapping contigs, + # the order doesn't matter. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='b', - seq='C' * 70, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='a', - seq='A' * 70, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="b", + seq="C" * 70, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="a", + seq="A" * 70, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) # No claims about the output order, so wrap into set() - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) -def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, visualizer): +def test_correct_stitching_of_two_partially_overlapping_contigs( + exact_aligner, visualizer +): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 20 + 'C' * 50, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "C" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 20 + "C" * 50, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -159,32 +184,36 @@ def test_correct_stitching_of_two_partially_overlapping_contigs(exact_aligner, v result = results[0] assert 100 == len(result.seq) - assert result.seq == 'A' * 50 + 'C' * 50 + assert result.seq == "A" * 50 + "C" * 50 assert len(visualizer().elements) > len(contigs) -def test_correct_stitching_of_two_partially_overlapping_contigs_with_padding(exact_aligner, visualizer): +def test_correct_stitching_of_two_partially_overlapping_contigs_with_padding( + exact_aligner, visualizer +): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='M' * 10 + 'A' * 50 + 'C' * 20 + 'Z' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='J' * 10 + 'A' * 20 + 'C' * 50 + 'N' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="M" * 10 + "A" * 50 + "C" * 20 + "Z" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="J" * 10 + "A" * 20 + "C" * 50 + "N" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -192,35 +221,38 @@ def test_correct_stitching_of_two_partially_overlapping_contigs_with_padding(exa result = results[0] assert 120 == len(result.seq) - assert result.seq == 'M' * 10 + 'A' * 50 + 'C' * 50 + 'N' * 10 + assert result.seq == "M" * 10 + "A" * 50 + "C" * 50 + "N" * 10 assert len(visualizer().elements) > len(contigs) -def test_correct_stitching_of_two_partially_overlapping_contigs_real(projects, visualizer): - # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. Not using exact aligner this time. +def test_correct_stitching_of_two_partially_overlapping_contigs_real( + projects, visualizer +): + # Scenario: Two partially overlapping contigs are stitched + # correctly into a single sequence. Not using exact aligner this time. - from mappy import revcomp - - ref_name = 'HIV1-B-ZA-KP109515-seed' + ref_name = "HIV1-B-ZA-KP109515-seed" ref = projects.getReference(ref_name) contigs = [ - GenotypedContig(name='a', - seq=ref[1700:2000], - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq=ref[1900:2200], - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq=ref[1700:2000], + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq=ref[1900:2200], + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -233,31 +265,34 @@ def test_correct_stitching_of_two_partially_overlapping_contigs_real(projects, v assert len(visualizer().elements) > len(contigs) -def test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps(projects, visualizer): - # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence, with insignificant gaps. - - from mappy import revcomp +def test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps( + projects, visualizer +): + # Scenario: Two partially overlapping contigs are stitched + # correctly into a single sequence, with insignificant gaps. - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) - gap_ref = ''.join(c if i % 30 > 2 else '' for i, c in enumerate(ref)) + gap_ref = "".join(c if i % 30 > 2 else "" for i, c in enumerate(ref)) contigs = [ - GenotypedContig(name='a', - seq=gap_ref[1700:2000], - ref_name=hxb2_name, - group_ref=hxb2_name, - ref_seq=ref, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq=gap_ref[1900:2200], - ref_name=hxb2_name, - group_ref=hxb2_name, - ref_seq=ref, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq=gap_ref[1700:2000], + ref_name=hxb2_name, + group_ref=hxb2_name, + ref_seq=ref, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq=gap_ref[1900:2200], + ref_name=hxb2_name, + group_ref=hxb2_name, + ref_seq=ref, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -270,104 +305,123 @@ def test_correct_stitching_of_two_partially_overlapping_contigs_with_insignifica assert len(visualizer().elements) > len(contigs) -def test_correct_processing_of_two_overlapping_and_one_separate_contig(exact_aligner, visualizer): +def test_correct_processing_of_two_overlapping_and_one_separate_contig( + exact_aligner, visualizer +): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. - ref_seq = 'Z' * 5 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'Y' * 5 + ref_seq = "Z" * 5 + "A" * 100 + "C" * 100 + "T" * 100 + "Y" * 5 contigs = [ - GenotypedContig(name='a', - seq='M' * 5 + 'A' * 50 + 'C' * 20 + 'J' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='Q' * 5 + 'A' * 20 + 'C' * 50 + 'I' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c', - seq='N' * 5 + 'C' * 20 + 'T' * 50 + 'H' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="M" * 5 + "A" * 50 + "C" * 20 + "J" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="Q" * 5 + "A" * 20 + "C" * 50 + "I" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="c", + seq="N" * 5 + "C" * 20 + "T" * 50 + "H" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == contigs[0].seq.rstrip('J') + 'C' * 30 + contigs[2].seq.lstrip('N') + assert results[0].seq == contigs[0].seq.rstrip("J") + "C" * 30 + contigs[ + 2 + ].seq.lstrip("N") assert len(visualizer().elements) > len(contigs) -def test_correct_processing_of_two_overlapping_and_one_separate_contig_2(exact_aligner, visualizer): +def test_correct_processing_of_two_overlapping_and_one_separate_contig_2( + exact_aligner, visualizer +): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. - ref_seq = 'Z' * 5 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'Y' * 5 + ref_seq = "Z" * 5 + "A" * 100 + "C" * 100 + "T" * 100 + "Y" * 5 contigs = [ - GenotypedContig(name='a', - seq='N' * 5 + 'A' * 50 + 'C' * 20 + 'H' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='M' * 5 + 'C' * 50 + 'T' * 20 + 'J' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c', - seq='Q' * 5 + 'C' * 20 + 'T' * 50 + 'I' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="N" * 5 + "A" * 50 + "C" * 20 + "H" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="M" * 5 + "C" * 50 + "T" * 20 + "J" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="c", + seq="Q" * 5 + "C" * 20 + "T" * 50 + "I" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == contigs[0].seq.rstrip('H') + 'C' * 30 + contigs[2].seq.lstrip('Q') + assert results[0].seq == contigs[0].seq.rstrip("H") + "C" * 30 + contigs[ + 2 + ].seq.lstrip("Q") assert len(visualizer().elements) > len(contigs) -def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner, visualizer): +def test_stitching_of_all_overlapping_contigs_into_one_sequence( + exact_aligner, visualizer +): # Scenario: All contigs have some overlapping parts, resulting in one continuous sequence after stitching. - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + ref_seq = "A" * 100 + "C" * 100 + "T" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 20 + 'C' * 100 + 'T' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c', - seq='C' * 20 + 'T' * 50, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "C" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 20 + "C" * 100 + "T" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="c", + seq="C" * 20 + "T" * 50, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -375,7 +429,7 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner, v result = results[0] assert 200 == len(result.seq) - assert result.seq == 'A' * 50 + 'C' * 100 + 'T' * 50 + assert result.seq == "A" * 50 + "C" * 100 + "T" * 50 assert len(visualizer().elements) > len(contigs) @@ -383,28 +437,29 @@ def test_stitching_of_all_overlapping_contigs_into_one_sequence(exact_aligner, v def test_stitching_with_empty_contigs(exact_aligner, visualizer): # Scenario: The function is able to handle and ignore empty contigs. - ref_seq = 'A' * 100 + ref_seq = "A" * 100 contigs = [ - GenotypedContig(name='a', - seq=ref_seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='', - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq=ref_seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="", + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) @@ -413,14 +468,16 @@ def test_stitching_of_identical_contigs(exact_aligner, visualizer): # Scenario: The function correctly handles and avoids duplication when identical contigs are stitched together. contigs = [ - GenotypedContig(name=name, - seq='ACTGACTG' * 100, - ref_name='testref', - group_ref='testref', - ref_seq='ACTGACTG' * 100, - match_fraction=1.0, - ) - for name in ["a", "b", "c"]] + GenotypedContig( + name=name, + seq="ACTGACTG" * 100, + ref_name="testref", + group_ref="testref", + ref_seq="ACTGACTG" * 100, + match_fraction=1.0, + ) + for name in ["a", "b", "c"] + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -439,34 +496,37 @@ def test_stitching_of_zero_contigs(exact_aligner, visualizer): assert len(visualizer().elements) > 0 -def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs(exact_aligner, visualizer): +def test_correct_stitching_of_two_partially_overlapping_different_organism_contigs( + exact_aligner, visualizer +): # Scenario: Two partially overlapping contigs, but which come from different organism, # are not stitched into a single sequence. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, - ref_name='testref-1', - group_ref='testref-1', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 20 + 'C' * 50, - ref_name='testref-2', - group_ref='testref-2', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "C" * 20, + ref_name="testref-1", + group_ref="testref-1", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 20 + "C" * 50, + ref_name="testref-2", + group_ref="testref-2", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 2 - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) @@ -476,38 +536,45 @@ def test_correct_processing_complex_nogaps(exact_aligner, visualizer): # Each with 4 contigs. # For each, three overlapping contigs are stitched together, the non-overlapping is kept separate. - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 - - contigs = [[ - GenotypedContig(name='a' + ref_name, - seq='A' * 50 + 'C' * 20, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b' + ref_name, - seq='A' * 20 + 'C' * 50, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c' + ref_name, - seq='C' * 70 + 'T' * 20, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='d' + ref_name, - seq='T' * 20 + 'G' * 50, - ref_name=ref_name, - group_ref=ref_name, - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] for ref_name in ['testref-1', 'testref-2']] + ref_seq = "A" * 100 + "C" * 100 + "T" * 100 + "G" * 100 + + contigs = [ + [ + GenotypedContig( + name="a" + ref_name, + seq="A" * 50 + "C" * 20, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b" + ref_name, + seq="A" * 20 + "C" * 50, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="c" + ref_name, + seq="C" * 70 + "T" * 20, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="d" + ref_name, + seq="T" * 20 + "G" * 50, + ref_name=ref_name, + group_ref=ref_name, + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] + for ref_name in ["testref-1", "testref-2"] + ] contigs = sum(contigs, start=[]) @@ -515,12 +582,12 @@ def test_correct_processing_complex_nogaps(exact_aligner, visualizer): assert len(results) == 4 assert 170 == len(results[0].seq) - assert results[0].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[0].group_ref == 'testref-1' + assert results[0].seq == "A" * 50 + "C" * 100 + "T" * 20 + assert results[0].group_ref == "testref-1" assert 170 == len(results[1].seq) - assert results[1].seq == 'A' * 50 + 'C' * 100 + 'T' * 20 - assert results[1].group_ref == 'testref-2' + assert results[1].seq == "A" * 50 + "C" * 100 + "T" * 20 + assert results[1].group_ref == "testref-2" assert results[2].seq == contigs[3].seq assert results[3].seq == contigs[7].seq @@ -528,28 +595,32 @@ def test_correct_processing_complex_nogaps(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) -def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner, visualizer): +def test_stitching_when_one_contig_completely_covered_by_another( + exact_aligner, visualizer +): # Scenario: If one contig is completely covered by another contig, # the completely covered contig must be dropped. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='M' * 10 + 'A' * 20 + 'C' * 20 + 'O' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='P' * 10 + 'A' * 50 + 'C' * 50 + 'Z' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="M" * 10 + "A" * 20 + "C" * 20 + "O" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="P" * 10 + "A" * 50 + "C" * 50 + "Z" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -561,41 +632,47 @@ def test_stitching_when_one_contig_completely_covered_by_another(exact_aligner, assert len(visualizer().elements) > len(contigs) -def test_stitching_when_multiple_contigs_completely_covered_by_other_contigs(exact_aligner, visualizer): +def test_stitching_when_multiple_contigs_completely_covered_by_other_contigs( + exact_aligner, visualizer +): # Scenario: If two contigs are completely covered by another two contigs. - ref_seq = 'A' * 100 + 'B' * 100 + 'C' * 100 + 'D' * 100 + ref_seq = "A" * 100 + "B" * 100 + "C" * 100 + "D" * 100 contigs = [ - GenotypedContig(name='a', - seq='M' * 10 + 'A' * 20 + 'B' * 100 + 'C' * 20 + 'O' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='K' * 10 + 'B' * 20 + 'C' * 100 + 'D' * 20 + 'J' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='c', - seq='I' * 10 + 'B' * 60 + 'C' * 80 + 'P' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='d', - seq='Z' * 10 + 'B' * 80 + 'C' * 60 + 'F' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="M" * 10 + "A" * 20 + "B" * 100 + "C" * 20 + "O" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="K" * 10 + "B" * 20 + "C" * 100 + "D" * 20 + "J" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="c", + seq="I" * 10 + "B" * 60 + "C" * 80 + "P" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="d", + seq="Z" * 10 + "B" * 80 + "C" * 60 + "F" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 @@ -606,22 +683,22 @@ def test_stitching_when_multiple_contigs_completely_covered_by_other_contigs(exa def test_stitching_contig_with_big_noncovered_gap(exact_aligner, visualizer): # Scenario: One contig has a big gap, which is however not covered by anything else. - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + ref_seq = "A" * 100 + "C" * 100 + "T" * 100 contigs = [ - GenotypedContig(name='a', - seq= 'A' * 50 + 'T' * 50, # mind the C gap - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "T" * 50, # mind the C gap + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) @@ -629,29 +706,30 @@ def test_stitching_contig_with_big_noncovered_gap(exact_aligner, visualizer): def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner, visualizer): # Scenario: One contig has a big gap, which is however not covered by anything else. - ref_seq = 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + ref_seq = "A" * 100 + "C" * 100 + "T" * 100 + "G" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 50 + 'T' * 50, # mind the C gap - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='B', - seq='G' * 50, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "T" * 50, # mind the C gap + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="B", + seq="G" * 50, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) @@ -659,24 +737,26 @@ def test_stitching_contig_with_big_noncovered_gap_2(exact_aligner, visualizer): def test_stitching_contig_with_big_covered_gap(exact_aligner, visualizer): # Scenario: If one contig has a big gap covered by another contig. - ref_seq = 'G' * 100 + 'A' * 100 + 'C' * 100 + 'T' * 100 + 'G' * 100 + ref_seq = "G" * 100 + "A" * 100 + "C" * 100 + "T" * 100 + "G" * 100 contigs = [ - GenotypedContig(name='a', - seq='G' * 50 + 'A' * 50 + 'T' * 100, # mind the gap - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 100 + 'C' * 100 + 'T' * 50, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="G" * 50 + "A" * 50 + "T" * 100, # mind the gap + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 100 + "C" * 100 + "T" * 50, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] contigs = list(align_all_to_reference(contigs)) assert len(contigs) == 2 @@ -693,24 +773,26 @@ def test_stitching_contig_with_big_covered_gap(exact_aligner, visualizer): def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): # Scenario: If one contig has a small gap covered by another contig. - ref_seq = 'G' * 100 + 'A' * 29 + 'C' * 100 + 'T' * 100 + ref_seq = "G" * 100 + "A" * 29 + "C" * 100 + "T" * 100 contigs = [ - GenotypedContig(name='a', - seq='G' * 100 + 'A' * 0 + 'C' * 100, # mind the gap - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 29 + 'C' * 50, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="G" * 100 + "A" * 0 + "C" * 100, # mind the gap + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 29 + "C" * 50, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + ] contigs = list(align_all_to_reference(contigs)) assert len(contigs) == 2 @@ -722,22 +804,22 @@ def test_stitching_contig_with_small_covered_gap(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) assert all(x.seq == lstrip(rstrip(x)).seq for x in results) - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} def test_stitching_partial_align(exact_aligner, visualizer): # Scenario: A single contig has a sequence that partially aligns to the reference sequence. contigs = [ - GenotypedContig(name='a', - seq='T' * 10 + 'C' * 20 + 'A' * 10, - ref_name='testref', - group_ref='testref', - ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="T" * 10 + "C" * 20 + "A" * 10, + ref_name="testref", + group_ref="testref", + ref_seq="A" * 20 + "C" * 20 + "T" * 20, + match_fraction=0.3, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == len(contigs) @@ -748,27 +830,28 @@ def test_stitching_partial_align(exact_aligner, visualizer): assert all(x.seq != lstrip(rstrip(x)).seq for x in results) - assert { contig.seq for contig in contigs } \ - != { lstrip(rstrip(contig)).seq for contig in results } + assert {contig.seq for contig in contigs} != { + lstrip(rstrip(contig)).seq for contig in results + } def test_partial_align_consensus(exact_aligner, visualizer): # Scenario: A single contig partially aligns to the reference sequence, and a consensus sequence is being stitched. contigs = [ - GenotypedContig(name='a', - seq='T' * 10 + 'C' * 20 + 'A' * 10, - ref_name='testref', - group_ref='testref', - ref_seq='A' * 20 + 'C' * 20 + 'T' * 20, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="T" * 10 + "C" * 20 + "A" * 10, + ref_name="testref", + group_ref="testref", + ref_seq="A" * 20 + "C" * 20 + "T" * 20, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == len(contigs) - assert { contig.seq for contig in contigs } \ - == { contig.seq for contig in results } + assert {contig.seq for contig in contigs} == {contig.seq for contig in results} assert len(visualizer().elements) > len(contigs) @@ -776,52 +859,57 @@ def test_partial_align_consensus(exact_aligner, visualizer): def test_stitching_partial_align_multiple_sequences(exact_aligner, visualizer): # Scenario: Multiple contigs have sequences that partially align to the same reference sequence. - ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + ref_seq = "A" * 20 + "C" * 20 + "T" * 20 contigs = [ - GenotypedContig(name='a', - seq='Z' * 5 + 'C' * 20 + 'T' * 5 + 'U' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='M' * 5 + 'C' * 5 + 'T' * 10 + 'G' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="Z" * 5 + "C" * 20 + "T" * 5 + "U" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="M" * 5 + "C" * 5 + "T" * 10 + "G" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 1 - assert results[0].seq == 'Z' * 5 + 'C' * 20 + 'T' * 10 + 'G' * 10 + assert results[0].seq == "Z" * 5 + "C" * 20 + "T" * 10 + "G" * 10 assert len(visualizer().elements) > len(contigs) def test_partial_align_consensus_multiple_sequences(exact_aligner, visualizer): - # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. + # Scenario: Multiple contigs partially align to the same reference sequence, + # and a consensus sequence is being stitched from them. - ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + ref_seq = "A" * 20 + "C" * 20 + "T" * 20 contigs = [ - GenotypedContig(name='a', - seq='A' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='T' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="A" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="T" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -830,31 +918,38 @@ def test_partial_align_consensus_multiple_sequences(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) -def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner, visualizer): - # Scenario: Multiple contigs partially align to the same reference sequence, and a consensus sequence is being stitched from them. +def test_partial_align_consensus_multiple_overlaping_sequences( + exact_aligner, visualizer +): + # Scenario: Multiple contigs partially align to the same reference sequence, + # and a consensus sequence is being stitched from them. - ref_seq='A' * 20 + 'C' * 20 + 'T' * 20 + ref_seq = "A" * 20 + "C" * 20 + "T" * 20 contigs = [ - GenotypedContig(name='a', - seq='T' * 10 + 'A' * 5 + 'C' * 20 + 'A' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="T" * 10 + "A" * 5 + "C" * 20 + "A" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="C" * 20 + "T" * 5 + "A" * 10 + "G" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 - assert results[0].seq == 'T' * 10 + 'A' * 5 + 'C' * 20 + 'T' * 5 + 'A' * 10 + 'G' * 10 + assert ( + results[0].seq == "T" * 10 + "A" * 5 + "C" * 20 + "T" * 5 + "A" * 10 + "G" * 10 + ) assert results[0].seq == contigs[0].seq[:-10] + contigs[1].seq[20:] assert len(visualizer().elements) > len(contigs) @@ -863,17 +958,18 @@ def test_partial_align_consensus_multiple_overlaping_sequences(exact_aligner, vi def test_big_insertion_in_a_single_contig(exact_aligner, visualizer): # Scenario: Single contig produces many alignments. - ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + ref_seq = "A" * 10 + "B" * 20 + "C" * 10 contigs = [ - GenotypedContig(name='a', - seq='B' * 10 + 'D' * 100 + 'B' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="B" * 10 + "D" * 100 + "B" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -885,17 +981,18 @@ def test_big_insertion_in_a_single_contig(exact_aligner, visualizer): def test_big_insertion_in_a_single_contig_2(exact_aligner, visualizer): # Scenario: Single contig produces many alignments. - ref_seq='A' * 10 + 'B' * 20 + 'C' * 10 + ref_seq = "A" * 10 + "B" * 20 + "C" * 10 contigs = [ - GenotypedContig(name='a', - seq='A' * 10 + 'D' * 100 + 'C' * 10, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="A" * 10 + "D" * 100 + "C" * 10, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -907,24 +1004,26 @@ def test_big_insertion_in_a_single_contig_2(exact_aligner, visualizer): def test_gap_around_small_insertion(exact_aligner, visualizer): # Scenario: Contig is split around its gap, then stripped. - ref_seq='A' * 10 + 'B' * 29 + 'C' * 10 + ref_seq = "A" * 10 + "B" * 29 + "C" * 10 contigs = [ - GenotypedContig(name='a', - seq='P' * 5 + 'A' * 10 + 'D' * 6 + 'C' * 10 + 'Z' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='Q' * 5 + 'B' * 29 + 'J' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="P" * 5 + "A" * 10 + "D" * 6 + "C" * 10 + "Z" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="Q" * 5 + "B" * 29 + "J" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -935,24 +1034,26 @@ def test_gap_around_small_insertion(exact_aligner, visualizer): def test_gap_around_big_insertion(exact_aligner, visualizer): # Scenario: Contig is split around its gap, then stripped. - ref_seq='A' * 10 + 'B' * 29 + 'C' * 10 + ref_seq = "A" * 10 + "B" * 29 + "C" * 10 contigs = [ - GenotypedContig(name='a', - seq='P' * 5 + 'A' * 10 + 'D' * 100 + 'C' * 10 + 'Z' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='Q' * 5 + 'B' * 29 + 'J' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="P" * 5 + "A" * 10 + "D" * 100 + "C" * 10 + "Z" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="Q" * 5 + "B" * 29 + "J" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -963,17 +1064,18 @@ def test_gap_around_big_insertion(exact_aligner, visualizer): def test_stitch_with_insertion(exact_aligner, visualizer): # Scenario: Contig is aligned with multiple hits, and the borders are correctly handled. - ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + ref_seq = "X" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "M" * 5 contigs = [ - GenotypedContig(name='a', - seq='P' * 5 + 'A' * 10 + 'D' * 6 + 'C' * 10 + 'Z' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="P" * 5 + "A" * 10 + "D" * 6 + "C" * 10 + "Z" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -981,21 +1083,21 @@ def test_stitch_with_insertion(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) - def test_stitch_cross_alignment(exact_aligner, visualizer): # Scenario: Single contig is cross-aligned. - ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + ref_seq = "X" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "M" * 5 contigs = [ - GenotypedContig(name='a', - seq='P' * 5 + 'C' * 10 + 'D' * 6 + 'A' * 10 + 'Z' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="P" * 5 + "C" * 10 + "D" * 6 + "A" * 10 + "Z" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1006,24 +1108,26 @@ def test_stitch_cross_alignment(exact_aligner, visualizer): def test_cross_alignment_around_small_insertion(exact_aligner, visualizer): # Scenario: Single contig is cross-aligned, then combined with another contig that is between its aligned parts. - ref_seq='X' * 5 + 'A' * 10 + 'B' * 20 + 'C' * 10 + 'M' * 5 + ref_seq = "X" * 5 + "A" * 10 + "B" * 20 + "C" * 10 + "M" * 5 contigs = [ - GenotypedContig(name='a', - seq='P' * 5 + 'C' * 10 + 'D' * 6 + 'A' * 10 + 'Z' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - GenotypedContig(name='b', - seq='Q' * 5 + 'B' * 20 + 'J' * 5, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq="P" * 5 + "C" * 10 + "D" * 6 + "A" * 10 + "Z" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + GenotypedContig( + name="b", + seq="Q" * 5 + "B" * 20 + "J" * 5, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1036,20 +1140,21 @@ def test_reverse_complement_match(projects, visualizer): from mappy import revcomp - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) ref_part = ref[2000:2200] seq = revcomp(ref_part) contigs = [ - GenotypedContig(name='a', - seq=seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1062,28 +1167,29 @@ def test_reverse_complement_match_with_padding(projects, visualizer): from mappy import revcomp - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) - ref_part = 'T' * 24 + ref[2000:2200] + 'G' * 27 + ref_part = "T" * 24 + ref[2000:2200] + "G" * 27 seq = revcomp(ref_part) contigs = [ - GenotypedContig(name='a', - seq=seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 assert results[0].seq == ref_part assert len(lstrip(results[0]).seq) == len(ref_part) - 24 assert len(rstrip(results[0]).seq) == len(ref_part) - 27 - assert rstrip(results[0]).seq == ref_part[:-27] # 27 Gs on the right - assert lstrip(results[0]).seq == ref_part[24:] # 24 Ts on the left + assert rstrip(results[0]).seq == ref_part[:-27] # 27 Gs on the right + assert lstrip(results[0]).seq == ref_part[24:] # 24 Ts on the left assert len(visualizer().elements) > len(contigs) @@ -1092,20 +1198,29 @@ def test_multiple_reverse_complement_matches(projects, visualizer): from mappy import revcomp - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) - ref_part = 'T' * 24 + ref[2000:2600] + 'A' * 9 + ref[3000:3600] + 'T' * 9 + ref[4000:4600] + 'G' * 27 + ref_part = ( + "T" * 24 + + ref[2000:2600] + + "A" * 9 + + ref[3000:3600] + + "T" * 9 + + ref[4000:4600] + + "G" * 27 + ) seq = revcomp(ref_part) contigs = [ - GenotypedContig(name='a', - seq=seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1124,20 +1239,29 @@ def test_multiple_reverse_complement_matches_out_of_order(projects, visualizer): from mappy import revcomp - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) - ref_part = 'T' * 24 + ref[2000:2600] + 'A' * 9 + ref[3000:3600] + 'T' * 9 + ref[4000:4600] + 'G' * 27 + ref_part = ( + "T" * 24 + + ref[2000:2600] + + "A" * 9 + + ref[3000:3600] + + "T" * 9 + + ref[4000:4600] + + "G" * 27 + ) seq = revcomp(ref_part) contigs = [ - GenotypedContig(name='a', - seq=seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1155,19 +1279,20 @@ def test_forward_and_reverse_match(projects, visualizer): from mappy import revcomp - hxb2_name = 'HIV1-B-FR-K03455-seed' + hxb2_name = "HIV1-B-FR-K03455-seed" ref = projects.getReference(hxb2_name) seq = ref[1000:1100] + revcomp(ref[2000:2200]) contigs = [ - GenotypedContig(name='a', - seq=seq, - ref_name='testref', - group_ref='testref', - ref_seq=ref, - match_fraction=0.3, - ), - ] + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] results = list(stitch_consensus(contigs)) assert len(results) == 1 @@ -1178,24 +1303,26 @@ def test_forward_and_reverse_match(projects, visualizer): def test_correct_stitching_of_one_normal_and_one_unknown(exact_aligner, visualizer): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. - ref_seq = 'A' * 100 + 'C' * 100 + ref_seq = "A" * 100 + "C" * 100 contigs = [ - GenotypedContig(name='a', - seq='A' * 50 + 'C' * 20, - ref_name='testref', - group_ref='testref', - ref_seq=ref_seq, - match_fraction=0.5, - ), - GenotypedContig(name='b', - seq='A' * 20 + 'C' * 50, - ref_name=None, - group_ref=None, - ref_seq=None, - match_fraction=0.5, - ), - ] + GenotypedContig( + name="a", + seq="A" * 50 + "C" * 20, + ref_name="testref", + group_ref="testref", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="A" * 20 + "C" * 50, + ref_name=None, + group_ref=None, + ref_seq=None, + match_fraction=0.5, + ), + ] results = list(stitch_contigs(contigs)) assert len(results) == 2 @@ -1203,8 +1330,7 @@ def test_correct_stitching_of_one_normal_and_one_unknown(exact_aligner, visualiz assert 70 == len(results[0].seq) assert 70 == len(results[1].seq) - assert {result.seq for result in results} \ - == {contig.seq for contig in contigs} + assert {result.seq for result in results} == {contig.seq for contig in contigs} assert len(visualizer().elements) > len(contigs) @@ -1219,14 +1345,16 @@ def test_main_invocation(exact_aligner, tmp_path, hcv_db): assert os.path.exists(stitched_contigs) # Check the contents of stitched_contigs - with open(stitched_contigs, 'r') as stitched_file: + with open(stitched_contigs, "r") as stitched_file: stitched_data = stitched_file.read() expected_file_path = os.path.join(pwd, "data", "exact_parts_contigs_stitched.csv") - with open(expected_file_path, 'r') as expected_file: + with open(expected_file_path, "r") as expected_file: expected_data = expected_file.read() - assert stitched_data == expected_data, "The contents of the stitched contigs file do not match the expected contents." + assert ( + stitched_data == expected_data + ), "The contents of the stitched contigs file do not match the expected contents." def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): @@ -1240,21 +1368,24 @@ def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): assert os.path.exists(stitched_contigs) # Check the contents of stitched_contigs - with open(stitched_contigs, 'r') as stitched_file: + with open(stitched_contigs, "r") as stitched_file: stitched_data = stitched_file.read() expected_file_path = os.path.join(pwd, "data", "exact_parts_contigs_stitched.csv") - with open(expected_file_path, 'r') as expected_file: + with open(expected_file_path, "r") as expected_file: expected_data = expected_file.read() - assert stitched_data == expected_data, "The contents of the stitched contigs file do not match the expected contents." + assert ( + stitched_data == expected_data + ), "The contents of the stitched contigs file do not match the expected contents." # Check the contents of stitched_contigs expected_plot = os.path.join(pwd, "data", "exact_parts_contigs.plot.svg") - with open(plot, 'r') as stitched_file, \ - open(expected_plot, 'r') as expected_file: + with open(plot, "r") as stitched_file, open(expected_plot, "r") as expected_file: stitched_data = stitched_file.read() expected_data = expected_file.read() - assert stitched_data == expected_data, "The contents of the stitched plot file do not match the expected contents." + assert ( + stitched_data == expected_data + ), "The contents of the stitched plot file do not match the expected contents." # _ _ _ _ _ _ @@ -1264,68 +1395,48 @@ def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): # \___/|_| |_|_|\__| \__\___||___/\__|___/ # -@pytest.mark.parametrize("intervals, expected", [ - ([], []), - ([(1, 3)], [(1, 3)]), - - # Non-overlapping intervals - ([(1, 3), (5, 6)], [(1, 3), (5, 6)]), - - # Directly overlapping intervals - ([(1, 3), (2, 5)], [(1, 5)]), - - # Adjacent intervals that exactly touch each other - ([(1, 2), (3, 4)], [(1, 4)]), - - # Nested intervals - ([(1, 10), (2, 5)], [(1, 10)]), - - # Multiple merged intervals - ([(1, 3), (2, 4), (6, 8), (10, 11), (11, 12)], - [(1, 4), (6, 8), (10, 12)]), - - # Intervals out of initial order - ([(4, 6), (1, 2)], - [(1, 2), (4, 6)]), - - # Overlapping intervals with out of order inputs - ([(1, 4), (3, 5), (2, 3), (7, 10), (9, 12)], - [(1, 5), (7, 12)]), - - # Large set of intervals with various overlaps - ([(1, 4), (2, 6), (5, 8), (7, 8), (10, 15), (11, 12), (13, 14), (17, 18)], - [(1, 8), (10, 15), (17, 18)]), - # Intervals where end is less than start should return as is or be handled explicitly depending on implementation - ([(5, 3), (1, 2)], - [(1, 2), (5, 3)]), - - # Intervals that are exactly one after the other in sequence / Intervals that are completely disjoint - ([(1, 2), (4, 5), (7, 8)], - [(1, 2), (4, 5), (7, 8)]), - - # Overlapping intervals that merge into one large interval - ([(2, 6), (4, 10), (5, 15), (14, 20)], - [(2, 20)]), - - # Same interval repeated multiple times - ([(1, 5), (1, 5), (1, 5)], - [(1, 5)]), - - # Single point intervals - ([(1, 1), (5, 5), (3, 3)], - [(1, 1), (3, 3), (5, 5)]), - - ([(1, 1), (5, 5), (3, 3), (1, 1), (1, 1)], - [(1, 1), (3, 3), (5, 5)]), - - ([(1, 1), (2, 3)], - [(1, 3)]), - - # Intervals that start with negative numbers - ([(-5, 0), (-2, 3), (1, 7), (9, 12)], - [(-5, 7), (9, 12)]), -]) +@pytest.mark.parametrize( + "intervals, expected", + [ + ([], []), + ([(1, 3)], [(1, 3)]), + # Non-overlapping intervals + ([(1, 3), (5, 6)], [(1, 3), (5, 6)]), + # Directly overlapping intervals + ([(1, 3), (2, 5)], [(1, 5)]), + # Adjacent intervals that exactly touch each other + ([(1, 2), (3, 4)], [(1, 4)]), + # Nested intervals + ([(1, 10), (2, 5)], [(1, 10)]), + # Multiple merged intervals + ([(1, 3), (2, 4), (6, 8), (10, 11), (11, 12)], [(1, 4), (6, 8), (10, 12)]), + # Intervals out of initial order + ([(4, 6), (1, 2)], [(1, 2), (4, 6)]), + # Overlapping intervals with out of order inputs + ([(1, 4), (3, 5), (2, 3), (7, 10), (9, 12)], [(1, 5), (7, 12)]), + # Large set of intervals with various overlaps + ( + [(1, 4), (2, 6), (5, 8), (7, 8), (10, 15), (11, 12), (13, 14), (17, 18)], + [(1, 8), (10, 15), (17, 18)], + ), + # Intervals where end is less than start should return + # as is or be handled explicitly depending on implementation + ([(5, 3), (1, 2)], [(1, 2), (5, 3)]), + # Intervals that are exactly one after the other in sequence / Intervals that are completely disjoint + ([(1, 2), (4, 5), (7, 8)], [(1, 2), (4, 5), (7, 8)]), + # Overlapping intervals that merge into one large interval + ([(2, 6), (4, 10), (5, 15), (14, 20)], [(2, 20)]), + # Same interval repeated multiple times + ([(1, 5), (1, 5), (1, 5)], [(1, 5)]), + # Single point intervals + ([(1, 1), (5, 5), (3, 3)], [(1, 1), (3, 3), (5, 5)]), + ([(1, 1), (5, 5), (3, 3), (1, 1), (1, 1)], [(1, 1), (3, 3), (5, 5)]), + ([(1, 1), (2, 3)], [(1, 3)]), + # Intervals that start with negative numbers + ([(-5, 0), (-2, 3), (1, 7), (9, 12)], [(-5, 7), (9, 12)]), + ], +) def test_merge_intervals(intervals, expected): assert merge_intervals(intervals) == expected @@ -1352,72 +1463,94 @@ def create_mock_aligned_contig(ref_name, r_st, r_ei, name="contig"): return MockAlignedContig(ref_name, ref_name, r_st, r_ei, name) -@pytest.mark.parametrize("contigs, expected_covered_name", [ - # No contigs are completely covered. - ([('ref1', 0, 100), ('ref1', 101, 200)], None), - ([('ref1', 0, 50), ('ref1', 51, 100)], None), - - # A single contig is completely covered by one other contig. - ([('ref1', 0, 100), ('ref1', 0, 200)], 'contig1'), - ([('ref1', 50, 150), ('ref1', 0, 200)], 'contig1'), - - # A single contig completely covers another, but with different reference names. - ([('ref1', 0, 50), ('ref2', 0, 100)], None), - - # Single coverage with exact match. - ([('ref1', 0, 100), ('ref1', 0, 100)], 'contig1'), - - # A single contig is completely covered at the beginning by one and at the end by another contig. - ([('ref1', 0, 50), ('ref1', 50, 100), ('ref1', 25, 75)], 'contig3'), - - # Contigs overlap but none are completely covered. - ([('ref1', 0, 50), ('ref1', 40, 90), ('ref1', 80, 120)], None), - - # Multiple contigs with some covered completely by a single other contig. - ([('ref1', 0, 200), ('ref1', 10, 30), ('ref1', 170, 190)], 'contig2'), - - # Multiple contigs with complex overlaps and one completely covered. - ([('ref1', 30, 60), ('ref1', 0, 50), ('ref1', 20, 70), ('ref1', 60, 90)], 'contig1'), - - # Edge case where a contig starts where another ends. - ([('ref1', 0, 50), ('ref1', 50, 100)], None), - - # Contigs are completely covered in a nested fashion. - ([('ref1', 0, 200), ('ref1', 50, 150), ('ref1', 100, 125)], 'contig2'), - - # Contigs are adjacent and cover each other completely. - ([('ref1', 0, 100), ('ref1', 101, 200), ('ref1', 0, 200)], 'contig1'), - - # Single large contig covers several smaller non-adjacent contigs. - ([('ref1', 0, 500), ('ref1', 50, 100), ('ref1', 200, 250), ('ref1', 300, 350)], 'contig2'), - - # Single large contig covers several smaller adjacent contigs. - ([('ref1', 50, 100), ('ref1', 70, 300), ('ref1', 101, 199), ('ref1', 200, 350)], 'contig2'), - - # Single small contig is covered by several larger contigs. - ([('ref1', 0, 250), ('ref1', 200, 300), ('ref1', 600, 800), ('ref1', 250, 700)], 'contig2'), - - # Complex case with multiple contigs and complete coverage by combinations. - ([('ref1', 0, 100), ('ref1', 30, 130), ('ref1', 60, 160), ('ref1', 90, 190), ('ref1', 120, 220)], 'contig2'), - - # Contigs with same start but different end, where one is covered. - ([('ref1', 0, 100), ('ref1', 0, 50)], 'contig2'), - - # Contigs with same end but different start, where one is covered. - ([('ref1', 50, 100), ('ref1', 0, 100)], 'contig1'), - - # Contig covered by two overlapping contigs that don't individually cover the whole range. - ([('ref1', 0, 75), ('ref1', 25, 100), ('ref1', 0, 100)], 'contig1'), - - # Two contigs are covered completely by one large contig. - ([('ref1', 0, 300), ('ref1', 50, 100), ('ref1', 200, 250)], 'contig2'), - - # No contigs at all. - ([], None), -]) +@pytest.mark.parametrize( + "contigs, expected_covered_name", + [ + # No contigs are completely covered. + ([("ref1", 0, 100), ("ref1", 101, 200)], None), + ([("ref1", 0, 50), ("ref1", 51, 100)], None), + # A single contig is completely covered by one other contig. + ([("ref1", 0, 100), ("ref1", 0, 200)], "contig1"), + ([("ref1", 50, 150), ("ref1", 0, 200)], "contig1"), + # A single contig completely covers another, but with different reference names. + ([("ref1", 0, 50), ("ref2", 0, 100)], None), + # Single coverage with exact match. + ([("ref1", 0, 100), ("ref1", 0, 100)], "contig1"), + # A single contig is completely covered at the beginning by one and at the end by another contig. + ([("ref1", 0, 50), ("ref1", 50, 100), ("ref1", 25, 75)], "contig3"), + # Contigs overlap but none are completely covered. + ([("ref1", 0, 50), ("ref1", 40, 90), ("ref1", 80, 120)], None), + # Multiple contigs with some covered completely by a single other contig. + ([("ref1", 0, 200), ("ref1", 10, 30), ("ref1", 170, 190)], "contig2"), + # Multiple contigs with complex overlaps and one completely covered. + ( + [("ref1", 30, 60), ("ref1", 0, 50), ("ref1", 20, 70), ("ref1", 60, 90)], + "contig1", + ), + # Edge case where a contig starts where another ends. + ([("ref1", 0, 50), ("ref1", 50, 100)], None), + # Contigs are completely covered in a nested fashion. + ([("ref1", 0, 200), ("ref1", 50, 150), ("ref1", 100, 125)], "contig2"), + # Contigs are adjacent and cover each other completely. + ([("ref1", 0, 100), ("ref1", 101, 200), ("ref1", 0, 200)], "contig1"), + # Single large contig covers several smaller non-adjacent contigs. + ( + [ + ("ref1", 0, 500), + ("ref1", 50, 100), + ("ref1", 200, 250), + ("ref1", 300, 350), + ], + "contig2", + ), + # Single large contig covers several smaller adjacent contigs. + ( + [ + ("ref1", 50, 100), + ("ref1", 70, 300), + ("ref1", 101, 199), + ("ref1", 200, 350), + ], + "contig2", + ), + # Single small contig is covered by several larger contigs. + ( + [ + ("ref1", 0, 250), + ("ref1", 200, 300), + ("ref1", 600, 800), + ("ref1", 250, 700), + ], + "contig2", + ), + # Complex case with multiple contigs and complete coverage by combinations. + ( + [ + ("ref1", 0, 100), + ("ref1", 30, 130), + ("ref1", 60, 160), + ("ref1", 90, 190), + ("ref1", 120, 220), + ], + "contig2", + ), + # Contigs with same start but different end, where one is covered. + ([("ref1", 0, 100), ("ref1", 0, 50)], "contig2"), + # Contigs with same end but different start, where one is covered. + ([("ref1", 50, 100), ("ref1", 0, 100)], "contig1"), + # Contig covered by two overlapping contigs that don't individually cover the whole range. + ([("ref1", 0, 75), ("ref1", 25, 100), ("ref1", 0, 100)], "contig1"), + # Two contigs are covered completely by one large contig. + ([("ref1", 0, 300), ("ref1", 50, 100), ("ref1", 200, 250)], "contig2"), + # No contigs at all. + ([], None), + ], +) def test_find_covered(contigs, expected_covered_name): - mock_contigs = [create_mock_aligned_contig(ref_name, r_st, r_ei, f'contig{i+1}') - for i, (ref_name, r_st, r_ei) in enumerate(contigs)] + mock_contigs = [ + create_mock_aligned_contig(ref_name, r_st, r_ei, f"contig{i+1}") + for i, (ref_name, r_st, r_ei) in enumerate(contigs) + ] covered, covering = find_covered_contig(mock_contigs) if expected_covered_name is None: assert covered is None @@ -1428,30 +1561,33 @@ def test_find_covered(contigs, expected_covered_name): def test_concordance_same_length_inputs(): with pytest.raises(ValueError): - calculate_concordance('abc', 'ab') + calculate_concordance("abc", "ab") + def test_concordance_completely_different_strings(): - result = calculate_concordance('a'*30, 'b'*30) + result = calculate_concordance("a" * 30, "b" * 30) assert all(n == 0 for n in result) + def generate_random_string_pair(length): - left = ''.join(random.choice('ACGT') for _ in range(length)) - right = ''.join(random.choice('ACGT') for _ in range(length)) + left = "".join(random.choice("ACGT") for _ in range(length)) + right = "".join(random.choice("ACGT") for _ in range(length)) return left, right @pytest.mark.parametrize( - 'left, right, expected', - [("aaaaa", "aaaaa", [0.6, 0.68, 0.7, 0.68, 0.6]), - ("abcdd", "abcdd", [0.6, 0.68, 0.7, 0.68, 0.6]), - ("aaaaaaaa", "baaaaaab", [0.3, 0.62, 0.71, 0.75, 0.75, 0.71, 0.62, 0.3]), - ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), - ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), - ("aaaaaaaa", "aaaaabbb", [0.6, 0.68, 0.7, 0.68, 0.6, 0.29, 0.19, 0.13]), - ("aaaaaaaa", "aaabbaaa", [0.56, 0.63, 0.62, 0.39, 0.39, 0.62, 0.63, 0.56]), - ("aaaaa", "bbbbb", [0] * 5), - ("", "", []), - ] + "left, right, expected", + [ + ("aaaaa", "aaaaa", [0.6, 0.68, 0.7, 0.68, 0.6]), + ("abcdd", "abcdd", [0.6, 0.68, 0.7, 0.68, 0.6]), + ("aaaaaaaa", "baaaaaab", [0.3, 0.62, 0.71, 0.75, 0.75, 0.71, 0.62, 0.3]), + ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), + ("aaaaaaaa", "aaaaaaab", [0.64, 0.73, 0.79, 0.8, 0.79, 0.73, 0.64, 0.31]), + ("aaaaaaaa", "aaaaabbb", [0.6, 0.68, 0.7, 0.68, 0.6, 0.29, 0.19, 0.13]), + ("aaaaaaaa", "aaabbaaa", [0.56, 0.63, 0.62, 0.39, 0.39, 0.62, 0.63, 0.56]), + ("aaaaa", "bbbbb", [0] * 5), + ("", "", []), + ], ) def test_concordance_simple(left, right, expected): result = [round(float(x), 2) for x in calculate_concordance(left, right)] @@ -1459,18 +1595,31 @@ def test_concordance_simple(left, right, expected): @pytest.mark.parametrize( - 'left, right, expected', - [("a" * 128, "a" * 128, 64), - ("a" * 128, "a" * 64 + "b" * 64, 32), - ("a" * 128, "a" * 64 + "ba" * 32, 32), - ("a" * 128, "a" * 54 + "b" * 20 + "a" * 54, 28), # two peaks - ("a" * 128, "a" * 63 + "b" * 2 + "a" * 63, 32), # two peaks - ("a" * 1280, "b" * 640 + "a" * 640, round(1280 * 3 / 4)), - ("a" * 128, "b" * 48 + "a" * 32 + "b" * 48, 64), - ("a" * 128, "b" * 48 + "a" * 15 + "ab" + "a" * 15 + "b" * 48, 48 + 16//2), # two peaks - choosing 1nd - ("a" * 128, "b" * 48 + "a" * 15 + "ba" + "a" * 15 + "b" * 48, 48 + 15 + 16//2), # two peaks - choosing 2nd - ("a" * 128, "b" * 48 + "a" * 15 + "bb" + "a" * 15 + "b" * 48, 48 + 15//2), # two peaks - choosing 1st - ] + "left, right, expected", + [ + ("a" * 128, "a" * 128, 64), + ("a" * 128, "a" * 64 + "b" * 64, 32), + ("a" * 128, "a" * 64 + "ba" * 32, 32), + ("a" * 128, "a" * 54 + "b" * 20 + "a" * 54, 28), # two peaks + ("a" * 128, "a" * 63 + "b" * 2 + "a" * 63, 32), # two peaks + ("a" * 1280, "b" * 640 + "a" * 640, round(1280 * 3 / 4)), + ("a" * 128, "b" * 48 + "a" * 32 + "b" * 48, 64), + ( + "a" * 128, + "b" * 48 + "a" * 15 + "ab" + "a" * 15 + "b" * 48, + 48 + 16 // 2, + ), # two peaks - choosing 1nd + ( + "a" * 128, + "b" * 48 + "a" * 15 + "ba" + "a" * 15 + "b" * 48, + 48 + 15 + 16 // 2, + ), # two peaks - choosing 2nd + ( + "a" * 128, + "b" * 48 + "a" * 15 + "bb" + "a" * 15 + "b" * 48, + 48 + 15 // 2, + ), # two peaks - choosing 1st + ], ) def test_concordance_simple_index(left, right, expected): concordance = calculate_concordance(left, right) @@ -1485,22 +1634,33 @@ def generate_test_cases(num_cases): length = random.randint(1, 80) return [generate_random_string_pair(length) for _ in range(num_cases)] + concordance_cases = generate_test_cases(num_cases=100) -@pytest.mark.parametrize('left, right', concordance_cases) +@pytest.mark.parametrize("left, right", concordance_cases) def test_concordance_output_range(left, right): result = calculate_concordance(left, right) - assert all(0 <= n <= 1 for n in result), "All values in result should be between 0 and 1" + assert all( + 0 <= n <= 1 for n in result + ), "All values in result should be between 0 and 1" -@pytest.mark.parametrize('left, right', concordance_cases) +@pytest.mark.parametrize("left, right", concordance_cases) def test_concordance_higher_if_more_matches_added(left, right): # Insert exact matches in the middle - matching_sequence = 'A' * 30 + matching_sequence = "A" * 30 insert_position = len(left) // 2 - new_left = left[:insert_position] + matching_sequence + left[insert_position + len(matching_sequence):] - new_right = right[:insert_position] + matching_sequence + right[insert_position + len(matching_sequence):] + new_left = ( + left[:insert_position] + + matching_sequence + + left[insert_position + len(matching_sequence):] + ) + new_right = ( + right[:insert_position] + + matching_sequence + + right[insert_position + len(matching_sequence):] + ) old_conc = calculate_concordance(left, right) new_conc = calculate_concordance(new_left, new_right) @@ -1509,22 +1669,36 @@ def test_concordance_higher_if_more_matches_added(left, right): assert old_average <= new_average -@pytest.mark.parametrize('left, right', concordance_cases) +@pytest.mark.parametrize("left, right", concordance_cases) def test_concordance_higher_in_matching_areas(left, right): # Insert exact matches in the middle - matching_sequence = 'A' * 30 + matching_sequence = "A" * 30 insert_position = len(left) // 2 - new_left = left[:insert_position] + matching_sequence + left[insert_position + len(matching_sequence):] - new_right = right[:insert_position] + matching_sequence + right[insert_position + len(matching_sequence):] + new_left = ( + left[:insert_position] + + matching_sequence + + left[insert_position + len(matching_sequence):] + ) + new_right = ( + right[:insert_position] + + matching_sequence + + right[insert_position + len(matching_sequence):] + ) concordance_scores = calculate_concordance(new_left, new_right) # Check concordance in the matching area - matching_area_concordance = concordance_scores[insert_position:insert_position + len(matching_sequence)] + matching_area_concordance = concordance_scores[ + insert_position:insert_position + len(matching_sequence) + ] # Calculate average concordance inside and outside the matching area average_inside = sum(matching_area_concordance) / len(matching_sequence) - average_outside = (sum(concordance_scores) - sum(matching_area_concordance)) / (len(concordance_scores) - len(matching_sequence)) + average_outside = (sum(concordance_scores) - sum(matching_area_concordance)) / ( + len(concordance_scores) - len(matching_sequence) + ) # Assert that the concordance is indeed higher in the matching area - assert average_inside > average_outside, "Concordance in matching areas should be higher than in non-matching areas" + assert ( + average_inside > average_outside + ), "Concordance in matching areas should be higher than in non-matching areas" diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index a83a07761..8271a1a88 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -1,7 +1,14 @@ import pytest import json import os -from micall.core.contig_stitcher import GenotypedContig, AlignedContig, stitch_consensus, stitch_contigs, split_contigs_with_gaps, drop_completely_covered, combine_overlaps, StitcherContext +from micall.core.contig_stitcher import ( + GenotypedContig, + AlignedContig, + stitch_consensus, + stitch_contigs, + drop_completely_covered, + StitcherContext, +) import micall.core.contig_stitcher as stitcher from micall.core.plot_contigs import build_stitcher_figure from micall.utils.cigar_tools import CigarHit, Cigar @@ -12,7 +19,7 @@ @pytest.fixture def no_aligner(monkeypatch): - monkeypatch.setattr('micall.core.contig_stitcher.align_to_reference', lambda x: [x]) + monkeypatch.setattr("micall.core.contig_stitcher.align_to_reference", lambda x: [x]) @pytest.fixture(autouse=True) @@ -22,28 +29,34 @@ def stitcher_context(): def read_contigs(line): array = json.loads(line) - contig_descriptions = [obj['fields'] for obj in array if obj['type'] == 'contig'] + contig_descriptions = [obj["fields"] for obj in array if obj["type"] == "contig"] for description in contig_descriptions: - start = description['start'] - end = description['end'] - name = description['name'] + start = description["start"] + end = description["end"] + name = description["name"] length = end - start + 1 assert length > 0 - ref_seq = 'A' * 1000 # it does not matter - seq='C' * 10 + 'A' * length + 'T' * 10 + ref_seq = "A" * 1000 # it does not matter + seq = "C" * 10 + "A" * length + "T" * 10 query = GenotypedContig( name=name, seq=seq, - ref_name='commonref', - group_ref='commongroup', + ref_name="commonref", + group_ref="commongroup", ref_seq=ref_seq, - match_fraction=2/3, + match_fraction=2 / 3, + ) + alignment = CigarHit( + Cigar([(length, CigarActions.MATCH)]), + q_st=20, + q_ei=20 + length - 1, + r_st=start, + r_ei=end, ) - alignment = CigarHit(Cigar([(length, CigarActions.MATCH)]), q_st=20, q_ei=20+length-1, r_st=start, r_ei=end) contig = AlignedContig.make(query=query, alignment=alignment, strand="forward") - aidee = f'{start:03d}-{end:03d}' - yield {'contig': contig, 'id': aidee} + aidee = f"{start:03d}-{end:03d}" + yield {"contig": contig, "id": aidee} def get_case_descriptions(): @@ -52,26 +65,26 @@ def get_case_descriptions(): with open(jsonfile, "r", encoding="utf8") as reader: for line in reader: read = list(read_contigs(line)) - contigs = [x['contig'] for x in read] - ids = [x['id'] for x in read] - aidee = ','.join(ids) - yield {'contigs': contigs, 'id': aidee} + contigs = [x["contig"] for x in read] + ids = [x["id"] for x in read] + aidee = ",".join(ids) + yield {"contigs": contigs, "id": aidee} all_case_descriptions = list(get_case_descriptions()) -all_case_ids = [x['id'] for x in all_case_descriptions] +all_case_ids = [x["id"] for x in all_case_descriptions] -@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +@pytest.mark.parametrize("description", all_case_descriptions, ids=all_case_ids) def test_contig_number_prop(no_aligner, description): - contigs = description['contigs'] + contigs = description["contigs"] stitched = list(stitch_consensus(contigs)) assert len(stitched) <= len(contigs) -@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +@pytest.mark.parametrize("description", all_case_descriptions, ids=all_case_ids) def test_contig_number_prop2(no_aligner, description): - contigs = description['contigs'] + contigs = description["contigs"] consensus = list(stitch_consensus(contigs)) stitched = list(stitch_contigs(contigs)) uncovered = list(drop_completely_covered(contigs)) @@ -81,19 +94,22 @@ def test_contig_number_prop2(no_aligner, description): def test_contig_number_prop2_existential(): # This test is just to confirm that our cases cover all sub-actions. - contig_sets = [x['contigs'] for x in all_case_descriptions] + contig_sets = [x["contigs"] for x in all_case_descriptions] - assert any(len(list(stitch_contigs(contigs))) > - len(list(stitch_consensus(contigs))) - for contigs in contig_sets) + assert any( + len(list(stitch_contigs(contigs))) > len(list(stitch_consensus(contigs))) + for contigs in contig_sets + ) - assert any(len(list(drop_completely_covered(contigs))) > - len(list(stitch_contigs(contigs))) - for contigs in contig_sets) + assert any( + len(list(drop_completely_covered(contigs))) > len(list(stitch_contigs(contigs))) + for contigs in contig_sets + ) - assert any(len(list(contigs)) > - len(list(drop_completely_covered(contigs))) - for contigs in contig_sets) + assert any( + len(list(contigs)) > len(list(drop_completely_covered(contigs))) + for contigs in contig_sets + ) def get_all_reference_positions(contigs: List[AlignedContig]): @@ -105,9 +121,9 @@ def get_all_reference_positions(contigs: List[AlignedContig]): return ret -@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +@pytest.mark.parametrize("description", all_case_descriptions, ids=all_case_ids) def test_stitching_intervals_prop(no_aligner, description): - contigs = description['contigs'] + contigs = description["contigs"] stitched = list(stitch_contigs(contigs)) initial_positions = get_all_reference_positions(contigs) stitched_positions = get_all_reference_positions(stitched) @@ -119,11 +135,11 @@ def test_stitching_intervals_prop(no_aligner, description): assert all(v == 1 for (k, v) in stitched_positions.items()) -@pytest.mark.parametrize('description', all_case_descriptions, ids=all_case_ids) +@pytest.mark.parametrize("description", all_case_descriptions, ids=all_case_ids) def test_visualizer_simple(no_aligner, description): - contigs = description['contigs'] + contigs = description["contigs"] with StitcherContext.fresh() as ctx: - stitched = list(stitch_consensus(contigs)) + list(stitch_consensus(contigs)) assert len(ctx.events) >= len(contigs) figure = build_stitcher_figure(ctx.events) assert len(figure.elements) > len(contigs) + 1 diff --git a/micall/tests/test_tests_utils.py b/micall/tests/test_tests_utils.py index 8e7a61edd..3e0c62415 100644 --- a/micall/tests/test_tests_utils.py +++ b/micall/tests/test_tests_utils.py @@ -1,5 +1,4 @@ -import pytest from micall.tests.utils import MockAligner, MockAlignment def test_basic_mapping(): diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 8c7b8b3cf..49fddbc07 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -1,10 +1,10 @@ from dataclasses import dataclass -from math import floor, ceil from contextlib import contextmanager import random from micall.utils.consensus_aligner import CigarActions + def find_all_occurrences(s, substring): start = 0 while True: @@ -14,9 +14,10 @@ def find_all_occurrences(s, substring): yield start start += len(substring) + @dataclass class MockAlignment: - strand: int # +1 if on the forward strand; -1 if on the reverse strand + strand: int # +1 if on the forward strand; -1 if on the reverse strand mapq: int cigar: list cigar_str: str @@ -37,7 +38,6 @@ def __init__(self, seq, *args, **kwargs): self.max_matches = 5 self.min_length = 3 - def map(self, seq): max_matches = self.max_matches returned = set() @@ -47,13 +47,12 @@ def map(self, seq): substring = seq[start:end] for r_st in find_all_occurrences(self.seq, substring): mapq = 60 - strand = 1 # Doesn't handle reverse complements in this mock. + strand = 1 # Doesn't handle reverse complements in this mock. r_en = r_st + len(substring) q_st = start q_en = end cigar = [[q_en - q_st, CigarActions.MATCH]] cigar_str = f'{(q_en - q_st)}M' - al = MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) if (q_st, q_en, r_st, r_en) not in returned: returned.add((q_st, q_en, r_st, r_en)) yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index d129039d5..dd2667f4d 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -4,10 +4,9 @@ from math import ceil, floor import re -from typing import Container, Tuple, Iterable, Optional, Set, Dict, List, Union +from typing import Tuple, Iterable, Optional, Set, List, Union from dataclasses import dataclass from functools import cached_property, reduce -from itertools import chain, dropwhile from fractions import Fraction from micall.utils.consensus_aligner import CigarActions @@ -26,8 +25,7 @@ class IntDict(dict): def __init__(self): super().__init__() self.domain: Set[int] = set() # superset of self.keys() - self.codomain: Set[int] = set() # superset of self.values() - + self.codomain: Set[int] = set() # superset of self.values() def extend(self, key: Optional[int], value: Optional[int]): if key is not None and value is not None: @@ -39,15 +37,12 @@ def extend(self, key: Optional[int], value: Optional[int]): if value is not None: self.codomain.add(value) - def left_max(self, index) -> Optional[int]: return max((v for (k, v) in self.items() if k <= index), default=None) - def right_min(self, index) -> Optional[int]: return min((v for (k, v) in self.items() if k >= index), default=None) - def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': """ Generates a new IntDict by shifting the entire mapping -- keys and values @@ -84,7 +79,6 @@ def __init__(self): self.ref_to_op = IntDict() self.query_to_op = IntDict() - def extend(self, ref_index: Optional[int], query_index: Optional[int], @@ -95,7 +89,6 @@ def extend(self, self.ref_to_op.extend(ref_index, op_index) self.query_to_op.extend(query_index, op_index) - def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': """ Generate a new CoordinateMapping with shifted coordinate spaces. @@ -115,12 +108,10 @@ def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMappin return ret - def __eq__(self, other): return (self.ref_to_op, self.query_to_op) \ == (other.ref_to_op, other.query_to_op) - def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' @@ -146,11 +137,9 @@ class Cigar: CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). """ - def __init__(self, data) -> None: self._data: List[Tuple[int, CigarActions]] = list(Cigar.normalize(data)) - @staticmethod def coerce(obj: Union['Cigar', str, Iterable[Tuple[int, CigarActions]]]): if isinstance(obj, Cigar): @@ -164,7 +153,6 @@ def coerce(obj: Union['Cigar', str, Iterable[Tuple[int, CigarActions]]]): raise TypeError(f"Cannot coerce {obj!r} to CIGAR string.") - def iterate_operations(self) -> Iterable[CigarActions]: """ Yields each operation in the CIGAR sequence as a `CigarActions` enum. @@ -175,7 +163,6 @@ def iterate_operations(self) -> Iterable[CigarActions]: for _ in range(num): yield operation - def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optional[int], Optional[int]]]: """ Iterates over the operations while tracking the reference and @@ -210,7 +197,6 @@ def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optio else: yield (operation, None, None) - def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': """ Creates a new Cigar object by slicing the current one from start_inclusive to @@ -225,7 +211,6 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': return Cigar([(1, op) for op in self.iterate_operations()] [start_inclusive:end_noninclusive]) - def lstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ @@ -237,7 +222,6 @@ def lstrip_query(self) -> 'Cigar': if query_pointer is None or i >= min_op] return Cigar.coerce(ops) - def rstrip_query(self) -> 'Cigar': """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ @@ -249,7 +233,6 @@ def rstrip_query(self) -> 'Cigar': if query_pointer is None or i <= max_op] return Cigar.coerce(ops) - def lstrip_reference(self) -> 'Cigar': """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ @@ -261,7 +244,6 @@ def lstrip_reference(self) -> 'Cigar': if ref_pointer is None or i >= min_op] return Cigar.coerce(ops) - def rstrip_reference(self) -> 'Cigar': """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ @@ -273,11 +255,11 @@ def rstrip_reference(self) -> 'Cigar': if ref_pointer is None or i <= max_op] return Cigar.coerce(ops) - @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ - Convert this CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. + Convert this CIGAR string to coordinate mapping representing + a reference-to-query and query-to-reference coordinate mappings. :param cigar: a CIGAR string. @@ -294,7 +276,6 @@ def coordinate_mapping(self) -> CoordinateMapping: return mapping - def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: """ Constructs a multiple sequence alignment (MSA) representation for this Cigar, using the original reference @@ -325,42 +306,38 @@ def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: return reference_msa, query_msa - @cached_property def op_length(self): return sum(1 for x in self.iterate_operations()) - @cached_property def query_length(self): return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) in self.iterate_operations_with_pointers()), default=0) - @cached_property def ref_length(self): return max((ref_pointer + 1 if ref_pointer is not None else 0 for (_, ref_pointer, _) in self.iterate_operations_with_pointers()), default=0) - # # - # Boring boilerplate code below # - # # + # # + # Boring boilerplate code below # + # # OP_MAPPING = { - 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) - 'I': CigarActions.INSERT, # Insertion to the reference - 'D': CigarActions.DELETE, # Deletion from the reference - 'N': CigarActions.SKIPPED, # Skipped region from the reference - 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) - 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) - 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) - '=': CigarActions.SEQ_MATCH, # Sequence match - 'X': CigarActions.MISMATCH, # Sequence mismatch + 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) + 'I': CigarActions.INSERT, # Insertion to the reference + 'D': CigarActions.DELETE, # Deletion from the reference + 'N': CigarActions.SKIPPED, # Skipped region from the reference + 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) + 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) + 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) + '=': CigarActions.SEQ_MATCH, # Sequence match + 'X': CigarActions.MISMATCH, # Sequence mismatch } - @staticmethod def parse_operation(operation: str) -> CigarActions: if operation in Cigar.OP_MAPPING: @@ -368,12 +345,10 @@ def parse_operation(operation: str) -> CigarActions: else: raise ValueError(f"Unexpected CIGAR action: {operation}.") - @staticmethod def operation_to_str(op: CigarActions) -> str: return [k for (k, v) in Cigar.OP_MAPPING.items() if v == op][0] - @staticmethod def parse(string) -> 'Cigar': """ @@ -395,7 +370,6 @@ def parse(string) -> 'Cigar': return Cigar(data) - @staticmethod def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: """ @@ -419,7 +393,7 @@ def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: if not isinstance(num, int) or not isinstance(operation, CigarActions): raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") if num < 0: - raise ValueError(f"Invalid CIGAR list: number of operations is negative.") + raise ValueError("Invalid CIGAR list: number of operations is negative.") # Normalization if num == 0: @@ -431,24 +405,22 @@ def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: last_item = (last_num + num, operation) continue - if last_item: yield (last_item[0], last_item[1]) + if last_item: + yield (last_item[0], last_item[1]) last_item = item - if last_item: yield (last_item[0], last_item[1]) - + if last_item: + yield (last_item[0], last_item[1]) def __eq__(self, other) -> bool: return isinstance(other, Cigar) and self._data == other._data - def __add__(self, other: 'Cigar'): return Cigar(self._data + other._data) - def __repr__(self): return f'Cigar({str(self)!r})' - def __str__(self): """ Inverse of Cigar.parse """ return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self._data) @@ -472,10 +444,9 @@ class CigarHit: cigar: Cigar r_st: int - r_ei: int # inclusive + r_ei: int # inclusive q_st: int - q_ei: int # inclusive - + q_ei: int # inclusive def __post_init__(self): if self.ref_length != self.cigar.ref_length: @@ -486,17 +457,14 @@ def __post_init__(self): raise ValueError(f"CIGAR string maps {self.cigar.query_length}" f" query positions, but CIGAR hit range is {self.query_length}") - @property def ref_length(self): return self.r_ei + 1 - self.r_st - @property def query_length(self): return self.q_ei + 1 - self.q_st - @staticmethod def from_default_alignment(r_st, r_ei, q_st, q_ei): """ @@ -511,7 +479,6 @@ def from_default_alignment(r_st, r_ei, q_st, q_ei): return CigarHit(cigar, r_st=r_st, r_ei=r_ei, q_st=q_st, q_ei=q_ei) - def overlaps(self, other) -> bool: """ Determines whether this CigarHit overlaps with another in terms of reference or query coordinates. @@ -526,7 +493,6 @@ def intervals_overlap(x, y): return intervals_overlap((self.r_st, self.r_ei), (other.r_st, other.r_ei)) \ or intervals_overlap((self.q_st, self.q_ei), (other.q_st, other.q_ei)) - def touches(self, other) -> bool: """ Checks if the end of this CigarHit is immediately adjacent to the start of another one. @@ -534,8 +500,7 @@ def touches(self, other) -> bool: """ return self.r_ei + 1 == other.r_st \ - and self.q_ei + 1 == other.q_st - + and self.q_ei + 1 == other.q_st def _gaps(self, is_deletions: bool) -> Iterable['CigarHit']: last_query_index = self.q_st @@ -570,7 +535,6 @@ def _gaps(self, is_deletions: bool) -> Iterable['CigarHit']: if op_index in op_to_ref: last_ref_index = op_to_ref[op_index] - def deletions(self) -> Iterable['CigarHit']: return self._gaps(is_deletions=True) @@ -592,7 +556,6 @@ def __add__(self, other): q_st=self.q_st, q_ei=other.q_ei) - def connect(self, other): """ Inserts deletions/insertions between self and other, @@ -605,12 +568,10 @@ def connect(self, other): filler = CigarHit.from_default_alignment(self.r_ei + 1, other.r_st - 1, self.q_ei + 1, other.q_st - 1) return self + filler + other - @property def epsilon(self): return Fraction(1, self.cigar.op_length * 3 + 1) - def _ref_cut_to_op_cut(self, cut_point): mapping = self.coordinate_mapping @@ -622,7 +583,7 @@ def _ref_cut_to_op_cut(self, cut_point): if right_op_cut_point is None: right_op_cut_point = self.cigar.op_length - lerp = lambda start, end, t: (1 - t) * start + t * end + def lerp(start, end, t): return (1 - t) * start + t * end op_cut_point = lerp(left_op_cut_point, right_op_cut_point, cut_point - floor(cut_point)) @@ -632,20 +593,18 @@ def _ref_cut_to_op_cut(self, cut_point): return op_cut_point - def _slice(self, r_st, q_st, o_st, o_ei): cigar = self.cigar.slice_operations(o_st, o_ei + 1) r_ei = r_st + cigar.ref_length - 1 q_ei = q_st + cigar.query_length - 1 return CigarHit(cigar=cigar, - r_st = r_st, - r_ei = r_ei, - q_st = q_st, - q_ei = q_ei, + r_st=r_st, + r_ei=r_ei, + q_st=q_st, + q_ei=q_ei, ) - def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: """ Splits this CigarHit into two non-overlapping parts using a fractional cut point in the reference space. @@ -667,48 +626,43 @@ def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: return left, right - def lstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ cigar = self.cigar.lstrip_query() return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, - q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - + q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) def rstrip_query(self) -> 'CigarHit': """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ cigar = self.cigar.rstrip_query() return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, - q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) - + q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) def lstrip_reference(self) -> 'CigarHit': """ Return a copy of the CigarHit with leading (unmatched) reference elements removed. """ cigar = self.cigar.lstrip_reference() return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, - q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - + q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) def rstrip_reference(self) -> 'CigarHit': """ Return a copy of the CigarHit with trailing (unmatched) reference elements removed. """ cigar = self.cigar.rstrip_reference() return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, - q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) - + q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) @cached_property def coordinate_mapping(self) -> CoordinateMapping: """ - Convert this alignment to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. + Convert this alignment to coordinate mapping representing + a reference-to-query and query-to-reference coordinate mappings. """ return self.cigar.coordinate_mapping.translate(self.r_st, self.q_st) - def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: """ Constructs a multiple sequence alignment (MSA) representation for this CigarHit, using the original reference @@ -718,7 +672,6 @@ def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: return self.cigar.to_msa(reference_seq[self.r_st:], query_seq[self.q_st:]) - def translate(self, reference_delta: int, query_delta: int) -> 'CigarHit': return CigarHit(cigar=self.cigar, r_st=self.r_st + reference_delta, @@ -726,13 +679,13 @@ def translate(self, reference_delta: int, query_delta: int) -> 'CigarHit': q_st=self.q_st + query_delta, q_ei=self.q_ei + query_delta) - def __repr__(self): - return f'CigarHit({str(self.cigar)!r}, r_st={self.r_st!r}, r_ei={self.r_ei!r}, q_st={self.q_st!r}, q_ei={self.q_ei!r})' - + return 'CigarHit(%r, r_st=%r, r_ei=%r, q_st=%r, q_ei=%r)' \ + % (self.cigar, self.r_st, self.r_ei, self.q_st, self.q_ei) def __str__(self): - return f'{str(self.cigar)}@[{self.q_st},{self.q_ei}]->[{self.r_st},{self.r_ei}]' + return '%s@[%d,%d]->[%d,%d]' \ + % (str(self.cigar), self.q_st, self.q_ei, self.r_st, self.r_ei) def connect_cigar_hits(cigar_hits: List[CigarHit]) -> List[CigarHit]: diff --git a/micall/utils/contig_stitcher_context.py b/micall/utils/contig_stitcher_context.py index d23bc261c..a90647247 100644 --- a/micall/utils/contig_stitcher_context.py +++ b/micall/utils/contig_stitcher_context.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Union, Literal, TypeVar, Callable, Set +from typing import List, Set from contextvars import ContextVar from contextlib import contextmanager @@ -22,7 +22,6 @@ def generate_new_name(self) -> str: def emit(self, event: events.EventType) -> None: self.events.append(event) - @staticmethod @contextmanager def fresh(): diff --git a/micall/utils/contig_stitcher_contigs.py b/micall/utils/contig_stitcher_contigs.py index 49117e70f..6c8ff6021 100644 --- a/micall/utils/contig_stitcher_contigs.py +++ b/micall/utils/contig_stitcher_contigs.py @@ -14,8 +14,13 @@ class Contig: class GenotypedContig(Contig): ref_name: str group_ref: str - ref_seq: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. - match_fraction: float # Approximated overall concordance between `seq` and `ref_seq`. It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. + + # The sequence of self.group_ref. None in cases where the reference organism is unknown. + ref_seq: Optional[str] + + # Approximated overall concordance between `seq` and `ref_seq`. + # It is calculated by BLAST as qcovhsp/100, where qcovhsp means Query Coverage Per HSP. + match_fraction: float @dataclass(frozen=True) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index c442f30c3..c000e1a1c 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -14,7 +14,12 @@ class Cut: cut_point: float def __str__(self) -> str: - return f"Created contigs {self.left.name!r} at {self.left.alignment} and {self.right.name!r} at {self.right.alignment} by cutting {self.original.name!r} at {self.original.alignment} at cut point = {round(self.cut_point, 1)}." + return ( + f"Created contigs {self.left.name!r} at {self.left.alignment} and " + f"{self.right.name!r} at {self.right.alignment} by cutting " + f"{self.original.name!r} at {self.original.alignment} at cut point = " + f"{round(self.cut_point, 1)}." + ) @dataclass(frozen=True) @@ -23,7 +28,11 @@ class LStrip: result: AlignedContig def __str__(self) -> str: - return f"Doing lstrip of {self.original.name!r} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." + return ( + f"Doing lstrip of {self.original.name!r} at {self.original.alignment} (len " + f"{len(self.original.seq)}) resulted in {self.result.name!r} at " + f"{self.result.alignment} (len {len(self.result.seq)})." + ) @dataclass(frozen=True) @@ -32,7 +41,11 @@ class RStrip: result: AlignedContig def __str__(self) -> str: - return f"Doing rstrip of {self.original.name!r} at {self.original.alignment} (len {len(self.original.seq)}) resulted in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." + return ( + f"Doing rstrip of {self.original.name!r} at {self.original.alignment} (len " + f"{len(self.original.seq)}) resulted in {self.result.name!r} at " + f"{self.result.alignment} (len {len(self.result.seq)})." + ) @dataclass(frozen=True) @@ -42,7 +55,11 @@ class Munge: result: AlignedContig def __str__(self) -> str: - return f"Munged contigs {self.left.name!r} at {self.left.alignment} with {self.right.name!r} at {self.right.alignment} resulting in {self.result.name!r} at {self.result.alignment}." + return ( + f"Munged contigs {self.left.name!r} at {self.left.alignment} with " + f"{self.right.name!r} at {self.right.alignment} resulting in " + f"{self.result.name!r} at {self.result.alignment}." + ) @dataclass(frozen=True) @@ -51,7 +68,12 @@ class Combine: result: AlignedContig def __str__(self) -> str: - return f"Created a frankenstein {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)}) from {[f'{x.name!r} at {x.alignment} (len {len(x.seq)})' for x in self.contigs]}." + contigs_str = ', '.join( + [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in self.contigs]) + return ( + f"Created a frankenstein {self.result.name!r} at {self.result.alignment} " + f"(len {len(self.result.seq)}) from [{contigs_str}]." + ) @dataclass(frozen=True) @@ -70,7 +92,11 @@ class InitialHit: strand: Literal["forward", "reverse"] def __str__(self) -> str: - return f"Part {self.index} of contig {self.contig.name!r} aligned at {self.hit}{'' if self.strand == 'forward' else ' (rev)'}." + strand_info = '' if self.strand == 'forward' else ' (rev)' + return ( + f"Part {self.index} of contig {self.contig.name!r} aligned at {self.hit}" + f"{strand_info}." + ) @dataclass(frozen=True) @@ -86,7 +112,10 @@ class StrandConflict: contig: GenotypedContig def __str__(self) -> str: - return f"Discarding contig {self.contig.name!r} because it aligned both in forward and reverse sense." + return ( + f"Discarding contig {self.contig.name!r} because it aligned both in forward " + "and reverse sense." + ) @dataclass(frozen=True) @@ -105,7 +134,10 @@ class HitNumber: connected: List[CigarHit] def __str__(self) -> str: - return f"Contig {self.contig.name!r} produced {len(self.initial)} aligner hits. After connecting them, the number became {len(self.connected)}." + return ( + f"Contig {self.contig.name!r} produced {len(self.initial)} aligner hits. " + f"After connecting them, the number became {len(self.connected)}." + ) @dataclass(frozen=True) @@ -115,7 +147,11 @@ class ConnectedHit: index: int def __str__(self) -> str: - return f"Part {self.index} of contig {self.contig.name!r} re-aligned as {self.part.name!r} at {self.part.alignment}{'' if self.part.strand == 'forward' else ' (rev)'}." + part_strand_info = '' if self.part.strand == 'forward' else ' (rev)' + return ( + f"Part {self.index} of contig {self.contig.name!r} re-aligned as " + f"{self.part.name!r} at {self.part.alignment}{part_strand_info}." + ) @dataclass(frozen=True) @@ -125,7 +161,10 @@ class InitialStrip: q_ei: int def __str__(self) -> str: - return f"Trimming (strip) contig {self.contig.name!r} from {self.q_st} to {self.q_ei}." + return ( + f"Trimming (strip) contig {self.contig.name!r} from {self.q_st} to " + f"{self.q_ei}." + ) @dataclass(frozen=True) @@ -138,7 +177,13 @@ class StitchCut: right_remainder: AlignedContig def __str__(self) -> str: - return f"Stitching {self.left.name!r} at {self.left.alignment} (len {len(self.left.seq)}) with {self.right.name!r} at {self.right.alignment} (len {len(self.right.seq)}). The left_overlap {self.left_overlap.name!r} is at {self.left_overlap.alignment} (len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name!r} is at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." + return ( + f"Stitching {self.left.name!r} at {self.left.alignment} (len {len(self.left.seq)}) " + f"with {self.right.name!r} at {self.right.alignment} (len {len(self.right.seq)}). " + f"The left_overlap {self.left_overlap.name!r} is at {self.left_overlap.alignment} " + f"(len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name!r} is " + f"at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." + ) @dataclass(frozen=True) @@ -160,7 +205,12 @@ def __str__(self) -> str: average_concordance = round(self.average * 100) cut_point_location_scaled = round(self.cut_point_scaled * 100) concordance_str = ', '.join(str(int(round(x * 100)) / 100) for x in self.concordance) - return f"Created overlap contigs {self.left_take.name!r} at {self.left_overlap.alignment} and {self.right_take.name!r} at {self.right_take.alignment} based on parts of {self.left.name!r} and {self.right.name!r}, with avg. concordance {average_concordance}%, cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." + return ( + f"Created overlap contigs {self.left_take.name!r} at {self.left_overlap.alignment} and " + f"{self.right_take.name!r} at {self.right_take.alignment} based on parts of " + f"{self.left.name!r} and {self.right.name!r}, with avg. concordance {average_concordance}%, " + f"cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." + ) @dataclass(frozen=True) @@ -178,7 +228,10 @@ class Stitch: result: AlignedContig def __str__(self) -> str: - return f"Stitching {self.left.name!r} with {self.right.name!r} results in {self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." + return ( + f"Stitching {self.left.name!r} with {self.right.name!r} results in " + f"{self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." + ) @dataclass(frozen=True) @@ -187,7 +240,11 @@ class Drop: covering: List[AlignedContig] def __str__(self) -> str: - return f"Dropped contig {self.contig.name!r} as it is completely covered by these contigs: {', '.join(repr(x.name) for x in self.covering)}." + covering_contig_names = ', '.join(repr(x.name) for x in self.covering) + return ( + f"Dropped contig {self.contig.name!r} as it is completely covered by these contigs: " + f"{covering_contig_names}." + ) @dataclass(frozen=True) @@ -207,7 +264,12 @@ class SplitGap: right: AlignedContig def __str__(self) -> str: - return f"Split contig {self.contig.name!r} at {self.contig.alignment} around its gap at [{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: {self.left.name!r} at {self.left.alignment}, right part: {self.right.name!r} at {self.right.alignment}." + return ( + f"Split contig {self.contig.name!r} at {self.contig.alignment} around its gap at " + f"[{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: " + f"{self.left.name!r} at {self.left.alignment}, right part: {self.right.name!r} at " + f"{self.right.alignment}." + ) @dataclass(frozen=True) @@ -215,7 +277,11 @@ class Intro: contig: GenotypedContig def __str__(self) -> str: - return f"Introduced contig {self.contig.name!r} (seq = {self.contig.seq}) of ref {self.contig.ref_name!r}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), and length {len(self.contig.seq)}." + return ( + f"Introduced contig {self.contig.name!r} (seq = {self.contig.seq}) of ref " + f"{self.contig.ref_name!r}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), " + f"and length {len(self.contig.seq)}." + ) @dataclass(frozen=True) @@ -224,9 +290,16 @@ class FinalCombine: result: AlignedContig def __str__(self) -> str: - return f"Combining these contigs for final output for {self.result.group_ref}: {['%r at %s (len %s)' % (x.name, x.alignment, len(x.seq)) for x in self.contigs]}." + contigs_str = [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in self.contigs] + contigs_format = ', '.join(contigs_str) + return ( + f"Combining these contigs for final output for {self.result.group_ref}: " + f"[{contigs_format}]." + ) -AlignmentEvent = Union[NoRef, InitialHit, ZeroHits, StrandConflict, ReverseComplement, HitNumber, ConnectedHit] +AlignmentEvent = Union[NoRef, InitialHit, ZeroHits, StrandConflict, ReverseComplement, + HitNumber, ConnectedHit] ModifyEvent = Union[LStrip, RStrip] -EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, InitialStrip, StitchCut, Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] +EventType = Union[Cut, ModifyEvent, Munge, Combine, AlignmentEvent, InitialStrip, StitchCut, + Overlap, NoOverlap, Stitch, Drop, IgnoreGap, SplitGap, Intro, FinalCombine] From 7aed74f69ce68b01bdab69bcd3d11b7454a0e311 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 28 Feb 2024 13:55:47 -0800 Subject: [PATCH 220/485] Contig stitcher: fix PyCharm warnings There seem to be quite a lot of unique checks that only PyCharm performs, and neither flake8 nor ruff have them. For example: grammar checks! This commit fixes all the errors that I've seen in the stitcher code, while browsing it in PyCharm. --- micall/core/contig_stitcher.py | 64 ++++++++++++----------- micall/core/denovo.py | 20 +++---- micall/tests/test_contig_stitcher_fuzz.py | 7 +-- micall/utils/cigar_tools.py | 30 +++++------ micall/utils/contig_stitcher_events.py | 2 +- 5 files changed, 62 insertions(+), 61 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 8f1a2b341..3efa803da 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -29,10 +29,10 @@ def log(e: events.EventType) -> None: def cut_query(self: GenotypedContig, cut_point: float) -> Tuple[GenotypedContig, GenotypedContig]: """ Cuts query sequence in two parts with cut_point between them. """ - cut_point = max(0, cut_point) + cut_point = max(0.0, cut_point) left = replace(self, name=context.get().generate_new_name(), seq=self.seq[:ceil(cut_point)]) right = replace(self, name=context.get().generate_new_name(), seq=self.seq[ceil(cut_point):]) - return (left, right) + return left, right def cut_reference(self: AlignedContig, cut_point: float) -> Tuple[AlignedContig, AlignedContig]: @@ -42,7 +42,7 @@ def cut_reference(self: AlignedContig, cut_point: float) -> Tuple[AlignedContig, left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) log(events.Cut(self, left, right, cut_point)) - return (left, right) + return left, right def lstrip(self: AlignedContig) -> AlignedContig: @@ -158,13 +158,15 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: yield contig return + def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: + cigar = CigarHit(Cigar(x.cigar), + min(x.r_st, x.r_en - 1), max(x.r_st, x.r_en - 1), + min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)) + return cigar, "forward" if x.strand == 1 else "reverse" + aligner = Aligner(seq=contig.ref_seq, preset='map-ont') alignments = list(aligner.map(contig.seq)) - hits_array: List[Tuple[CigarHit, Literal["forward", "reverse"]]] = \ - [(CigarHit(Cigar(x.cigar), - min(x.r_st, x.r_en - 1), max(x.r_st, x.r_en - 1), - min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)), - "forward" if x.strand == 1 else "reverse") for x in alignments] + hits_array = [init_hit(x) for x in alignments] for i, (hit, strand) in enumerate(hits_array): log(events.InitialHit(contig, i, hit, strand)) @@ -208,9 +210,9 @@ def strip_conflicting_mappings(contigs: Iterable[GenotypedContig]) -> Iterable[G def get_indexes(name: str) -> Tuple[int, int]: contig = names[name] if isinstance(contig, AlignedContig): - return (contig.alignment.q_st, contig.alignment.r_st) + return contig.alignment.q_st, contig.alignment.r_st else: - return (-1, -1) + return -1, -1 reference_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[1])) query_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[0])) @@ -269,7 +271,7 @@ def align_queries(seq1: str, seq2: str) -> Tuple[str, str]: def find_all_overlapping_contigs(self: AlignedContig, aligned_contigs): - """" + """ Yield all contigs from a collection that overlap with a given contig. Contigs are considered overlapping if they have overlapping intervals on the same reference genome. """ @@ -332,7 +334,7 @@ def slide(start, end): def disambiguate_concordance(concordance: List[float]) -> Iterable[Tuple[float, int]]: for i, x in enumerate(concordance): global_rank = i if i < len(concordance) / 2 else len(concordance) - i - 1 - yield (x, global_rank) + yield x, global_rank def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned_right, concordance): @@ -353,9 +355,9 @@ def remove_dashes(s): return ''.join(c for c in s if c != '-') if aligned_right_r_index is None: aligned_right_r_index = right_overlap.alignment.r_ei + 1 if aligned_right_r_index > aligned_left_r_index: - return (aligned_left_r_index + 0.5, aligned_right_r_index - 0.5, max_concordance_index) + return aligned_left_r_index + 0.5, aligned_right_r_index - 0.5, max_concordance_index - return (left_overlap.alignment.r_st - 1 + 0.5, right_overlap.alignment.r_ei + 1 - 0.5, 0) + return left_overlap.alignment.r_st - 1 + 0.5, right_overlap.alignment.r_ei + 1 - 0.5, 0 def stitch_2_contigs(left, right): @@ -396,7 +398,7 @@ def stitch_2_contigs(left, right): def combine_overlaps(contigs: List[AlignedContig]) -> Iterable[AlignedContig]: - """" + """ Repeatedly combine all overlapping aligned contigs into an iterable collection of contiguous AlignedContigs. It proceeds by iterating through sorted contigs and stitching any overlapping ones until none are left. """ @@ -457,8 +459,8 @@ def find_covered_contig(contigs: List[AlignedContig]) -> Tuple[Optional[AlignedC :return: An AlignedContig if there is one completely covered by others, None otherwise. """ - def calculate_cumulative_coverage(contigs) -> List[Tuple[int, int]]: - intervals = [(contig.alignment.r_st, contig.alignment.r_ei) for contig in contigs] + def calculate_cumulative_coverage(others) -> List[Tuple[int, int]]: + intervals = [(contig.alignment.r_st, contig.alignment.r_ei) for contig in others] merged_intervals = merge_intervals(intervals) return merged_intervals @@ -501,41 +503,41 @@ def split_contigs_with_gaps(contigs: List[AlignedContig]) -> List[AlignedContig] """ def covered_by(gap, other): - # Check if any 1 reference coordinate in gap is mapped in other. + # Check if any 1 reference coordinate in gap is mapped in `other`. gap_coords = gap.coordinate_mapping.ref_to_query.domain cover_coords = set(other.alignment.coordinate_mapping.ref_to_query.keys()) return not gap_coords.isdisjoint(cover_coords) - def covered(contig, gap): - return any(covered_by(gap, other) for other in contigs if other != contig) + def covered(self, gap): + return any(covered_by(gap, other) for other in contigs if other != self) def significant(gap): # noinspection PyLongLine # The size of the gap is unavoidably, to some point, arbitrary. Here we tried to adjust it to common gaps in HIV, as HIV is the primary test subject in MiCall. A notable feature of HIV-1 reverse transcription is the appearance of periodic deletions of approximately 21 nucleotides. These deletions have been reported to occur in the HIV-1 genome and are thought to be influenced by the structure of the viral RNA. Specifically, the secondary structures and foldings of the RNA can lead to pause sites for the reverse transcriptase, resulting in staggered alignment when the enzyme slips. This misalignment can cause the reverse transcriptase to "jump," leading to deletions in the newly synthesized DNA. The unusually high frequency of about 21-nucleotide deletions is believed to correspond to the pitch of the RNA helix, which reflects the spatial arrangement of the RNA strands. The 21 nucleotide cycle is an average measure and is thought to be associated with the length of one turn of the RNA helix, meaning that when reverse transcriptase slips and reattaches, it often does so one helical turn away from the original site. # noqa: E501 return gap.ref_length > 21 - def try_split(contig): - for gap in contig.alignment.deletions(): + def try_split(self: AlignedContig): + for gap in self.alignment.deletions(): if not significant(gap): # Really we do not want to split on every little deletion # because that would mean that we would need to stitch # overlaps around them. # And we are likely to lose quality with every stitching operation. # By skipping we assert that this gap is aligner's fault. - log(events.IgnoreGap(contig, gap)) + log(events.IgnoreGap(self, gap)) continue - if covered(contig, gap): - midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + contig.alignment.epsilon - left_part, right_part = cut_reference(contig, midpoint) + if covered(self, gap): + midpoint = gap.r_st + (gap.r_ei - gap.r_st) / 2 + self.alignment.epsilon + left_part, right_part = cut_reference(self, midpoint) left_part = rstrip(left_part) right_part = lstrip(right_part) - contigs.remove(contig) + contigs.remove(self) contigs.append(left_part) contigs.append(right_part) process_queue.put(right_part) - log(events.SplitGap(contig, gap, left_part, right_part)) + log(events.SplitGap(self, gap, left_part, right_part)) return process_queue: LifoQueue = LifoQueue() @@ -580,9 +582,9 @@ def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCo yield contig def combine(group_ref): - contigs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) - result = combine_contigs(contigs) - log(events.FinalCombine(contigs, result)) + ctgs = sorted(consensus_parts[group_ref], key=lambda x: x.alignment.r_st) + result = combine_contigs(ctgs) + log(events.FinalCombine(ctgs, result)) return result yield from map(combine, consensus_parts) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 992aae257..94ce14aae 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -1,7 +1,7 @@ import argparse import logging import os -import typing +from typing import Optional, TextIO, Iterable, Dict from collections import Counter from csv import DictWriter, DictReader from datetime import datetime @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) -def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> typing.Iterable[GenotypedContig]: +def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> Iterable[GenotypedContig]: projects = ProjectConfig.loadDefault() for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): @@ -82,7 +82,7 @@ def write_contig_refs(contigs_fasta_path, for i, row in enumerate(contig_reader, 1): contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") - group_refs = {} + group_refs: Dict[str, str] = {} with StitcherContext.fresh() as ctx: genotypes = genotype(contigs_fasta_path, @@ -120,7 +120,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): fraction of the query that aligned against the reference (matches and mismatches). """ - contig_nums = {} # {contig_name: contig_num} + contig_nums: Dict[str, int] = {} # {contig_name: contig_num} with open(fasta) as f: for line in f: if line.startswith('>'): @@ -167,7 +167,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): for match in matches} top_refs = set(contig_top_matches.values()) projects = ProjectConfig.loadDefault() - match_scores = Counter() + match_scores: Counter[str] = Counter() for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')): contig_top_ref = contig_top_matches[contig_name] contig_seed_group = projects.getSeedGroup(contig_top_ref) @@ -177,7 +177,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): continue match_seed_group = projects.getSeedGroup(ref_name) if match_seed_group == contig_seed_group: - match_scores[ref_name] += float(match['score']) + match_scores[ref_name] += float(match['score']) # type: ignore[assignment] if group_refs is not None: group_top_refs = {projects.getSeedGroup(ref_name): ref_name @@ -207,11 +207,11 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): def denovo(fastq1_path: str, fastq2_path: str, - contigs_csv: typing.TextIO, + contigs_csv: TextIO, work_dir: str = '.', - merged_contigs_csv: typing.TextIO = None, - blast_csv: typing.TextIO = None, - stitcher_plot_path: typing.Optional[str] = None, + merged_contigs_csv: Optional[TextIO] = None, + blast_csv: Optional[TextIO] = None, + stitcher_plot_path: Optional[str] = None, ): """ Use de novo assembly to build contigs from reads. diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 8271a1a88..9b82db2f9 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -112,11 +112,12 @@ def test_contig_number_prop2_existential(): ) -def get_all_reference_positions(contigs: List[AlignedContig]): +def get_all_reference_positions(contigs: List[GenotypedContig]): ret: Dict[int, int] = defaultdict(lambda: 0) for contig in contigs: - for i in contig.alignment.coordinate_mapping.ref_to_query.domain: - ret[i] += 1 + if isinstance(contig, AlignedContig): + for i in contig.alignment.coordinate_mapping.ref_to_query.domain: + ret[i] += 1 return ret diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index dd2667f4d..33c44be3b 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -128,7 +128,7 @@ class Cigar: manipulation (as seen in self.iterate_operations()), while retaining the compact form for storage and return purposes (seen in self.__str__()). - Instances of this class should be created by calling the Cigar.coerce method. + Instances of this class should be created by calling the `Cigar.coerce` method. Examples: Cigar.coerce("10M1I5M1D") Cigar.coerce([(10, CigarActions.MATCH), (1, CigarActions.INSERT), ...]) @@ -182,20 +182,20 @@ def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optio for operation in self.iterate_operations(): if operation in (CigarActions.MATCH, CigarActions.SEQ_MATCH, CigarActions.MISMATCH): - yield (operation, ref_pointer, query_pointer) + yield operation, ref_pointer, query_pointer query_pointer += 1 ref_pointer += 1 elif operation in (CigarActions.INSERT, CigarActions.SOFT_CLIPPED): - yield (operation, None, query_pointer) + yield operation, None, query_pointer query_pointer += 1 elif operation in (CigarActions.DELETE, CigarActions.SKIPPED): - yield (operation, ref_pointer, None) + yield operation, ref_pointer, None ref_pointer += 1 else: - yield (operation, None, None) + yield operation, None, None def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': """ @@ -261,8 +261,6 @@ def coordinate_mapping(self) -> CoordinateMapping: Convert this CIGAR string to coordinate mapping representing a reference-to-query and query-to-reference coordinate mappings. - :param cigar: a CIGAR string. - :return: Lists of integers representing the mappings of coordinates from the reference sequence to the query sequence, and back. """ @@ -406,11 +404,11 @@ def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: continue if last_item: - yield (last_item[0], last_item[1]) + yield last_item[0], last_item[1] last_item = item if last_item: - yield (last_item[0], last_item[1]) + yield last_item[0], last_item[1] def __eq__(self, other) -> bool: return isinstance(other, Cigar) and self._data == other._data @@ -422,7 +420,7 @@ def __repr__(self): return f'Cigar({str(self)!r})' def __str__(self): - """ Inverse of Cigar.parse """ + """ Inverse of `Cigar.parse` """ return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self._data) @@ -556,7 +554,7 @@ def __add__(self, other): q_st=self.q_st, q_ei=other.q_ei) - def connect(self, other): + def connect(self, other: 'CigarHit') -> 'CigarHit': """ Inserts deletions/insertions between self and other, then ajusts boundaries appropriately. @@ -718,16 +716,16 @@ def connect_cigar_hits(cigar_hits: List[CigarHit]) -> List[CigarHit]: # Segregate independent matches. sorted_groups: List[List[CigarHit]] = [] - def find_group(hit): + def find_group(phit): for group in sorted_groups: - if hit.q_st > group[-1].q_st: - group.append(hit) + if phit.q_st > group[-1].q_st: + group.append(phit) return - sorted_groups.append([hit]) + sorted_groups.append([phit]) for hit in sorted_parts: find_group(hit) # Collect all intervals back together, connecting them with CigarActions.DELETE. - return [reduce(CigarHit.connect, group) for group in sorted_groups] + return [reduce(lambda x, y: x.connect(y), group) for group in sorted_groups] diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index c000e1a1c..b9c889462 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -199,7 +199,7 @@ class Overlap: concordance: List[Fraction] average: Fraction cut_point: int - cut_point_scaled: Fraction + cut_point_scaled: float def __str__(self) -> str: average_concordance = round(self.average * 100) From b90beb31bbd5f8c932175ee8472708a1c960fb62 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 28 Feb 2024 16:56:19 -0800 Subject: [PATCH 221/485] Add new output file: contigs_stitched.csv --- Singularity | 3 +- docs/steps.md | 6 ++- micall/core/contig_stitcher.py | 2 +- micall/core/denovo.py | 84 +++++++++++++++++++++++++-------- micall/drivers/sample.py | 7 ++- micall/monitor/kive_watcher.py | 1 + micall/tests/test_denovo.py | 25 ++++++---- micall/utils/denovo_simplify.py | 2 +- micall_kive.py | 4 ++ 9 files changed, 101 insertions(+), 33 deletions(-) diff --git a/Singularity b/Singularity index e32803957..816f0142c 100644 --- a/Singularity +++ b/Singularity @@ -192,7 +192,8 @@ From: centos:7 failed_csv cascade_csv nuc_csv amino_csv insertions_csv conseq_csv \ conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ - genome_coverage_csv genome_coverage_svg genome_concordance_svg contigs_csv \ + genome_coverage_csv genome_coverage_svg genome_concordance_svg \ + contigs_csv contigs_stitched_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index 57d0695c9..1043988f9 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -216,10 +216,14 @@ Individual files are described after the list of steps. * insert - the nucleotide sequence that was inserted * qual - the Phred quality scores for the inserted sequence * contigs.csv - * genotype - the reference name with the best BLAST result + * ref - the reference name with the best BLAST result * match - the fraction of the contig that matched in BLAST, negative for reverse-complemented matches + * group_ref - the reference name chosen to best match all of + the contigs in a sample * contig - the nucleotide sequence of the assembled contig +* contigs_stitched.csv + Same as `contigs.csv`, but contigs are stitched by `micall/core/contig_stitcher.py`. * coverage_scores.csv * project - the project this score is defined by * region - the region being displayed diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 3efa803da..b6495387e 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -616,7 +616,7 @@ def main(args): logging.basicConfig(level=logger.level) with StitcherContext.fresh(): - write_contig_refs(args.contigs.name, args.stitched_contigs, stitcher_plot_path=args.plot) + write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=args.plot) args.contigs.close() args.stitched_contigs.close() diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 94ce14aae..20835b1bc 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -1,6 +1,7 @@ import argparse import logging import os +import tempfile from typing import Optional, TextIO, Iterable, Dict from collections import Counter from csv import DictWriter, DictReader @@ -58,30 +59,45 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> It match_fraction=match_fraction) -def write_contig_refs(contigs_fasta_path, - contigs_csv, - merged_contigs_csv=None, - blast_csv=None, - stitcher_plot_path=None): +def init_contigs_refs(contigs_csv: TextIO): + writer = DictWriter(contigs_csv, + ['ref', 'match', 'group_ref', 'contig'], + lineterminator=os.linesep) + writer.writeheader() + return writer + + +def contigs_refs_write(writer, ref: str, match: float, group_ref: str, contig: str): + writer.writerow(dict(ref=ref, match=match, group_ref=group_ref, contig=contig)) + + +def write_contig_refs(contigs_fasta_path: str, + contigs_csv: Optional[TextIO], + contigs_stitched_csv: Optional[TextIO], + merged_contigs_csv: Optional[TextIO] = None, + blast_csv: Optional[TextIO] = None, + stitcher_plot_path: Optional[str] = None) -> int: """ Run BLAST search to identify contig sequences. :param str contigs_fasta_path: path to file to read contig sequences from and append merged contigs to :param contigs_csv: open file to write assembled contigs to + :param contigs_stitched_csv: open file to write stitched contigs to :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads :param blast_csv: open file to write BLAST search results for each contig + :param stitcher_plot_path: open file to write the visualizer plot to """ - writer = DictWriter(contigs_csv, - ['ref', 'match', 'group_ref', 'contig'], - lineterminator=os.linesep) - writer.writeheader() + with open(contigs_fasta_path, 'a') as contigs_fasta: if merged_contigs_csv is not None: contig_reader = DictReader(merged_contigs_csv) for i, row in enumerate(contig_reader, 1): contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") + + writer = init_contigs_refs(contigs_csv) if contigs_csv else None + stitched_writer = init_contigs_refs(contigs_stitched_csv) if contigs_stitched_csv else None group_refs: Dict[str, str] = {} with StitcherContext.fresh() as ctx: @@ -90,16 +106,26 @@ def write_contig_refs(contigs_fasta_path, group_refs=group_refs) contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) - contigs = list(stitch_consensus(contigs)) - for contig in contigs: - writer.writerow(dict(ref=contig.ref_name, - match=contig.match_fraction, - group_ref=contig.group_ref, - contig=contig.seq)) + if writer is not None: + for contig in contigs: + contigs_refs_write(writer, + ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq) - if stitcher_plot_path is not None: - plot_stitcher_coverage(ctx.events, stitcher_plot_path) + if stitched_writer is not None: + contigs = list(stitch_consensus(contigs)) + for contig in contigs: + contigs_refs_write(stitched_writer, + ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq) + + if stitcher_plot_path is not None: + plot_stitcher_coverage(ctx.events, stitcher_plot_path) return len(contigs) @@ -120,6 +146,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): fraction of the query that aligned against the reference (matches and mismatches). """ + contig_nums: Dict[str, int] = {} # {contig_name: contig_num} with open(fasta) as f: for line in f: @@ -207,7 +234,8 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): def denovo(fastq1_path: str, fastq2_path: str, - contigs_csv: TextIO, + contigs_csv: Optional[TextIO], + contigs_stitched_csv: Optional[TextIO], work_dir: str = '.', merged_contigs_csv: Optional[TextIO] = None, blast_csv: Optional[TextIO] = None, @@ -218,16 +246,29 @@ def denovo(fastq1_path: str, :param fastq1_path: FASTQ file name for read 1 reads :param fastq2_path: FASTQ file name for read 2 reads :param contigs_csv: open file to write assembled contigs to + :param contigs_stitched_csv: open file to write stitched contigs to :param work_dir: path for writing temporary files :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads :param blast_csv: open file to write BLAST search results for each contig + :param stitcher_plot_path: open file to write the visualizer plot to """ + + if contigs_csv is None and contigs_stitched_csv is None: + raise ValueError("Must specify either contigs_csv or contigs_stitched_csv") + old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*')) for old_tmp_dir in old_tmp_dirs: rmtree(old_tmp_dir, ignore_errors=True) tmp_dir = mkdtemp(dir=work_dir, prefix='assembly_') + + if contigs_csv is None: + contigs_csv_tmp = tempfile.NamedTemporaryFile() + contigs_csv = contigs_csv_tmp.name + else: + contigs_csv_tmp = None + start_time = datetime.now() start_dir = os.getcwd() joined_path = os.path.join(tmp_dir, 'joined.fastq') @@ -267,6 +308,7 @@ def denovo(fastq1_path: str, duration = datetime.now() - start_time contig_count = write_contig_refs(contigs_fasta_path, contigs_csv, + contigs_stitched_csv, blast_csv=blast_csv, stitcher_plot_path=stitcher_plot_path) logger.info('Assembled %d contigs in %s (%ds) on %s.', @@ -275,6 +317,9 @@ def denovo(fastq1_path: str, duration.total_seconds(), fastq1_path) + if contigs_csv_tmp: + contigs_csv_tmp.close() + if __name__ == '__main__': logging.basicConfig(level=logging.INFO) @@ -282,7 +327,8 @@ def denovo(fastq1_path: str, parser.add_argument('fastq1') parser.add_argument('fastq2') parser.add_argument('contigs', type=argparse.FileType('w')) + parser.add_argument('--contigs_stitched', type=argparse.FileType('w')) parser.add_argument('--stitcher_plot') args = parser.parse_args() - denovo(args.fastq1, args.fastq2, args.contigs, args.stitcher_plot_path) + denovo(args.fastq1, args.fastq2, args.contigs, args.contigs_stitched, args.stitcher_plot_path) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index fec8eda2b..5d988f5dc 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -250,6 +250,7 @@ def process(self, conseq_ins_csv=(self.conseq_ins_csv, 'r'), remap_conseq_csv=(self.remap_conseq_csv, 'r'), contigs_csv=(self.contigs_csv, 'r') if use_denovo else None, + contigs_stitched_csv=(self.contigs_stitched_csv, 'r') if use_denovo else None, nuc_detail_csv=(self.nuc_details_csv, 'w') if use_denovo else None, amino_csv=(self.amino_csv, 'w'), amino_detail_csv=(self.amino_details_csv, 'w') if use_denovo else None, @@ -387,10 +388,12 @@ def run_denovo(self, excluded_seeds): scratch_path = self.get_scratch_path() with open(self.merged_contigs_csv) as merged_contigs_csv, \ open(self.contigs_csv, 'w') as contigs_csv, \ + open(self.contigs_stitched_csv, 'w') as contigs_stitched_csv, \ open(self.blast_csv, 'w') as blast_csv: denovo(self.trimmed1_fastq, self.trimmed2_fastq, contigs_csv, + contigs_stitched_csv, self.scratch_path, merged_contigs_csv, blast_csv=blast_csv) @@ -399,7 +402,7 @@ def run_denovo(self, excluded_seeds): debug_file_prefix = os.path.join(scratch_path, 'debug') else: debug_file_prefix = None - with open(self.contigs_csv) as contigs_csv, \ + with open(self.contigs_stitched_csv) as contigs_stitched_csv, \ open(self.remap_csv, 'w') as remap_csv, \ open(self.remap_counts_csv, 'w') as counts_csv, \ open(self.remap_conseq_csv, 'w') as conseq_csv, \ @@ -408,7 +411,7 @@ def run_denovo(self, excluded_seeds): map_to_contigs(self.trimmed1_fastq, self.trimmed2_fastq, - contigs_csv, + contigs_stitched_csv, remap_csv, counts_csv, conseq_csv, diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 0e0dbd032..421bdaf41 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -60,6 +60,7 @@ 'wg_fasta', 'mid_fasta', 'contigs_csv', + 'contigs_stitched_csv', 'alignment_svg', 'alignment_png', 'assembly_fasta', diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index a7fbff009..06001779b 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -48,7 +48,8 @@ def test_write_contig_refs_two_sequences(tmpdir, hcv_db): HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -71,7 +72,8 @@ def test_write_contig_refs_two_groups(tmpdir, hcv_db): HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -88,7 +90,8 @@ def test_write_contig_refs_not_found(tmpdir, hcv_db): unknown,0,,CATCACATAGGAGA """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -108,7 +111,8 @@ def test_write_contig_refs_partial_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -129,7 +133,8 @@ def test_write_contig_refs_reversed_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -177,7 +182,8 @@ def test_write_contig_refs(tmpdir, hcv_db): 1,HCV-1a,41,1.0,100,1,41,8187,8227 """ - write_contig_refs(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv, blast_csv=blast_csv) assert expected_contigs_csv == contigs_csv.getvalue() assert expected_blast_csv == blast_csv.getvalue() @@ -192,7 +198,8 @@ def test_write_contig_refs_none(tmpdir, hcv_db): ref,match,group_ref,contig """ - write_contig_refs(str(contigs_fasta), contigs_csv) + contigs_stitched_csv = StringIO() + write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -214,8 +221,9 @@ def test_merged_contig(tmpdir, hcv_db): """ with merged_contigs_path.open() as merged_contigs_csv: + contigs_stitched_csv = StringIO() write_contig_refs(str(contigs_fasta), - contigs_csv, + contigs_csv, contigs_stitched_csv, merged_contigs_csv=merged_contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -239,6 +247,7 @@ def test_denovo_iva(tmpdir, hcv_db): denovo(str(microtest_path / '2160A-HCV_S19_L001_R1_001.fastq'), str(microtest_path / '2160A-HCV_S19_L001_R2_001.fastq'), contigs_csv, + None, tmpdir) assert contigs_csv.getvalue() == expected_contigs_csv diff --git a/micall/utils/denovo_simplify.py b/micall/utils/denovo_simplify.py index 9bbfe83e6..6786a6458 100644 --- a/micall/utils/denovo_simplify.py +++ b/micall/utils/denovo_simplify.py @@ -90,7 +90,7 @@ def _test(self, read_indexes): exception = None # noinspection PyBroadException try: - denovo(trimmed_filename1, trimmed_filename2, contigs_csv, workdir) + denovo(trimmed_filename1, trimmed_filename2, contigs_csv, None, workdir) except Exception as ex: logger.warning('Assembly failed.', exc_info=True) exception = ex diff --git a/micall_kive.py b/micall_kive.py index 3d49b0e39..d6b9e0c52 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -83,6 +83,9 @@ def parse_args(): parser.add_argument('contigs_csv', nargs='?', help='CSV containing contigs built by de novo assembly') + parser.add_argument('contigs_stitched_csv', + nargs='?', + help='CSV containing contigs built by de novo assembly and stitched by our stitcher') parser.add_argument('read_entropy_csv', nargs='?', help='CSV containing read pair length counts') @@ -125,6 +128,7 @@ def load_sample(args): aligned_csv=args.aligned_csv, g2p_aligned_csv=args.g2p_aligned_csv, contigs_csv=args.contigs_csv, + contigs_stitched_csv=args.contigs_stitched_csv, genome_coverage_csv=args.genome_coverage_csv, genome_coverage_svg=args.genome_coverage_svg, genome_concordance_svg=args.genome_concordance_svg, From a770f1c3a77b2f089fd0e4505e03d9f4df2db87a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 29 Feb 2024 11:37:35 -0800 Subject: [PATCH 222/485] Add new output file: remap_unstitched_conseq.csv --- Singularity | 4 ++-- docs/steps.md | 5 +++++ micall/core/denovo.py | 6 +++--- micall/drivers/sample.py | 27 +++++++++++++++++++++++++-- micall_docker.py | 1 + micall_kive.py | 3 +++ release_test_microtest.py | 1 + 7 files changed, 40 insertions(+), 7 deletions(-) diff --git a/Singularity b/Singularity index 816f0142c..6a21fa06c 100644 --- a/Singularity +++ b/Singularity @@ -15,7 +15,7 @@ From: centos:7 MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \ - remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ + remap_conseq_csv remap_unstitched_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ failed_csv cascade_csv nuc_csv amino_csv insertions_csv conseq_csv \ conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ @@ -188,7 +188,7 @@ From: centos:7 %applabels denovo KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \ - remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ + remap_conseq_csv remap_unstitched_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ failed_csv cascade_csv nuc_csv amino_csv insertions_csv conseq_csv \ conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ diff --git a/docs/steps.md b/docs/steps.md index 1043988f9..aa327065e 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -58,6 +58,8 @@ Individual files are described after the list of steps. each stage. * remap_conseq.csv - downloaded - consensus sequence that reads were mapped to on the final iteration + * remap_unstitched_conseq.csv - downloaded - consensus sequence that reads were + mapped to the unstitched contigs. * unmapped1.fastq - FASTQ format (unstructured text) reads that didn't map to any of the final references. * unmapped2.fastq - FASTQ @@ -347,6 +349,9 @@ Individual files are described after the list of steps. * remap_conseq.csv * region - the region mapped to * sequence - the consensus sequence used +* remap_unstitched_conseq.csv + * region - the region mapped to + * sequence - the consensus sequence used * resistance.csv * region - the region code, like PR or RT * drug_class - the drug class code from the HIVdb rules, like NRTI diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 20835b1bc..c9a3ad6f5 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -2,7 +2,7 @@ import logging import os import tempfile -from typing import Optional, TextIO, Iterable, Dict +from typing import Optional, TextIO, Iterable, Dict, cast from collections import Counter from csv import DictWriter, DictReader from datetime import datetime @@ -264,8 +264,8 @@ def denovo(fastq1_path: str, tmp_dir = mkdtemp(dir=work_dir, prefix='assembly_') if contigs_csv is None: - contigs_csv_tmp = tempfile.NamedTemporaryFile() - contigs_csv = contigs_csv_tmp.name + contigs_csv_tmp = tempfile.NamedTemporaryFile("wt") + contigs_csv = cast(TextIO, contigs_csv_tmp.file) else: contigs_csv_tmp = None diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 5d988f5dc..46419f9c8 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -249,6 +249,7 @@ def process(self, nuc_csv=(self.nuc_csv, 'w'), conseq_ins_csv=(self.conseq_ins_csv, 'r'), remap_conseq_csv=(self.remap_conseq_csv, 'r'), + remap_unstitched_conseq_csv=(self.remap_unstitched_conseq_csv, 'r') if use_denovo else None, contigs_csv=(self.contigs_csv, 'r') if use_denovo else None, contigs_stitched_csv=(self.contigs_stitched_csv, 'r') if use_denovo else None, nuc_detail_csv=(self.nuc_details_csv, 'w') if use_denovo else None, @@ -397,15 +398,17 @@ def run_denovo(self, excluded_seeds): self.scratch_path, merged_contigs_csv, blast_csv=blast_csv) + logger.info('Running remap on %s.', self) if self.debug_remap: debug_file_prefix = os.path.join(scratch_path, 'debug') else: debug_file_prefix = None + with open(self.contigs_stitched_csv) as contigs_stitched_csv, \ open(self.remap_csv, 'w') as remap_csv, \ open(self.remap_counts_csv, 'w') as counts_csv, \ - open(self.remap_conseq_csv, 'w') as conseq_csv, \ + open(self.remap_conseq_csv, 'w') as remap_conseq_csv, \ open(self.unmapped1_fastq, 'w') as unmapped1, \ open(self.unmapped2_fastq, 'w') as unmapped2: @@ -414,7 +417,27 @@ def run_denovo(self, excluded_seeds): contigs_stitched_csv, remap_csv, counts_csv, - conseq_csv, + remap_conseq_csv, + unmapped1, + unmapped2, + scratch_path, + debug_file_prefix=debug_file_prefix, + excluded_seeds=excluded_seeds) + + # Mapping the unstitched version too. + with open(self.contigs_csv) as contigs_csv, \ + open(os.devnull, 'w') as remap_csv, \ + open(os.devnull, 'w') as counts_csv, \ + open(self.remap_unstitched_conseq_csv, 'w') as remap_unstitched_conseq_csv, \ + open(os.devnull, 'w') as unmapped1, \ + open(os.devnull, 'w') as unmapped2: + + map_to_contigs(self.trimmed1_fastq, + self.trimmed2_fastq, + contigs_csv, + remap_csv, + counts_csv, + remap_unstitched_conseq_csv, unmapped1, unmapped2, scratch_path, diff --git a/micall_docker.py b/micall_docker.py index 2300f7c51..b8ee54523 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1075,6 +1075,7 @@ def collate_samples(run_info: RunInfo): """ filenames = ['remap_counts.csv', 'remap_conseq.csv', + 'remap_unstitched_conseq.csv', 'insertions.csv', 'failed_read.csv', 'nuc.csv', diff --git a/micall_kive.py b/micall_kive.py index d6b9e0c52..acf37eb38 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -33,6 +33,8 @@ def parse_args(): help='CSV containing numbers of mapped reads') parser.add_argument('remap_conseq_csv', help='CSV containing mapping consensus sequences') + parser.add_argument('remap_unstitched_conseq_csv', + help='CSV containing mapping unstitched consensus sequences') parser.add_argument('unmapped1_fastq', help='FASTQ R1 of reads that failed to map to any region') parser.add_argument('unmapped2_fastq', @@ -112,6 +114,7 @@ def load_sample(args): g2p_summary_csv=args.g2p_summary_csv, remap_counts_csv=args.remap_counts_csv, remap_conseq_csv=args.remap_conseq_csv, + remap_unstitched_conseq_csv=args.remap_unstitched_conseq_csv, unmapped1_fastq=args.unmapped1_fastq, unmapped2_fastq=args.unmapped2_fastq, insertions_csv=args.insertions_csv, diff --git a/release_test_microtest.py b/release_test_microtest.py index 79d597688..416a10a55 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -442,6 +442,7 @@ def process_sample(self, fastq_file: Path): 'g2p_summary.csv', 'remap_counts.csv', 'remap_conseq.csv', + 'remap_unstitched_conseq.csv', 'unmapped1.fastq', 'unmapped2.fastq', 'conseq_ins.csv', From 14d52f78fe2bf96e1bdfa279699b7a4995ebfdac Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Mar 2024 09:25:52 -0800 Subject: [PATCH 223/485] Rename contigs.csv to contigs_unstitched.csv And similarly, rename contigs_stitched.csv to contigs.csv --- Singularity | 4 ++-- docs/steps.md | 9 ++++---- micall/core/contig_stitcher.py | 21 ++++++++++------- micall/core/denovo.py | 41 ++++++++++++++++++---------------- micall/drivers/sample.py | 14 ++++++------ micall/monitor/kive_watcher.py | 3 ++- micall_kive.py | 6 ++--- 7 files changed, 54 insertions(+), 44 deletions(-) diff --git a/Singularity b/Singularity index 6a21fa06c..bb3f171bc 100644 --- a/Singularity +++ b/Singularity @@ -15,7 +15,7 @@ From: centos:7 MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \ - remap_conseq_csv remap_unstitched_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ + remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ failed_csv cascade_csv nuc_csv amino_csv insertions_csv conseq_csv \ conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ @@ -193,7 +193,7 @@ From: centos:7 conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - contigs_csv contigs_stitched_csv \ + contigs_unstitched_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index aa327065e..2b8c2c773 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -44,8 +44,9 @@ Individual files are described after the list of steps. * in - fastq1 * in - fastq2 * in - merged_contigs.csv - * contigs.csv - the assembled contigs, plus any merged contigs, including + * contigs_unstitched.csv - the assembled contigs, plus any merged contigs, including the best blast results + * contigs.csv - stitched version of `contigs_unstitched` * blast.csv - multiple blast results for each contig * `remap`: iteratively use consensus from previous mapping as reference to try and map more reads. See [remap design] for more details. (The denovo version @@ -217,15 +218,15 @@ Individual files are described after the list of steps. * pos - 1-based position in the consensus sequence that this insertion follows * insert - the nucleotide sequence that was inserted * qual - the Phred quality scores for the inserted sequence -* contigs.csv +* contigs_unstitched.csv * ref - the reference name with the best BLAST result * match - the fraction of the contig that matched in BLAST, negative for reverse-complemented matches * group_ref - the reference name chosen to best match all of the contigs in a sample * contig - the nucleotide sequence of the assembled contig -* contigs_stitched.csv - Same as `contigs.csv`, but contigs are stitched by `micall/core/contig_stitcher.py`. +* contigs.csv + Same as `contigs_unstitched.csv`, but contigs are stitched by `micall/core/contig_stitcher.py`. * coverage_scores.csv * project - the project this score is defined by * region - the region being displayed diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b6495387e..4a8aa45e3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -595,14 +595,16 @@ def main(args): from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() - parser.add_argument('contigs', type=argparse.FileType('r')) - parser.add_argument('stitched_contigs', type=argparse.FileType('w')) - parser.add_argument('--plot') + parser.add_argument('contigs', type=argparse.FileType('r'), help="Input fasta file with assembled contigs.") + parser.add_argument('stitched_contigs', type=argparse.FileType('w'), + help="Output fasta file with stitched contigs.") + parser.add_argument('--plot', type=argparse.FileType('w'), + help="Output SVG image visualizing the stitching process.") verbosity_group = parser.add_mutually_exclusive_group() - verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity') - verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity', default=True) - verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity') - verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity') + verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity.') + verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True) + verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.') + verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.') args = parser.parse_args(args) if args.quiet: @@ -616,9 +618,12 @@ def main(args): logging.basicConfig(level=logger.level) with StitcherContext.fresh(): - write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=args.plot) + plot_path = args.plot.name if args.plot is not None else None + write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) args.contigs.close() args.stitched_contigs.close() + if args.plot is not None: + args.plot.close() if __name__ == '__main__': diff --git a/micall/core/denovo.py b/micall/core/denovo.py index c9a3ad6f5..1b77c9fb5 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -72,8 +72,8 @@ def contigs_refs_write(writer, ref: str, match: float, group_ref: str, contig: s def write_contig_refs(contigs_fasta_path: str, + contigs_unstitched_csv: Optional[TextIO], contigs_csv: Optional[TextIO], - contigs_stitched_csv: Optional[TextIO], merged_contigs_csv: Optional[TextIO] = None, blast_csv: Optional[TextIO] = None, stitcher_plot_path: Optional[str] = None) -> int: @@ -81,8 +81,8 @@ def write_contig_refs(contigs_fasta_path: str, :param str contigs_fasta_path: path to file to read contig sequences from and append merged contigs to - :param contigs_csv: open file to write assembled contigs to - :param contigs_stitched_csv: open file to write stitched contigs to + :param contigs_unstitched_csv: open file to write assembled contigs to + :param contigs_csv: open file to write stitched contigs to :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads :param blast_csv: open file to write BLAST search results for each contig @@ -96,8 +96,9 @@ def write_contig_refs(contigs_fasta_path: str, contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") - writer = init_contigs_refs(contigs_csv) if contigs_csv else None - stitched_writer = init_contigs_refs(contigs_stitched_csv) if contigs_stitched_csv else None + unstitched_writer = init_contigs_refs(contigs_unstitched_csv) \ + if contigs_unstitched_csv else None + stitched_writer = init_contigs_refs(contigs_csv) if contigs_csv else None group_refs: Dict[str, str] = {} with StitcherContext.fresh() as ctx: @@ -107,16 +108,18 @@ def write_contig_refs(contigs_fasta_path: str, contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) - if writer is not None: + if unstitched_writer is not None: for contig in contigs: - contigs_refs_write(writer, + contigs_refs_write(unstitched_writer, ref=contig.ref_name, match=contig.match_fraction, group_ref=contig.group_ref, contig=contig.seq) - if stitched_writer is not None: + if stitched_writer is not None or stitcher_plot_path is not None: contigs = list(stitch_consensus(contigs)) + + if stitched_writer is not None: for contig in contigs: contigs_refs_write(stitched_writer, ref=contig.ref_name, @@ -124,8 +127,8 @@ def write_contig_refs(contigs_fasta_path: str, group_ref=contig.group_ref, contig=contig.seq) - if stitcher_plot_path is not None: - plot_stitcher_coverage(ctx.events, stitcher_plot_path) + if stitcher_plot_path is not None: + plot_stitcher_coverage(ctx.events, stitcher_plot_path) return len(contigs) @@ -234,8 +237,8 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): def denovo(fastq1_path: str, fastq2_path: str, + contigs_unstitched_csv: Optional[TextIO], contigs_csv: Optional[TextIO], - contigs_stitched_csv: Optional[TextIO], work_dir: str = '.', merged_contigs_csv: Optional[TextIO] = None, blast_csv: Optional[TextIO] = None, @@ -245,8 +248,8 @@ def denovo(fastq1_path: str, :param fastq1_path: FASTQ file name for read 1 reads :param fastq2_path: FASTQ file name for read 2 reads - :param contigs_csv: open file to write assembled contigs to - :param contigs_stitched_csv: open file to write stitched contigs to + :param contigs_unstitched_csv: open file to write assembled contigs to + :param contigs_csv: open file to write stitched contigs to :param work_dir: path for writing temporary files :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads @@ -254,8 +257,8 @@ def denovo(fastq1_path: str, :param stitcher_plot_path: open file to write the visualizer plot to """ - if contigs_csv is None and contigs_stitched_csv is None: - raise ValueError("Must specify either contigs_csv or contigs_stitched_csv") + if contigs_unstitched_csv is None and contigs_csv is None: + raise ValueError("Must specify either contigs_csv or contigs_unstitched_csv") old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*')) for old_tmp_dir in old_tmp_dirs: @@ -307,8 +310,8 @@ def denovo(fastq1_path: str, os.chdir(start_dir) duration = datetime.now() - start_time contig_count = write_contig_refs(contigs_fasta_path, + contigs_unstitched_csv, contigs_csv, - contigs_stitched_csv, blast_csv=blast_csv, stitcher_plot_path=stitcher_plot_path) logger.info('Assembled %d contigs in %s (%ds) on %s.', @@ -326,9 +329,9 @@ def denovo(fastq1_path: str, parser = argparse.ArgumentParser() parser.add_argument('fastq1') parser.add_argument('fastq2') - parser.add_argument('contigs', type=argparse.FileType('w')) - parser.add_argument('--contigs_stitched', type=argparse.FileType('w')) + parser.add_argument('--contigs_unstitched', type=argparse.FileType('w')) + parser.add_argument('--contigs', type=argparse.FileType('w')) parser.add_argument('--stitcher_plot') args = parser.parse_args() - denovo(args.fastq1, args.fastq2, args.contigs, args.contigs_stitched, args.stitcher_plot_path) + denovo(args.fastq1, args.fastq2, args.contigs_unstitched, args.contigs, args.stitcher_plot_path) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 46419f9c8..ed118fc49 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -250,8 +250,8 @@ def process(self, conseq_ins_csv=(self.conseq_ins_csv, 'r'), remap_conseq_csv=(self.remap_conseq_csv, 'r'), remap_unstitched_conseq_csv=(self.remap_unstitched_conseq_csv, 'r') if use_denovo else None, + contigs_unstitched_csv=(self.contigs_unstitched_csv, 'r') if use_denovo else None, contigs_csv=(self.contigs_csv, 'r') if use_denovo else None, - contigs_stitched_csv=(self.contigs_stitched_csv, 'r') if use_denovo else None, nuc_detail_csv=(self.nuc_details_csv, 'w') if use_denovo else None, amino_csv=(self.amino_csv, 'w'), amino_detail_csv=(self.amino_details_csv, 'w') if use_denovo else None, @@ -388,13 +388,13 @@ def run_denovo(self, excluded_seeds): logger.info('Running de novo assembly on %s.', self) scratch_path = self.get_scratch_path() with open(self.merged_contigs_csv) as merged_contigs_csv, \ + open(self.contigs_unstitched_csv, 'w') as contigs_unstitched_csv, \ open(self.contigs_csv, 'w') as contigs_csv, \ - open(self.contigs_stitched_csv, 'w') as contigs_stitched_csv, \ open(self.blast_csv, 'w') as blast_csv: denovo(self.trimmed1_fastq, self.trimmed2_fastq, + contigs_unstitched_csv, contigs_csv, - contigs_stitched_csv, self.scratch_path, merged_contigs_csv, blast_csv=blast_csv) @@ -405,7 +405,7 @@ def run_denovo(self, excluded_seeds): else: debug_file_prefix = None - with open(self.contigs_stitched_csv) as contigs_stitched_csv, \ + with open(self.contigs_csv) as contigs_csv, \ open(self.remap_csv, 'w') as remap_csv, \ open(self.remap_counts_csv, 'w') as counts_csv, \ open(self.remap_conseq_csv, 'w') as remap_conseq_csv, \ @@ -414,7 +414,7 @@ def run_denovo(self, excluded_seeds): map_to_contigs(self.trimmed1_fastq, self.trimmed2_fastq, - contigs_stitched_csv, + contigs_csv, remap_csv, counts_csv, remap_conseq_csv, @@ -425,7 +425,7 @@ def run_denovo(self, excluded_seeds): excluded_seeds=excluded_seeds) # Mapping the unstitched version too. - with open(self.contigs_csv) as contigs_csv, \ + with open(self.contigs_unstitched_csv) as contigs_unstitched_csv, \ open(os.devnull, 'w') as remap_csv, \ open(os.devnull, 'w') as counts_csv, \ open(self.remap_unstitched_conseq_csv, 'w') as remap_unstitched_conseq_csv, \ @@ -434,7 +434,7 @@ def run_denovo(self, excluded_seeds): map_to_contigs(self.trimmed1_fastq, self.trimmed2_fastq, - contigs_csv, + contigs_unstitched_csv, remap_csv, counts_csv, remap_unstitched_conseq_csv, diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 421bdaf41..98479387a 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -33,6 +33,7 @@ MAXIMUM_RETRY_WAIT = timedelta(days=1) MAX_RUN_NAME_LENGTH = 60 DOWNLOADED_RESULTS = ['remap_counts_csv', + 'remap_unstitched_conseq_csv', 'conseq_csv', 'conseq_all_csv', 'conseq_stitched_csv', @@ -60,7 +61,7 @@ 'wg_fasta', 'mid_fasta', 'contigs_csv', - 'contigs_stitched_csv', + 'contigs_unstitched_csv', 'alignment_svg', 'alignment_png', 'assembly_fasta', diff --git a/micall_kive.py b/micall_kive.py index acf37eb38..0d42ed362 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -82,10 +82,10 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') - parser.add_argument('contigs_csv', + parser.add_argument('contigs_unstitched_csv', nargs='?', help='CSV containing contigs built by de novo assembly') - parser.add_argument('contigs_stitched_csv', + parser.add_argument('contigs_csv', nargs='?', help='CSV containing contigs built by de novo assembly and stitched by our stitcher') parser.add_argument('read_entropy_csv', @@ -130,8 +130,8 @@ def load_sample(args): coverage_scores_csv=args.coverage_scores_csv, aligned_csv=args.aligned_csv, g2p_aligned_csv=args.g2p_aligned_csv, + contigs_unstitched_csv=args.contigs_unstitched_csv, contigs_csv=args.contigs_csv, - contigs_stitched_csv=args.contigs_stitched_csv, genome_coverage_csv=args.genome_coverage_csv, genome_coverage_svg=args.genome_coverage_svg, genome_concordance_svg=args.genome_concordance_svg, From cb28178b1f9448866f390c64169b191e8c6ed53b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Mar 2024 16:30:02 -0800 Subject: [PATCH 224/485] Contig stitcher: fix visualizer bug ignoring some strip actions --- micall/core/plot_contigs.py | 30 +++++++--- ...wo_overlapping_and_one_separate_contig.svg | 55 ++++++++++--------- ..._overlapping_and_one_separate_contig_2.svg | 55 ++++++++++--------- 3 files changed, 79 insertions(+), 61 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 65f5c0837..ce433e926 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -665,6 +665,9 @@ def extend_alive(contig_name): child_name in contig_map if child_name not in children_graph)) + lstrip_set = set(lstrip_map.keys()) + rstrip_set = set(rstrip_map.keys()) + for contig_name, parents in parent_graph.items(): if len(parents) == 1: morphism_graph[parents[0]] = [contig_name] @@ -712,14 +715,18 @@ def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: for contig_name in sorted_roots: set_query_position(contig_name) - def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, overlap_xsibling): + def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, overlap_xsibling, xstrip_set): for parent in edge_table: child_remainder = edge_table[parent] for child_remainder_morph in eqv_morphism_graph.get(child_remainder, [child_remainder]): - if child_remainder_morph in overlap_xtake_map: - continue - for parent_morph in eqv_morphism_graph.get(parent, [parent]): + if child_remainder_morph in xstrip_set: + xstrip_set.add(parent_morph) + if parent_morph in xstrip_set: + xstrip_set.add(child_remainder_morph) + + if child_remainder_morph in overlap_xtake_map: + continue for parent_remainder in overlap_xparent_map: if overlap_xparent_map[parent_remainder] == parent_morph: overlap_xtake_map[child_remainder_morph] = overlap_xtake_map[parent_remainder] @@ -727,9 +734,11 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, over yield True # Closing `takes` by parents - while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, overlap_leftparent_map, overlap_left_sibling)): + while list(copy_takes_one_side(combine_right_edge, overlap_lefttake_map, + overlap_leftparent_map, overlap_left_sibling, rstrip_set)): pass - while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map, overlap_right_sibling)): + while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, + overlap_rightparent_map, overlap_right_sibling, lstrip_set)): pass final_nodes: List[str] = [] @@ -823,9 +832,8 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] for parent_name in sorted_roots: parts_names = final_children_mapping[parent_name] - parts = [contig_map[part] for part in parts_names] - - for part in parts: + for part_name in parts_names: + part = contig_map[part_name] if not isinstance(part, AlignedContig): continue @@ -834,12 +842,16 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] if prev_part is not None: r_st = prev_part.alignment.r_st + elif part_name in lstrip_set: + r_st = part.alignment.r_st else: start_delta = -1 * part.alignment.q_st r_st = part.alignment.r_st + start_delta if next_part is not None: r_ei = next_part.alignment.r_ei + elif part_name in rstrip_set: + r_ei = part.alignment.r_ei else: end_delta = len(part.seq) - 1 - part.alignment.q_ei r_ei = part.alignment.r_ei + end_delta diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index 5bbf0b0f9..e09ed4e9b 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -1,9 +1,9 @@ + width="970.0" height="266.6" viewBox="0.0 -266.6 970.0 266.6"> - + @@ -11,7 +11,7 @@ - + @@ -26,13 +26,13 @@ - + testref - + @@ -40,10 +40,10 @@ 1.1 - - - -2.2 + + + +2.2 @@ -52,13 +52,13 @@ 3.2 - + - + @@ -69,16 +69,13 @@ - + - - + + - - - - + @@ -86,13 +83,13 @@ - + - + @@ -103,11 +100,11 @@ - + - + @@ -187,28 +184,34 @@ - + - + discards: - + 1.2 - + 2.1 + + + +2.3 + + diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 30864a1b1..6342c3939 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -1,14 +1,14 @@ + width="970.0" height="266.6" viewBox="0.0 -266.6 970.0 266.6"> - + - + @@ -26,13 +26,13 @@ - + testref - + @@ -40,10 +40,10 @@ 1.1 - - + + -2.1 +2.2 @@ -52,13 +52,13 @@ 3.2 - + - + @@ -69,30 +69,27 @@ - + - + - - - - + -2.1 +2.2 - + - + @@ -103,11 +100,11 @@ - + - + @@ -187,26 +184,32 @@ - + - + discards: - + 1.2 - + -2.2 +2.1 + + + + + +2.3 From 940b186c51950be14f0c5cf1adee47fc836ace06 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Mar 2024 16:34:51 -0800 Subject: [PATCH 225/485] Contig stitcher: fix landmarks visualization --- micall/core/plot_contigs.py | 26 ++-- micall/data/landmark_reader.py | 7 + ...partially_overlapping_contigs_real_hcv.svg | 145 ++++++++++++++++++ ...artially_overlapping_contigs_real_hiv.svg} | 0 micall/tests/test_contig_stitcher.py | 42 ++++- 5 files changed, 203 insertions(+), 17 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg rename micall/tests/data/stitcher_plots/{test_correct_stitching_of_two_partially_overlapping_contigs_real.svg => test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg} (100%) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index ce433e926..e653c40d1 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -23,6 +23,7 @@ from micall.utils.contig_stitcher_contigs import Contig, GenotypedContig, AlignedContig from micall.utils.cigar_tools import CigarHit import micall.utils.contig_stitcher_events as events +from micall.data.landmark_reader import LandmarkReader logger = logging.getLogger(__name__) @@ -1052,28 +1053,22 @@ def add_section(title: str) -> None: # Drawing part # ################ - landmarks_path = (Path(__file__).parent.parent / "data" / "landmark_references.yaml") - landmark_groups = yaml.safe_load(landmarks_path.read_text()) + landmark_reader = LandmarkReader.load() figure = Figure() for group_ref in group_refs: - matching_groups = [group for group in landmark_groups if group['coordinates'] == group_ref] - if matching_groups: - reference_set = matching_groups[0] - elif "HIV1" in group_ref: - matching_groups = [group for group in landmark_groups if group['coordinates'] == "HIV1-B-FR-K03455-seed"] - reference_set = matching_groups[0] - else: - reference_set = None + try: + landmarks = landmark_reader.get_landmarks(group_ref) + except ValueError: + landmarks = None ############# # Landmarks # ############# - if reference_set: - + if landmarks: # Filling out missing ends. prev_landmark = None - for landmark in sorted(reference_set['landmarks'], key=itemgetter('start')): + for landmark in sorted(landmarks, key=itemgetter('start')): landmark.setdefault('frame', 0) if prev_landmark and 'end' not in prev_landmark: prev_landmark['end'] = landmark['start'] - 1 @@ -1081,14 +1076,13 @@ def add_section(title: str) -> None: # Computing the stretching factor. landmark_max = 0 - for landmark in reference_set['landmarks']: + for landmark in landmarks: landmark_max = max(landmark_max, landmark['end']) stretch_c = group_refs[group_ref] / landmark_max # Drawing the landmarks. - for frame, frame_landmarks in groupby(reference_set['landmarks'], - itemgetter('frame')): + for frame, frame_landmarks in groupby(landmarks, itemgetter('frame')): subtracks = [] for landmark in frame_landmarks: landmark_colour = landmark.get('colour') diff --git a/micall/data/landmark_reader.py b/micall/data/landmark_reader.py index 0dbe456d5..592ffa27a 100644 --- a/micall/data/landmark_reader.py +++ b/micall/data/landmark_reader.py @@ -70,6 +70,13 @@ def get_coordinates(self, seed_name: str) -> str: return genotype_landmarks['coordinates'] raise ValueError(f'No landmarks match {seed_name!r}.') + def get_landmarks(self, seed_name: str) -> typing.Dict[str, object]: + for genotype_landmarks in self.landmarks: + seed_pattern = genotype_landmarks['seed_pattern'] + if re.fullmatch(seed_pattern, seed_name): + return genotype_landmarks['landmarks'] + raise ValueError(f'No landmarks match {seed_name!r}.') + def get_region(self, ref_name, position): match_regions = [] matches = [entry diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg new file mode 100644 index 000000000..06b6c481a --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg @@ -0,0 +1,145 @@ + + + + + + + + +5' + + + + + +C + + + + + +E1 + + + + + +E2 + + + + + +p7 + + + + + +NS2 + + + + + +NS3 + + + + + +NS4b + + + + + +NS4a + + + + + +NS5a + + + + + +NS5b + + + + + +3' + + + + + + + + + + + + + + + + + + + + + + + +HCV-1a + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + +1 + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg similarity index 100% rename from micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real.svg rename to micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 4715f1a59..d92414633 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -226,7 +226,7 @@ def test_correct_stitching_of_two_partially_overlapping_contigs_with_padding( assert len(visualizer().elements) > len(contigs) -def test_correct_stitching_of_two_partially_overlapping_contigs_real( +def test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv( projects, visualizer ): # Scenario: Two partially overlapping contigs are stitched @@ -265,6 +265,46 @@ def test_correct_stitching_of_two_partially_overlapping_contigs_real( assert len(visualizer().elements) > len(contigs) +def test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv( + projects, visualizer +): + # Scenario: Two partially overlapping contigs are stitched + # correctly into a single sequence. Not using exact aligner this time. + + ref_name = "HCV-1a" + ref = projects.getReference(ref_name) + group_ref = ref_name + + contigs = [ + GenotypedContig( + name="a", + seq=ref[1700:2000], + ref_name=ref_name, + group_ref=group_ref, + ref_seq=ref, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq=ref[1900:2200], + ref_name=ref_name, + group_ref=group_ref, + ref_seq=ref, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + + result = results[0] + + assert 500 == len(result.seq) + assert result.seq == ref[1700:2200] + + assert len(visualizer().elements) > len(contigs) + + def test_correct_stitching_of_two_partially_overlapping_contigs_with_insignificant_gaps( projects, visualizer ): From 6138b57329e6c0cfdb73589140bea1354a7f28c9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 4 Mar 2024 17:18:15 -0800 Subject: [PATCH 226/485] Update proviral pipeline inputs Use the unstitched versions of the files. --- micall/monitor/kive_watcher.py | 7 ++++--- micall/tests/test_kive_watcher.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 98479387a..d0ffc5fc3 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -938,13 +938,14 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description): run_dataset['argument_name']: run_dataset['dataset'] for run_dataset in main_run['datasets'] if run_dataset['argument_name'] in ('sample_info_csv', - 'conseq_csv', - 'contigs_csv', + 'remap_unstitched_conseq_csv', + 'contigs_unstitched_csv', 'cascade_csv')} input_datasets = { argument_name: self.kive_retry(lambda: self.session.get(url).json()) for argument_name, url in input_dataset_urls.items()} - input_datasets['conseqs_csv'] = input_datasets.pop('conseq_csv') + input_datasets['conseqs_csv'] = input_datasets.pop('remap_unstitched_conseq_csv') + input_datasets['contigs_csv'] = input_datasets.pop('contigs_unstitched_csv') run = self.find_or_launch_run( pipeline_id, input_datasets, diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 929b30671..19bad8104 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1759,10 +1759,10 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): argument_name='sample_info_csv'), dict(dataset='/datasets/111/', argument_type='O', - argument_name='contigs_csv'), + argument_name='contigs_unstitched_csv'), dict(dataset='/datasets/112/', argument_type='O', - argument_name='conseq_csv'), + argument_name='remap_unstitched_conseq_csv'), dict(dataset='/datasets/113/', argument_type='O', argument_name='cascade_csv')]] # run datasets From acc110bf8c90e7fa3a33ff8d6fc1695bbdcb3b0c Mon Sep 17 00:00:00 2001 From: Donaim <15235025+Donaim@users.noreply.github.com> Date: Wed, 6 Mar 2024 21:01:25 -0800 Subject: [PATCH 227/485] Contig stitcher: various code improvements Co-authored-by: Don Kirkby --- micall/core/contig_stitcher.py | 10 ++- .../tests/data/exact_parts_contigs.plot.svg | 24 +++--- ...test_correct_processing_complex_nogaps.svg | 76 +++++++++---------- ...wo_overlapping_and_one_separate_contig.svg | 24 +++--- ..._overlapping_and_one_separate_contig_2.svg | 24 +++--- ...g_of_two_partially_overlapping_contigs.svg | 24 +++--- ...partially_overlapping_contigs_real_hcv.svg | 24 +++--- ...partially_overlapping_contigs_real_hiv.svg | 24 +++--- ...ially_overlapping_contigs_with_padding.svg | 24 +++--- ...onsensus_multiple_overlaping_sequences.svg | 24 +++--- ..._overlapping_contigs_into_one_sequence.svg | 38 +++++----- ...ching_partial_align_multiple_sequences.svg | 24 +++--- ...gs_completely_covered_by_other_contigs.svg | 24 +++--- 13 files changed, 184 insertions(+), 180 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 4a8aa45e3..9ca2cd1aa 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -10,6 +10,7 @@ from Bio import Seq import logging from fractions import Fraction +from operator import itemgetter from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.contig_stitcher_context import context, StitcherContext @@ -341,10 +342,13 @@ def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned """ Determine optimal cut points for stitching based on sequence concordance in the overlap region. """ concordance_d = list(disambiguate_concordance(concordance)) - sorted_concordance_indexes = sorted(range(len(concordance)), key=lambda i: concordance_d[i]) - def remove_dashes(s): return ''.join(c for c in s if c != '-') + sorted_concordance_indexes = [i for i, v in sorted(enumerate(concordance_d), + key=itemgetter(1), + reverse=True, + )] + def remove_dashes(s): return s.replace('-', '') - for max_concordance_index in reversed(sorted_concordance_indexes): + for max_concordance_index in sorted_concordance_indexes: aligned_left_q_index = len(remove_dashes(aligned_left[:max_concordance_index])) aligned_right_q_index = right_overlap.alignment.query_length - \ len(remove_dashes(aligned_right[max_concordance_index:])) + 1 diff --git a/micall/tests/data/exact_parts_contigs.plot.svg b/micall/tests/data/exact_parts_contigs.plot.svg index 50e268a9c..7bd2b78a5 100644 --- a/micall/tests/data/exact_parts_contigs.plot.svg +++ b/micall/tests/data/exact_parts_contigs.plot.svg @@ -124,10 +124,10 @@ - + - + @@ -144,22 +144,22 @@ - - - -1 + + + +1 - - + + -2 +2 - - + + @@ -172,7 +172,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index 468c24c46..c670f0d08 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -8,13 +8,13 @@ - + - + - + @@ -40,22 +40,22 @@ - - - -1 + + + +1 - - - -3 + + + +3 - - + + -5 +5 @@ -66,8 +66,8 @@ - - + + @@ -80,8 +80,8 @@ - - + + @@ -94,7 +94,7 @@ - + @@ -125,13 +125,13 @@ - + - + - + @@ -157,22 +157,22 @@ - - - -2 + + + +2 - - - -4 + + + +4 - - + + -6 +6 @@ -183,8 +183,8 @@ - - + + @@ -197,8 +197,8 @@ - - + + @@ -211,7 +211,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg index e09ed4e9b..49aa21a16 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig.svg @@ -8,10 +8,10 @@ - + - + @@ -34,16 +34,16 @@ - - - -1.1 + + + +1.1 - - + + -2.2 +2.2 @@ -54,8 +54,8 @@ - - + + @@ -71,7 +71,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg index 6342c3939..62cf27e71 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_of_two_overlapping_and_one_separate_contig_2.svg @@ -8,10 +8,10 @@ - + - + @@ -40,16 +40,16 @@ 1.1 - - - -2.2 + + + +2.2 - - + + -3.2 +3.2 @@ -71,8 +71,8 @@ - - + + @@ -85,7 +85,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg index 87df386a2..cf5086981 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs.svg @@ -8,10 +8,10 @@ - + - + @@ -28,22 +28,22 @@ - - - -1 + + + +1 - - + + -2 +2 - - + + @@ -56,7 +56,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg index 06b6c481a..eb0036e0e 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hcv.svg @@ -82,10 +82,10 @@ - + - + @@ -102,22 +102,22 @@ - - - -1 + + + +1 - - + + -2 +2 - - + + @@ -130,7 +130,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg index 90b68759e..662638fc6 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_real_hiv.svg @@ -124,10 +124,10 @@ - + - + @@ -144,22 +144,22 @@ - - - -1 + + + +1 - - + + -2 +2 - - + + @@ -172,7 +172,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg index 4451d8f16..473330d16 100644 --- a/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg +++ b/micall/tests/data/stitcher_plots/test_correct_stitching_of_two_partially_overlapping_contigs_with_padding.svg @@ -8,10 +8,10 @@ - + - + @@ -28,22 +28,22 @@ - - - -1.1 + + + +1.1 - - + + -2.2 +2.2 - - + + @@ -59,7 +59,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg index a5df4e916..92a38ddf4 100644 --- a/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_partial_align_consensus_multiple_overlaping_sequences.svg @@ -8,10 +8,10 @@ - + - + @@ -28,22 +28,22 @@ - - - -1.1 + + + +1.1 - - + + -2 +2 - - + + @@ -59,7 +59,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg index d0d69d99a..87a968b81 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_of_all_overlapping_contigs_into_one_sequence.svg @@ -8,13 +8,13 @@ - + - + - + @@ -34,28 +34,28 @@ - - - -1 + + + +1 - - - -2 + + + +2 - - + + -3 +3 - - + + @@ -68,8 +68,8 @@ - - + + @@ -82,7 +82,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg index fe2216305..2d4ff8f91 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_partial_align_multiple_sequences.svg @@ -8,10 +8,10 @@ - + - + @@ -28,22 +28,22 @@ - - - -1.1 + + + +1.1 - - + + -2.2 +2.2 - - + + @@ -59,7 +59,7 @@ - + diff --git a/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg index acb07c521..3b2f6aaf5 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_when_multiple_contigs_completely_covered_by_other_contigs.svg @@ -8,10 +8,10 @@ - + - + @@ -28,22 +28,22 @@ - - - -1.1 + + + +1.1 - - + + -2.2 +2.2 - - + + @@ -59,7 +59,7 @@ - + From df5eb7523530d6b867ef8f20beb9c0d5c93e1e58 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Mar 2024 14:42:25 -0700 Subject: [PATCH 228/485] Run apt-update on CI Without this step, the following installation steps may fail spontaneously whenever the upstream servers update their IP addresses. --- .github/workflows/build-and-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index de5ebe481..5ac4bde97 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -16,6 +16,9 @@ jobs: with: go-version: '^1.15.6' + - name: Run apt update + run: sudo apt-get update + - name: Install IVA assembler dependencies run: | sudo apt-get install -qq zlib1g-dev libncurses5-dev libncursesw5-dev mummer ncbi-blast+ From d9298578315d7c07e6080e8877228e0c7a339d4e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 19 Jul 2023 14:38:06 -0700 Subject: [PATCH 229/485] Improve the error message for the case of a missing input Also add a unit test for the negative case of inputs mismatch in a pipepine. --- micall/monitor/kive_watcher.py | 11 ++++++--- micall/tests/test_kive_watcher.py | 39 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index d0ffc5fc3..702cc7a9a 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -983,9 +983,14 @@ def find_or_launch_run(self, for run_dataset in run_datasets): run = None if run is None: - run_datasets = [dict(argument=app_arg, - dataset=inputs[name]['url']) - for name, app_arg in app_args.items()] + try: + run_datasets = [dict(argument=app_arg, + dataset=inputs[name]['url']) + for name, app_arg in app_args.items()] + except KeyError as e: + raise ValueError(f"Pipeline input error: {repr(e)}." + f" The specified app with id {pipeline_id} appears to expect a different set of inputs." + f" Does the run name {repr(run_name)} make sense for it?") run_params = dict(name=run_name, batch=run_batch['url'], groups_allowed=ALLOWED_GROUPS, diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 19bad8104..cd75783b5 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -3160,3 +3160,42 @@ def test_collate_csv_with_sample_already_filled(): KiveWatcher.extract_csv(source2, target, 'ignored', source_count=1) assert target.getvalue() == expected_target + +def test_launch_main_good_pipeline_id(mock_open_kive, default_config): + mock_session = mock_open_kive.return_value + kive_watcher = KiveWatcher(default_config) + kive_watcher.app_urls = { + default_config.micall_filter_quality_pipeline_id: '/containerapps/102'} + kive_watcher.app_args = { + default_config.micall_filter_quality_pipeline_id: dict( + quality_csv='/containerargs/103')} + + inputs = {'quality_csv': {'url': '/datasets/104', 'id': 104}} + run_batch = {'url': '/batches/101'} + kive_watcher.find_or_launch_run(pipeline_id=42, + inputs=inputs, + run_name='MiCall filter quality on 140101_M01234', + run_batch=run_batch) + +def test_launch_main_bad_pipeline_id(mock_open_kive, default_config): + mock_session = mock_open_kive.return_value + kive_watcher = KiveWatcher(default_config) + kive_watcher.app_urls = { + default_config.micall_filter_quality_pipeline_id: '/containerapps/102'} + kive_watcher.app_args = { + default_config.micall_filter_quality_pipeline_id: dict( + quality_csv='/containerargs/103')} + + inputs = {'quality_csv': {'bad_argument': 777, 'id': 104}} + run_batch = {'url': '/batches/101'} + pipeline_id = 42 + + with pytest.raises(ValueError) as excinfo: + kive_watcher.find_or_launch_run(pipeline_id=pipeline_id, + inputs=inputs, + run_name='MiCall filter quality on 140101_M01234', + run_batch=run_batch) + + assert f'The specified app with id {pipeline_id}' \ + ' appears to expect a different set of inputs' \ + in str(excinfo.value) From e71f156c457c06e5cfb1e6fafd8d025549f45ec6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Mar 2024 15:33:30 -0700 Subject: [PATCH 230/485] Simplify a kive test case by using pytest's match argument --- micall/tests/test_kive_watcher.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index cd75783b5..2b0e37e12 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -3189,13 +3189,11 @@ def test_launch_main_bad_pipeline_id(mock_open_kive, default_config): inputs = {'quality_csv': {'bad_argument': 777, 'id': 104}} run_batch = {'url': '/batches/101'} pipeline_id = 42 + expected_msg = f'The specified app with id {pipeline_id}' \ + ' appears to expect a different set of inputs' - with pytest.raises(ValueError) as excinfo: + with pytest.raises(ValueError, match=expected_msg) as excinfo: kive_watcher.find_or_launch_run(pipeline_id=pipeline_id, inputs=inputs, run_name='MiCall filter quality on 140101_M01234', run_batch=run_batch) - - assert f'The specified app with id {pipeline_id}' \ - ' appears to expect a different set of inputs' \ - in str(excinfo.value) From 3c0ca6c2ee6fb818ba15495bed1b55c35a0856b2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 7 Mar 2024 11:31:27 -0800 Subject: [PATCH 231/485] Fix singularity arguments order --- Singularity | 4 ++-- micall/monitor/kive_watcher.py | 4 ++-- micall_kive.py | 7 ++++--- release_test_microtest.py | 1 - 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Singularity b/Singularity index bb3f171bc..bfe522316 100644 --- a/Singularity +++ b/Singularity @@ -188,12 +188,12 @@ From: centos:7 %applabels denovo KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv KIVE_OUTPUTS g2p_csv g2p_summary_csv remap_counts_csv \ - remap_conseq_csv remap_unstitched_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ + remap_conseq_csv unmapped1_fastq unmapped2_fastq conseq_ins_csv \ failed_csv cascade_csv nuc_csv amino_csv insertions_csv conseq_csv \ conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - contigs_unstitched_csv contigs_csv \ + remap_unstitched_conseq_csv contigs_unstitched_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index d0ffc5fc3..a310d03c6 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -33,7 +33,6 @@ MAXIMUM_RETRY_WAIT = timedelta(days=1) MAX_RUN_NAME_LENGTH = 60 DOWNLOADED_RESULTS = ['remap_counts_csv', - 'remap_unstitched_conseq_csv', 'conseq_csv', 'conseq_all_csv', 'conseq_stitched_csv', @@ -60,8 +59,9 @@ 'resistance_consensus_csv', 'wg_fasta', 'mid_fasta', - 'contigs_csv', + 'remap_unstitched_conseq_csv', 'contigs_unstitched_csv', + 'contigs_csv', 'alignment_svg', 'alignment_png', 'assembly_fasta', diff --git a/micall_kive.py b/micall_kive.py index 0d42ed362..601bcc2ef 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -33,8 +33,6 @@ def parse_args(): help='CSV containing numbers of mapped reads') parser.add_argument('remap_conseq_csv', help='CSV containing mapping consensus sequences') - parser.add_argument('remap_unstitched_conseq_csv', - help='CSV containing mapping unstitched consensus sequences') parser.add_argument('unmapped1_fastq', help='FASTQ R1 of reads that failed to map to any region') parser.add_argument('unmapped2_fastq', @@ -82,6 +80,9 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') + parser.add_argument('remap_unstitched_conseq_csv', + nargs='?', + help='CSV containing mapping unstitched consensus sequences') parser.add_argument('contigs_unstitched_csv', nargs='?', help='CSV containing contigs built by de novo assembly') @@ -114,7 +115,6 @@ def load_sample(args): g2p_summary_csv=args.g2p_summary_csv, remap_counts_csv=args.remap_counts_csv, remap_conseq_csv=args.remap_conseq_csv, - remap_unstitched_conseq_csv=args.remap_unstitched_conseq_csv, unmapped1_fastq=args.unmapped1_fastq, unmapped2_fastq=args.unmapped2_fastq, insertions_csv=args.insertions_csv, @@ -130,6 +130,7 @@ def load_sample(args): coverage_scores_csv=args.coverage_scores_csv, aligned_csv=args.aligned_csv, g2p_aligned_csv=args.g2p_aligned_csv, + remap_unstitched_conseq_csv=args.remap_unstitched_conseq_csv, contigs_unstitched_csv=args.contigs_unstitched_csv, contigs_csv=args.contigs_csv, genome_coverage_csv=args.genome_coverage_csv, diff --git a/release_test_microtest.py b/release_test_microtest.py index 416a10a55..79d597688 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -442,7 +442,6 @@ def process_sample(self, fastq_file: Path): 'g2p_summary.csv', 'remap_counts.csv', 'remap_conseq.csv', - 'remap_unstitched_conseq.csv', 'unmapped1.fastq', 'unmapped2.fastq', 'conseq_ins.csv', From ea77119f7a605c8012f52fe6371de2b578c7636e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 7 Mar 2024 16:56:06 -0800 Subject: [PATCH 232/485] Contig stitcher: small improvements to code quality --- micall/core/contig_stitcher.py | 5 +++-- micall/tests/test_contig_stitcher.py | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 9ca2cd1aa..b03351657 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -332,7 +332,7 @@ def slide(start, end): return result -def disambiguate_concordance(concordance: List[float]) -> Iterable[Tuple[float, int]]: +def disambiguate_concordance(concordance: List[Fraction]) -> Iterable[Tuple[Fraction, int]]: for i, x in enumerate(concordance): global_rank = i if i < len(concordance) / 2 else len(concordance) - i - 1 yield x, global_rank @@ -346,7 +346,8 @@ def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned key=itemgetter(1), reverse=True, )] - def remove_dashes(s): return s.replace('-', '') + + def remove_dashes(s: str): return s.replace('-', '') for max_concordance_index in sorted_concordance_indexes: aligned_left_q_index = len(remove_dashes(aligned_left[:max_concordance_index])) diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index d92414633..5209865ba 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -28,7 +28,9 @@ logging.getLogger("micall.core.plot_contigs").setLevel(logging.DEBUG) -random.choice([check_hcv_db, load_projects]) # make linters not complain about unused imports. +# make linters not complain about unused imports. +assert check_hcv_db is not None +assert load_projects is not None @pytest.fixture() @@ -349,6 +351,7 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig( exact_aligner, visualizer ): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. + # One contig on the right, and two on the left. ref_seq = "Z" * 5 + "A" * 100 + "C" * 100 + "T" * 100 + "Y" * 5 @@ -391,6 +394,7 @@ def test_correct_processing_of_two_overlapping_and_one_separate_contig_2( exact_aligner, visualizer ): # Scenario: Two overlapping contigs are stitched together, the non-overlapping is kept separate. + # One contig on the left, and two on the right. ref_seq = "Z" * 5 + "A" * 100 + "C" * 100 + "T" * 100 + "Y" * 5 From 705a435fa04b49ab7a82306f1c220d7537b7e117 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 22:49:42 +0000 Subject: [PATCH 233/485] Bump activesupport from 7.0.4.3 to 7.1.3.2 in /docs Bumps [activesupport](https://github.com/rails/rails) from 7.0.4.3 to 7.1.3.2. - [Release notes](https://github.com/rails/rails/releases) - [Changelog](https://github.com/rails/rails/blob/v7.1.3.2/activesupport/CHANGELOG.md) - [Commits](https://github.com/rails/rails/compare/v7.0.4.3...v7.1.3.2) --- updated-dependencies: - dependency-name: activesupport dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 46a78a557..5a06d0262 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -1,22 +1,31 @@ GEM remote: https://rubygems.org/ specs: - activesupport (7.0.4.3) + activesupport (7.1.3.2) + base64 + bigdecimal concurrent-ruby (~> 1.0, >= 1.0.2) + connection_pool (>= 2.2.5) + drb i18n (>= 1.6, < 2) minitest (>= 5.1) + mutex_m tzinfo (~> 2.0) addressable (2.8.1) public_suffix (>= 2.0.2, < 6.0) + base64 (0.2.0) + bigdecimal (3.1.6) coffee-script (2.4.1) coffee-script-source execjs coffee-script-source (1.11.1) colorator (1.1.0) commonmarker (0.23.9) - concurrent-ruby (1.2.2) + concurrent-ruby (1.2.3) + connection_pool (2.4.1) dnsruby (1.61.9) simpleidn (~> 0.1) + drb (2.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) @@ -86,7 +95,7 @@ GEM activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (1.12.0) + i18n (1.14.4) concurrent-ruby (~> 1.0) jekyll (3.9.3) addressable (~> 2.4) @@ -210,7 +219,8 @@ GEM jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.18.0) + minitest (5.22.2) + mutex_m (0.2.0) nokogiri (1.14.3) mini_portile2 (~> 2.8.0) racc (~> 1.4) From 43c60a639c1337be7dd7efa8f0f750e64f02397b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 22:49:47 +0000 Subject: [PATCH 234/485] Bump drawsvg from 2.2.0 to 2.3.0 Bumps [drawsvg](https://github.com/cduck/drawsvg) from 2.2.0 to 2.3.0. - [Commits](https://github.com/cduck/drawsvg/compare/2.2.0...2.3.0) --- updated-dependencies: - dependency-name: drawsvg dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8941a9e11..73072c094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,5 +14,5 @@ reportlab==4.0.4 pysam==0.21.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 -drawsvg==2.2.0 +drawsvg==2.3.0 cairosvg==2.7.0 \ No newline at end of file From 805bc294b8ab57e3e7dd18dbf3dfb3a64c197263 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 07:32:19 +0000 Subject: [PATCH 235/485] Bump python-levenshtein from 0.21.1 to 0.25.0 Bumps [python-levenshtein](https://github.com/rapidfuzz/python-Levenshtein) from 0.21.1 to 0.25.0. - [Release notes](https://github.com/rapidfuzz/python-Levenshtein/releases) - [Changelog](https://github.com/rapidfuzz/python-Levenshtein/blob/main/HISTORY.md) - [Commits](https://github.com/rapidfuzz/python-Levenshtein/compare/v0.21.1...v0.25.0) --- updated-dependencies: - dependency-name: python-levenshtein dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 73072c094..a4f87a198 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ numpy==1.24.3 scipy==1.10.1 matplotlib==3.7.3 cutadapt==3.7 -python-Levenshtein==0.21.1 +python-Levenshtein==0.25.0 PyYAML==6.0.1 reportlab==4.0.4 pysam==0.21.0 From ec3fb5aac1b50488634a41fb103b745dabde44fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:09:17 +0000 Subject: [PATCH 236/485] Bump pytest from 7.4.0 to 8.1.1 Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.4.0 to 8.1.1. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.4.0...8.1.1) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index d0ab5b830..9257e8df5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,7 +2,7 @@ -r requirements.txt -r requirements-watcher.txt -pytest==7.4.0 +pytest==8.1.1 coverage==7.2.7 pandas==2.0.2 seaborn==0.12.2 From 7d24d3eeb69a8d3bcd0b10e696f4e3817fe34463 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:11:18 +0000 Subject: [PATCH 237/485] Bump coverage from 7.2.7 to 7.4.3 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.2.7 to 7.4.3. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.2.7...7.4.3) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 9257e8df5..78fbf3a68 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,7 +3,7 @@ -r requirements.txt -r requirements-watcher.txt pytest==8.1.1 -coverage==7.2.7 +coverage==7.4.3 pandas==2.0.2 seaborn==0.12.2 ete3 From 3df103a9a15e4aed52c12d97ae3eb220b596e77c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 Nov 2023 11:44:22 -0800 Subject: [PATCH 238/485] Complete typing annotations for several files * consensus_aligner.py * report_amino.py * project_config.py They are all imported by Contig stitcher. --- micall/core/denovo.py | 3 +- micall/core/project_config.py | 5 +- micall/utils/consensus_aligner.py | 95 ++++++++++++++++--------------- micall/utils/report_amino.py | 5 +- 4 files changed, 58 insertions(+), 50 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 1b77c9fb5..8f3be5acc 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -2,6 +2,7 @@ import logging import os import tempfile +import typing from typing import Optional, TextIO, Iterable, Dict, cast from collections import Counter from csv import DictWriter, DictReader @@ -197,7 +198,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): for match in matches} top_refs = set(contig_top_matches.values()) projects = ProjectConfig.loadDefault() - match_scores: Counter[str] = Counter() + match_scores: typing.Counter[str] = Counter() for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')): contig_top_ref = contig_top_matches[contig_name] contig_seed_group = projects.getSeedGroup(contig_top_ref) diff --git a/micall/core/project_config.py b/micall/core/project_config.py index 2fea21b39..b336254f9 100644 --- a/micall/core/project_config.py +++ b/micall/core/project_config.py @@ -1,5 +1,6 @@ import json import os +from typing import Dict, List G2P_SEED_NAME = "HIV1-CON-XX-Consensus-seed" @@ -53,7 +54,7 @@ def writeSeedFasta(self, fasta_file, excluded_seeds=None): if excluded_seeds: seed_region_set.difference_update(excluded_seeds) seed_region_list = list(seed_region_set) - seed_name_map = {} # {sequence: name} + seed_name_map: Dict[str, str] = {} # {sequence: name} seed_region_list.sort() for name in seed_region_list: region = self.config['regions'][name] @@ -157,7 +158,7 @@ def getProjectRegions(self, seed_name, coordinate_name, excluded_projects=None): project_names = set(self.config['projects']) if excluded_projects is not None: project_names.difference_update(excluded_projects) - project_names = sorted(project_names) + project_names: List[str] = sorted(project_names) # type: ignore[no-redef] for project_name in project_names: project = self.config['projects'][project_name] for region in project['regions']: diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 66e9ca6bf..79c23fbb3 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,4 +1,4 @@ -import typing +from typing import Dict, List, Optional, Iterable, Set, Any from dataclasses import dataclass, replace from enum import IntEnum from itertools import count @@ -91,7 +91,7 @@ def __new__(cls, q_st=0, q_en=0, mapq=0, - cigar: typing.Iterable[typing.List[int]] = tuple(), + cigar: Iterable[List[int]] = tuple(), is_primary=True, mlen=0, blen=0, @@ -179,14 +179,14 @@ def __init__(self, self.coordinate_name = self.consensus = self.amino_consensus = '' self.algorithm = '' self.consensus_offset = 0 - self.alignments: typing.List[Alignment] = [] - self.reading_frames: typing.List[typing.List[SeedAmino]] = [] - self.seed_nucs: typing.List[SeedNucleotide] = [] - self.amino_alignments: typing.List[AminoAlignment] = [] + self.alignments: List[Alignment] = [] + self.reading_frames: Dict[int, List[SeedAmino]] = {} + self.seed_nucs: List[SeedNucleotide] = [] + self.amino_alignments: List[AminoAlignment] = [] self.contig_name = contig_name # consensus nucleotide positions that were inserts - self.inserts: typing.Set[int] = set() + self.inserts: Set[int] = set() if alignments_file is not None: self.alignments_writer = self._create_alignments_writer(alignments_file) @@ -248,11 +248,9 @@ def _create_alignments_writer(alignments_file, different_columns=None): return writer def start_contig(self, - coordinate_name: str = None, - consensus: str = None, - reading_frames: typing.Dict[ - int, - typing.List[SeedAmino]] = None): + coordinate_name: Optional[str] = None, + consensus: Optional[str] = None, + reading_frames: Optional[Dict[int, List[SeedAmino]]] = None): self.clear() if consensus: @@ -301,7 +299,7 @@ def start_contig(self, "cigar_str": alignment.cigar_str} self.overall_alignments_writer.writerow(row) - def align_gotoh(self, coordinate_seq, consensus): + def align_gotoh(self, coordinate_seq: str, consensus: str): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 @@ -313,12 +311,12 @@ def align_gotoh(self, coordinate_seq, consensus): use_terminal_gap_penalty) if min(len(coordinate_seq), len(consensus)) < score: ref_start = len(aligned_consensus) - len(aligned_consensus.lstrip('-')) - aligned_consensus: str = aligned_consensus[ref_start:] - aligned_coordinate: str = aligned_coordinate[ref_start:] + aligned_consensus: str = aligned_consensus[ref_start:] # type: ignore[no-redef] + aligned_coordinate: str = aligned_coordinate[ref_start:] # type: ignore[no-redef] aligned_consensus = aligned_consensus.rstrip('-') ref_index = ref_start consensus_index = 0 - cigar = [] + cigar: List[List[int]] = [] for ref_nuc, nuc in zip(aligned_coordinate, aligned_consensus): expected_nuc = consensus[consensus_index] ref_index += 1 @@ -346,9 +344,9 @@ def align_gotoh(self, coordinate_seq, consensus): def find_amino_alignments(self, start_pos: int, end_pos: int, - repeat_pos: typing.Optional[int], - skip_pos: typing.Optional[int], - amino_ref: str): + repeat_pos: Optional[int], + skip_pos: Optional[int], + amino_ref: Optional[str]): translations = { reading_frame: translate( '-'*(reading_frame + self.consensus_offset) + @@ -527,11 +525,11 @@ def report_region( self, start_pos: int, end_pos: int, - report_nucleotides: typing.List[ReportNucleotide], - report_aminos: typing.List[ReportAmino] = None, - repeat_position: int = None, - skip_position: int = None, - amino_ref: str = None): + report_nucleotides: List[ReportNucleotide], + report_aminos: Optional[List[ReportAmino]] = None, + repeat_position: Optional[int] = None, + skip_position: Optional[int] = None, + amino_ref: Optional[str] = None): """ Add read counts to report counts for a section of the reference. :param start_pos: 1-based position of first nucleotide to report in @@ -565,7 +563,7 @@ def report_region( self.build_nucleotide_report(start_pos, end_pos, report_nucleotides) - else: + elif amino_ref is not None: report_aminos.extend(ReportAmino(SeedAmino(None), i + 1) for i in range(len(amino_ref))) self.build_amino_report(start_pos, @@ -593,13 +591,13 @@ def get_deletion_coverage(self, consensus_nuc_index): def build_amino_report(self, start_pos: int, end_pos: int, - report_nucleotides: typing.List[ReportNucleotide], - report_aminos: typing.List[ReportAmino] = None, - repeat_position: int = None, - skip_position: int = None, - amino_ref: str = None): + report_nucleotides: List[ReportNucleotide], + report_aminos: Optional[List[ReportAmino]] = None, + repeat_position: Optional[int] = None, + skip_position: Optional[int] = None, + amino_ref: Optional[str] = None): """ Add read counts to report counts for a section of the reference. - + Used for regions that translate to amino acids. :param start_pos: 1-based position of first nucleotide to report in @@ -657,13 +655,13 @@ def build_amino_report(self, @staticmethod def update_report_amino(coord_index: int, - report_aminos: typing.List[ReportAmino], - report_nucleotides: typing.List[ReportNucleotide], + report_aminos: List[ReportAmino], + report_nucleotides: List[ReportNucleotide], seed_amino: SeedAmino, start_pos: int, - repeat_position: int = None, - skip_position: int = None, - skipped_nuc=None): + repeat_position: Optional[int] = None, + skip_position: Optional[int] = None, + skipped_nuc: Optional[SeedAmino] =None): report_amino = report_aminos[coord_index] report_amino.seed_amino.add(seed_amino) ref_nuc_pos = coord_index * 3 + start_pos @@ -848,7 +846,7 @@ def count_match(self, def build_nucleotide_report(self, start_pos: int, end_pos: int, - report_nucleotides: typing.List[ReportNucleotide]): + report_nucleotides: List[ReportNucleotide]): """ Add read counts to report counts for a section of the reference. Used for regions that don't translate to amino acids. @@ -921,14 +919,14 @@ def seed_concordance(self, seed_name, projects, seed_coordinates, excluded_regio continue self.region_seed_concordance(region, seed_name, seed_alignments, seed_ref, start_pos, end_pos) - def coord_concordance(self, half_window_size=10): + def coord_concordance(self, half_window_size: int = 10) -> List[float]: coord_alignments = self.alignments try: coord_ref = self.projects.getGenotypeReference(self.coordinate_name) except KeyError: coord_ref = self.projects.getReference(self.coordinate_name) query_matches = [0] * len(self.consensus) - concordance_list: typing.List[typing.Any] = [None] * len(self.consensus) + concordance_list: List[float] = [0] * len(self.consensus) for alignment in coord_alignments: ref_progress = alignment.r_st @@ -1019,11 +1017,11 @@ class AminoAlignment: ref_end: int action: CigarActions reading_frame: int - query: str = None # Amino sequence - ref: str = None # Amino sequence - aligned_query: str = None - aligned_ref: str = None - ref_amino_start: int = None + query: Optional[str] = None # Amino sequence + ref: Optional[str] = None # Amino sequence + aligned_query: Optional[str] = None + aligned_ref: Optional[str] = None + ref_amino_start: Optional[int] = None def has_overlap(self, start_pos: int, end_pos: int) -> bool: before_end = self.ref_start < end_pos @@ -1073,8 +1071,15 @@ def size(self): def amino_size(self): return (self.size + 2) // 3 - def map_amino_sequences(self) -> typing.Dict[int, int]: + def map_amino_sequences(self) -> Dict[int, int]: """ Map reference amino indexes to query amino indexes. """ + + assert self.aligned_ref is not None, "For this operation, aligned_ref must not be None" + assert self.aligned_query is not None, "For this operation, aligned_query must not be None" + assert self.query is not None, "For this operation, query must not be None" + assert self.ref is not None, "For this operation, ref must not be None" + assert self.ref_amino_start is not None, "For this operation, ref_amino_start must not be None" + seq_map = {} query_offset = (self.query_start + self.reading_frame) // 3 ref_index = query_index = 0 diff --git a/micall/utils/report_amino.py b/micall/utils/report_amino.py index 917614915..8c7e70380 100644 --- a/micall/utils/report_amino.py +++ b/micall/utils/report_amino.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Optional from micall.utils.translation import translate, ambig_dict @@ -19,7 +20,7 @@ def __init__(self, consensus_nuc_index, counts=None): if consensus_nuc_index is not None: self.all_consensus_nuc_indexes.add(consensus_nuc_index) self.counts = counts or Counter() # {amino: count} - self.codon_counts = Counter() # {codon_nucs: count} + self.codon_counts: Counter = Counter() # {codon_nucs: count} self.nucleotides = [] for i in range(3): seed_nuc = SeedNucleotide() @@ -239,7 +240,7 @@ def count_overlap(self, other): class ReportNucleotide: - def __init__(self, position: int, seed_nucleotide: SeedNucleotide = None): + def __init__(self, position: int, seed_nucleotide: Optional[SeedNucleotide] = None): self.position = position if seed_nucleotide is None: self.seed_nucleotide = SeedNucleotide() From 5cfc0205e3c9b395320ae0e126476eaa0bdb209c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 Nov 2023 11:44:48 -0800 Subject: [PATCH 239/485] Add .mypy.ini configuration --- .mypy.ini | 12 ++++++++++++ requirements-test.txt | 2 ++ 2 files changed, 14 insertions(+) create mode 100644 .mypy.ini diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 000000000..c6f790075 --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,12 @@ +[mypy] +files = micall/utils/consensus_aligner.py, + micall/utils/report_amino.py, + micall/core/project_config.py + +check_untyped_defs = True + +[mypy-mappy] +ignore_missing_imports = true + +[mypy-gotoh] +ignore_missing_imports = true diff --git a/requirements-test.txt b/requirements-test.txt index 78fbf3a68..a913a3fcc 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,6 +2,8 @@ -r requirements.txt -r requirements-watcher.txt +mypy==1.7.0 +mypy-extensions==1.0.0 pytest==8.1.1 coverage==7.4.3 pandas==2.0.2 From 9b95c29f915247b65a8b1b460fe10a2363c5afe2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 16:13:55 +0000 Subject: [PATCH 240/485] Bump cairosvg from 2.7.0 to 2.7.1 Bumps [cairosvg](https://github.com/Kozea/CairoSVG) from 2.7.0 to 2.7.1. - [Release notes](https://github.com/Kozea/CairoSVG/releases) - [Changelog](https://github.com/Kozea/CairoSVG/blob/master/NEWS.rst) - [Commits](https://github.com/Kozea/CairoSVG/compare/2.7.0...2.7.1) --- updated-dependencies: - dependency-name: cairosvg dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a4f87a198..a6f5a261e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,4 @@ pysam==0.21.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 -cairosvg==2.7.0 \ No newline at end of file +cairosvg==2.7.1 \ No newline at end of file From 1b8b30bec2af4b0e9d9a46aa8dee67b54a6be4de Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 Nov 2023 12:59:55 -0800 Subject: [PATCH 241/485] Run mypy on CI --- .github/workflows/build-and-test.yml | 4 ++++ requirements-dev.txt | 2 ++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 5ac4bde97..b0eaeb01a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -63,6 +63,10 @@ jobs: python -m pip install --upgrade pip pip install -r requirements-test.txt + - name: Run type checker + run: | + mypy + - name: Test with pytest run: coverage run --source=micall/core,micall/g2p,micall/resistance,micall/monitor -m pytest diff --git a/requirements-dev.txt b/requirements-dev.txt index be9f95100..0d407499a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,3 +3,5 @@ -r requirements-watcher.txt # Used for plotting profiling results. gprof2dot==2022.7.29 +mypy==1.7.0 +mypy-extensions==1.0.0 From 4b0fea6036a419d18e6e41327ac1e4026dc4223a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Mar 2024 12:15:20 -0700 Subject: [PATCH 242/485] Typecheck stitcher code --- .mypy.ini | 36 +++++++++++++++++++++++++++++++++- micall/data/landmark_reader.py | 6 +++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index c6f790075..4ca29fa84 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,12 +1,46 @@ [mypy] files = micall/utils/consensus_aligner.py, micall/utils/report_amino.py, - micall/core/project_config.py + micall/core/project_config.py, + micall/core/contig_stitcher.py check_untyped_defs = True +[mypy-micall.core.plot_contigs] +check_untyped_defs = False + [mypy-mappy] ignore_missing_imports = true [mypy-gotoh] ignore_missing_imports = true + +[mypy-yaml] +ignore_missing_imports = true + +[mypy-Bio] +ignore_missing_imports = true + +[mypy-Bio.Seq] +ignore_missing_imports = True + +[mypy-Bio.Blast.Applications] +ignore_missing_imports = True + +[mypy-Bio.SeqRecord] +ignore_missing_imports = True + +[mypy-genetracks] +ignore_missing_imports = true + +[mypy-genetracks.elements] +ignore_missing_imports = true + +[mypy-drawsvg] +ignore_missing_imports = true + +[mypy-matplotlib] +ignore_missing_imports = true + +[mypy-matplotlib.colors] +ignore_missing_imports = true diff --git a/micall/data/landmark_reader.py b/micall/data/landmark_reader.py index 592ffa27a..9c01b6031 100644 --- a/micall/data/landmark_reader.py +++ b/micall/data/landmark_reader.py @@ -1,5 +1,5 @@ import re -import typing +from typing import Optional, TextIO, Dict from operator import itemgetter from pathlib import Path @@ -10,7 +10,7 @@ class LandmarkReader: @classmethod - def load(cls, f: typing.TextIO = None): + def load(cls, f: Optional[TextIO] = None): """ Load an instance of this class from an open JSON file. :param f: The file to load from, or None to load from the default. @@ -70,7 +70,7 @@ def get_coordinates(self, seed_name: str) -> str: return genotype_landmarks['coordinates'] raise ValueError(f'No landmarks match {seed_name!r}.') - def get_landmarks(self, seed_name: str) -> typing.Dict[str, object]: + def get_landmarks(self, seed_name: str) -> Dict[str, object]: for genotype_landmarks in self.landmarks: seed_pattern = genotype_landmarks['seed_pattern'] if re.fullmatch(seed_pattern, seed_name): From ce82ad4930ef8ac0b2f402e33df34e6f50e791c1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 25 Jul 2023 13:31:31 -0700 Subject: [PATCH 243/485] Rename "hivseqinr_results" output to "detailed_results" --- micall/monitor/kive_watcher.py | 2 +- micall/tests/test_kive_watcher.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index e437400fe..981d132d8 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -74,7 +74,7 @@ 'contigs_primers_csv', 'table_precursor_csv', 'proviral_landscape_csv', - 'hivseqinr_results_tar'] + 'detailed_results_tar'] # noinspection PyArgumentList FolderEventType = Enum('FolderEventType', 'ADD_SAMPLE FINISH_FOLDER') diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 2b0e37e12..fdd83ef9c 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -3007,7 +3007,7 @@ def test_collate_denovo_results(raw_data_with_two_samples, default_config, mock_ expected_cascade_path = version_folder / "denovo" / "cascade.csv" expected_done_path = version_folder / "denovo" / "doneprocessing" - proviral_path = version_folder / "denovo" / "hivseqinr_results" + proviral_path = version_folder / "denovo" / "detailed_results" main_scratch_path = version_folder / "scratch" main_scratch_path.mkdir() From 413d062683a83e8a1593579f519b6ec9a2ebc530 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 25 Jul 2023 14:41:05 -0700 Subject: [PATCH 244/485] Replace all HIVSeqinR by HIVIntact --- micall/monitor/kive_watcher.py | 2 +- micall/tests/test_kive_watcher.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 981d132d8..7946c2a4f 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -781,7 +781,7 @@ def run_pipeline(self, run = self.run_proviral_pipeline( sample_watcher, folder_watcher, - 'Proviral HIVSeqinR') + 'Proviral HIVIntact') return run if pipeline_type == PipelineType.RESISTANCE: run = self.run_resistance_pipeline( diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index fdd83ef9c..4056a6e6e 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1784,7 +1784,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): dataset='/datasets/112/'), dict(argument='/containerargs/106', dataset='/datasets/113/')], - name='Proviral HIVSeqinR on 2120A', + name='Proviral HIVIntact on 2120A', batch='/batches/101', groups_allowed=['Everyone'])) From 977edfa0f8e756995059033b07621561eb34fec7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 12 Mar 2024 10:19:16 -0700 Subject: [PATCH 245/485] Kive: support both HIVSeqinR and HIVIntact versions of proviral --- micall/monitor/kive_watcher.py | 3 ++- micall/tests/test_kive_watcher.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 7946c2a4f..1a5d98c52 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -74,6 +74,7 @@ 'contigs_primers_csv', 'table_precursor_csv', 'proviral_landscape_csv', + 'hivseqinr_results_tar', # TODO: remove when proviral is finally updated. 'detailed_results_tar'] # noinspection PyArgumentList @@ -781,7 +782,7 @@ def run_pipeline(self, run = self.run_proviral_pipeline( sample_watcher, folder_watcher, - 'Proviral HIVIntact') + 'Proviral') return run if pipeline_type == PipelineType.RESISTANCE: run = self.run_resistance_pipeline( diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 4056a6e6e..89ec37385 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1784,7 +1784,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): dataset='/datasets/112/'), dict(argument='/containerargs/106', dataset='/datasets/113/')], - name='Proviral HIVIntact on 2120A', + name='Proviral on 2120A', batch='/batches/101', groups_allowed=['Everyone'])) From bb4d920c63e00e19032932ea5cbf92bf7bced603 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 20 Sep 2023 13:54:30 -0700 Subject: [PATCH 246/485] Switch to Debian in Singularity image --- Singularity | 71 +++++++++++++---------------------------------------- 1 file changed, 17 insertions(+), 54 deletions(-) diff --git a/Singularity b/Singularity index bfe522316..352b84d9f 100644 --- a/Singularity +++ b/Singularity @@ -1,6 +1,6 @@ # Generate the Singularity container to run MiCall on Kive. Bootstrap: docker -From: centos:7 +From: python:3.8 %help MiCall maps all the reads from a sample against a set of reference @@ -53,48 +53,28 @@ From: centos:7 %post echo ===== Installing Prerequisites ===== >/dev/null - yum update -q -y - - yum groupinstall -q -y 'development tools' - yum install -q -y epel-release - yum install -q -y unzip wget fontconfig bzip2-devel xz-devel openssl-devel \ - libffi-devel sqlite-devel - - echo ===== Installing Python ===== >/dev/null - wget -q https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tar.xz - tar xJf Python* - rm Python*.xz - cd Python* - ./configure --enable-optimizations - make altinstall - cd .. - rm -rf Python* - ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 + apt-get update -q + apt-get install -q -y unzip wget echo ===== Installing blast ===== >/dev/null - cd /root - # Saved our own copy, because download was slow from ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-1.x86_64.rpm - wget -q https://github.com/cfe-lab/MiCall/releases/download/v7.12.dev28/ncbi-blast-2.6.0+-1.x86_64.rpm - yum install -q -y ncbi-blast-2.6.0+-1.x86_64.rpm - rm ncbi-blast-2.6.0+-1.x86_64.rpm - python3 /opt/micall/micall/blast_db/make_blast_db.py + apt-get install -q -y ncbi-blast+ echo ===== Installing Rust and merge-mates ===== >/dev/null - yum install -q -y rust cargo + wget -qO rustup.sh https://sh.rustup.rs + chmod +x /rustup.sh + /rustup.sh -y -q + . /root/.cargo/env + rm rustup.sh cargo install --quiet --root / --git https://github.com/jeff-k/merge-mates.git --rev 2fec61363f645e2008a4adff553d098beae21469 - ## Miniconda (Python 2) (Don't use this) - #wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh - #bash miniconda.sh -b -p /opt/miniconda - echo ===== Installing bowtie2 ===== >/dev/null wget -q -O bowtie2.zip https://github.com/BenLangmead/bowtie2/releases/download/v2.2.8/bowtie2-2.2.8-linux-x86_64.zip - unzip -qq bowtie2.zip -d /opt/ + unzip bowtie2.zip -d /opt/ ln -s /opt/bowtie2-2.2.8/ /opt/bowtie2 rm bowtie2.zip echo ===== Installing IVA dependencies ===== >/dev/null - yum install -q -y tcsh ncurses-devel zlib-devel + apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev cd /bin wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc_dump @@ -123,31 +103,14 @@ From: centos:7 ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt echo ===== Installing Python packages ===== >/dev/null + # Install dependencies for genetracks/drawsvg + apt-get install -q -y libcairo2-dev # Also trigger matplotlib to build its font cache. - wget -q https://bootstrap.pypa.io/get-pip.py - python3 get-pip.py - rm get-pip.py cd /opt - pip install --quiet -r /opt/micall/requirements.txt - ln -s /usr/local/bin/cutadapt /usr/local/bin/cutadapt-1.11 - python3 -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' - - # Install dependencies for genetracks/drawsvg - yum install -q -y cairo-devel cairo cairo-tools zlib-devel - - yum groupremove -q -y 'development tools' - yum remove -q -y epel-release wget unzip - yum autoremove -q -y - yum clean all - - rm -rf /var/cache/yum - - ## CAUTION! This changes the default python command to python3! - ## This breaks many things, including yum! - ## To switch back to python2, use this command: - # sudo alternatives --set python /usr/bin/python2 - alternatives --install /usr/bin/python python /usr/bin/python2 50 - alternatives --install /usr/bin/python python /usr/local/bin/python3 60 + pip install --upgrade pip + pip install -r /opt/micall/requirements-basespace.txt + python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' + python /opt/micall/micall/blast_db/make_blast_db.py %environment export PATH=/opt/bowtie2:/bin:/usr/local/bin From 6eaffb03bbac8bd9f04fa9c3767d78e51b0652c4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 11 Mar 2024 14:50:03 -0700 Subject: [PATCH 247/485] Singularity: install requirements.txt, not its basespace version We do not need the additional package that requirements-basespace.txt provides over requirements.txt. --- Singularity | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Singularity b/Singularity index 352b84d9f..1e53cb728 100644 --- a/Singularity +++ b/Singularity @@ -108,7 +108,7 @@ From: python:3.8 # Also trigger matplotlib to build its font cache. cd /opt pip install --upgrade pip - pip install -r /opt/micall/requirements-basespace.txt + pip install -r /opt/micall/requirements.txt python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' python /opt/micall/micall/blast_db/make_blast_db.py From 0b728a32865aecc66fb3280b7a4e342da161ac07 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 26 Feb 2024 12:55:59 -0800 Subject: [PATCH 248/485] Add flake8 configuration file We ignore a lot of warnings in order to assert that the current quality of code is acceptable. --- .flake8 | 82 ++++++++++++++++++++++++++++++++++ micall/core/contig_stitcher.py | 3 +- 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..d9a369289 --- /dev/null +++ b/.flake8 @@ -0,0 +1,82 @@ +[flake8] +ignore = E226,E302,E41 +max-line-length = 120 +per-file-ignores = + micall/core/aln2counts.py: W504,E225,E126,W293 + micall/core/amplicon_finder.py: E126,W504 + micall/core/cascade_report.py: W293 + micall/core/filter_quality.py: W504 + micall/core/plot_contigs.py: W504,E501,F841,E231,E261,E262,E265,E302,E303,E305,E999,W291,W292,F401 + micall/core/plot_simple.py: F401,F841,W292 + micall/core/project_config.py: E722,E261 + micall/core/remap.py: F401 + micall/core/sam2aln.py: E128 + micall/core/trim_fastqs.py: W504 + micall/drivers/run_info.py: E225 + micall/g2p/pssm_lib.py: E722 + micall/monitor/find_groups.py: W504 + micall/monitor/kive_watcher.py: W504,E126,E501,W503 + micall/monitor/sample_watcher.py: W503,W504 + micall/monitor/update_qai.py: W503,W504 + micall/resistance/asi_algorithm.py: W504 + micall/resistance/pdfreport.py: E241,W504 + micall/resistance/resistance.py: W504 + micall/tcr/igblast.py: E265,E231,E502,E128,W291,E305 + micall/tests/microtest/make_sample.py: W504 + micall/tests/microtest/reverse_and_complement.py: E126 + micall/tests/microtest/translate_nuc.py: E126 + micall/tests/svg_differ.py: E126,F541 + micall/tests/test_align_it_gotoh.py: E261,E262,E225,E303,E501,W292 + micall/tests/test_aln2counts_report.py: W504,F401 + micall/tests/test_aln2counts_seed_amino.py: W293 + micall/tests/test_aln2counts_seed_nucleotide.py: W293 + micall/tests/test_cigar_tools.py: E704,W503 + micall/tests/test_consensus_aligner.py: F841,E126,W504,E701,F401 + micall/tests/test_contig_stitcher.py: W503 + micall/tests/test_error_metrics_parser.py: E231 + micall/tests/test_fastq_g2p.py: W293 + micall/tests/test_hcv_reference_tree.py: W293 + micall/tests/test_hcv_rules_import.py: E241 + micall/tests/test_kive_watcher.py: W504,E126,E127,F841 + micall/tests/test_plot_contigs.py: E303 + micall/tests/test_primer_tracker.py: F401 + micall/tests/test_projects_dump.py: W504 + micall/tests/test_quality_metrics_parser.py: E231 + micall/tests/test_remap.py: F541 + micall/tests/test_resistance.py: W504 + micall/tests/test_report_amino.py: W292 + micall/tests/test_sam2aln.py: E121 + micall/tests/test_sample.py: F401 + micall/tests/test_sample_sheet_parser.py: E123 + micall/tests/test_tile_metrics_parser.py: E231 + micall/tests/test_trim_fastqs.py: E121,E123 + micall/tests/test_update_qai.py: E127 + micall/utils/aln2counts_simplify.py: F401,E261,E126,F821 + micall/utils/big_counter.py: W293 + micall/utils/cigar_tools.py: E704 + micall/utils/compare_454_samples.py: W504,E305 + micall/utils/compare_mapping.py: W504,F841 + micall/utils/concordance_evaluation.py: E303 + micall/utils/consensus_aligner.py: W504,W293,E126,E303,F401,E261,E252 + micall/utils/conseq_compare.py: F811,E305,W503,F841,F401 + micall/utils/contig_summary.py: W504 + micall/utils/coverage_data.py: W504 + micall/utils/dd.py: E711,W504,E211,E303 + micall/utils/fetch_sequences.py: W504,E126,E303,E265 + micall/utils/find_chimera.py: W504 + micall/utils/find_reads_in_sam.py: F401,W293,F841 + micall/utils/hcv_reference_tree.py: W504 + micall/utils/hcv_rules_import.py: W503,W504,E126,E241 + micall/utils/make_fda_refs.py: E211,E261,W293,W292 + micall/utils/make_hcv_genotype_refs.py: E999 + micall/utils/primer_checks.py: E303,F541 + micall/utils/primer_tracker.py: F541 + micall/utils/projects_dump.py: W504 + micall/utils/ref_aligner.py: F541 + micall/utils/reference_distances.py: E305 + micall/utils/report_amino.py: W504 + micall/utils/sam_g2p_simplify.py: W293,E265,E303,E305 + micall/utils/sample_fastq.py: E305 + micall/utils/sample_sheet_parser.py: E275,W504 + micall/utils/scan_run_folders.py: W504 + micall/utils/v3loop_alignment_scores/plot.py: W293 diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index b03351657..82793748c 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -347,7 +347,8 @@ def concordance_to_cut_points(left_overlap, right_overlap, aligned_left, aligned reverse=True, )] - def remove_dashes(s: str): return s.replace('-', '') + def remove_dashes(s: str): + return s.replace('-', '') for max_concordance_index in sorted_concordance_indexes: aligned_left_q_index = len(remove_dashes(aligned_left[:max_concordance_index])) From 326d7fa29894bd401432d089785e0b2416517fa5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 12 Mar 2024 09:21:11 -0700 Subject: [PATCH 249/485] Run flake8 on CI --- .github/workflows/build-and-test.yml | 4 ++++ requirements-test.txt | 1 + 2 files changed, 5 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index b0eaeb01a..fc81cfa6f 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -67,6 +67,10 @@ jobs: run: | mypy + - name: Run linter + run: | + flake8 micall/ + - name: Test with pytest run: coverage run --source=micall/core,micall/g2p,micall/resistance,micall/monitor -m pytest diff --git a/requirements-test.txt b/requirements-test.txt index a913a3fcc..f326b5ca9 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,6 +9,7 @@ coverage==7.4.3 pandas==2.0.2 seaborn==0.12.2 ete3 +flake8==7.0.0 # For reading HCV rules from an Excel file. openpyxl==3.1.2 From eb4dee36ec0d96386187b6d3cad5a9102de2b456 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:19:09 +0000 Subject: [PATCH 250/485] Bump nokogiri from 1.14.3 to 1.16.2 in /docs Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.14.3 to 1.16.2. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.14.3...v1.16.2) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 5a06d0262..391815cfd 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -214,15 +214,15 @@ GEM rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.8.1) + mini_portile2 (2.8.5) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) minitest (5.22.2) mutex_m (0.2.0) - nokogiri (1.14.3) - mini_portile2 (~> 2.8.0) + nokogiri (1.16.2) + mini_portile2 (~> 2.8.2) racc (~> 1.4) octokit (4.25.1) faraday (>= 1, < 3) @@ -230,7 +230,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.6.2) + racc (1.7.3) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) From 9ba785fc7d6be9ee592124725a5c5abe67590b47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:44:48 +0000 Subject: [PATCH 251/485] Bump pytz from 2023.3 to 2024.1 Bumps [pytz](https://github.com/stub42/pytz) from 2023.3 to 2024.1. - [Release notes](https://github.com/stub42/pytz/releases) - [Commits](https://github.com/stub42/pytz/compare/release_2023.3...release_2024.1) --- updated-dependencies: - dependency-name: pytz dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a6f5a261e..6543e7b77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Requirements for running the bare pipeline under Kive or BaseSpace -pytz==2023.3 +pytz==2024.1 git+https://github.com/cfe-lab/iva.git@v1.1.1 biopython==1.81 git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh From 5ad9f5ae0e50dd26c3818af07aa858c44feaf10f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 19:17:02 +0000 Subject: [PATCH 252/485] Bump commonmarker from 0.23.9 to 0.23.10 in /docs Bumps [commonmarker](https://github.com/gjtorikian/commonmarker) from 0.23.9 to 0.23.10. - [Release notes](https://github.com/gjtorikian/commonmarker/releases) - [Changelog](https://github.com/gjtorikian/commonmarker/blob/v0.23.10/CHANGELOG.md) - [Commits](https://github.com/gjtorikian/commonmarker/compare/v0.23.9...v0.23.10) --- updated-dependencies: - dependency-name: commonmarker dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 391815cfd..49b736e1f 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -20,7 +20,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.9) + commonmarker (0.23.10) concurrent-ruby (1.2.3) connection_pool (2.4.1) dnsruby (1.61.9) From e5435d47240015d8d78e2478c79579e971835aa4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 13 Mar 2024 07:16:30 +0000 Subject: [PATCH 253/485] Bump biopython from 1.81 to 1.83 Bumps [biopython](https://github.com/biopython/biopython) from 1.81 to 1.83. - [Changelog](https://github.com/biopython/biopython/blob/master/NEWS.rst) - [Commits](https://github.com/biopython/biopython/commits) --- updated-dependencies: - dependency-name: biopython dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6543e7b77..2d66bfd61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Requirements for running the bare pipeline under Kive or BaseSpace pytz==2024.1 git+https://github.com/cfe-lab/iva.git@v1.1.1 -biopython==1.81 +biopython==1.83 git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2 numpy==1.24.3 From 089910aa98929e2330715cd85844ecc038f001e0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 13 Mar 2024 07:18:23 +0000 Subject: [PATCH 254/485] Bump github-pages and jekyll in /docs Bumps [github-pages](https://github.com/github/pages-gem) and [jekyll](https://github.com/jekyll/jekyll). These dependencies needed to be updated together. Updates `github-pages` from 228 to 231 - [Release notes](https://github.com/github/pages-gem/releases) - [Commits](https://github.com/github/pages-gem/compare/v228...v231) Updates `jekyll` from 3.9.3 to 3.9.5 - [Release notes](https://github.com/jekyll/jekyll/releases) - [Changelog](https://github.com/jekyll/jekyll/blob/v3.9.5/History.markdown) - [Commits](https://github.com/jekyll/jekyll/compare/v3.9.3...v3.9.5) --- updated-dependencies: - dependency-name: github-pages dependency-type: direct:development update-type: version-update:semver-major - dependency-name: jekyll dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 90 ++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 49b736e1f..cd39f2718 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -11,20 +11,20 @@ GEM minitest (>= 5.1) mutex_m tzinfo (~> 2.0) - addressable (2.8.1) + addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) bigdecimal (3.1.6) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.11.1) + coffee-script-source (1.12.2) colorator (1.1.0) commonmarker (0.23.10) concurrent-ruby (1.2.3) connection_pool (2.4.1) - dnsruby (1.61.9) - simpleidn (~> 0.1) + dnsruby (1.71.0) + simpleidn (~> 0.2.1) drb (2.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) @@ -32,24 +32,24 @@ GEM ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.8.1) - faraday (2.7.4) - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-net_http (3.0.2) - ffi (1.15.5) + execjs (2.9.1) + faraday (2.9.0) + faraday-net_http (>= 2.0, < 3.2) + faraday-net_http (3.1.0) + net-http + ffi (1.16.3) forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (228) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.3) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) + gemoji (4.1.0) + github-pages (231) + github-pages-health-check (= 1.18.2) + jekyll (= 3.9.5) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) jekyll-commonmark-ghpages (= 0.4.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) + jekyll-github-metadata (= 2.16.1) jekyll-include-cache (= 0.2.1) jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) @@ -76,20 +76,20 @@ GEM jekyll-theme-tactile (= 0.2.0) jekyll-theme-time-machine (= 0.2.0) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) + jemoji (= 0.13.0) + kramdown (= 2.4.0) kramdown-parser-gfm (= 1.1.0) liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) nokogiri (>= 1.13.6, < 2.0) - rouge (= 3.26.0) + rouge (= 3.30.0) terminal-table (~> 1.4) - github-pages-health-check (1.17.9) + github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) typhoeus (~> 1.3) html-pipeline (2.14.3) activesupport (>= 2) @@ -97,7 +97,7 @@ GEM http_parser.rb (0.8.0) i18n (1.14.4) concurrent-ruby (~> 1.0) - jekyll (3.9.3) + jekyll (3.9.5) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) @@ -110,11 +110,11 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) + jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) + jekyll-coffeescript (1.2.2) coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) + coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) jekyll-commonmark-ghpages (0.4.0) @@ -122,15 +122,15 @@ GEM jekyll (~> 3.9.0) jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 5.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.13.0) + jekyll-github-metadata (2.16.1) jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) + octokit (>= 4, < 7, != 4.4.0) jekyll-include-cache (0.2.1) jekyll (>= 3.7, < 5.0) jekyll-mentions (1.6.0) @@ -201,16 +201,16 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) + kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) liquid (4.0.4) - listen (3.8.0) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) @@ -221,6 +221,8 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.22.2) mutex_m (0.2.0) + net-http (0.4.1) + uri nokogiri (1.16.2) mini_portile2 (~> 2.8.2) racc (~> 1.4) @@ -229,14 +231,13 @@ GEM sawyer (~> 0.9) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.7) + public_suffix (5.0.4) racc (1.7.3) rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby2_keywords (0.0.5) + rexml (3.2.6) + rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) @@ -251,14 +252,15 @@ GEM unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) tzinfo (2.0.6) concurrent-ruby (~> 1.0) unf (0.1.4) unf_ext - unf_ext (0.0.8.2) + unf_ext (0.0.9.1) unicode-display_width (1.8.0) + uri (0.13.0) PLATFORMS ruby From ab38d118a97625ac9072a7e722b3dfbc4290eb68 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 13 Mar 2024 15:10:59 +0000 Subject: [PATCH 255/485] Bump mypy from 1.7.0 to 1.9.0 Bumps [mypy](https://github.com/python/mypy) from 1.7.0 to 1.9.0. - [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md) - [Commits](https://github.com/python/mypy/compare/v1.7.0...1.9.0) --- updated-dependencies: - dependency-name: mypy dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- requirements-test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 0d407499a..f8e370d54 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,5 +3,5 @@ -r requirements-watcher.txt # Used for plotting profiling results. gprof2dot==2022.7.29 -mypy==1.7.0 +mypy==1.9.0 mypy-extensions==1.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index f326b5ca9..448aed78f 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,7 +2,7 @@ -r requirements.txt -r requirements-watcher.txt -mypy==1.7.0 +mypy==1.9.0 mypy-extensions==1.0.0 pytest==8.1.1 coverage==7.4.3 From 4697d1954b179c4dbd9d6523340dc9b7572fc7ba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 Mar 2024 15:29:50 +0000 Subject: [PATCH 256/485] Bump seaborn from 0.12.2 to 0.13.2 Bumps [seaborn](https://github.com/mwaskom/seaborn) from 0.12.2 to 0.13.2. - [Release notes](https://github.com/mwaskom/seaborn/releases) - [Commits](https://github.com/mwaskom/seaborn/compare/v0.12.2...v0.13.2) --- updated-dependencies: - dependency-name: seaborn dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index f326b5ca9..ec8bf0bc0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -7,7 +7,7 @@ mypy-extensions==1.0.0 pytest==8.1.1 coverage==7.4.3 pandas==2.0.2 -seaborn==0.12.2 +seaborn==0.13.2 ete3 flake8==7.0.0 From 3fe11bbafcd006f9667dc4c21a2627764cf9462d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 Mar 2024 19:10:49 +0000 Subject: [PATCH 257/485] Bump reportlab from 4.0.4 to 4.1.0 Bumps [reportlab](https://www.reportlab.com/) from 4.0.4 to 4.1.0. --- updated-dependencies: - dependency-name: reportlab dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2d66bfd61..4bac31ef9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ matplotlib==3.7.3 cutadapt==3.7 python-Levenshtein==0.25.0 PyYAML==6.0.1 -reportlab==4.0.4 +reportlab==4.1.0 pysam==0.21.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 From 836bd5720ee4f440f40cbbc1b63a7184d1e01c19 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 Mar 2024 22:08:45 +0000 Subject: [PATCH 258/485] Bump pysam from 0.21.0 to 0.22.0 Bumps [pysam](https://github.com/pysam-developers/pysam) from 0.21.0 to 0.22.0. - [Release notes](https://github.com/pysam-developers/pysam/releases) - [Changelog](https://github.com/pysam-developers/pysam/blob/master/NEWS) - [Commits](https://github.com/pysam-developers/pysam/compare/v0.21.0...v0.22.0) --- updated-dependencies: - dependency-name: pysam dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4bac31ef9..e7d3c0128 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ cutadapt==3.7 python-Levenshtein==0.25.0 PyYAML==6.0.1 reportlab==4.1.0 -pysam==0.21.0 +pysam==0.22.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 From 6e4f54d96ec5241a8fe87233e41b66e57707db3c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 15 Mar 2024 07:33:54 +0000 Subject: [PATCH 259/485] Bump coverage from 7.4.3 to 7.4.4 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.4.3 to 7.4.4. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.4.3...7.4.4) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index fa2b9ccca..5cd1368c6 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.9.0 mypy-extensions==1.0.0 pytest==8.1.1 -coverage==7.4.3 +coverage==7.4.4 pandas==2.0.2 seaborn==0.13.2 ete3 From c5b61c36ce44c5ba2525710b34252675d68c1211 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 15 Mar 2024 07:33:59 +0000 Subject: [PATCH 260/485] Bump cutadapt from 3.7 to 4.7 Bumps [cutadapt](https://github.com/marcelm/cutadapt) from 3.7 to 4.7. - [Release notes](https://github.com/marcelm/cutadapt/releases) - [Changelog](https://github.com/marcelm/cutadapt/blob/main/CHANGES.rst) - [Commits](https://github.com/marcelm/cutadapt/compare/v3.7...v4.7) --- updated-dependencies: - dependency-name: cutadapt dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e7d3c0128..f74fb0256 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2 numpy==1.24.3 scipy==1.10.1 matplotlib==3.7.3 -cutadapt==3.7 +cutadapt==4.7 python-Levenshtein==0.25.0 PyYAML==6.0.1 reportlab==4.1.0 From 036968a808c68f03c882227e7c751af7a0498abd Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 18 Mar 2024 10:13:48 -0700 Subject: [PATCH 261/485] Move CigarActions definition to cigar_tools.py --- micall/core/aln2counts.py | 3 ++- micall/tests/test_consensus_aligner.py | 3 ++- micall/tests/test_contig_stitcher_fuzz.py | 3 +-- micall/tests/utils.py | 2 +- micall/utils/cigar_tools.py | 11 ++++++++--- micall/utils/consensus_aligner.py | 10 ++-------- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index ee20ea053..a52e2d404 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -28,7 +28,8 @@ from micall.core.remap import PARTIAL_CONTIG_SUFFIX, REVERSED_CONTIG_SUFFIX from micall.data.landmark_reader import LandmarkReader from micall.utils.big_counter import BigCounter -from micall.utils.consensus_aligner import CigarActions, ConsensusAligner +from micall.utils.consensus_aligner import ConsensusAligner +from micall.utils.cigar_tools import CigarActions from micall.utils.report_amino import ReportAmino, MAX_CUTOFF, SeedAmino, AMINO_ALPHABET, ReportNucleotide, \ SeedNucleotide from micall.utils.spring_beads import Wire, Bead diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 0d6b36e63..015af9318 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -5,7 +5,8 @@ from pytest import approx from micall.core.aln2counts import SeedAmino, ReportAmino -from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, CigarActions, AminoAlignment +from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, AminoAlignment +from micall.utils.cigar_tools import CigarActions from micall.core.project_config import ProjectConfig # noinspection PyUnresolvedReferences diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 9b82db2f9..2b361cbde 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -11,8 +11,7 @@ ) import micall.core.contig_stitcher as stitcher from micall.core.plot_contigs import build_stitcher_figure -from micall.utils.cigar_tools import CigarHit, Cigar -from micall.utils.consensus_aligner import CigarActions +from micall.utils.cigar_tools import CigarHit, Cigar, CigarActions from typing import Dict, List from collections import defaultdict diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 49fddbc07..7192ea9ad 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -2,7 +2,7 @@ from contextlib import contextmanager import random -from micall.utils.consensus_aligner import CigarActions +from micall.utils.cigar_tools import CigarActions def find_all_occurrences(s, substring): diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py index 33c44be3b..41b0f3bc9 100644 --- a/micall/utils/cigar_tools.py +++ b/micall/utils/cigar_tools.py @@ -1,7 +1,7 @@ """ Module for handling CIGAR strings and related alignment formats. """ - +from enum import IntEnum from math import ceil, floor import re from typing import Tuple, Iterable, Optional, Set, List, Union @@ -9,8 +9,6 @@ from functools import cached_property, reduce from fractions import Fraction -from micall.utils.consensus_aligner import CigarActions - class IntDict(dict): """ @@ -116,6 +114,13 @@ def __repr__(self): return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' +# Mapping as defined in https://samtools.github.io/hts-specs/SAMv1.pdf, page 8 +CigarActions = IntEnum( + 'CigarActions', + 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED PADDING SEQ_MATCH MISMATCH', + start=0) + + class Cigar: """ Represents an alignment between a query sequence and a reference sequence using the diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 79c23fbb3..adc2fa794 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,6 +1,5 @@ -from typing import Dict, List, Optional, Iterable, Set, Any +from typing import Dict, List, Optional, Iterable, Set from dataclasses import dataclass, replace -from enum import IntEnum from itertools import count from operator import attrgetter import csv @@ -11,6 +10,7 @@ from mappy import Alignment, Aligner from micall.core.project_config import ProjectConfig +from micall.utils.cigar_tools import CigarActions from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide from micall.utils.translation import translate @@ -23,12 +23,6 @@ # Most codons in an insertion or deletion that is still aligned in amino acids. MAXIMUM_AMINO_GAP = 10 -# Mapping as defined in https://samtools.github.io/hts-specs/SAMv1.pdf, page 8 -CigarActions = IntEnum( - 'CigarActions', - 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED PADDING SEQ_MATCH MISMATCH', - start=0) - def align_aminos(reference: str, query: str, From 1727e951e1fe1c009cfcc24ea764a45ac7411a3b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 19 Mar 2024 15:40:03 -0700 Subject: [PATCH 262/485] Add aligntools to the list of dependencies --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f74fb0256..c8499af3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ pysam==0.22.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 -cairosvg==2.7.1 \ No newline at end of file +cairosvg==2.7.1 +git+https://github.com/cfe-lab/aligntools.git@v1.0.0 From ba01749dad784ed7e6457678bba7d967e2532037 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 19 Mar 2024 15:46:04 -0700 Subject: [PATCH 263/485] Switch to using aligntools instead of utils.cigar_tools --- .mypy.ini | 3 +++ micall/core/aln2counts.py | 2 +- micall/core/contig_stitcher.py | 2 +- micall/core/plot_contigs.py | 2 +- micall/tests/test_consensus_aligner.py | 2 +- micall/tests/test_contig_stitcher_fuzz.py | 2 +- micall/tests/utils.py | 3 +-- micall/utils/consensus_aligner.py | 2 +- micall/utils/contig_stitcher_contigs.py | 2 +- micall/utils/contig_stitcher_events.py | 2 +- 10 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index 4ca29fa84..54091654f 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -9,6 +9,9 @@ check_untyped_defs = True [mypy-micall.core.plot_contigs] check_untyped_defs = False +[mypy-aligntools] +ignore_missing_imports = true + [mypy-mappy] ignore_missing_imports = true diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index a52e2d404..44a467123 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -29,7 +29,7 @@ from micall.data.landmark_reader import LandmarkReader from micall.utils.big_counter import BigCounter from micall.utils.consensus_aligner import ConsensusAligner -from micall.utils.cigar_tools import CigarActions +from aligntools import CigarActions from micall.utils.report_amino import ReportAmino, MAX_CUTOFF, SeedAmino, AMINO_ALPHABET, ReportNucleotide, \ SeedNucleotide from micall.utils.spring_beads import Wire, Bead diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 82793748c..8cec40242 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -11,8 +11,8 @@ import logging from fractions import Fraction from operator import itemgetter +from aligntools import Cigar, connect_cigar_hits, CigarHit -from micall.utils.cigar_tools import Cigar, connect_cigar_hits, CigarHit from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig import micall.utils.contig_stitcher_events as events diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index e653c40d1..cc879b3f3 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -11,6 +11,7 @@ import logging import yaml +from aligntools import CigarHit from genetracks import Figure, Track, Multitrack, Coverage # noinspection PyPep8Naming import drawsvg as draw @@ -21,7 +22,6 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs from micall.utils.contig_stitcher_contigs import Contig, GenotypedContig, AlignedContig -from micall.utils.cigar_tools import CigarHit import micall.utils.contig_stitcher_events as events from micall.data.landmark_reader import LandmarkReader diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 015af9318..301b25390 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -6,7 +6,7 @@ from micall.core.aln2counts import SeedAmino, ReportAmino from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, AminoAlignment -from micall.utils.cigar_tools import CigarActions +from aligntools import CigarActions from micall.core.project_config import ProjectConfig # noinspection PyUnresolvedReferences diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index 2b361cbde..d23ea9cd3 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -11,7 +11,7 @@ ) import micall.core.contig_stitcher as stitcher from micall.core.plot_contigs import build_stitcher_figure -from micall.utils.cigar_tools import CigarHit, Cigar, CigarActions +from aligntools import CigarHit, Cigar, CigarActions from typing import Dict, List from collections import defaultdict diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 7192ea9ad..39eda8940 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -1,8 +1,7 @@ from dataclasses import dataclass from contextlib import contextmanager import random - -from micall.utils.cigar_tools import CigarActions +from aligntools import CigarActions def find_all_occurrences(s, substring): diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index adc2fa794..7d182e466 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -5,12 +5,12 @@ import csv import os import logging +from aligntools import CigarActions from gotoh import align_it, align_it_aa from mappy import Alignment, Aligner from micall.core.project_config import ProjectConfig -from micall.utils.cigar_tools import CigarActions from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide from micall.utils.translation import translate diff --git a/micall/utils/contig_stitcher_contigs.py b/micall/utils/contig_stitcher_contigs.py index 6c8ff6021..42a6807f2 100644 --- a/micall/utils/contig_stitcher_contigs.py +++ b/micall/utils/contig_stitcher_contigs.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import Optional, Literal -from micall.utils.cigar_tools import CigarHit +from aligntools import CigarHit @dataclass(frozen=True) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index b9c889462..931d00a88 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -1,8 +1,8 @@ from typing import Union, List, Tuple, Literal from dataclasses import dataclass from fractions import Fraction +from aligntools import CigarHit -from micall.utils.cigar_tools import CigarHit from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig From 2c799fd1dcd2c576894040a031c4c234abaede3f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 19 Mar 2024 15:46:31 -0700 Subject: [PATCH 264/485] Remove cigar_tools.py --- micall/tests/test_cigar_tools.py | 725 ------------------------------ micall/utils/cigar_tools.py | 736 ------------------------------- 2 files changed, 1461 deletions(-) delete mode 100644 micall/tests/test_cigar_tools.py delete mode 100644 micall/utils/cigar_tools.py diff --git a/micall/tests/test_cigar_tools.py b/micall/tests/test_cigar_tools.py deleted file mode 100644 index 13f43a8b9..000000000 --- a/micall/tests/test_cigar_tools.py +++ /dev/null @@ -1,725 +0,0 @@ -import pytest -from math import floor -import re - -from micall.utils.cigar_tools import ( - Cigar, - CigarHit, - connect_cigar_hits, -) - - -cigar_mapping_cases = [ - # Simple cases - ("3M", {0: 0, 1: 1, 2: 2}, {0: 0, 1: 1, 2: 2}), # exact mapping # closest mapping - ("1M1D1M", {0: 0, 2: 1}, {0: 0, 1: 0, 2: 1}), # exact mapping # closest mapping - ("1M1I1M", {0: 0, 1: 2}, {0: 0, 1: 2}), - ("2M2D2M", {0: 0, 1: 1, 4: 2, 5: 3}, {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3}), - ("2M2I2M", {0: 0, 1: 1, 2: 4, 3: 5}, {0: 0, 1: 1, 2: 4, 3: 5}), - ( - "3M1D3M", - {0: 0, 1: 1, 2: 2, 4: 3, 5: 4, 6: 5}, - {0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 4, 6: 5}, - ), - ( - "3M1I3M", - {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, - {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6}, - ), - ( - "7M1I3M", - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8, 8: 9, 9: 10}, - ), - ( - "5M2D4M", - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 7: 5, 8: 6, 9: 7, 10: 8}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 4, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8}, - ), - ( - "5M3I4M", - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8, 6: 9, 7: 10, 8: 11}, - ), - ("1M1D", {0: 0}, {0: 0, 1: 0}), - ("1M1I", {0: 0}, {0: 0}), - ("1I1M", {0: 1}, {0: 1}), - ("1D1M", {1: 0}, {1: 0, 0: None}), - # Multiple deletions and insertions - ( - "2M2D2M2I2M", - {0: 0, 1: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 3, 6: 6, 7: 7}, - ), - ( - "2M2I2M2D2M", - {0: 0, 1: 1, 2: 4, 3: 5, 6: 6, 7: 7}, - {0: 0, 1: 1, 2: 4, 3: 5, 4: 5, 5: 5, 6: 6, 7: 7}, - ), - ( - "2=1X2N1N2=1H2S", - {0: 0, 1: 1, 2: 2, 6: 3, 7: 4}, - {0: 0, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 3, 7: 4}, - ), - ( - "3=1X2N1N2=1H2S", - {0: 0, 1: 1, 2: 2, 3: 3, 7: 4, 8: 5}, - {0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3, 6: 3, 7: 4, 8: 5}, - ), - # Edge cases - ("", {}, {}), - ("3I3D", {}, {0: None, 1: None, 2: None}), - ("3D3I", {}, {0: None, 1: None, 2: None}), - ("12I", {}, {}), - ("12D", {}, {k: None for k in range(12)}), -] - - -@pytest.mark.parametrize( - "cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases] -) -def test_cigar_to_coordinate_mapping(cigar_str, expected_mapping): - mapping = Cigar.coerce(cigar_str).coordinate_mapping - assert expected_mapping == mapping.ref_to_query - - -@pytest.mark.parametrize("cigar_str", [x[0] for x in cigar_mapping_cases]) -def test_cigar_to_coordinate_bijection_property(cigar_str): - def inverse(d): return {v: k for k, v in d.items()} - - mapping = Cigar.coerce(cigar_str).coordinate_mapping - - assert mapping.query_to_ref == inverse(mapping.ref_to_query) - assert mapping.ref_to_query == inverse(mapping.query_to_ref) - assert mapping.ref_to_query == inverse(inverse(mapping.ref_to_query)) - assert mapping.query_to_ref == inverse(inverse(mapping.query_to_ref)) - - -@pytest.mark.parametrize( - "cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases] -) -def test_cigar_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): - mapping = Cigar.coerce(cigar_str).coordinate_mapping - - fullrange = { - i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain - } - assert expected_leftmax_mapping == fullrange - - -@pytest.mark.parametrize( - "cigar_str, expected_mapping", [(x[0], x[1]) for x in cigar_mapping_cases] -) -def test_cigar_hit_to_coordinate_mapping(cigar_str, expected_mapping): - cigar = Cigar.coerce(cigar_str) - hit = CigarHit( - cigar, - r_st=5, - r_ei=(5 + cigar.ref_length - 1), - q_st=7, - q_ei=(7 + cigar.query_length - 1), - ) - mapping = hit.coordinate_mapping - - # Coordinates are translated by q_st and r_st. - expected_mapping = { - k + hit.r_st: v + hit.q_st for (k, v) in expected_mapping.items() - } - assert mapping.ref_to_query.get(0, None) is None - assert mapping.query_to_ref.get(0, None) is None - assert expected_mapping == { - i: mapping.ref_to_query[i] for i in mapping.ref_to_query.keys() - } - - -@pytest.mark.parametrize( - "cigar_str, expected_leftmax_mapping", [(x[0], x[2]) for x in cigar_mapping_cases] -) -def test_cigar_hit_to_coordinate_mapping_leftmax(cigar_str, expected_leftmax_mapping): - cigar = Cigar.coerce(cigar_str) - hit = CigarHit( - cigar, - r_st=5, - r_ei=(5 + cigar.ref_length - 1), - q_st=7, - q_ei=(7 + cigar.query_length - 1), - ) - mapping = hit.coordinate_mapping - expected = { - k + hit.r_st: v + hit.q_st if v is not None else v - for (k, v) in expected_leftmax_mapping.items() - } - fullrange = { - i: mapping.ref_to_query.left_max(i) for i in mapping.ref_to_query.domain - } - assert expected == fullrange - - -def test_invalid_operation_in_cigar_string(): - with pytest.raises(ValueError): - Cigar.coerce("3M1Z3M") # Z operation is not implemented - - -def test_invalid_operation_in_cigar_list(): - with pytest.raises(ValueError): - Cigar.coerce([(3, 42)]) # Operation code "42" does not exist - - -def test_invalid_cigar_string(): - with pytest.raises(ValueError): - Cigar.coerce("3MMMMMM3M") # Too many Ms - with pytest.raises(ValueError): - Cigar.coerce("3") # Not enough Ms - - -CIGAR_REGEX = re.compile(r"(.*)@([0-9]+)->([0-9]+)") - - -def parsed_hit(string): - match = CIGAR_REGEX.match(string) - assert match, f"Cannot parse {string}" - cigar_str, q_st, r_st = match.groups() - cigar = Cigar.coerce(cigar_str) - r_ei = int(r_st) + cigar.ref_length - 1 - q_ei = int(q_st) + cigar.query_length - 1 - return CigarHit(cigar, int(r_st), int(r_ei), int(q_st), int(q_ei)) - - -cigar_hit_ref_cut_cases = [ - # Trivial cases - ("4M@1->1", 2.5, ["2M@1->1", "2M@3->3"]), - ("9M@1->1", 3.5, ["3M@1->1", "6M@4->4"]), - ("9M@1->1", 4.5, ["4M@1->1", "5M@5->5"]), - ("9M@0->0", 3.5, ["4M@0->0", "5M@4->4"]), - # Simple cases - ("9M9D9M@1->1", 3.5, ["3M@1->1", "6M9D9M@4->4"]), - ("9M9D9M@1->1", 20.5, ["9M9D2M@1->1", "7M@12->21"]), - ("9M9I9M@1->1", 3.5, ["3M@1->1", "6M9I9M@4->4"]), - ("9M9I9M@1->1", 13.5 or 27 / 2, ["9M9I4M@1->1", "5M@23->14"]), - ("5M6I@1->1", 3.5, ["3M@1->1", "2M6I@4->4"]), - ("6I5M@1->1", 3.5, ["6I3M@1->1", "2M@10->4"]), - ("5M6D@1->1", 3.5, ["3M@1->1", "2M6D@4->4"]), - ("6D5M@1->1", 3.5, ["3D@1->1", "3D5M@1->4"]), - ("5M6D@1->1", 7.5, ["5M2D@1->1", "4D@6->8"]), - ("6D5M@1->1", 7.5, ["6D1M@1->1", "4M@2->8"]), - ("6D5M@1->1", 6.5, ["6D@1->1", "5M@1->7"]), - # Ambigous cases - ("9M9D9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9M@10->14"]), - ("9M9I9M@1->1", 9.2, ["9M1I@1->1", "8I9M@11->10"]), - ("9M9D9I9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9I9M@10->14"]), - ("9M9I9D9M@1->1", 13.5 or 27 / 2, ["9M9I4D@1->1", "5D9M@19->14"]), - ("9M9D9I9D9I9D9M@1->1", 13.5 or 27 / 2, ["9M4D@1->1", "5D9I9D9I9D9M@10->14"]), - ("9M9I9D9I9D9I9M@1->1", 13.5 or 27 / 2, ["9M9I4D@1->1", "5D9I9D9I9M@19->14"]), - ( - "1M1I1D1M@1->1", - 1.5, # same as previous 2 cases but smaller - ["1M1I@1->1", "1D1M@3->2"], - ), - ( - "1M1D1I1M@1->1", - 1.5, # same as previous 2 cases but smaller - ["1M@1->1", "1D1I1M@2->2"], - ), - # Edge cases - ("9M9I9M@1->1", 9.5, ["9M5I@1->1", "4I9M@15->10"]), # no middlepoint - ("9M@1->1", 8.5, ["8M@1->1", "1M@9->9"]), - ("9M@1->1", 9.5, ["9M@1->1", "@10->10"]), - ("7M@3->3", 2.5, ["@3->3", "7M@3->3"]), - ("9M@1->1", 0.5, ["@1->1", "9M@1->1"]), - ("9M@0->0", -0.5, ["@0->0", "9M@0->0"]), - ("9D@1->1", 3.5, ["3D@1->1", "6D@1->4"]), - ("9D@0->0", -0.5, ["@0->0", "9D@0->0"]), - ("1M7I1M@1->1", 1.5, ["1M4I@1->1", "3I1M@6->2"]), - ("1M6I1M@1->1", 1.5, ["1M3I@1->1", "3I1M@5->2"]), - ("1M7I1M@1->1", 1.999, ["1M7I@1->1", "1M@9->2"]), - ("1M7I1M@1->1", 1.001, ["1M@1->1", "7I1M@2->2"]), - ("2=1X2N1N2=1H2S@1->1", 3.5, ["2=1X@1->1", "3N2=1H2S@4->4"]), - # Negative cases - ( - "9M9I9M@1->1", - 20.5, - IndexError("Cut point out of reference bounds"), - ), # 20.5 is bigger than reference (18) - ( - "@2->2", - 2.5, - IndexError("Cut point out of reference bounds"), - ), # Empty string cannot be cut - ( - "@2->2", - 1.5, - IndexError("Cut point out of reference bounds"), - ), # Empty string cannot be cut - ("9I@1->1", 3.5, IndexError("Cut point out of reference bounds")), - ("9M@1->1", 4, ValueError("Cut accepts fractions, not integers")), -] - - -@pytest.mark.parametrize("hit, cut_point, expected_result", cigar_hit_ref_cut_cases) -def test_cigar_hit_ref_cut(hit, cut_point, expected_result): - hit = parsed_hit(hit) - - if isinstance(expected_result, Exception): - with pytest.raises(type(expected_result)) as exc_info: - hit.cut_reference(cut_point) - assert exc_info.value.args == expected_result.args - - else: - expected_result = list(map(parsed_hit, expected_result)) - expected_left, expected_right = expected_result - left, right = hit.cut_reference(cut_point) - assert expected_left == left - assert expected_right == right - - -@pytest.mark.parametrize( - "hit, cut_point", - [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)], -) -def test_cigar_hit_ref_cut_add_prop(hit, cut_point): - hit = parsed_hit(hit) - left, right = hit.cut_reference(cut_point) - assert left + right == hit - - -@pytest.mark.parametrize( - "hit, cut_point", - [ - (x[0], x[1]) - for x in [x for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] - ], -) -def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point): - hit = parsed_hit(hit) - percentage = cut_point - floor(cut_point) - - for cut_point in range(hit.r_st, hit.r_ei + 2): - left, right = hit.cut_reference(cut_point - percentage) - assert left + right == hit - - -lstrip_query_cases = [ - ("9M@1->1", "9M@1->1"), - ("5M6D@1->1", "5M6D@1->1"), - ("6D5M@1->1", "6D5M@1->1"), - ("6I5M@1->1", "5M@7->1"), - ("6I4D5M@1->1", "4D5M@7->1"), - ("6D4I5M@1->1", "6D5M@5->1"), - ("3D3D4I5M@1->1", "6D5M@5->1"), - ("3I3I4D5M@1->1", "4D5M@7->1"), - ("3D2I3D2I5M@1->1", "6D5M@5->1"), - ("3I2D3I2D5M@1->1", "4D5M@7->1"), - ("4D6I5M@1->1", "4D5M@7->1"), - ("4I6D5M@1->1", "6D5M@5->1"), - ("6I4D@1->1", "4D@7->1"), - ("6D4I@1->1", "6D@5->1"), - ("4D6I@1->1", "4D@7->1"), - ("4I6D@1->1", "6D@5->1"), - ("4I@1->1", "@5->1"), - ("4D@1->1", "4D@1->1"), - ("@1->1", "@1->1"), -] - - -rstrip_query_cases = [ - ("9M@1->1", "9M@1->1"), - ("5M6D@1->1", "5M6D@1->1"), - ("5M6I@1->1", "5M@1->1"), - ("6D5M@1->1", "6D5M@1->1"), - ("5M4I6D@1->1", "5M6D@1->1"), - ("5M4D6I@1->1", "5M4D@1->1"), - ("5M4I3D3D@1->1", "5M6D@1->1"), - ("5M4D3I3I@1->1", "5M4D@1->1"), - ("5M2I3D2I3D@1->1", "5M6D@1->1"), - ("5M2D3I2D3I@1->1", "5M4D@1->1"), - ("5M6D4I@1->1", "5M6D@1->1"), - ("5M6I4D@1->1", "5M4D@1->1"), - ("6D4I@1->1", "6D@1->1"), - ("6I4D@1->1", "4D@1->1"), - ("4I6D@1->1", "6D@1->1"), - ("4D6I@1->1", "4D@1->1"), - ("4I@1->1", "@1->1"), - ("4D@1->1", "4D@1->1"), - ("@1->1", "@1->1"), -] - - -lstrip_reference_cases = [ - ("9M@1->1", "9M@1->1"), - ("5M6D@1->1", "5M6D@1->1"), - ("6D5M@1->1", "5M@1->7"), - ("6I5M@1->1", "6I5M@1->1"), - ("6I4D5M@1->1", "6I5M@1->5"), - ("6D4I5M@1->1", "4I5M@1->7"), - ("6D4I5M@1->1", "4I5M@1->7"), - ("6I4D5M@1->1", "6I5M@1->5"), - ("3D2I3D2I5M@1->1", "4I5M@1->7"), - ("3I2D3I2D5M@1->1", "6I5M@1->5"), - ("4D6I5M@1->1", "6I5M@1->5"), - ("4I6D5M@1->1", "4I5M@1->7"), - ("6I4D@1->1", "6I@1->5"), - ("6D4I@1->1", "4I@1->7"), - ("4D6I@1->1", "6I@1->5"), - ("4I6D@1->1", "4I@1->7"), - ("4I@1->1", "4I@1->1"), - ("4D@1->1", "@1->5"), - ("@1->1", "@1->1"), -] - - -rstrip_reference_cases = [ - ("9M@1->1", "9M@1->1"), - ("5M6D@1->1", "5M@1->1"), - ("5M6I@1->1", "5M6I@1->1"), - ("6D5M@1->1", "6D5M@1->1"), - ("5M4I6D@1->1", "5M4I@1->1"), - ("5M4D6I@1->1", "5M6I@1->1"), - ("5M4I6D@1->1", "5M4I@1->1"), - ("5M4D6I@1->1", "5M6I@1->1"), - ("5M2I3D2I3D@1->1", "5M4I@1->1"), - ("5M2D3I2D3I@1->1", "5M6I@1->1"), - ("5M6D4I@1->1", "5M4I@1->1"), - ("5M6I4D@1->1", "5M6I@1->1"), - ("6D4I@1->1", "4I@1->1"), - ("6I4D@1->1", "6I@1->1"), - ("4I6D@1->1", "4I@1->1"), - ("4D6I@1->1", "6I@1->1"), - ("4I@1->1", "4I@1->1"), - ("4D@1->1", "@1->1"), - ("@1->1", "@1->1"), -] - - -strip_prop_cases_all = ( - [x[0] for x in cigar_hit_ref_cut_cases] - + [x[0] for x in lstrip_query_cases] - + [x[0] for x in rstrip_query_cases] -) - - -@pytest.mark.parametrize("hit, expected", lstrip_query_cases) -def test_cigar_hit_lstrip_reference(hit, expected): - hit = parsed_hit(hit) - expected = parsed_hit(expected) - assert expected == hit.lstrip_query() - - -@pytest.mark.parametrize("hit, expected", rstrip_query_cases) -def test_cigar_hit_rstrip_reference(hit, expected): - hit = parsed_hit(hit) - expected = parsed_hit(expected) - assert expected == hit.rstrip_query() - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_reference_strip_combines_with_connect(hit): - hit = parsed_hit(hit) - - for cut_point in range(hit.r_st - 1, hit.r_ei): - left, right = hit.cut_reference(cut_point + hit.epsilon) - - left = left.rstrip_query() - right = right.lstrip_query() - - assert ( - left.connect(right).coordinate_mapping.ref_to_query - == hit.coordinate_mapping.ref_to_query - ) - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_reference_strip_combines_with_add(hit): - hit = parsed_hit(hit) - - for cut_point in range(hit.r_st - 1, hit.r_ei): - left, right = hit.cut_reference(cut_point + hit.epsilon) - - left = left.rstrip_query() - right = right.lstrip_query() - - if left.touches(right): - assert left + right == hit - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_reference_strip_never_crashes(hit): - hit = parsed_hit(hit) - - hit.rstrip_query().lstrip_query() - hit.lstrip_query().rstrip_query() - hit.lstrip_query().lstrip_query() - hit.rstrip_query().rstrip_query() - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_reference_strip_is_idempotent(hit): - hit = parsed_hit(hit) - - h1 = hit.rstrip_query() - assert h1 == h1.rstrip_query() == h1.rstrip_query().rstrip_query() - - h1 = hit.lstrip_query() - assert h1 == h1.lstrip_query() == h1.lstrip_query().lstrip_query() - - h1 = hit.lstrip_query().rstrip_query() - assert h1 == h1.lstrip_query() == h1.rstrip_query() - - h1 = hit.rstrip_query().lstrip_query() - assert h1 == h1.rstrip_query() == h1.lstrip_query() - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_reference_strips_are_commutative(hit): - hit = parsed_hit(hit) - - if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert hit.rstrip_query().lstrip_query() == hit.lstrip_query().rstrip_query() - else: - assert ( - hit.rstrip_query().lstrip_query().cigar - == hit.lstrip_query().rstrip_query().cigar - ) - - -@pytest.mark.parametrize("hit, expected", lstrip_reference_cases) -def test_cigar_hit_lstrip_query(hit, expected): - hit = parsed_hit(hit) - expected = parsed_hit(expected) - assert expected == hit.lstrip_reference() - - -@pytest.mark.parametrize("hit, expected", rstrip_reference_cases) -def test_cigar_hit_rstrip_query(hit, expected): - hit = parsed_hit(hit) - expected = parsed_hit(expected) - assert expected == hit.rstrip_reference() - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_query_strip_combines_with_connect(hit): - hit = parsed_hit(hit) - - for cut_point in range(hit.r_st - 1, hit.r_ei): - left, right = hit.cut_reference(cut_point + hit.epsilon) - - left = left.rstrip_reference() - right = right.lstrip_reference() - - assert ( - left.connect(right).coordinate_mapping.ref_to_query - == hit.coordinate_mapping.ref_to_query - ) - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_query_strip_combines_with_add(hit): - hit = parsed_hit(hit) - - for cut_point in range(hit.r_st - 1, hit.r_ei): - left, right = hit.cut_reference(cut_point + hit.epsilon) - - left = left.rstrip_reference() - right = right.lstrip_reference() - - if left.touches(right): - assert left + right == hit - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_strips_work_together(hit): - hit = parsed_hit(hit) - - rstrip = str(hit.rstrip_reference().rstrip_query().cigar) - assert not rstrip.endswith("I") and not rstrip.endswith("D") - lstrip = str(hit.lstrip_reference().lstrip_query().cigar) - assert not lstrip.startswith("I") and not lstrip.startswith("D") - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_query_strip_is_idempotent(hit): - hit = parsed_hit(hit) - - h1 = hit.rstrip_reference() - assert h1 == h1.rstrip_reference() == h1.rstrip_reference().rstrip_reference() - - h1 = hit.lstrip_reference() - assert h1 == h1.lstrip_reference() == h1.lstrip_reference().lstrip_reference() - - h1 = hit.lstrip_reference().rstrip_reference() - assert h1 == h1.lstrip_reference() == h1.rstrip_reference() - - h1 = hit.rstrip_reference().lstrip_reference() - assert h1 == h1.rstrip_reference() == h1.lstrip_reference() - - -@pytest.mark.parametrize("hit", strip_prop_cases_all) -def test_cigar_hit_query_strips_are_commutative(hit): - hit = parsed_hit(hit) - - if len(hit.cigar.coordinate_mapping.ref_to_query) > 0: - assert ( - hit.rstrip_reference().lstrip_reference() - == hit.lstrip_reference().rstrip_reference() - ) - else: - assert ( - hit.rstrip_reference().lstrip_reference().cigar - == hit.lstrip_reference().rstrip_reference().cigar - ) - - -@pytest.mark.parametrize( - "hit, cut_point", - [(x[0], x[1]) for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)], -) -def test_cigar_hit_ref_cut_add_associativity(hit, cut_point): - hit = parsed_hit(hit) - percentage = cut_point - floor(cut_point) - - for ax_cut in range(hit.r_st, hit.r_ei + 2): - a, x = hit.cut_reference(ax_cut - percentage) - - for bc_cut in range(a.r_ei + 1, hit.r_ei + 2): - if x.ref_length == 0: - continue - - b, c = x.cut_reference(bc_cut - percentage) - - assert (a + b) + c == a + (b + c) - - -@pytest.mark.parametrize( - "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] -) -def test_cigar_hit_deletions_no_m_or_i(hit): - hit = parsed_hit(hit) - deletions = list(hit.deletions()) - - assert len(deletions) == len([op for op in str(hit.cigar) if op in "DN"]) - - for deletion in deletions: - assert "M" not in str(deletion.cigar) - assert "I" not in str(deletion.cigar) - - -@pytest.mark.parametrize( - "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] -) -def test_cigar_hit_deletions_lengths(hit): - hit = parsed_hit(hit) - deletions = list(hit.deletions()) - - for deletion in deletions: - assert deletion.query_length == 0 - assert deletion.ref_length > 0 - assert deletion.coordinate_mapping.ref_to_query == {} - - -@pytest.mark.parametrize( - "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] -) -def test_cigar_hit_insertions_no_m_or_i(hit): - hit = parsed_hit(hit) - insertions = list(hit.insertions()) - - if "I" in str(hit.cigar): - assert len(insertions) > 0 - - for insertion in insertions: - assert "M" not in str(insertion.cigar) - assert "D" not in str(insertion.cigar) - - -@pytest.mark.parametrize( - "hit", [x[0] for x in cigar_hit_ref_cut_cases if not isinstance(x[2], Exception)] -) -def test_cigar_hit_insertions_lengths(hit): - hit = parsed_hit(hit) - insertions = list(hit.insertions()) - - for insertion in insertions: - assert insertion.ref_length == 0 - assert insertion.query_length > 0 - assert insertion.coordinate_mapping.ref_to_query == {} - - -@pytest.mark.parametrize( - "reference_seq, query_seq, cigar, expected_reference, expected_query", - [ - ("ACTG", "ACTG", "4M", "ACTG", "ACTG"), - ("ACTG", "", "4D", "ACTG", "----"), - ("", "ACTG", "4I", "----", "ACTG"), - ("ACTGAC", "ACAC", "2M2D2M", "ACTGAC", "AC--AC"), - ("ACAC", "ACTGAC", "2M2I2M", "AC--AC", "ACTGAC"), - ("GCTATGGGAA", "GCTATGGGAA", "5M3D2M", "GCTATGGGAA", "GCTAT---GG"), - ( - "ACTG", - "ACTG", - "2M99H77P2M", - "ACTG", - "ACTG", - ), # Ignores non-consuming operations. - ], -) -def test_cigar_to_msa( - reference_seq, query_seq, cigar, expected_reference, expected_query -): - assert Cigar.coerce(cigar).to_msa(reference_seq, query_seq) == ( - expected_reference, - expected_query, - ) - - -@pytest.mark.parametrize( - "cigar, reference_seq, query_seq", - [ - ("10M", "A" * 3, "A" * 10), # reference is shorter than CIGAR - ("10M", "A" * 10, "A" * 3), # query is shorter than CIGAR - ("10D", "A" * 3, "A" * 3), - ("10I", "A" * 3, "A" * 3), - ], -) -def test_illigal_cigar_to_msa(cigar, reference_seq, query_seq): - with pytest.raises(ValueError): - Cigar.coerce(cigar).to_msa(reference_seq, query_seq) - - -connect_cigar_hits_cases = [ - # Non-overlapping hits should be connected with deletions/insertions - (["4M@1->1", "4M@8->10"], ["4M5D3I4M@1->1"]), - # Overlapping hits should ignore later ones - (["4M@1->1", "5M@3->3"], ["4M@1->1"]), - # Touching hits should be simply concatenated - (["4M@1->1", "4M@5->5"], ["8M@1->1"]), - # Hits that touch at only one boundary should combine just fine - (["3M@1->1", "6M@6->4"], ["3M2I6M@1->1"]), - # Hits that are subsets of earlier hits should be ignored - (["8M@1->1", "3M@3->3"], ["8M@1->1"]), - # Hits that are out of order should be connected if no overlap - (["3M@6->10", "3M@1->1"], ["3M6D2I3M@1->1"]), - # Hits that overlap by a single base should prioritize the first hit and not combine - (["3M@1->1", "3M@3->3"], ["3M@1->1"]), - # Non-overlapping hits in the query space but overlapping in reference space - (["5M@1->1", "1M@10->3"], ["5M@1->1"]), - # Combining more than two hits - (["3M@1->1", "3M@7->7", "3M@16->12"], ["3M3D3I3M2D6I3M@1->1"]), - # Combining hits including hard-clipping, which should be ignored in alignments - (["2H5M1H@3->1", "2H5M1H@13->11"], ["2H5M1H5D5I2H5M1H@3->1"]), - # An empty list of hits should raise a ValueError - ([], ValueError("Expected a non-empty list of cigar hits")), - # Before by reference, after by query - (["4M@8->1", "4M@1->10"], ["4M@8->1", "4M@1->10"]), -] - - -@pytest.mark.parametrize("hits, expected_result", connect_cigar_hits_cases) -def test_connect_cigar_hits(hits, expected_result): - hits = list(map(parsed_hit, hits)) - - if isinstance(expected_result, Exception): - with pytest.raises(type(expected_result)): - connect_cigar_hits(hits) - else: - expected_result = list(map(parsed_hit, expected_result)) - result = connect_cigar_hits(hits) - assert expected_result == result diff --git a/micall/utils/cigar_tools.py b/micall/utils/cigar_tools.py deleted file mode 100644 index 41b0f3bc9..000000000 --- a/micall/utils/cigar_tools.py +++ /dev/null @@ -1,736 +0,0 @@ -""" -Module for handling CIGAR strings and related alignment formats. -""" -from enum import IntEnum -from math import ceil, floor -import re -from typing import Tuple, Iterable, Optional, Set, List, Union -from dataclasses import dataclass -from functools import cached_property, reduce -from fractions import Fraction - - -class IntDict(dict): - """ - An extension of the basic Python dictionary designed for integer-to-integer mappings. - - The IntDict maintains not just key-value pairs (as in a normal dictionary) but also - tracks additional sets called `domain` and `codomain`. These sets are supersets - of the keys and values respectively, as they include integers that might not be used - directly in mappings but are within the range of interest for the domain and codomain. - """ - - def __init__(self): - super().__init__() - self.domain: Set[int] = set() # superset of self.keys() - self.codomain: Set[int] = set() # superset of self.values() - - def extend(self, key: Optional[int], value: Optional[int]): - if key is not None and value is not None: - self[key] = value - - if key is not None: - self.domain.add(key) - - if value is not None: - self.codomain.add(value) - - def left_max(self, index) -> Optional[int]: - return max((v for (k, v) in self.items() if k <= index), default=None) - - def right_min(self, index) -> Optional[int]: - return min((v for (k, v) in self.items() if k >= index), default=None) - - def translate(self, domain_delta: int, codomain_delta: int) -> 'IntDict': - """ - Generates a new IntDict by shifting the entire mapping -- keys and values - are incremented by domain_delta and codomain_delta, respectively. - This shift operation preserves the inherent ordering and relative spacing within the mapping, - effectively repositioning the dataset within the integer space. - """ - - ret = IntDict() - - for k, v in self.items(): - ret.extend(k + domain_delta, v + codomain_delta) - - for k in self.domain: - ret.extend(k + domain_delta, None) - - for v in self.codomain: - ret.extend(None, v + codomain_delta) - - return ret - - -class CoordinateMapping: - """ - Manages bidirectional mappings between reference and query coordinates, as well as operation indices. - - The mapping enables conversion from reference to query coordinates and vice versa. It also manages the - association of these coordinates with their respective operations in the alignment process. - """ - - def __init__(self): - self.ref_to_query = IntDict() - self.query_to_ref = IntDict() - self.ref_to_op = IntDict() - self.query_to_op = IntDict() - - def extend(self, - ref_index: Optional[int], - query_index: Optional[int], - op_index: int): - - self.ref_to_query.extend(ref_index, query_index) - self.query_to_ref.extend(query_index, ref_index) - self.ref_to_op.extend(ref_index, op_index) - self.query_to_op.extend(query_index, op_index) - - def translate(self, reference_delta: int, query_delta: int) -> 'CoordinateMapping': - """ - Generate a new CoordinateMapping with shifted coordinate spaces. - - This method creates a new mapping where each original coordinate in - the reference and query sequences is shifted. This allows for adapting - the CoordinateMapping to account for changes or offsets in sequence positions, - such as when sequences are trimmed or extended. - """ - - ret = CoordinateMapping() - - ret.ref_to_query = self.ref_to_query.translate(reference_delta, query_delta) - ret.query_to_ref = self.query_to_ref.translate(query_delta, reference_delta) - ret.ref_to_op = self.ref_to_op.translate(reference_delta, 0) - ret.query_to_op = self.query_to_op.translate(query_delta, 0) - - return ret - - def __eq__(self, other): - return (self.ref_to_op, self.query_to_op) \ - == (other.ref_to_op, other.query_to_op) - - def __repr__(self): - return f'CoordinateMapping({self.ref_to_op},{self.query_to_op})' - - -# Mapping as defined in https://samtools.github.io/hts-specs/SAMv1.pdf, page 8 -CigarActions = IntEnum( - 'CigarActions', - 'MATCH INSERT DELETE SKIPPED SOFT_CLIPPED HARD_CLIPPED PADDING SEQ_MATCH MISMATCH', - start=0) - - -class Cigar: - """ - Represents an alignment between a query sequence and a reference sequence using the - Compact Idiosyncratic Gapped Alignment Report (CIGAR) string format. - - A CIGAR string is a sequence of operation codes ('M', 'I', 'D', etc.) each preceded by - the number of bases or residues to which the operation applies. - - The class abstracts a CIGAR string as a sequence of discrete operations for convenient - manipulation (as seen in self.iterate_operations()), while retaining the compact - form for storage and return purposes (seen in self.__str__()). - - Instances of this class should be created by calling the `Cigar.coerce` method. - Examples: - Cigar.coerce("10M1I5M1D") - Cigar.coerce([(10, CigarActions.MATCH), (1, CigarActions.INSERT), ...]) - Cigar.coerce(existing_cigar_object) - - CIGAR strings are defined in the SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). - """ - - def __init__(self, data) -> None: - self._data: List[Tuple[int, CigarActions]] = list(Cigar.normalize(data)) - - @staticmethod - def coerce(obj: Union['Cigar', str, Iterable[Tuple[int, CigarActions]]]): - if isinstance(obj, Cigar): - return obj - - if isinstance(obj, str): - return Cigar.parse(obj) - - if isinstance(obj, list) or isinstance(obj, tuple): - return Cigar(obj) - - raise TypeError(f"Cannot coerce {obj!r} to CIGAR string.") - - def iterate_operations(self) -> Iterable[CigarActions]: - """ - Yields each operation in the CIGAR sequence as a `CigarActions` enum. - The resulting sequence is a decoded version of the initial run-length encoded sequence. - """ - - for num, operation in self._data: - for _ in range(num): - yield operation - - def iterate_operations_with_pointers(self) -> Iterable[Tuple[CigarActions, Optional[int], Optional[int]]]: - """ - Iterates over the operations while tracking the reference and - query sequence positions affected by each operation. - - Example: - For a Cigar instance representing "1M1I1M", this method would yield: - (CigarActions.MATCH, 0, 0), (CigarActions.INSERT, None, 1), (CigarActions.MATCH, 1, 2) - - :return: Tuple of type (CigarActions, reference_pointer, query_pointer) for each operation in the - CIGAR sequence. Pointers can be None if the operation does not map to a sequence - position (e.g., insertions, deletions). - """ - - ref_pointer = 0 - query_pointer = 0 - - for operation in self.iterate_operations(): - if operation in (CigarActions.MATCH, CigarActions.SEQ_MATCH, CigarActions.MISMATCH): - yield operation, ref_pointer, query_pointer - query_pointer += 1 - ref_pointer += 1 - - elif operation in (CigarActions.INSERT, CigarActions.SOFT_CLIPPED): - yield operation, None, query_pointer - query_pointer += 1 - - elif operation in (CigarActions.DELETE, CigarActions.SKIPPED): - yield operation, ref_pointer, None - ref_pointer += 1 - - else: - yield operation, None, None - - def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar': - """ - Creates a new Cigar object by slicing the current one from start_inclusive to - end_noninclusive. Note that slicing is done at the level of individual operations, - not at the level of counts within operations. - - Example: - Given a Cigar instance representing "10M5D5M", slicing from 2 to 11 would result in a new - Cigar object representing "8M1D". - """ - - return Cigar([(1, op) for op in self.iterate_operations()] - [start_inclusive:end_noninclusive]) - - def lstrip_query(self) -> 'Cigar': - """ Return a copy of the Cigar with leading (unmatched) query elements removed. """ - - min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=None) - min_op = self.coordinate_mapping.ref_to_op.get(min_r, float("inf")) - - ops = [(1, op) for i, (op, ref_pointer, query_pointer) - in enumerate(self.iterate_operations_with_pointers()) - if query_pointer is None or i >= min_op] - return Cigar.coerce(ops) - - def rstrip_query(self) -> 'Cigar': - """ Return a copy of the Cigar with trailing (unmatched) query elements removed. """ - - max_r = max(self.coordinate_mapping.ref_to_query.keys(), default=None) - max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("-inf")) - - ops = [(1, op) for i, (op, ref_pointer, query_pointer) - in enumerate(self.iterate_operations_with_pointers()) - if query_pointer is None or i <= max_op] - return Cigar.coerce(ops) - - def lstrip_reference(self) -> 'Cigar': - """ Return a copy of the Cigar with leading (unmatched) reference elements removed. """ - - min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=None) - min_op = self.coordinate_mapping.query_to_op.get(min_q, float("inf")) - - ops = [(1, op) for i, (op, ref_pointer, query_pointer) - in enumerate(self.iterate_operations_with_pointers()) - if ref_pointer is None or i >= min_op] - return Cigar.coerce(ops) - - def rstrip_reference(self) -> 'Cigar': - """ Return a copy of the Cigar with trailing (unmatched) reference elements removed. """ - - max_q = max(self.coordinate_mapping.query_to_ref.keys(), default=None) - max_op = self.coordinate_mapping.query_to_op.get(max_q, float("-inf")) - - ops = [(1, op) for i, (op, ref_pointer, query_pointer) - in enumerate(self.iterate_operations_with_pointers()) - if ref_pointer is None or i <= max_op] - return Cigar.coerce(ops) - - @cached_property - def coordinate_mapping(self) -> CoordinateMapping: - """ - Convert this CIGAR string to coordinate mapping representing - a reference-to-query and query-to-reference coordinate mappings. - - :return: Lists of integers representing the mappings of coordinates from the reference - sequence to the query sequence, and back. - """ - - mapping = CoordinateMapping() - - for op_pointer, (operation, ref_pointer, query_pointer) in enumerate(self.iterate_operations_with_pointers()): - mapping.extend(ref_pointer, - query_pointer, - op_pointer) - - return mapping - - def to_msa(self, reference_seq, query_seq) -> Tuple[str, str]: - """ - Constructs a multiple sequence alignment (MSA) representation for this Cigar, using the original reference - and query sequences. It aligns the sequences according to the CIGAR operations, introducing gaps ('-') - as necessary to reflect insertions or deletions. - """ - - reference_msa = '' - query_msa = '' - - for operation, ref_pointer, query_pointer in self.iterate_operations_with_pointers(): - if ref_pointer is None and query_pointer is None: - continue - - try: - if ref_pointer is not None: - reference_msa += reference_seq[ref_pointer] - else: - reference_msa += '-' - - if query_pointer is not None: - query_msa += query_seq[query_pointer] - else: - query_msa += '-' - - except IndexError: - raise ValueError("CIGAR string corresponds to a larger match than either reference or query.") - - return reference_msa, query_msa - - @cached_property - def op_length(self): - return sum(1 for x in self.iterate_operations()) - - @cached_property - def query_length(self): - return max((query_pointer + 1 if query_pointer is not None else 0 for (_, _, query_pointer) - in self.iterate_operations_with_pointers()), - default=0) - - @cached_property - def ref_length(self): - return max((ref_pointer + 1 if ref_pointer is not None else 0 for (_, ref_pointer, _) - in self.iterate_operations_with_pointers()), - default=0) - - # # - # Boring boilerplate code below # - # # - - OP_MAPPING = { - 'M': CigarActions.MATCH, # Alignment match (can be a sequence match or mismatch) - 'I': CigarActions.INSERT, # Insertion to the reference - 'D': CigarActions.DELETE, # Deletion from the reference - 'N': CigarActions.SKIPPED, # Skipped region from the reference - 'S': CigarActions.SOFT_CLIPPED, # Soft clip on the read (ignored region, not aligned but present in the read) - 'H': CigarActions.HARD_CLIPPED, # Hard clip on the read (ignored region, not present in the read) - 'P': CigarActions.PADDING, # Padding (silent deletion from padded reference, not applicable for our case) - '=': CigarActions.SEQ_MATCH, # Sequence match - 'X': CigarActions.MISMATCH, # Sequence mismatch - } - - @staticmethod - def parse_operation(operation: str) -> CigarActions: - if operation in Cigar.OP_MAPPING: - return Cigar.OP_MAPPING[operation] - else: - raise ValueError(f"Unexpected CIGAR action: {operation}.") - - @staticmethod - def operation_to_str(op: CigarActions) -> str: - return [k for (k, v) in Cigar.OP_MAPPING.items() if v == op][0] - - @staticmethod - def parse(string) -> 'Cigar': - """ - Parses a CIGAR string into a Cigar object. - - :param string: A CIGAR string with the format '(\\d+[MIDNSHPX=])+', where each operation code - is preceded by a number indicating how many times the operation should be applied. - """ - - data = [] - while string: - match = re.match(r'([0-9]+)([^0-9])', string) - if match: - num, operation = match.groups() - data.append((int(num), Cigar.parse_operation(operation))) - string = string[match.end():] - else: - raise ValueError(f"Invalid CIGAR string. Invalid part: {string[:20]}") - - return Cigar(data) - - @staticmethod - def normalize(cigar_lst) -> Iterable[Tuple[int, CigarActions]]: - """ - Goes through the list appending operations to the CIGAR sequence, - checking for type correctness and performing normalization - by merging consecutive identical operations. - """ - - last_item = None - - for item in cigar_lst: - # Type checking - if not isinstance(item, list) and not isinstance(item, tuple): - raise ValueError(f"Invalid CIGAR list: {item!r} is not a tuple.") - if len(item) != 2: - raise ValueError(f"Invalid CIGAR list: {item!r} has a bad length.") - - num, operation = item - if isinstance(operation, int): - operation = CigarActions(operation) - if not isinstance(num, int) or not isinstance(operation, CigarActions): - raise ValueError(f"Invalid CIGAR list: {item!r} is not a number/operation tuple.") - if num < 0: - raise ValueError("Invalid CIGAR list: number of operations is negative.") - - # Normalization - if num == 0: - continue - - if last_item: - last_num, last_operation = last_item - if operation == last_operation: - last_item = (last_num + num, operation) - continue - - if last_item: - yield last_item[0], last_item[1] - last_item = item - - if last_item: - yield last_item[0], last_item[1] - - def __eq__(self, other) -> bool: - return isinstance(other, Cigar) and self._data == other._data - - def __add__(self, other: 'Cigar'): - return Cigar(self._data + other._data) - - def __repr__(self): - return f'Cigar({str(self)!r})' - - def __str__(self): - """ Inverse of `Cigar.parse` """ - return ''.join('{}{}'.format(num, Cigar.operation_to_str(op)) for num, op in self._data) - - -@dataclass(frozen=True) -class CigarHit: - """ - This class provides an abstraction over the complex details involved in working with sequence alignments - expressed as CIGAR strings. It implements operations for alignment handling that are conceptually - straightforward but challenging to implement ad-hoc. - - The main tasks handled by this class are: - - Precisely dividing an alignment into two contiguous segments - at any given reference position (`cut_reference()`), - - Removing portions of the query sequence that do not align with - the reference sequence from either end - while preserving the alignment context (`lstrip*()` and `rstrip*()`), - - Enumerating gaps in the alignment (`gaps()`). - """ - - cigar: Cigar - r_st: int - r_ei: int # inclusive - q_st: int - q_ei: int # inclusive - - def __post_init__(self): - if self.ref_length != self.cigar.ref_length: - raise ValueError(f"CIGAR string maps {self.cigar.ref_length}" - f" reference positions, but CIGAR hit range is {self.ref_length}") - - if self.query_length != self.cigar.query_length: - raise ValueError(f"CIGAR string maps {self.cigar.query_length}" - f" query positions, but CIGAR hit range is {self.query_length}") - - @property - def ref_length(self): - return self.r_ei + 1 - self.r_st - - @property - def query_length(self): - return self.q_ei + 1 - self.q_st - - @staticmethod - def from_default_alignment(r_st, r_ei, q_st, q_ei): - """ - A convenience method that creates a CigarHit instance representing a default alignment, - where there are only deletions in the reference sequence and only insertions in the query. - """ - - ref_length = r_ei - r_st + 1 - query_length = q_ei - q_st + 1 - cigar = Cigar.coerce([(ref_length, CigarActions.DELETE), - (query_length, CigarActions.INSERT)]) - - return CigarHit(cigar, r_st=r_st, r_ei=r_ei, q_st=q_st, q_ei=q_ei) - - def overlaps(self, other) -> bool: - """ - Determines whether this CigarHit overlaps with another in terms of reference or query coordinates. - Two hits are considered overlapping if their alignment ranges on the reference or query sequence overlap. - Note: Assumes that both CigarHit instances pertain to the same pair of reference and query sequences. - """ - - def intervals_overlap(x, y): - """ Check if two intervals [x0, x1] and [y0, y1] overlap. """ - return x[0] <= y[1] and x[1] >= y[0] - - return intervals_overlap((self.r_st, self.r_ei), (other.r_st, other.r_ei)) \ - or intervals_overlap((self.q_st, self.q_ei), (other.q_st, other.q_ei)) - - def touches(self, other) -> bool: - """ - Checks if the end of this CigarHit is immediately adjacent to the start of another one. - Note: Assumes that both CigarHit instances pertain to the same pair of reference and query sequences. - """ - - return self.r_ei + 1 == other.r_st \ - and self.q_ei + 1 == other.q_st - - def _gaps(self, is_deletions: bool) -> Iterable['CigarHit']: - last_query_index = self.q_st - last_ref_index = self.r_st - gap_start: Optional[int] = None - op_to_ref = {v: k for k, v in self.coordinate_mapping.ref_to_op.items()} - op_to_query = {v: k for k, v in self.coordinate_mapping.query_to_op.items()} - present = op_to_ref if is_deletions else op_to_query - missing = op_to_query if is_deletions else op_to_ref - - for op_index in sorted(self.coordinate_mapping.query_to_op.codomain) + [None]: - if op_index in present and \ - op_index not in missing: - if gap_start is None: - gap_start = op_index - else: - if gap_start is not None: - cigar = self.cigar.slice_operations(gap_start, op_index) - if is_deletions: - q_st = last_query_index - r_st = present[gap_start] - else: - q_st = present[gap_start] - r_st = last_ref_index - q_ei = q_st + cigar.query_length - 1 - r_ei = r_st + cigar.ref_length - 1 - yield CigarHit(cigar, q_st=q_st, q_ei=q_ei, r_st=r_st, r_ei=r_ei) - gap_start = None - - if op_index in op_to_query: - last_query_index = op_to_query[op_index] - if op_index in op_to_ref: - last_ref_index = op_to_ref[op_index] - - def deletions(self) -> Iterable['CigarHit']: - return self._gaps(is_deletions=True) - - def insertions(self) -> Iterable['CigarHit']: - return self._gaps(is_deletions=False) - - def __add__(self, other): - """ - Only adds CigarHits that are touching. - The addition is simply a concatenation of two Cigar strings, and adjustment of hit coordinates. - """ - - if not self.touches(other): - raise ValueError("Cannot combine CIGAR hits that do not touch in both reference and query coordinates") - - return CigarHit(cigar=self.cigar + other.cigar, - r_st=self.r_st, - r_ei=other.r_ei, - q_st=self.q_st, - q_ei=other.q_ei) - - def connect(self, other: 'CigarHit') -> 'CigarHit': - """ - Inserts deletions/insertions between self and other, - then ajusts boundaries appropriately. - """ - - if self.overlaps(other): - raise ValueError("Cannot combine overlapping CIGAR hits") - - filler = CigarHit.from_default_alignment(self.r_ei + 1, other.r_st - 1, self.q_ei + 1, other.q_st - 1) - return self + filler + other - - @property - def epsilon(self): - return Fraction(1, self.cigar.op_length * 3 + 1) - - def _ref_cut_to_op_cut(self, cut_point): - mapping = self.coordinate_mapping - - left_op_cut_point = mapping.ref_to_op.left_max(floor(cut_point)) - right_op_cut_point = mapping.ref_to_op.right_min(ceil(cut_point)) - - if left_op_cut_point is None: - left_op_cut_point = -1 - if right_op_cut_point is None: - right_op_cut_point = self.cigar.op_length - - def lerp(start, end, t): return (1 - t) * start + t * end - op_cut_point = lerp(left_op_cut_point, right_op_cut_point, - cut_point - floor(cut_point)) - - if float(op_cut_point).is_integer(): - # Disambiguate to the right. - op_cut_point += self.epsilon - - return op_cut_point - - def _slice(self, r_st, q_st, o_st, o_ei): - cigar = self.cigar.slice_operations(o_st, o_ei + 1) - r_ei = r_st + cigar.ref_length - 1 - q_ei = q_st + cigar.query_length - 1 - - return CigarHit(cigar=cigar, - r_st=r_st, - r_ei=r_ei, - q_st=q_st, - q_ei=q_ei, - ) - - def cut_reference(self, cut_point: float) -> Tuple['CigarHit', 'CigarHit']: - """ - Splits this CigarHit into two non-overlapping parts using a fractional cut point in the reference space. - Resulting parts of CigarHits are touching at cut point. - The two parts do not share any elements, and no element is "lost". - """ - - fcut_point: Fraction = Fraction(cut_point) - if fcut_point.denominator == 1: - raise ValueError("Cut accepts fractions, not integers") - - if self.ref_length == 0 or \ - not (self.r_st - 1 < fcut_point < self.r_ei + 1): - raise IndexError("Cut point out of reference bounds") - - op_fcut_point = self._ref_cut_to_op_cut(fcut_point) - left = self._slice(self.r_st, self.q_st, 0, floor(op_fcut_point)) - right = self._slice(left.r_ei + 1, left.q_ei + 1, ceil(op_fcut_point), self.cigar.op_length) - - return left, right - - def lstrip_query(self) -> 'CigarHit': - """ Return a copy of the CigarHit with leading (unmatched) query elements removed. """ - - cigar = self.cigar.lstrip_query() - return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, - q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - - def rstrip_query(self) -> 'CigarHit': - """ Return a copy of the CigarHit with trailing (unmatched) query elements removed. """ - - cigar = self.cigar.rstrip_query() - return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, - q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) - - def lstrip_reference(self) -> 'CigarHit': - """ Return a copy of the CigarHit with leading (unmatched) reference elements removed. """ - - cigar = self.cigar.lstrip_reference() - return CigarHit(cigar, r_st=self.r_ei - cigar.ref_length + 1, r_ei=self.r_ei, - q_st=self.q_ei - cigar.query_length + 1, q_ei=self.q_ei) - - def rstrip_reference(self) -> 'CigarHit': - """ Return a copy of the CigarHit with trailing (unmatched) reference elements removed. """ - - cigar = self.cigar.rstrip_reference() - return CigarHit(cigar, r_st=self.r_st, r_ei=self.r_st + cigar.ref_length - 1, - q_st=self.q_st, q_ei=self.q_st + cigar.query_length - 1) - - @cached_property - def coordinate_mapping(self) -> CoordinateMapping: - """ - Convert this alignment to coordinate mapping representing - a reference-to-query and query-to-reference coordinate mappings. - """ - - return self.cigar.coordinate_mapping.translate(self.r_st, self.q_st) - - def to_msa(self, reference_seq: str, query_seq: str) -> Tuple[str, str]: - """ - Constructs a multiple sequence alignment (MSA) representation for this CigarHit, using the original reference - and query sequences. It aligns the sequences according to the CIGAR operations, introducing gaps ('-') - as necessary to reflect insertions or deletions. - """ - - return self.cigar.to_msa(reference_seq[self.r_st:], query_seq[self.q_st:]) - - def translate(self, reference_delta: int, query_delta: int) -> 'CigarHit': - return CigarHit(cigar=self.cigar, - r_st=self.r_st + reference_delta, - r_ei=self.r_ei + reference_delta, - q_st=self.q_st + query_delta, - q_ei=self.q_ei + query_delta) - - def __repr__(self): - return 'CigarHit(%r, r_st=%r, r_ei=%r, q_st=%r, q_ei=%r)' \ - % (self.cigar, self.r_st, self.r_ei, self.q_st, self.q_ei) - - def __str__(self): - return '%s@[%d,%d]->[%d,%d]' \ - % (str(self.cigar), self.q_st, self.q_ei, self.r_st, self.r_ei) - - -def connect_cigar_hits(cigar_hits: List[CigarHit]) -> List[CigarHit]: - """ - This function exists to deal with the fact that mappy does not always - connect big gaps, and returns surrounding parts as two separate alignment hits. - - For those cases we simply connect all the parts that do not overlap. - - Order of cigar_hits matters because we ignore alignments - that overlap with previously found alignments. - """ - - if len(cigar_hits) == 0: - raise ValueError("Expected a non-empty list of cigar hits") - - accumulator: List[CigarHit] = [] - - # Collect non-overlaping parts. - # Earlier matches have priority over ones that come after. - for hit in cigar_hits: - if any(earlier.overlaps(hit) for earlier in accumulator): - continue - - accumulator.append(hit) - - # Sort by interval start positions. - sorted_parts = sorted(accumulator, key=lambda p: p.r_st) - - # Segregate independent matches. - sorted_groups: List[List[CigarHit]] = [] - - def find_group(phit): - for group in sorted_groups: - if phit.q_st > group[-1].q_st: - group.append(phit) - return - - sorted_groups.append([phit]) - - for hit in sorted_parts: - find_group(hit) - - # Collect all intervals back together, connecting them with CigarActions.DELETE. - return [reduce(lambda x, y: x.connect(y), group) for group in sorted_groups] From 92eb86c289d97d4dca2cf5d79c22f01f41f79ae8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 19 Mar 2024 19:46:17 -0700 Subject: [PATCH 265/485] Bump aligntools to v1.0.2 --- .mypy.ini | 3 --- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index 54091654f..4ca29fa84 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -9,9 +9,6 @@ check_untyped_defs = True [mypy-micall.core.plot_contigs] check_untyped_defs = False -[mypy-aligntools] -ignore_missing_imports = true - [mypy-mappy] ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt index c8499af3d..d13a7d749 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 cairosvg==2.7.1 -git+https://github.com/cfe-lab/aligntools.git@v1.0.0 +git+https://github.com/cfe-lab/aligntools.git@v1.0.2 From d3d510927bc2305c2fe9d227523af9424afb0fec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 07:24:47 +0000 Subject: [PATCH 266/485] Bump python-levenshtein from 0.25.0 to 0.25.1 Bumps [python-levenshtein](https://github.com/rapidfuzz/python-Levenshtein) from 0.25.0 to 0.25.1. - [Release notes](https://github.com/rapidfuzz/python-Levenshtein/releases) - [Changelog](https://github.com/rapidfuzz/python-Levenshtein/blob/main/HISTORY.md) - [Commits](https://github.com/rapidfuzz/python-Levenshtein/compare/v0.25.0...v0.25.1) --- updated-dependencies: - dependency-name: python-levenshtein dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d13a7d749..5dc375be5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ numpy==1.24.3 scipy==1.10.1 matplotlib==3.7.3 cutadapt==4.7 -python-Levenshtein==0.25.0 +python-Levenshtein==0.25.1 PyYAML==6.0.1 reportlab==4.1.0 pysam==0.22.0 From d0b6d2706302affe35eeb054eaad54b89c856e5a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 12 Apr 2024 11:09:00 -0700 Subject: [PATCH 267/485] Handle floating point numbers in `update_qai.py` This commit addresses an issue in `update_qai.py` where the script failed when encountering floating point numbers instead of exact integers, which became apparent when running the script independently outside of its typical pipeline. The problem lies in the strict interpretation of certain numeric fields (namely 'count', 'demultiplexed', 'v3loop', 'on.score', 'off.score', 'min.coverage', 'which.key.pos') as integers. When a floating point number was encountered in one of these fields, it created a ValueError, terminating the execution of the script. This falls under the category of 'data type mismatch' problems, when the data provided to a function or operation doesn't match the expected type. To fix this, a new function `read_int` has been introduced which tries to convert numeric inputs into integers but tolerates floating point numbers without automatic truncation or rounding errors. If a number cannot be converted exactly into an integer, a ValueError is raised. This modification improves the script's flexibility and resilience, now accommodating floating point numbers in these specified fields without error. Data type mismatch issues, where the data provided to a function or operation doesn't match the expected type, can pose significant challenges in script execution. A common example of this error surfaces when a function expressly expects an integer input but receives a floating-point number. At its core, this problem arises from the inherent rigidity of type systems in programming, which while ensuring data consistency and integrity, can sometimes limit flexibility. This particular revision serves as a practical example of addressing the data type mismatch problem, showcasing that with mindful updates, it's possible to maintain data integrity while allowing for greater operational flexibility in our codebases. It highlights that while strict typing can prevent certain types of errors, it's equally important to ensure our code is resilient to variations in data type, especially when interfacing with varying data sources or when scripts run in isolation outside their regular pipelines. --- micall/monitor/update_qai.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/micall/monitor/update_qai.py b/micall/monitor/update_qai.py index 860035ff4..cbc9a9750 100644 --- a/micall/monitor/update_qai.py +++ b/micall/monitor/update_qai.py @@ -164,10 +164,16 @@ def build_review_decisions(coverage_file, collated_counts_file, cascade_file, # noinspection PyTypeChecker sample_names = dict(map(itemgetter('tags', 'filename'), sample_sheet['DataSplit'])) + def read_int(table, name): + ret = float(table[name]) + if float(int(ret)) != ret: + raise ValueError(f"Bad value for {name!r}: {ret!r}. Expected an integer.") + return int(ret) + counts_map = {} # {tags: raw, (tags, seed): mapped]} # sample,type,count for counts in csv.DictReader(collated_counts_file): - count = int(counts['count']) + count = read_int(counts, 'count') tags = sample_tags[counts['sample']] count_type = counts['type'] if count_type not in ('raw', 'unmapped'): @@ -178,11 +184,11 @@ def build_review_decisions(coverage_file, collated_counts_file, cascade_file, unreported_tags = set() for counts in csv.DictReader(cascade_file): tags = sample_tags[counts['sample']] - counts_map[tags] = int(counts['demultiplexed']) * 2 + counts_map[tags] = read_int(counts, 'demultiplexed') * 2 unreported_tags.add(tags) key = tags, G2P_SEED_NAME - counts_map[key] = int(counts['v3loop']) * 2 + counts_map[key] = read_int(counts, 'v3loop') * 2 sequencing_map = defaultdict(dict) # {tags: {project: sequencing}} for sequencing in sequencings: @@ -201,9 +207,9 @@ def build_review_decisions(coverage_file, collated_counts_file, cascade_file, "tagged layouts missing?" % (tags, coverage_file.name)) sequencing = project_map.get(coverage['project']) if sequencing is not None: - score = int(coverage['on.score']) + score = read_int(coverage, 'on.score') else: - score = int(coverage['off.score']) + score = read_int(coverage, 'off.score') first_project = sorted(project_map.keys())[0] sequencing = project_map[first_project] project_region_id = project_region_map[(coverage['project'], @@ -226,8 +232,8 @@ def build_review_decisions(coverage_file, collated_counts_file, cascade_file, 'seed_region_id': seed_region_id, 'sample_name': coverage['sample'], 'score': score, - 'min_coverage': int(coverage['min.coverage']), - 'min_coverage_pos': int(coverage['which.key.pos']), + 'min_coverage': read_int(coverage, 'min.coverage'), + 'min_coverage_pos': read_int(coverage, 'which.key.pos'), 'raw_reads': raw_count, 'mapped_reads': mapped_count } From 482bea287281791f83d807748be0a0d31661eee5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Apr 2024 07:51:06 +0000 Subject: [PATCH 268/485] Bump cutadapt from 4.7 to 4.8 Bumps [cutadapt](https://github.com/marcelm/cutadapt) from 4.7 to 4.8. - [Release notes](https://github.com/marcelm/cutadapt/releases) - [Changelog](https://github.com/marcelm/cutadapt/blob/main/CHANGES.rst) - [Commits](https://github.com/marcelm/cutadapt/compare/v4.7...v4.8) --- updated-dependencies: - dependency-name: cutadapt dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5dc375be5..02cb729b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2 numpy==1.24.3 scipy==1.10.1 matplotlib==3.7.3 -cutadapt==4.7 +cutadapt==4.8 python-Levenshtein==0.25.1 PyYAML==6.0.1 reportlab==4.1.0 From f565ac3613dea8326100fbaf4908b9d0ace5c2e0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Apr 2024 07:10:52 +0000 Subject: [PATCH 269/485] Bump reportlab from 4.1.0 to 4.2.0 Bumps [reportlab](https://www.reportlab.com/) from 4.1.0 to 4.2.0. --- updated-dependencies: - dependency-name: reportlab dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 02cb729b2..faa24526a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ matplotlib==3.7.3 cutadapt==4.8 python-Levenshtein==0.25.1 PyYAML==6.0.1 -reportlab==4.1.0 +reportlab==4.2.0 pysam==0.22.0 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 From 9daf84ae2a6b5c2bbf483e22573ad43ae39dc6e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 24 Apr 2024 07:37:54 +0000 Subject: [PATCH 270/485] Bump pysam from 0.22.0 to 0.22.1 Bumps [pysam](https://github.com/pysam-developers/pysam) from 0.22.0 to 0.22.1. - [Release notes](https://github.com/pysam-developers/pysam/releases) - [Changelog](https://github.com/pysam-developers/pysam/blob/master/NEWS) - [Commits](https://github.com/pysam-developers/pysam/compare/v0.22.0...v0.22.1) --- updated-dependencies: - dependency-name: pysam dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index faa24526a..48b804eb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ cutadapt==4.8 python-Levenshtein==0.25.1 PyYAML==6.0.1 reportlab==4.2.0 -pysam==0.22.0 +pysam==0.22.1 git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 From 4a9489aa73f3caf2a957756a8e0a79ce5d86c250 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Apr 2024 10:08:39 -0700 Subject: [PATCH 271/485] Split singularity test from the rest of the workflow on CI --- .github/workflows/build-and-test.yml | 37 +++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index fc81cfa6f..d9138cb23 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -1,7 +1,7 @@ name: Build and Test MiCall on: push jobs: - build: + main: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -12,10 +12,6 @@ jobs: - run: mkdir -p ~/bin - run: echo ~/bin >> $GITHUB_PATH - - uses: actions/setup-go@v2 - with: - go-version: '^1.15.6' - - name: Run apt update run: sudo apt-get update @@ -74,6 +70,29 @@ jobs: - name: Test with pytest run: coverage run --source=micall/core,micall/g2p,micall/resistance,micall/monitor -m pytest + - name: Report code coverage + run: | + rm -rf micall/tests/microtest/scratch + python -m pip install codecov + codecov + + singularity-test: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - uses: actions/setup-go@v2 + with: + go-version: '^1.15.6' + + - name: Run apt update + run: sudo apt-get update + - name: Install Singularity run: | sudo apt-get install -qq \ @@ -95,8 +114,10 @@ jobs: cd .. rm -rf singularity singularity --version + - name: Singularity build run: sudo singularity build micall.simg Singularity + - name: Upload Singularity Image uses: actions/upload-artifact@v2 with: @@ -105,9 +126,3 @@ jobs: - name: Singularity test run: python release_test_microtest.py micall.simg - - - name: Report code coverage - run: | - rm -rf micall/tests/microtest/scratch - python -m pip install codecov - codecov From 1d244968d959deca50ac17b8a930dffce7eb4abe Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Apr 2024 10:09:42 -0700 Subject: [PATCH 272/485] Only upload Singularity images that pass the test --- .github/workflows/build-and-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index d9138cb23..f0b41a2ca 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -118,11 +118,11 @@ jobs: - name: Singularity build run: sudo singularity build micall.simg Singularity + - name: Singularity test + run: python release_test_microtest.py micall.simg + - name: Upload Singularity Image uses: actions/upload-artifact@v2 with: name: micall.simg path: micall.simg - - - name: Singularity test - run: python release_test_microtest.py micall.simg From b03221c30d929f7a6de4887d9ebb7b07af5d9e4f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 24 Apr 2024 17:43:14 -0700 Subject: [PATCH 273/485] Fix arguments order in microtest.py --- release_test_microtest.py | 86 ++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/release_test_microtest.py b/release_test_microtest.py index 79d597688..6f5fd25e8 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -438,29 +438,69 @@ def process_sample(self, fastq_file: Path): fields = sections[0].split('-') project_code = fields[-1] writer.writerow(dict(sample=sample_name, project=project_code)) - output_names = ['g2p.csv', - 'g2p_summary.csv', - 'remap_counts.csv', - 'remap_conseq.csv', - 'unmapped1.fastq', - 'unmapped2.fastq', - 'conseq_ins.csv', - 'failed.csv', - 'cascade.csv', - 'nuc.csv', - 'amino.csv', - 'insertions.csv', - 'conseq.csv', - 'conseq_all.csv', - 'conseq_region.csv', - 'conseq_stitched.csv' - 'failed_align.csv', - 'coverage_scores.csv', - 'coverage_maps.tar', - 'aligned.csv', - 'g2p_aligned.csv', - 'genome_coverage.csv', - 'genome_coverage.svg'] + + if self.is_denovo: + output_names = [ + 'g2p.csv', + 'g2p_summary.csv', + 'remap_counts.csv', + 'remap_conseq.csv', + 'unmapped1.fastq', + 'unmapped2.fastq', + 'conseq_ins.csv', + 'failed.csv', + 'cascade.csv', + 'nuc.csv', + 'amino.csv', + 'insertions.csv', + 'conseq.csv', + 'conseq_all.csv', + 'concordance.csv', + 'concordance_seed.csv', + 'failed_align.csv', + 'coverage_scores.csv', + 'coverage_maps.tar', + 'aligned.csv', + 'g2p_aligned.csv', + 'genome_coverage.csv', + 'genome_coverage.svg', + 'genome_concordance.svg', + 'remap_unstitched_conseq.csv', + 'contigs_unstitched.csv', + 'contigs.csv', + 'read_entropy.csv', + 'conseq_region.csv', + 'conseq_stitched.csv', + ] + + else: + output_names = [ + 'g2p.csv', + 'g2p_summary.csv', + 'remap_counts.csv', + 'remap_conseq.csv', + 'unmapped1.fastq', + 'unmapped2.fastq', + 'conseq_ins.csv', + 'failed.csv', + 'cascade.csv', + 'nuc.csv', + 'amino.csv', + 'insertions.csv', + 'conseq.csv', + 'conseq_all.csv', + 'concordance.csv', + 'concordance_seed.csv', + 'failed_align.csv', + 'coverage_scores.csv', + 'coverage_maps.tar', + 'aligned.csv', + 'g2p_aligned.csv', + 'genome_coverage.csv', + 'genome_coverage.svg', + 'genome_concordance.svg', + ] + output_paths = [output_path/name for name in output_names] app_name = 'denovo' if self.is_denovo else None run_with_retries(self.build_command([sample_info_path, From 281a875a8103dd37e5b7bf9c261c03ba2cc9e5c8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Apr 2024 13:35:14 -0700 Subject: [PATCH 274/485] Check that output files exist in microtest.py --- release_test_microtest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/release_test_microtest.py b/release_test_microtest.py index 6f5fd25e8..f8745706f 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -509,6 +509,15 @@ def process_sample(self, fastq_file: Path): self.bad_cycles_path], output_paths, app_name)) + + for path in output_paths: + + if path == (output_path/"conseq_ins.csv"): + # This file is special. See https://github.com/cfe-lab/MiCall/issues/1085 + path = output_path/"scratch"/"conseq_ins.csv" + + assert os.path.exists(path), f"Expected output file {path!r} to be created." + return sample_name def process_resistance(self, sample_group: SampleGroup): From de759d3e1537cacd860dee13ca4d99615178eb05 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 23 Apr 2024 15:02:08 -0700 Subject: [PATCH 275/485] Split Sample.process method Now it as the initial process() and the additional process_post_essembly() methods. This should help us in splitting the outputs into stitched_ and unstitched_ versions. --- micall/drivers/sample.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index ed118fc49..a4cd4e54a 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -229,6 +229,13 @@ def process(self, else: self.run_mapping(excluded_seeds) + return self.process_post_assembly(use_denovo=use_denovo, + excluded_projects=excluded_projects) + + def process_post_assembly(self, + use_denovo: bool, + excluded_projects=(), + ): logger.info('Running sam2aln on %s.', self) with open(self.remap_csv) as remap_csv, \ open(self.aligned_csv, 'w') as aligned_csv, \ From 92c03af6fc361d211b669eba6aaea9ac38296a01 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 25 Apr 2024 15:59:11 -0700 Subject: [PATCH 276/485] Only print the finish message of sample.py once --- micall/drivers/sample.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index a4cd4e54a..cb1fbc808 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -232,6 +232,8 @@ def process(self, return self.process_post_assembly(use_denovo=use_denovo, excluded_projects=excluded_projects) + logger.info('Finished sample %s.', self) + def process_post_assembly(self, use_denovo: bool, excluded_projects=(), @@ -346,7 +348,6 @@ def process_post_assembly(self, cascade_report.remap_counts_csv = remap_counts_csv cascade_report.aligned_csv = aligned_csv cascade_report.generate() - logger.info('Finished sample %s.', self) def load_sample_info(self): path = Path(self.sample_info_csv) From 1dc42394e37e730300cb462a6f44406647577674 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 25 Apr 2024 16:23:59 -0700 Subject: [PATCH 277/485] Rename `$A_unstitched_$B` to `unstitched_$A_$B` The following names have been changed: - `contigs_unstitched_csv` to `unstitched_contigs_csv` - `remap_unstitched_conseq_csv` to `unstitched_remap_conseq_csv` This will make it easier to split the unstitched files from the stitched ones. --- Singularity | 2 +- docs/steps.md | 12 ++++++------ micall/core/denovo.py | 22 +++++++++++----------- micall/core/trim_fastqs.py | 6 +++--- micall/drivers/sample.py | 16 ++++++++-------- micall/monitor/kive_watcher.py | 12 ++++++------ micall/tests/test_kive_watcher.py | 4 ++-- micall_docker.py | 2 +- micall_kive.py | 8 ++++---- release_test_microtest.py | 4 ++-- 10 files changed, 44 insertions(+), 44 deletions(-) diff --git a/Singularity b/Singularity index 1e53cb728..52aa87528 100644 --- a/Singularity +++ b/Singularity @@ -156,7 +156,7 @@ From: python:3.8 conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - remap_unstitched_conseq_csv contigs_unstitched_csv contigs_csv \ + unstitched_remap_conseq_csv unstitched_contigs_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index 2b8c2c773..d59c3a916 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -44,9 +44,9 @@ Individual files are described after the list of steps. * in - fastq1 * in - fastq2 * in - merged_contigs.csv - * contigs_unstitched.csv - the assembled contigs, plus any merged contigs, including + * unstitched_contigs.csv - the assembled contigs, plus any merged contigs, including the best blast results - * contigs.csv - stitched version of `contigs_unstitched` + * contigs.csv - stitched version of `unstitched_contigs` * blast.csv - multiple blast results for each contig * `remap`: iteratively use consensus from previous mapping as reference to try and map more reads. See [remap design] for more details. (The denovo version @@ -59,7 +59,7 @@ Individual files are described after the list of steps. each stage. * remap_conseq.csv - downloaded - consensus sequence that reads were mapped to on the final iteration - * remap_unstitched_conseq.csv - downloaded - consensus sequence that reads were + * unstitched_remap_conseq.csv - downloaded - consensus sequence that reads were mapped to the unstitched contigs. * unmapped1.fastq - FASTQ format (unstructured text) reads that didn't map to any of the final references. @@ -218,7 +218,7 @@ Individual files are described after the list of steps. * pos - 1-based position in the consensus sequence that this insertion follows * insert - the nucleotide sequence that was inserted * qual - the Phred quality scores for the inserted sequence -* contigs_unstitched.csv +* unstitched_contigs.csv * ref - the reference name with the best BLAST result * match - the fraction of the contig that matched in BLAST, negative for reverse-complemented matches @@ -226,7 +226,7 @@ Individual files are described after the list of steps. the contigs in a sample * contig - the nucleotide sequence of the assembled contig * contigs.csv - Same as `contigs_unstitched.csv`, but contigs are stitched by `micall/core/contig_stitcher.py`. + Same as `unstitched_contigs.csv`, but contigs are stitched by `micall/core/contig_stitcher.py`. * coverage_scores.csv * project - the project this score is defined by * region - the region being displayed @@ -350,7 +350,7 @@ Individual files are described after the list of steps. * remap_conseq.csv * region - the region mapped to * sequence - the consensus sequence used -* remap_unstitched_conseq.csv +* unstitched_remap_conseq.csv * region - the region mapped to * sequence - the consensus sequence used * resistance.csv diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 8f3be5acc..e672e6548 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -73,7 +73,7 @@ def contigs_refs_write(writer, ref: str, match: float, group_ref: str, contig: s def write_contig_refs(contigs_fasta_path: str, - contigs_unstitched_csv: Optional[TextIO], + unstitched_contigs_csv: Optional[TextIO], contigs_csv: Optional[TextIO], merged_contigs_csv: Optional[TextIO] = None, blast_csv: Optional[TextIO] = None, @@ -82,7 +82,7 @@ def write_contig_refs(contigs_fasta_path: str, :param str contigs_fasta_path: path to file to read contig sequences from and append merged contigs to - :param contigs_unstitched_csv: open file to write assembled contigs to + :param unstitched_contigs_csv: open file to write assembled contigs to :param contigs_csv: open file to write stitched contigs to :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads @@ -97,8 +97,8 @@ def write_contig_refs(contigs_fasta_path: str, contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") - unstitched_writer = init_contigs_refs(contigs_unstitched_csv) \ - if contigs_unstitched_csv else None + unstitched_writer = init_contigs_refs(unstitched_contigs_csv) \ + if unstitched_contigs_csv else None stitched_writer = init_contigs_refs(contigs_csv) if contigs_csv else None group_refs: Dict[str, str] = {} @@ -238,7 +238,7 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): def denovo(fastq1_path: str, fastq2_path: str, - contigs_unstitched_csv: Optional[TextIO], + unstitched_contigs_csv: Optional[TextIO], contigs_csv: Optional[TextIO], work_dir: str = '.', merged_contigs_csv: Optional[TextIO] = None, @@ -249,7 +249,7 @@ def denovo(fastq1_path: str, :param fastq1_path: FASTQ file name for read 1 reads :param fastq2_path: FASTQ file name for read 2 reads - :param contigs_unstitched_csv: open file to write assembled contigs to + :param unstitched_contigs_csv: open file to write assembled contigs to :param contigs_csv: open file to write stitched contigs to :param work_dir: path for writing temporary files :param merged_contigs_csv: open file to read contigs that were merged from @@ -258,8 +258,8 @@ def denovo(fastq1_path: str, :param stitcher_plot_path: open file to write the visualizer plot to """ - if contigs_unstitched_csv is None and contigs_csv is None: - raise ValueError("Must specify either contigs_csv or contigs_unstitched_csv") + if unstitched_contigs_csv is None and contigs_csv is None: + raise ValueError("Must specify either contigs_csv or unstitched_contigs_csv") old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*')) for old_tmp_dir in old_tmp_dirs: @@ -311,7 +311,7 @@ def denovo(fastq1_path: str, os.chdir(start_dir) duration = datetime.now() - start_time contig_count = write_contig_refs(contigs_fasta_path, - contigs_unstitched_csv, + unstitched_contigs_csv, contigs_csv, blast_csv=blast_csv, stitcher_plot_path=stitcher_plot_path) @@ -330,9 +330,9 @@ def denovo(fastq1_path: str, parser = argparse.ArgumentParser() parser.add_argument('fastq1') parser.add_argument('fastq2') - parser.add_argument('--contigs_unstitched', type=argparse.FileType('w')) + parser.add_argument('--unstitched_contigs', type=argparse.FileType('w')) parser.add_argument('--contigs', type=argparse.FileType('w')) parser.add_argument('--stitcher_plot') args = parser.parse_args() - denovo(args.fastq1, args.fastq2, args.contigs_unstitched, args.contigs, args.stitcher_plot_path) + denovo(args.fastq1, args.fastq2, args.unstitched_contigs, args.contigs, args.stitcher_plot_path) diff --git a/micall/core/trim_fastqs.py b/micall/core/trim_fastqs.py index ff063b835..8e30f3b72 100755 --- a/micall/core/trim_fastqs.py +++ b/micall/core/trim_fastqs.py @@ -60,9 +60,9 @@ def trim(original_fastq_filenames: typing.Sequence[str], bad_cycles_filename: str, trimmed_fastq_filenames: typing.Sequence[str], use_gzip: bool = True, - summary_file: typing.TextIO = None, - skip: typing.Tuple[str] = (), - project_code: str = None): + summary_file: typing.Optional[typing.TextIO] = None, + skip: typing.Iterable[str] = (), + project_code: typing.Optional[str] = None): """ :param original_fastq_filenames: sequence of two filenames, containing diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index cb1fbc808..e647a809e 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -258,8 +258,8 @@ def process_post_assembly(self, nuc_csv=(self.nuc_csv, 'w'), conseq_ins_csv=(self.conseq_ins_csv, 'r'), remap_conseq_csv=(self.remap_conseq_csv, 'r'), - remap_unstitched_conseq_csv=(self.remap_unstitched_conseq_csv, 'r') if use_denovo else None, - contigs_unstitched_csv=(self.contigs_unstitched_csv, 'r') if use_denovo else None, + unstitched_remap_conseq_csv=(self.unstitched_remap_conseq_csv, 'r') if use_denovo else None, + unstitched_contigs_csv=(self.unstitched_contigs_csv, 'r') if use_denovo else None, contigs_csv=(self.contigs_csv, 'r') if use_denovo else None, nuc_detail_csv=(self.nuc_details_csv, 'w') if use_denovo else None, amino_csv=(self.amino_csv, 'w'), @@ -396,12 +396,12 @@ def run_denovo(self, excluded_seeds): logger.info('Running de novo assembly on %s.', self) scratch_path = self.get_scratch_path() with open(self.merged_contigs_csv) as merged_contigs_csv, \ - open(self.contigs_unstitched_csv, 'w') as contigs_unstitched_csv, \ + open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \ open(self.contigs_csv, 'w') as contigs_csv, \ open(self.blast_csv, 'w') as blast_csv: denovo(self.trimmed1_fastq, self.trimmed2_fastq, - contigs_unstitched_csv, + unstitched_contigs_csv, contigs_csv, self.scratch_path, merged_contigs_csv, @@ -433,19 +433,19 @@ def run_denovo(self, excluded_seeds): excluded_seeds=excluded_seeds) # Mapping the unstitched version too. - with open(self.contigs_unstitched_csv) as contigs_unstitched_csv, \ + with open(self.unstitched_contigs_csv) as unstitched_contigs_csv, \ open(os.devnull, 'w') as remap_csv, \ open(os.devnull, 'w') as counts_csv, \ - open(self.remap_unstitched_conseq_csv, 'w') as remap_unstitched_conseq_csv, \ + open(self.unstitched_remap_conseq_csv, 'w') as unstitched_remap_conseq_csv, \ open(os.devnull, 'w') as unmapped1, \ open(os.devnull, 'w') as unmapped2: map_to_contigs(self.trimmed1_fastq, self.trimmed2_fastq, - contigs_unstitched_csv, + unstitched_contigs_csv, remap_csv, counts_csv, - remap_unstitched_conseq_csv, + unstitched_remap_conseq_csv, unmapped1, unmapped2, scratch_path, diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 1a5d98c52..6fbc6da10 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -59,8 +59,8 @@ 'resistance_consensus_csv', 'wg_fasta', 'mid_fasta', - 'remap_unstitched_conseq_csv', - 'contigs_unstitched_csv', + 'unstitched_remap_conseq_csv', + 'unstitched_contigs_csv', 'contigs_csv', 'alignment_svg', 'alignment_png', @@ -939,14 +939,14 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description): run_dataset['argument_name']: run_dataset['dataset'] for run_dataset in main_run['datasets'] if run_dataset['argument_name'] in ('sample_info_csv', - 'remap_unstitched_conseq_csv', - 'contigs_unstitched_csv', + 'unstitched_remap_conseq_csv', + 'unstitched_contigs_csv', 'cascade_csv')} input_datasets = { argument_name: self.kive_retry(lambda: self.session.get(url).json()) for argument_name, url in input_dataset_urls.items()} - input_datasets['conseqs_csv'] = input_datasets.pop('remap_unstitched_conseq_csv') - input_datasets['contigs_csv'] = input_datasets.pop('contigs_unstitched_csv') + input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_remap_conseq_csv') + input_datasets['contigs_csv'] = input_datasets.pop('unstitched_contigs_csv') run = self.find_or_launch_run( pipeline_id, input_datasets, diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 89ec37385..831c561e0 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1759,10 +1759,10 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): argument_name='sample_info_csv'), dict(dataset='/datasets/111/', argument_type='O', - argument_name='contigs_unstitched_csv'), + argument_name='unstitched_contigs_csv'), dict(dataset='/datasets/112/', argument_type='O', - argument_name='remap_unstitched_conseq_csv'), + argument_name='unstitched_remap_conseq_csv'), dict(dataset='/datasets/113/', argument_type='O', argument_name='cascade_csv')]] # run datasets diff --git a/micall_docker.py b/micall_docker.py index b8ee54523..d640a1186 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1075,7 +1075,7 @@ def collate_samples(run_info: RunInfo): """ filenames = ['remap_counts.csv', 'remap_conseq.csv', - 'remap_unstitched_conseq.csv', + 'unstitched_remap_conseq.csv', 'insertions.csv', 'failed_read.csv', 'nuc.csv', diff --git a/micall_kive.py b/micall_kive.py index 601bcc2ef..5a2a93980 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -80,10 +80,10 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') - parser.add_argument('remap_unstitched_conseq_csv', + parser.add_argument('unstitched_remap_conseq_csv', nargs='?', help='CSV containing mapping unstitched consensus sequences') - parser.add_argument('contigs_unstitched_csv', + parser.add_argument('unstitched_contigs_csv', nargs='?', help='CSV containing contigs built by de novo assembly') parser.add_argument('contigs_csv', @@ -130,8 +130,8 @@ def load_sample(args): coverage_scores_csv=args.coverage_scores_csv, aligned_csv=args.aligned_csv, g2p_aligned_csv=args.g2p_aligned_csv, - remap_unstitched_conseq_csv=args.remap_unstitched_conseq_csv, - contigs_unstitched_csv=args.contigs_unstitched_csv, + unstitched_remap_conseq_csv=args.unstitched_remap_conseq_csv, + unstitched_contigs_csv=args.unstitched_contigs_csv, contigs_csv=args.contigs_csv, genome_coverage_csv=args.genome_coverage_csv, genome_coverage_svg=args.genome_coverage_svg, diff --git a/release_test_microtest.py b/release_test_microtest.py index f8745706f..a87ff0a90 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -465,8 +465,8 @@ def process_sample(self, fastq_file: Path): 'genome_coverage.csv', 'genome_coverage.svg', 'genome_concordance.svg', - 'remap_unstitched_conseq.csv', - 'contigs_unstitched.csv', + 'unstitched_remap_conseq.csv', + 'unstitched_contigs.csv', 'contigs.csv', 'read_entropy.csv', 'conseq_region.csv', From bba28035d643e52acca29a77812aa125a42a4ecc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 25 Apr 2024 16:34:58 -0700 Subject: [PATCH 278/485] Fix some typechecking errors in sample.py --- micall/drivers/sample.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index e647a809e..85c924d4f 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -61,10 +61,10 @@ def open_files(**files): raise IOError -def exclude_extra_seeds(excluded_seeds: typing.Sequence[str], - project_code: str = None) -> typing.Sequence[str]: +def exclude_extra_seeds(excluded_seeds: typing.Iterable[str], + project_code: typing.Optional[str] = None) -> typing.Sequence[str]: if project_code == 'HIVGHA': - return excluded_seeds + return tuple(excluded_seeds) projects = ProjectConfig.loadDefault() hivgha_seeds = projects.getProjectSeeds('HIVGHA') extra_exclusions = {seed @@ -81,7 +81,7 @@ def __init__(self, rank=None, debug_remap=False, scratch_path=None, - skip: typing.Tuple[str] = (), + skip: typing.Iterable[str] = (), **paths): """ Record the details. @@ -98,13 +98,13 @@ def __init__(self, fastq1 = paths.get('fastq1') if 'fastq2' in paths: pass - elif 'fastq1' in paths: + elif fastq1: if '_R1_' not in fastq1: raise ValueError( "fastq2 not given, and fastq1 does not contain '_R1_'.") paths['fastq2'] = fastq1.replace('_R1_', '_R2_') if fastq1: - self.name = '_'.join(os.path.basename(fastq1).split('_')[:2]) + self.name: typing.Optional[str] = '_'.join(os.path.basename(fastq1).split('_')[:2]) else: self.name = None self.basespace_id = basespace_id @@ -158,8 +158,8 @@ def get_scratch_path(self): def process(self, pssm, - excluded_seeds=(), - excluded_projects=(), + excluded_seeds: typing.Iterable[str] = (), + excluded_projects: typing.Iterable[str] = (), force_gzip=False, use_denovo=False): """ Process a single sample. @@ -236,7 +236,7 @@ def process(self, def process_post_assembly(self, use_denovo: bool, - excluded_projects=(), + excluded_projects: typing.Iterable[str] = (), ): logger.info('Running sam2aln on %s.', self) with open(self.remap_csv) as remap_csv, \ From 792c3359e6c3b865383d3aa458d3dbd8ff3c8d92 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 25 Apr 2024 16:27:52 -0700 Subject: [PATCH 279/485] Split unstitched files from the stitched ones This change basically forks all generated files after denovo step. One branch will generate the stitched files, as previously, but the other one will also generated the unstitched files, which will be named the same as the stitched once, except for the "unstitched_" prefix. --- micall/drivers/sample.py | 137 ++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 58 deletions(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 85c924d4f..fecffd769 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -24,6 +24,11 @@ logger = logging.getLogger(__name__) +def prepend_prefix_to_basename(prefix: str, path: str): + dir_name, base_name = os.path.split(path) + return os.path.join(dir_name, prefix + base_name) + + @contextmanager def open_files(**files): """ Context manager that will open files and close them at the end. @@ -229,21 +234,32 @@ def process(self, else: self.run_mapping(excluded_seeds) - return self.process_post_assembly(use_denovo=use_denovo, - excluded_projects=excluded_projects) + self.process_post_assembly(prefix="", + use_denovo=use_denovo, + excluded_projects=excluded_projects) + + if use_denovo: + self.process_post_assembly(prefix="unstitched_", + use_denovo=use_denovo, + excluded_projects=excluded_projects) logger.info('Finished sample %s.', self) def process_post_assembly(self, use_denovo: bool, - excluded_projects: typing.Iterable[str] = (), + excluded_projects: typing.Iterable[str], + prefix: str, ): + + def with_prefix(path): + return prepend_prefix_to_basename(prefix, path) + logger.info('Running sam2aln on %s.', self) - with open(self.remap_csv) as remap_csv, \ - open(self.aligned_csv, 'w') as aligned_csv, \ - open(self.conseq_ins_csv, 'w') as conseq_ins_csv, \ - open(self.failed_csv, 'w') as failed_csv, \ - open(self.clipping_csv, 'w') as clipping_csv: + with open(with_prefix(self.remap_csv)) as remap_csv, \ + open(with_prefix(self.aligned_csv), 'w') as aligned_csv, \ + open(with_prefix(self.conseq_ins_csv), 'w') as conseq_ins_csv, \ + open(with_prefix(self.failed_csv), 'w') as failed_csv, \ + open(with_prefix(self.clipping_csv), 'w') as clipping_csv: sam2aln(remap_csv, aligned_csv, @@ -252,34 +268,35 @@ def process_post_assembly(self, clipping_csv=clipping_csv) logger.info('Running aln2counts on %s.', self) - with open_files(aligned_csv=(self.aligned_csv, 'r'), + with open_files(aligned_csv=(with_prefix(self.aligned_csv), 'r'), + + # Does not need a prefix because it is produced before the denovo/remap split. g2p_aligned_csv=(self.g2p_aligned_csv, 'r'), - clipping_csv=(self.clipping_csv, 'r'), - nuc_csv=(self.nuc_csv, 'w'), - conseq_ins_csv=(self.conseq_ins_csv, 'r'), - remap_conseq_csv=(self.remap_conseq_csv, 'r'), - unstitched_remap_conseq_csv=(self.unstitched_remap_conseq_csv, 'r') if use_denovo else None, - unstitched_contigs_csv=(self.unstitched_contigs_csv, 'r') if use_denovo else None, - contigs_csv=(self.contigs_csv, 'r') if use_denovo else None, - nuc_detail_csv=(self.nuc_details_csv, 'w') if use_denovo else None, - amino_csv=(self.amino_csv, 'w'), - amino_detail_csv=(self.amino_details_csv, 'w') if use_denovo else None, - insertions_csv=(self.insertions_csv, 'w'), - conseq_csv=(self.conseq_csv, 'w'), - conseq_region_csv=(self.conseq_region_csv, 'w') if use_denovo else None, - failed_align_csv=(self.failed_align_csv, 'w'), - coverage_summary_csv=(self.coverage_summary_csv, 'w'), - genome_coverage_csv=(self.genome_coverage_csv, 'w'), - conseq_all_csv=(self.conseq_all_csv, 'w'), - conseq_stitched_csv=(self.conseq_stitched_csv, 'w') if use_denovo else None, - minimap_hits_csv=(self.minimap_hits_csv, 'w'), - alignments_csv=(self.alignments_csv, 'w'), - alignments_unmerged_csv=(self.alignments_unmerged_csv, 'w'), - alignments_intermediate_csv=(self.alignments_intermediate_csv, 'w'), - alignments_overall_csv=(self.alignments_overall_csv, 'w'), - concordance_csv=(self.concordance_csv, 'w'), - concordance_detailed_csv=(self.concordance_detailed_csv, 'w'), - concordance_seed_csv=(self.concordance_seed_csv, 'w')) as opened_files: + + clipping_csv=(with_prefix(self.clipping_csv), 'r'), + nuc_csv=(with_prefix(self.nuc_csv), 'w'), + conseq_ins_csv=(with_prefix(self.conseq_ins_csv), 'r'), + remap_conseq_csv=(with_prefix(self.remap_conseq_csv), 'r'), + contigs_csv=(with_prefix(self.contigs_csv), 'r') if use_denovo else None, + nuc_detail_csv=(with_prefix(self.nuc_details_csv), 'w') if use_denovo else None, + amino_csv=(with_prefix(self.amino_csv), 'w'), + amino_detail_csv=(with_prefix(self.amino_details_csv), 'w') if use_denovo else None, + insertions_csv=(with_prefix(self.insertions_csv), 'w'), + conseq_csv=(with_prefix(self.conseq_csv), 'w'), + conseq_region_csv=(with_prefix(self.conseq_region_csv), 'w') if use_denovo else None, + failed_align_csv=(with_prefix(self.failed_align_csv), 'w'), + coverage_summary_csv=(with_prefix(self.coverage_summary_csv), 'w'), + genome_coverage_csv=(with_prefix(self.genome_coverage_csv), 'w'), + conseq_all_csv=(with_prefix(self.conseq_all_csv), 'w'), + conseq_stitched_csv=(with_prefix(self.conseq_stitched_csv), 'w') if use_denovo else None, + minimap_hits_csv=(with_prefix(self.minimap_hits_csv), 'w'), + alignments_csv=(with_prefix(self.alignments_csv), 'w'), + alignments_unmerged_csv=(with_prefix(self.alignments_unmerged_csv), 'w'), + alignments_intermediate_csv=(with_prefix(self.alignments_intermediate_csv), 'w'), + alignments_overall_csv=(with_prefix(self.alignments_overall_csv), 'w'), + concordance_csv=(with_prefix(self.concordance_csv), 'w'), + concordance_detailed_csv=(with_prefix(self.concordance_detailed_csv), 'w'), + concordance_seed_csv=(with_prefix(self.concordance_seed_csv), 'w')) as opened_files: aln2counts(opened_files['aligned_csv'], opened_files['nuc_csv'], @@ -309,40 +326,42 @@ def process_post_assembly(self, concordance_seed_csv=opened_files['concordance_seed_csv']) logger.info('Running coverage_plots on %s.', self) - os.makedirs(self.coverage_maps) - with open(self.amino_csv) as amino_csv, \ - open(self.coverage_scores_csv, 'w') as coverage_scores_csv: + os.makedirs(with_prefix(self.coverage_maps)) + with open(with_prefix(self.amino_csv)) as amino_csv, \ + open(with_prefix(self.coverage_scores_csv), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, coverage_maps_path=self.coverage_maps, coverage_maps_prefix=self.name, excluded_projects=excluded_projects) - with open(self.genome_coverage_csv) as genome_coverage_csv, \ - open(self.minimap_hits_csv) as minimap_hits_csv: + with open(with_prefix(self.genome_coverage_csv)) as genome_coverage_csv, \ + open(with_prefix(self.minimap_hits_csv)) as minimap_hits_csv: if not use_denovo: minimap_hits_csv = None plot_genome_coverage(genome_coverage_csv, minimap_hits_csv, - self.genome_coverage_svg) + with_prefix(self.genome_coverage_svg)) - with open(self.genome_coverage_csv) as genome_coverage_csv, \ - open(self.minimap_hits_csv) as minimap_hits_csv: + with open(with_prefix(self.genome_coverage_csv)) as genome_coverage_csv, \ + open(with_prefix(self.minimap_hits_csv)) as minimap_hits_csv: if not use_denovo: minimap_hits_csv = None plot_genome_coverage(genome_coverage_csv, minimap_hits_csv, - self.genome_concordance_svg, + with_prefix(self.genome_concordance_svg), use_concordance=True) - with open(self.concordance_detailed_csv) as concordance_detailed_csv: - concordance_plot(concordance_detailed_csv, plot_path=self.coverage_maps, concordance_prefix=self.name) + with open(with_prefix(self.concordance_detailed_csv)) as concordance_detailed_csv: + concordance_plot(concordance_detailed_csv, + plot_path=with_prefix(self.coverage_maps), + concordance_prefix=self.name) logger.info('Running cascade_report on %s.', self) with open(self.g2p_summary_csv) as g2p_summary_csv, \ - open(self.remap_counts_csv) as remap_counts_csv, \ - open(self.aligned_csv) as aligned_csv, \ - open(self.cascade_csv, 'w') as cascade_csv: + open(with_prefix(self.remap_counts_csv)) as remap_counts_csv, \ + open(with_prefix(self.aligned_csv)) as aligned_csv, \ + open(with_prefix(self.cascade_csv), 'w') as cascade_csv: cascade_report = CascadeReport(cascade_csv) cascade_report.g2p_summary_csv = g2p_summary_csv cascade_report.remap_counts_csv = remap_counts_csv @@ -432,20 +451,22 @@ def run_denovo(self, excluded_seeds): debug_file_prefix=debug_file_prefix, excluded_seeds=excluded_seeds) - # Mapping the unstitched version too. - with open(self.unstitched_contigs_csv) as unstitched_contigs_csv, \ - open(os.devnull, 'w') as remap_csv, \ - open(os.devnull, 'w') as counts_csv, \ - open(self.unstitched_remap_conseq_csv, 'w') as unstitched_remap_conseq_csv, \ - open(os.devnull, 'w') as unmapped1, \ - open(os.devnull, 'w') as unmapped2: + def with_prefix(path): + return prepend_prefix_to_basename("unstitched_", path) + + with open(self.unstitched_contigs_csv) as contigs_csv, \ + open(with_prefix(self.remap_csv), 'w') as remap_csv, \ + open(with_prefix(self.remap_counts_csv), 'w') as counts_csv, \ + open(with_prefix(self.remap_conseq_csv), 'w') as remap_conseq_csv, \ + open(with_prefix(self.unmapped1_fastq), 'w') as unmapped1, \ + open(with_prefix(self.unmapped2_fastq), 'w') as unmapped2: map_to_contigs(self.trimmed1_fastq, self.trimmed2_fastq, - unstitched_contigs_csv, + contigs_csv, remap_csv, counts_csv, - unstitched_remap_conseq_csv, + remap_conseq_csv, unmapped1, unmapped2, scratch_path, From b444bdfc2478ca1848c8c3f4b17c807801e05728 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Apr 2024 09:06:09 -0700 Subject: [PATCH 280/485] Download unstitched_conseq from Kive instead of .._remap_conseq --- micall/monitor/kive_watcher.py | 6 +++--- micall/tests/test_kive_watcher.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index 6fbc6da10..fd22956fc 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -59,7 +59,7 @@ 'resistance_consensus_csv', 'wg_fasta', 'mid_fasta', - 'unstitched_remap_conseq_csv', + 'unstitched_conseq_csv', 'unstitched_contigs_csv', 'contigs_csv', 'alignment_svg', @@ -939,13 +939,13 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description): run_dataset['argument_name']: run_dataset['dataset'] for run_dataset in main_run['datasets'] if run_dataset['argument_name'] in ('sample_info_csv', - 'unstitched_remap_conseq_csv', + 'unstitched_conseq_csv', 'unstitched_contigs_csv', 'cascade_csv')} input_datasets = { argument_name: self.kive_retry(lambda: self.session.get(url).json()) for argument_name, url in input_dataset_urls.items()} - input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_remap_conseq_csv') + input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_conseq_csv') input_datasets['contigs_csv'] = input_datasets.pop('unstitched_contigs_csv') run = self.find_or_launch_run( pipeline_id, diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 831c561e0..64a1e542b 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1762,7 +1762,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): argument_name='unstitched_contigs_csv'), dict(dataset='/datasets/112/', argument_type='O', - argument_name='unstitched_remap_conseq_csv'), + argument_name='unstitched_conseq_csv'), dict(dataset='/datasets/113/', argument_type='O', argument_name='cascade_csv')]] # run datasets From 970dc5da0f2efca7dc7284adfe991bfa6a5630ed Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 26 Apr 2024 13:25:22 -0700 Subject: [PATCH 281/485] Replace all usages of unstitched_remap_conseq by unstitched_conseq --- Singularity | 2 +- docs/steps.md | 4 ++-- micall_docker.py | 2 +- micall_kive.py | 4 ++-- release_test_microtest.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Singularity b/Singularity index 52aa87528..3b267501c 100644 --- a/Singularity +++ b/Singularity @@ -156,7 +156,7 @@ From: python:3.8 conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - unstitched_remap_conseq_csv unstitched_contigs_csv contigs_csv \ + unstitched_conseq_csv unstitched_contigs_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index d59c3a916..747f7665c 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -59,7 +59,7 @@ Individual files are described after the list of steps. each stage. * remap_conseq.csv - downloaded - consensus sequence that reads were mapped to on the final iteration - * unstitched_remap_conseq.csv - downloaded - consensus sequence that reads were + * unstitched_conseq.csv - downloaded - consensus sequence that reads were mapped to the unstitched contigs. * unmapped1.fastq - FASTQ format (unstructured text) reads that didn't map to any of the final references. @@ -350,7 +350,7 @@ Individual files are described after the list of steps. * remap_conseq.csv * region - the region mapped to * sequence - the consensus sequence used -* unstitched_remap_conseq.csv +* unstitched_conseq.csv * region - the region mapped to * sequence - the consensus sequence used * resistance.csv diff --git a/micall_docker.py b/micall_docker.py index d640a1186..2a5f88fee 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1075,7 +1075,7 @@ def collate_samples(run_info: RunInfo): """ filenames = ['remap_counts.csv', 'remap_conseq.csv', - 'unstitched_remap_conseq.csv', + 'unstitched_conseq.csv', 'insertions.csv', 'failed_read.csv', 'nuc.csv', diff --git a/micall_kive.py b/micall_kive.py index 5a2a93980..60275cc11 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -80,7 +80,7 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') - parser.add_argument('unstitched_remap_conseq_csv', + parser.add_argument('unstitched_conseq_csv', nargs='?', help='CSV containing mapping unstitched consensus sequences') parser.add_argument('unstitched_contigs_csv', @@ -130,7 +130,7 @@ def load_sample(args): coverage_scores_csv=args.coverage_scores_csv, aligned_csv=args.aligned_csv, g2p_aligned_csv=args.g2p_aligned_csv, - unstitched_remap_conseq_csv=args.unstitched_remap_conseq_csv, + unstitched_conseq_csv=args.unstitched_conseq_csv, unstitched_contigs_csv=args.unstitched_contigs_csv, contigs_csv=args.contigs_csv, genome_coverage_csv=args.genome_coverage_csv, diff --git a/release_test_microtest.py b/release_test_microtest.py index a87ff0a90..d9e6440af 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -465,7 +465,7 @@ def process_sample(self, fastq_file: Path): 'genome_coverage.csv', 'genome_coverage.svg', 'genome_concordance.svg', - 'unstitched_remap_conseq.csv', + 'unstitched_conseq.csv', 'unstitched_contigs.csv', 'contigs.csv', 'read_entropy.csv', From e43a9e8b36fac2779bc3237051a2967769908f75 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 07:55:21 +0000 Subject: [PATCH 282/485] Bump pytest from 8.1.1 to 8.2.0 Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.1.1 to 8.2.0. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.1.1...8.2.0) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 5cd1368c6..ee0d4e026 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,7 +4,7 @@ -r requirements-watcher.txt mypy==1.9.0 mypy-extensions==1.0.0 -pytest==8.1.1 +pytest==8.2.0 coverage==7.4.4 pandas==2.0.2 seaborn==0.13.2 From 563f77028063e38604371b1b5542f3cda0efc8d0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 29 Apr 2024 09:15:29 -0700 Subject: [PATCH 283/485] Update micall_docker.py to recognize the unstitched_conseq --- micall_docker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/micall_docker.py b/micall_docker.py index 2a5f88fee..f27ec9f6b 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1075,7 +1075,6 @@ def collate_samples(run_info: RunInfo): """ filenames = ['remap_counts.csv', 'remap_conseq.csv', - 'unstitched_conseq.csv', 'insertions.csv', 'failed_read.csv', 'nuc.csv', @@ -1096,7 +1095,7 @@ def collate_samples(run_info: RunInfo): 'concordance.csv', 'concordance_seed.csv'] if run_info.is_denovo: - filenames += ['conseq_stitched.csv', 'conseq_region.csv'] + filenames += ['conseq_stitched.csv', 'conseq_region.csv', 'unstitched_conseq.csv'] for filename in filenames: out_path = run_info.output_path with open(os.path.join(out_path, filename), 'w') as fout: From 8c7cdef38f90fa2032462f55bb2944c93d348f9b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:28:49 +0000 Subject: [PATCH 284/485] Bump coverage from 7.4.4 to 7.5.0 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.4.4 to 7.5.0. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.4.4...7.5.0) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index ee0d4e026..3abc899ba 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.9.0 mypy-extensions==1.0.0 pytest==8.2.0 -coverage==7.4.4 +coverage==7.5.0 pandas==2.0.2 seaborn==0.13.2 ete3 From 0b1794df428fa048f83d4e327a3092ddff53b0f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 18:07:38 +0000 Subject: [PATCH 285/485] Bump mypy from 1.9.0 to 1.10.0 Bumps [mypy](https://github.com/python/mypy) from 1.9.0 to 1.10.0. - [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md) - [Commits](https://github.com/python/mypy/compare/1.9.0...v1.10.0) --- updated-dependencies: - dependency-name: mypy dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- requirements-test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index f8e370d54..6a5be4fb9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,5 +3,5 @@ -r requirements-watcher.txt # Used for plotting profiling results. gprof2dot==2022.7.29 -mypy==1.9.0 +mypy==1.10.0 mypy-extensions==1.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 3abc899ba..e9ee24556 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,7 +2,7 @@ -r requirements.txt -r requirements-watcher.txt -mypy==1.9.0 +mypy==1.10.0 mypy-extensions==1.0.0 pytest==8.2.0 coverage==7.5.0 From 4b5e23065ef231a2440c358ef638faa36c51f5d5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 May 2024 15:21:06 -0700 Subject: [PATCH 286/485] micall_docker: do not spawn subprocesses for n=1 --- micall_docker.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/micall_docker.py b/micall_docker.py index f27ec9f6b..18a70829c 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -584,18 +584,21 @@ def process_run(run_info, args): logger.info('Summarizing run.') run_summary = summarize_run(run_info) - with ProcessPoolExecutor(max_workers=args.max_active) as pool: - for _ in pool.map(functools.partial(process_sample, - args=args, - pssm=pssm, - use_denovo=run_info.is_denovo), - run_info.get_all_samples()): - pass - - for _ in pool.map(functools.partial(process_resistance, - run_info=run_info), - run_info.sample_groups): - pass + def runner(func, inputs): + if args.max_active > 1 and len(inputs) > 1: + with ProcessPoolExecutor(max_workers=args.max_active) as pool: + list(pool.map(func, inputs)) + else: + list(map(func, inputs)) + + runner(functools.partial(process_sample, + args=args, + pssm=pssm, + use_denovo=run_info.is_denovo), + run_info.get_all_samples()) + + runner(functools.partial(process_resistance, run_info=run_info), + run_info.sample_groups) collate_samples(run_info) if run_summary is not None: From 9ae95d3d873eb7ab76b3f95a76cb2ba9e52af557 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 May 2024 16:01:13 -0700 Subject: [PATCH 287/485] docs: pin jekyll theme version to prevent unexpected changes --- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index f8f858e88..e807070c5 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,2 +1,2 @@ -remote_theme: chrisrhymes/bulma-clean-theme +remote_theme: chrisrhymes/bulma-clean-theme:v0.14.0 title: MiCall From a180d4d4ed5d4a935000a55f43d9826660254849 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 9 May 2024 15:19:16 -0700 Subject: [PATCH 288/485] Fix contig stitcher CLI help message for output arg --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 8cec40242..665730e60 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -603,7 +603,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument('contigs', type=argparse.FileType('r'), help="Input fasta file with assembled contigs.") parser.add_argument('stitched_contigs', type=argparse.FileType('w'), - help="Output fasta file with stitched contigs.") + help="Output CSV file with stitched contigs.") parser.add_argument('--plot', type=argparse.FileType('w'), help="Output SVG image visualizing the stitching process.") verbosity_group = parser.add_mutually_exclusive_group() From a9d990d8db6f8459f6362586ccecec82b706bfb4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 May 2024 13:54:04 -0700 Subject: [PATCH 289/485] Contig stitcher: fix reverse complement logging --- micall/core/contig_stitcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 665730e60..e0f5204a5 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -185,12 +185,13 @@ def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: strand = hits_array[0][1] if strand == "reverse": rc = str(Seq.Seq(contig.seq).reverse_complement()) + original_contig = contig new_contig = replace(contig, seq=rc) contig = new_contig hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand) for hit, strand in hits_array] - log(events.ReverseComplement(contig, new_contig)) + log(events.ReverseComplement(original_contig, new_contig)) for i, (hit, strand) in enumerate(hits_array): log(events.InitialHit(contig, i, hit, strand)) From 06826a6650d2723366bafaf0053804ff83705205 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 May 2024 15:52:40 -0700 Subject: [PATCH 290/485] Contig stitcher: rely on unique identifiers instead of contig names --- micall/core/contig_stitcher.py | 38 +- micall/core/plot_contigs.py | 443 +++++++++--------- ...test_correct_processing_complex_nogaps.svg | 24 +- ...stitching_of_non_overlapping_contigs_2.svg | 20 +- ...ching_contig_with_big_noncovered_gap_2.svg | 20 +- micall/tests/test_contig_stitcher.py | 7 +- micall/tests/test_contig_stitcher_fuzz.py | 2 +- micall/utils/contig_stitcher_context.py | 51 +- micall/utils/contig_stitcher_contigs.py | 30 +- micall/utils/contig_stitcher_events.py | 74 +-- 10 files changed, 381 insertions(+), 328 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index e0f5204a5..0e1197c91 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -31,8 +31,8 @@ def cut_query(self: GenotypedContig, cut_point: float) -> Tuple[GenotypedContig, """ Cuts query sequence in two parts with cut_point between them. """ cut_point = max(0.0, cut_point) - left = replace(self, name=context.get().generate_new_name(), seq=self.seq[:ceil(cut_point)]) - right = replace(self, name=context.get().generate_new_name(), seq=self.seq[ceil(cut_point):]) + left = replace(self, name=None, seq=self.seq[:ceil(cut_point)]) + right = replace(self, name=None, seq=self.seq[ceil(cut_point):]) return left, right @@ -40,8 +40,8 @@ def cut_reference(self: AlignedContig, cut_point: float) -> Tuple[AlignedContig, """ Cuts this alignment in two parts with cut_point between them. """ alignment_left, alignment_right = self.alignment.cut_reference(cut_point) - left = replace(self, name=context.get().generate_new_name(), alignment=alignment_left) - right = replace(self, name=context.get().generate_new_name(), alignment=alignment_right) + left = replace(self, name=None, alignment=alignment_left) + right = replace(self, name=None, alignment=alignment_right) log(events.Cut(self, left, right, cut_point)) return left, right @@ -93,7 +93,7 @@ def munge(self: AlignedContig, other: AlignedContig) -> AlignedContig: match_fraction = min(self.match_fraction, other.match_fraction) ref_name = max([self, other], key=lambda x: x.alignment.ref_length).ref_name query = GenotypedContig(seq=self.seq + other.seq, - name=context.get().generate_new_name(), + name=None, ref_name=ref_name, group_ref=self.group_ref, ref_seq=self.ref_seq, @@ -199,7 +199,7 @@ def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: log(events.HitNumber(contig, hits_array, connected)) for i, single_hit in enumerate(connected): - query = replace(contig, name=context.get().generate_new_name()) + query = replace(contig, name=None) part = AlignedContig.make(query, single_hit, strand) log(events.ConnectedHit(contig, part, i)) yield part @@ -207,32 +207,32 @@ def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: def strip_conflicting_mappings(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: contigs = list(contigs) - names = {contig.name: contig for contig in contigs} + names = {contig.id: contig for contig in contigs} - def get_indexes(name: str) -> Tuple[int, int]: - contig = names[name] + def get_indexes(id: int) -> Tuple[int, int]: + contig = names[id] if isinstance(contig, AlignedContig): return contig.alignment.q_st, contig.alignment.r_st else: return -1, -1 - reference_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[1])) - query_sorted = list(sorted(names.keys(), key=lambda name: get_indexes(name)[0])) + reference_sorted = list(sorted(names.keys(), key=lambda id: get_indexes(id)[1])) + query_sorted = list(sorted(names.keys(), key=lambda id: get_indexes(id)[0])) - def is_out_of_order(name: str) -> bool: - return reference_sorted.index(name) != query_sorted.index(name) + def is_out_of_order(id: int) -> bool: + return reference_sorted.index(id) != query_sorted.index(id) - sorted_by_query = sorted(contigs, key=lambda contig: get_indexes(contig.name)) + sorted_by_query = sorted(contigs, key=lambda contig: get_indexes(contig.id)) for prev_contig, contig, next_contig in sliding_window(sorted_by_query): if isinstance(contig, AlignedContig): original = contig start = prev_contig.alignment.q_ei + 1 if isinstance(prev_contig, AlignedContig) else 0 end = next_contig.alignment.q_st - 1 if isinstance(next_contig, AlignedContig) else len(contig.seq) - 1 - if prev_contig is not None or is_out_of_order(original.name): + if prev_contig is not None or is_out_of_order(original.id): contig = lstrip(contig) log(events.InitialStrip(original, start, original.alignment.q_st - 1)) - if next_contig is not None or is_out_of_order(original.name): + if next_contig is not None or is_out_of_order(original.id): contig = rstrip(contig) log(events.InitialStrip(original, original.alignment.q_ei + 1, end)) @@ -475,7 +475,7 @@ def calculate_cumulative_coverage(others) -> List[Tuple[int, int]]: current_interval = (current.alignment.r_st, current.alignment.r_ei) # Create a map of cumulative coverage for contigs - overlaping_contigs = [x for x in contigs if x != current and overlap(current, x)] + overlaping_contigs = [x for x in contigs if x.id != current.id and overlap(current, x)] cumulative_coverage = calculate_cumulative_coverage(overlaping_contigs) # Check if the current contig is covered by the cumulative coverage intervals @@ -562,7 +562,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont contigs = list(contigs) for contig in contigs: log(events.Intro(contig)) - context.get().nameset.add(contig.name) + contig.register() maybe_aligned = list(align_all_to_reference(contigs)) @@ -602,7 +602,7 @@ def main(args): from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() - parser.add_argument('contigs', type=argparse.FileType('r'), help="Input fasta file with assembled contigs.") + parser.add_argument('contigs', type=argparse.FileType('r'), help="Input FASTA file with assembled contigs.") parser.add_argument('stitched_contigs', type=argparse.FileType('w'), help="Output CSV file with stitched contigs.") parser.add_argument('--plot', type=argparse.FileType('w'), diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index cc879b3f3..4e12edbb8 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -22,6 +22,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.alignment_wrapper import align_nucs from micall.utils.contig_stitcher_contigs import Contig, GenotypedContig, AlignedContig +from micall.utils.contig_stitcher_context import StitcherContext, context import micall.utils.contig_stitcher_events as events from micall.data.landmark_reader import LandmarkReader @@ -400,36 +401,37 @@ def build_coverage_figure(genome_coverage_csv, blast_csv=None, use_concordance=F def plot_stitcher_coverage(logs: Iterable[events.EventType], genome_coverage_svg_path: str): - f = build_stitcher_figure(logs) - f.show(w=970).save_svg(genome_coverage_svg_path, context=draw.Context(invert_y=True)) - return f + with StitcherContext.stage(): + f = build_stitcher_figure(logs) + f.show(w=970).save_svg(genome_coverage_svg_path, context=draw.Context(invert_y=True)) + return f def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure: - complete_contig_map: Dict[str, GenotypedContig] = {} - name_map: Dict[str, str] = {} - complete_parent_graph: Dict[str, List[str]] = {} - alive_set: Set[str] = set() - morphism_graph: Dict[str, List[str]] = {} - reduced_parent_graph: Dict[str, List[str]] = {} - transitive_parent_graph: Dict[str, List[str]] = {} - discarded: List[str] = [] - unknown: List[str] = [] - anomaly: List[str] = [] - unaligned_map: Dict[str, List[CigarHit]] = {} - overlaps_list: List[str] = [] - overlap_leftparent_map: Dict[str, str] = {} - overlap_rightparent_map: Dict[str, str] = {} - overlap_lefttake_map: Dict[str, str] = {} - overlap_righttake_map: Dict[str, str] = {} - overlap_left_sibling: Dict[str, str] = {} - overlap_right_sibling: Dict[str, str] = {} - combine_left_edge: Dict[str, str] = {} - combine_right_edge: Dict[str, str] = {} - children_join_points: List[str] = [] - query_position_map: Dict[str, Tuple[int, int]] = {} - lstrip_map: Dict[str, str] = {} - rstrip_map: Dict[str, str] = {} + complete_contig_map: Dict[int, GenotypedContig] = {} + name_map: Dict[int, str] = {} + complete_parent_graph: Dict[int, List[int]] = {} + alive_set: Set[int] = set() + morphism_graph: Dict[int, List[int]] = {} + reduced_parent_graph: Dict[int, List[int]] = {} + transitive_parent_graph: Dict[int, List[int]] = {} + discarded: List[int] = [] + unknown: List[int] = [] + anomaly: List[int] = [] + unaligned_map: Dict[int, List[CigarHit]] = {} + overlaps_list: List[int] = [] + overlap_leftparent_map: Dict[int, int] = {} + overlap_rightparent_map: Dict[int, int] = {} + overlap_lefttake_map: Dict[int, int] = {} + overlap_righttake_map: Dict[int, int] = {} + overlap_left_sibling: Dict[int, int] = {} + overlap_right_sibling: Dict[int, int] = {} + combine_left_edge: Dict[int, int] = {} + combine_right_edge: Dict[int, int] = {} + children_join_points: List[int] = [] + query_position_map: Dict[int, Tuple[int, int]] = {} + lstrip_map: Dict[int, int] = {} + rstrip_map: Dict[int, int] = {} def remove_intermediate_edges(graph): tr_cl = transitive_closure(graph) @@ -448,7 +450,9 @@ def remove_transitive_edges(graph): for parent, children in graph.items(): lst = [] for child in children: - is_transitive = any(child in tr_cl.get(other_node, []) for other_node in children if other_node != child) + is_transitive = any(child in tr_cl.get(other_node, []) + for other_node in children + if other_node != child) if not is_transitive: lst.append(child) ret[parent] = lst @@ -522,33 +526,34 @@ def symmetric_closure(graph): return graph_sum(graph, inverse_graph(graph)) def record_contig(contig: GenotypedContig, parents: Iterable[GenotypedContig]): - complete_contig_map[contig.name] = contig - if [contig.name] != [parent.name for parent in parents]: + complete_contig_map[contig.id] = contig + if [contig.id] != [parent.id for parent in parents]: for parent in parents: - complete_contig_map[parent.name] = parent - if contig.name not in complete_parent_graph: - complete_parent_graph[contig.name] = [] + complete_contig_map[parent.id] = parent + if contig.id not in complete_parent_graph: + complete_parent_graph[contig.id] = [] - complete_parent_graph[contig.name].append(parent.name) + complete_parent_graph[contig.id].append(parent.id) def record_alive(contig: Contig): - alive_set.add(contig.name) + alive_set.add(contig.id) - def record_bad_contig(contig: GenotypedContig, lst: List[str]): - complete_contig_map[contig.name] = contig - if contig.name not in lst: - lst.append(contig.name) + def record_bad_contig(contig: GenotypedContig, lst: List[int]): + complete_contig_map[contig.id] = contig + if contig.id not in lst: + lst.append(contig.id) def record_lstrip(result: AlignedContig, original: AlignedContig): - lstrip_map[result.name] = original.name + lstrip_map[result.id] = original.id def record_rstrip(result: AlignedContig, original: AlignedContig): - rstrip_map[result.name] = original.name + rstrip_map[result.id] = original.id def hit_to_insertions(contig: GenotypedContig, hit: CigarHit): yield CigarHit.from_default_alignment(q_st=0, q_ei=hit.q_st - 1, r_st=hit.r_st, r_ei=hit.r_st - 1) yield from hit.insertions() - yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, r_st=hit.r_ei + 1, r_ei=hit.r_ei) + yield CigarHit.from_default_alignment(q_st=hit.q_ei + 1, q_ei=len(contig.seq) - 1, + r_st=hit.r_ei + 1, r_ei=hit.r_ei) def hits_to_insertions(contig: GenotypedContig, hits: List[CigarHit]): for hit in hits: @@ -556,7 +561,7 @@ def hits_to_insertions(contig: GenotypedContig, hits: List[CigarHit]): def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): insertions = [gap for gap in hits_to_insertions(contig, hits)] - unaligned_map[contig.name] = insertions + unaligned_map[contig.id] = insertions for event in logs: if isinstance(event, events.FinalCombine): @@ -599,14 +604,14 @@ def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): elif isinstance(event, events.InitialStrip): pass elif isinstance(event, events.Overlap): - overlaps_list.append(event.left_overlap.name) - overlaps_list.append(event.right_overlap.name) - overlap_leftparent_map[event.left_remainder.name] = event.left.name - overlap_rightparent_map[event.right_remainder.name] = event.right.name - overlap_lefttake_map[event.left_remainder.name] = event.left_take.name - overlap_righttake_map[event.right_remainder.name] = event.right_take.name - overlap_left_sibling[event.left_remainder.name] = event.right_remainder.name - overlap_right_sibling[event.right_remainder.name] = event.left_remainder.name + overlaps_list.append(event.left_overlap.id) + overlaps_list.append(event.right_overlap.id) + overlap_leftparent_map[event.left_remainder.id] = event.left.id + overlap_rightparent_map[event.right_remainder.id] = event.right.id + overlap_lefttake_map[event.left_remainder.id] = event.left_take.id + overlap_righttake_map[event.right_remainder.id] = event.right_take.id + overlap_left_sibling[event.left_remainder.id] = event.right_remainder.id + overlap_right_sibling[event.right_remainder.id] = event.left_remainder.id elif isinstance(event, events.Drop): record_bad_contig(event.contig, discarded) record_alive(event.contig) @@ -627,8 +632,8 @@ def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): record_alive(event.result) record_contig(event.result, event.contigs) if event.contigs: - combine_left_edge[event.result.name] = event.contigs[0].name - combine_right_edge[event.result.name] = event.contigs[-1].name + combine_left_edge[event.result.id] = event.contigs[0].id + combine_right_edge[event.result.id] = event.contigs[-1].id elif isinstance(event, (events.IgnoreGap, events.InitialHit)): pass else: @@ -639,82 +644,82 @@ def record_initial_hit(contig: GenotypedContig, hits: List[CigarHit]): nodup_parent_graph = remove_duplicate_edges(notransitive_parent_graph) # Close alive set by parents - def extend_alive(contig_name): - alive_set.add(contig_name) - for parent_name in nodup_parent_graph.get(contig_name, []): - extend_alive(parent_name) + def extend_alive(contig_id): + alive_set.add(contig_id) + for parent_id in nodup_parent_graph.get(contig_id, []): + extend_alive(parent_id) - for contig_name in alive_set.copy(): - extend_alive(contig_name) + for contig_id in alive_set.copy(): + extend_alive(contig_id) - parent_graph: Dict[str, List[str]] = {} - for contig_name in nodup_parent_graph: - if contig_name in alive_set: - parent_graph[contig_name] = nodup_parent_graph[contig_name] + parent_graph: Dict[int, List[int]] = {} + for contig_id in nodup_parent_graph: + if contig_id in alive_set: + parent_graph[contig_id] = nodup_parent_graph[contig_id] - contig_map: Dict[str, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set} + contig_map: Dict[int, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set} bad_contigs = anomaly + discarded + unknown group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq} children_graph = inverse_graph(parent_graph) transitive_parent_graph = transitive_closure(parent_graph) transitive_children_graph = transitive_closure(children_graph) reduced_parent_graph = remove_intermediate_edges(transitive_parent_graph) - sorted_roots = list(sorted(parent_name for - parent_name in contig_map - if parent_name not in parent_graph)) - sorted_sinks = list(sorted(child_name for - child_name in contig_map - if child_name not in children_graph)) + sorted_roots = list(sorted(parent_id for + parent_id in contig_map + if parent_id not in parent_graph)) + sorted_sinks = list(sorted(child_id for + child_id in contig_map + if child_id not in children_graph)) lstrip_set = set(lstrip_map.keys()) rstrip_set = set(rstrip_map.keys()) - for contig_name, parents in parent_graph.items(): + for contig_id, parents in parent_graph.items(): if len(parents) == 1: - morphism_graph[parents[0]] = [contig_name] + morphism_graph[parents[0]] = [contig_id] transitive_morphism_graph = transitive_closure(morphism_graph) reduced_morphism_graph = remove_intermediate_edges(transitive_morphism_graph) eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_morphism_graph)) - for contig_name, parents in parent_graph.items(): + for contig_id, parents in parent_graph.items(): if len(parents) > 1: - children_join_points.append(contig_name) + children_join_points.append(contig_id) - def set_query_position(contig_name: str) -> None: - contig = contig_map[contig_name] - children_names = children_graph.get(contig.name, []) + def set_query_position(contig_id: int) -> None: + contig = contig_map[contig_id] + children_ids = children_graph.get(contig.id, []) - def copy_from_parent(contig: AlignedContig, parent_name: str) -> None: - if parent_name in query_position_map: - (original_q_st, original_q_ei) = query_position_map[parent_name] + def copy_from_parent(contig: AlignedContig, parent_id: int) -> None: + if parent_id in query_position_map: + (original_q_st, original_q_ei) = query_position_map[parent_id] (current_q_st, current_q_ei) = (contig.alignment.q_st, contig.alignment.q_ei) current_query_len = abs(current_q_st - current_q_ei) - if contig_name in lstrip_map: - query_position_map[contig.name] = (original_q_ei - current_query_len, original_q_ei) - elif contig_name in rstrip_map: - query_position_map[contig.name] = (original_q_st, original_q_st + current_query_len) + if contig_id in lstrip_map: + query_position_map[contig.id] = (original_q_ei - current_query_len, original_q_ei) + elif contig_id in rstrip_map: + query_position_map[contig.id] = (original_q_st, original_q_st + current_query_len) else: - query_position_map[contig_name] = query_position_map[parent_name] + query_position_map[contig_id] = query_position_map[parent_id] - if contig_name not in query_position_map: + if contig_id not in query_position_map: if isinstance(contig, AlignedContig): - regular_parents_names = parent_graph.get(contig_name, []) - regular_parents_names = [name for name in regular_parents_names if name in query_position_map] - strip_parents_names = lstrip_map.get(contig_name, None) or rstrip_map.get(contig_name, None) - parents_names = (strip_parents_names and [strip_parents_names]) or regular_parents_names - if parents_names: - for parent_name in parents_names: - copy_from_parent(contig, parent_name) + regular_parents_ids = parent_graph.get(contig_id, []) + regular_parents_ids = [name for name in regular_parents_ids if name in query_position_map] + strip_parents_ids = lstrip_map.get(contig_id, None) or rstrip_map.get(contig_id, None) + parents_ids = (strip_parents_ids and [strip_parents_ids]) or regular_parents_ids + if parents_ids: + for parent_id in parents_ids: + copy_from_parent(contig, parent_id) else: - query_position_map[contig_name] = (contig.alignment.q_st, contig.alignment.q_ei) + query_position_map[contig_id] = (contig.alignment.q_st, contig.alignment.q_ei) - for child_name in children_names: - set_query_position(child_name) + for child_id in children_ids: + set_query_position(child_id) - for contig_name in sorted_roots: - set_query_position(contig_name) + for contig_id in sorted_roots: + set_query_position(contig_id) def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, overlap_xsibling, xstrip_set): for parent in edge_table: @@ -742,65 +747,65 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map, over overlap_rightparent_map, overlap_right_sibling, lstrip_set)): pass - final_nodes: List[str] = [] - final_parts: Dict[str, bool] = {} - final_children_mapping: Dict[str, List[str]] = {} + final_nodes: List[int] = [] + final_parts: Dict[int, bool] = {} + final_children_mapping: Dict[int, List[int]] = {} - def add_join_parents(join_name): - if join_name in children_join_points: - for contig_name in parent_graph.get(join_name, [join_name]): - add_join_parents(contig_name) + def add_join_parents(join_id): + if join_id in children_join_points: + for contig_id in parent_graph.get(join_id, [join_id]): + add_join_parents(contig_id) else: - final_nodes.append(join_name) + final_nodes.append(join_id) - for join_name in children_join_points + sorted_sinks: - add_join_parents(join_name) + for join_id in children_join_points + sorted_sinks: + add_join_parents(join_id) - def is_ancestor(contig_name, other_names): - for other in other_names: - if other == contig_name: + def is_ancestor(contig_id, other_ids): + for other in other_ids: + if other == contig_id: continue - if contig_name in transitive_children_graph.get(other, []): + if contig_id in transitive_children_graph.get(other, []): return True return False - for contig_name in final_nodes[:]: - if is_ancestor(contig_name, final_nodes): - final_nodes.remove(contig_name) + for contig_id in final_nodes[:]: + if is_ancestor(contig_id, final_nodes): + final_nodes.remove(contig_id) - for contig_name in final_nodes: - if any(contig_name in eqv_morphism_graph.get(bad, []) for bad in bad_contigs): + for contig_id in final_nodes: + if any(contig_id in eqv_morphism_graph.get(bad, []) for bad in bad_contigs): continue - if any(contig_name in eqv_morphism_graph.get(temp_name, [temp_name]) for temp_name in overlaps_list): + if any(contig_id in eqv_morphism_graph.get(temp_id, [temp_id]) for temp_id in overlaps_list): continue - final_parts[contig_name] = True + final_parts[contig_id] = True - for contig_name in bad_contigs: - final_parts[contig_name] = True + for contig_id in bad_contigs: + final_parts[contig_id] = True - for parent_name in sorted_roots: + for parent_id in sorted_roots: children = [] for final_contig in final_parts: - if final_contig == parent_name or \ - parent_name in reduced_parent_graph.get(final_contig, [final_contig]): + if final_contig == parent_id or \ + parent_id in reduced_parent_graph.get(final_contig, [final_contig]): children.append(final_contig) - final_children_mapping[parent_name] = children + final_children_mapping[parent_id] = children - aligned_size_map: Dict[str, Tuple[int, int]] = {} - full_size_map: Dict[str, Tuple[int, int]] = {} + aligned_size_map: Dict[int, Tuple[int, int]] = {} + full_size_map: Dict[int, Tuple[int, int]] = {} def get_neighbours(part, lookup): - for clone in eqv_morphism_graph.get(part.name, [part.name]): - maybe_name = lookup.get(clone, None) - if maybe_name is not None: - yield contig_map[maybe_name] + for clone in eqv_morphism_graph.get(part.id, [part.id]): + maybe_id = lookup.get(clone, None) + if maybe_id is not None: + yield contig_map[maybe_id] def get_final_version(contig): - [name] = reduced_morphism_graph.get(contig.name, [contig.name]) + [name] = reduced_morphism_graph.get(contig.id, [contig.id]) return contig_map[name] def get_neighbour(part, lookup): @@ -814,13 +819,13 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] if isinstance(contig, AlignedContig) and contig.alignment.ref_length > 0: r_st = contig.alignment.r_st r_ei = contig.alignment.r_ei - if contig.name in aligned_size_map: - a_r_st, a_r_ei = aligned_size_map[contig.name] + if contig.id in aligned_size_map: + a_r_st, a_r_ei = aligned_size_map[contig.id] else: a_r_st = r_st a_r_ei = r_ei - if contig.name in full_size_map: - f_r_st, f_r_ei = full_size_map[contig.name] + if contig.id in full_size_map: + f_r_st, f_r_ei = full_size_map[contig.id] else: f_r_st = r_st - contig.alignment.q_st f_r_ei = r_ei + (len(contig.seq) - contig.alignment.q_ei) @@ -831,10 +836,10 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] a_r_ei = f_r_ei return (a_r_st, a_r_ei, f_r_st, f_r_ei) - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] - for part_name in parts_names: - part = contig_map[part_name] + for parent_id in sorted_roots: + parts_ids = final_children_mapping[parent_id] + for part_id in parts_ids: + part = contig_map[part_id] if not isinstance(part, AlignedContig): continue @@ -843,7 +848,7 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] if prev_part is not None: r_st = prev_part.alignment.r_st - elif part_name in lstrip_set: + elif part_id in lstrip_set: r_st = part.alignment.r_st else: start_delta = -1 * part.alignment.q_st @@ -851,18 +856,22 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] if next_part is not None: r_ei = next_part.alignment.r_ei - elif part_name in rstrip_set: + elif part_id in rstrip_set: r_ei = part.alignment.r_ei else: end_delta = len(part.seq) - 1 - part.alignment.q_ei r_ei = part.alignment.r_ei + end_delta - aligned_size_map[part.name] = (r_st, r_ei) + aligned_size_map[part.id] = (r_st, r_ei) - sibling_left_name = ([overlap_left_sibling[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_left_sibling] or [""])[0] - sibling_left = sibling_left_name and contig_map[sibling_left_name] - sibling_right_name = ([overlap_right_sibling[name] for name in eqv_morphism_graph.get(part.name, [part.name]) if name in overlap_right_sibling] or [""])[0] - sibling_right = sibling_right_name and contig_map[sibling_right_name] + sibling_left_id = ([overlap_left_sibling[name] + for name in eqv_morphism_graph.get(part.id, [part.id]) + if name in overlap_left_sibling] or [0])[0] + sibling_left = sibling_left_id and contig_map[sibling_left_id] + sibling_right_id = ([overlap_right_sibling[name] + for name in eqv_morphism_graph.get(part.id, [part.id]) + if name in overlap_right_sibling] or [0])[0] + sibling_right = sibling_right_id and contig_map[sibling_right_id] prev_part = get_neighbour(sibling_right, overlap_lefttake_map) next_part = get_neighbour(sibling_left, overlap_righttake_map) @@ -876,12 +885,12 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int] else: r_ei = part.alignment.r_ei - full_size_map[part.name] = (r_st, r_ei) + full_size_map[part.id] = (r_st, r_ei) def carve_gap(gap: CigarHit, aligned_parts: Iterable[AlignedContig]): for contig in aligned_parts: (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) - other_coords = query_position_map.get(contig.name, (-1, -2)) + other_coords = query_position_map.get(contig.id, (-1, -2)) other_q_st = min(other_coords) - max(0, abs(f_r_st - a_r_st)) other_q_ei = max(other_coords) + max(0, abs(a_r_ei - f_r_ei)) @@ -908,8 +917,8 @@ def carve_gap(gap: CigarHit, aligned_parts: Iterable[AlignedContig]): if gap.query_length > 0: return gap - def collect_gaps(root: str, children_names: List[str]): - all_children = [contig_map[name] for name in children_names] + def collect_gaps(root: int, children_ids: List[int]): + all_children = [contig_map[name] for name in children_ids] children = [child for child in all_children if isinstance(child, AlignedContig)] for name in unaligned_map: if reduced_parent_graph.get(name, [name]) == [root]: @@ -918,8 +927,7 @@ def collect_gaps(root: str, children_names: List[str]): if carved is not None: yield carved - carved_unaligned_parts: Dict[str, List[str]] = {} - fake_name_counter = 0 + carved_unaligned_parts: Dict[int, List[int]] = {} for root in sorted_roots: existing: Set[Tuple[int, int]] = set() children = final_children_mapping[root] @@ -927,34 +935,31 @@ def collect_gaps(root: str, children_names: List[str]): coords = (gap.q_st, gap.q_ei) if coords not in existing: existing.add(coords) - fake_name_counter += 1 - fake_name = f"m{fake_name_counter}" if root not in carved_unaligned_parts: carved_unaligned_parts[root] = [] - carved_unaligned_parts[root].append(fake_name) - query_position_map[fake_name] = coords + fake = Contig(name=None, seq="") + carved_unaligned_parts[root].append(fake.id) + query_position_map[fake.id] = coords - merged_unaligned_parts: Dict[str, List[str]] = {} - fake_name_counter = 0 + merged_unaligned_parts: Dict[int, List[int]] = {} for root in sorted_roots: children = final_children_mapping[root] unaligned_children = carved_unaligned_parts.get(root, []) todo = children + unaligned_children todo = list(sorted(todo, key=lambda name: query_position_map.get(name, (-1, -1)))) current_group = [] - for child_name in todo + [None]: - if child_name in unaligned_children: - coords = query_position_map[child_name] + for child_id in todo + [None]: + if child_id in unaligned_children: + coords = query_position_map[child_id] current_group.append(coords) elif current_group: coords = (min(q_st for q_st, q_ei in current_group), max(q_ei for q_st, q_ei in current_group)) - fake_name_counter += 1 - fake_name = f"u{fake_name_counter}" - query_position_map[fake_name] = coords if root not in merged_unaligned_parts: merged_unaligned_parts[root] = [] - merged_unaligned_parts[root].append(fake_name) + fake = Contig(name=None, seq="") + query_position_map[fake.id] = coords + merged_unaligned_parts[root].append(fake.id) current_group = [] name_map = {} @@ -964,45 +969,49 @@ def collect_gaps(root: str, children_names: List[str]): name_map[root] = f"{i + 1}" - todo_names = children + unaligned_children - todo_names = list(sorted(todo_names, key=lambda name: query_position_map.get(name, (-1, -1)))) - for k, child_name in enumerate(todo_names): - if len(todo_names) > 1: - name_map[child_name] = f"{i + 1}.{k + 1}" + todo_ids = children + unaligned_children + todo_ids = list(sorted(todo_ids, key=lambda name: query_position_map.get(name, (-1, -1)))) + for k, child_id in enumerate(todo_ids): + if len(todo_ids) > 1: + name_map[child_id] = f"{i + 1}.{k + 1}" else: - name_map[child_name] = f"{i + 1}" + name_map[child_id] = f"{i + 1}" - for bad_name in bad_contigs: - if bad_name not in children: - if bad_name in transitive_parent_graph \ - and root in transitive_parent_graph[bad_name]: + for bad_id in bad_contigs: + if bad_id not in children: + if bad_id in transitive_parent_graph \ + and root in transitive_parent_graph[bad_id]: k += 1 - name_map[bad_name] = f"{i + 1}.{k + 1}" + name_map[bad_id] = f"{i + 1}.{k + 1}" - for contig_name, name in name_map.items(): - logger.debug(f"Contig name {contig_name!r} is displayed as {name!r}.") + for contig_id, name in name_map.items(): + if contig_id in complete_contig_map: + contig = complete_contig_map[contig_id] + logger.debug(f"Contig name {contig.unique_name} is displayed as {name!r}.") def get_tracks(parts: Iterable[GenotypedContig]) -> Iterable[Track]: for part in parts: - indexes = name_map[part.name] + name = name_map[part.id] (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) if a_r_st < f_r_st: - yield Track(min(a_r_st, f_r_st) + position_offset, max(a_r_st, f_r_st) + position_offset, color="yellow") + yield Track(min(a_r_st, f_r_st) + position_offset, + max(a_r_st, f_r_st) + position_offset, color="yellow") if a_r_ei > f_r_ei: - yield Track(min(a_r_ei, f_r_ei) + position_offset, max(a_r_ei, f_r_ei) + position_offset, color="yellow") + yield Track(min(a_r_ei, f_r_ei) + position_offset, + max(a_r_ei, f_r_ei) + position_offset, color="yellow") if isinstance(part, AlignedContig): colour = 'lightgrey' else: colour = "yellow" - yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{indexes}", color=colour) + yield Track(f_r_st + position_offset, f_r_ei + position_offset, label=f"{name}", color=colour) def get_arrows(parts: Iterable[GenotypedContig], labels: bool) -> Iterable[Arrow]: for part in parts: - indexes = name_map[part.name] if labels else None + name = name_map[part.id] if labels else None height = 20 if labels else 1 elevation = 1 if labels else -20 (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(part) @@ -1015,7 +1024,7 @@ def get_arrows(parts: Iterable[GenotypedContig], labels: bool) -> Iterable[Arrow yield Arrow(a_r_st + position_offset, a_r_ei + position_offset, elevation=elevation, h=height, - label=indexes) + label=name) def make_ray() -> Element: screen_size = (max_position - min_position) + position_offset / 2 @@ -1024,7 +1033,9 @@ def make_ray() -> Element: def generate_beams(): for i in range(floor(screen_size / single_size) + 1): if i % 2 == 0: - yield Track(i * single_size + min_position + position_offset / 2, (i + 1) * single_size + min_position + position_offset / 2, h=0.1, color="green") + yield Track(i * single_size + min_position + position_offset / 2, + (i + 1) * single_size + min_position + position_offset / 2, + h=0.1, color="green") return Multitrack(list(generate_beams())) @@ -1038,8 +1049,8 @@ def add_section(title: str) -> None: min_position = 0 max_position = max(group_refs.values(), default=1) - for contig_name in final_parts: - contig = contig_map[contig_name] + for contig_id in final_parts: + contig = contig_map[contig_id] if isinstance(contig, AlignedContig): positions = get_contig_coordinates(contig) max_position = max(max_position, max(positions)) @@ -1105,15 +1116,15 @@ def add_section(title: str) -> None: reference_max = r_ei + position_offset reference_tracks.append(Track(r_st + position_offset, r_ei + position_offset, color="red")) - for contig_name in final_parts: - contig = contig_map[contig_name] + for contig_id in final_parts: + contig = contig_map[contig_id] if contig.group_ref != group_ref: continue if not isinstance(contig, AlignedContig): continue - if contig_name in bad_contigs: + if contig_id in bad_contigs: continue (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) @@ -1121,15 +1132,15 @@ def add_section(title: str) -> None: reference_min = min(a_r_st + position_offset, reference_min) reference_max = max(a_r_ei + position_offset, reference_max) - for contig_name in final_parts: - contig = contig_map[contig_name] + for contig_id in final_parts: + contig = contig_map[contig_id] if contig.group_ref != group_ref: continue if not isinstance(contig, AlignedContig): continue - if contig_name in bad_contigs: + if contig_id in bad_contigs: continue (a_r_st, a_r_ei, f_r_st, f_r_ei) = get_contig_coordinates(contig) @@ -1147,9 +1158,9 @@ def add_section(title: str) -> None: ref_arrows: List[Arrow] = [] for root in sorted_roots: - parts_names = final_children_mapping[root] - parts_names = [name for name in parts_names if name not in bad_contigs] - parts = [contig_map[name] for name in parts_names] + parts_ids = final_children_mapping[root] + parts_ids = [name for name in parts_ids if name not in bad_contigs] + parts = [contig_map[name] for name in parts_ids] parts = [part for part in parts if part.group_ref == group_ref] ref_arrows.extend(get_arrows(parts, labels=True)) @@ -1161,9 +1172,9 @@ def add_section(title: str) -> None: ########### for root in sorted_roots: - parts_names = final_children_mapping[root] - parts_names = [name for name in parts_names if name not in bad_contigs] - parts = [contig_map[name] for name in parts_names] + parts_ids = final_children_mapping[root] + parts_ids = [name for name in parts_ids if name not in bad_contigs] + parts = [contig_map[name] for name in parts_ids] parts = [part for part in parts if part.group_ref == group_ref] if parts: figure.add(ArrowGroup(list(get_arrows(parts, labels=False)))) @@ -1179,16 +1190,18 @@ def add_section(title: str) -> None: if contig_map[root].group_ref != group_ref: continue - parts_names = final_children_mapping[root] - parts_names = [name for name in parts_names if name in discarded] + parts_ids = final_children_mapping[root] + parts_ids = [id for id in parts_ids if id in discarded] unaligned_parts = merged_unaligned_parts.get(root, []) - for name in sorted(parts_names + unaligned_parts, key=lambda x: name_map[x.name] if isinstance(x, Contig) else name_map[x]): - if name in unaligned_parts: - (q_st, q_ei) = query_position_map[name] - label = name_map[name] - figure.add(Track(position_offset, position_offset + abs(q_ei - q_st), label=label, color="yellow")) + for id in sorted(parts_ids + unaligned_parts, + key=lambda x: name_map[x.id] if isinstance(x, Contig) else name_map[x]): + if id in unaligned_parts: + (q_st, q_ei) = query_position_map[id] + label = name_map[id] + figure.add(Track(position_offset, position_offset + abs(q_ei - q_st), + label=label, color="yellow")) else: - part = contig_map[name] + part = contig_map[id] figure.add(Multitrack(list(get_tracks([part])))) ############# @@ -1198,9 +1211,9 @@ def add_section(title: str) -> None: if anomaly: add_section("anomaly:") for root in sorted_roots: - parts_names = final_children_mapping[root] - parts_names = [name for name in parts_names if name in anomaly] - parts = [contig_map[name] for name in parts_names] + parts_ids = final_children_mapping[root] + parts_ids = [name for name in parts_ids if name in anomaly] + parts = [contig_map[name] for name in parts_ids] parts = [part for part in parts if part.group_ref == group_ref] for part in parts: figure.add(Multitrack(list(get_tracks([part])))) @@ -1211,10 +1224,10 @@ def add_section(title: str) -> None: if unknown: add_section("unknown:") - for parent_name in sorted_roots: - parts_names = final_children_mapping[parent_name] - parts_names = [name for name in parts_names if name in unknown] - parts = [contig_map[name] for name in parts_names] + for parent_id in sorted_roots: + parts_ids = final_children_mapping[parent_id] + parts_ids = [name for name in parts_ids if name in unknown] + parts = [contig_map[name] for name in parts_ids] for part in parts: figure.add(Multitrack(list(get_tracks([part])))) diff --git a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg index c670f0d08..60521ecf3 100644 --- a/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg +++ b/micall/tests/data/stitcher_plots/test_correct_processing_complex_nogaps.svg @@ -49,19 +49,19 @@ -3 +2 -5 +3 -7 +4 @@ -88,7 +88,7 @@ -3 +2 @@ -102,7 +102,7 @@ -5 +3 @@ -116,7 +116,7 @@ -7 +4 @@ -160,19 +160,19 @@ -2 +5 -4 +6 -6 +7 @@ -191,7 +191,7 @@ -2 +5 @@ -205,7 +205,7 @@ -4 +6 @@ -219,7 +219,7 @@ -6 +7 diff --git a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg index e1a300e9b..fddc82407 100644 --- a/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg +++ b/micall/tests/data/stitcher_plots/test_separate_stitching_of_non_overlapping_contigs_2.svg @@ -31,39 +31,39 @@ -1 +2 -2 +1 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg index ec3fc4060..c0e986950 100644 --- a/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg +++ b/micall/tests/data/stitcher_plots/test_stitching_contig_with_big_noncovered_gap_2.svg @@ -31,39 +31,39 @@ -2 +1 -1 +2 - - + + - - + + 1 - - + + - - + + 2 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 5209865ba..36c8aae93 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -13,7 +13,6 @@ stitch_consensus, calculate_concordance, align_all_to_reference, - AlignedContig, disambiguate_concordance, lstrip, rstrip, @@ -40,7 +39,7 @@ def exact_aligner(monkeypatch): @pytest.fixture def visualizer(request, tmp_path): - stitcher.context.set(stitcher.StitcherContext()) + stitcher.context.set(stitcher.StitcherContext.make()) test_name = request.node.name plot_name = test_name + ".svg" pwd = os.path.dirname(__file__) @@ -1491,9 +1490,7 @@ def __init__(self, ref_name, group_ref, r_st, r_ei, name="contig"): self.group_ref = group_ref self.alignment = MockAlignment(r_st, r_ei) self.name = name - - def overlaps(self, other): - return AlignedContig.overlaps(self, other) + self.id = id(self) class MockAlignment: diff --git a/micall/tests/test_contig_stitcher_fuzz.py b/micall/tests/test_contig_stitcher_fuzz.py index d23ea9cd3..dde2af769 100644 --- a/micall/tests/test_contig_stitcher_fuzz.py +++ b/micall/tests/test_contig_stitcher_fuzz.py @@ -23,7 +23,7 @@ def no_aligner(monkeypatch): @pytest.fixture(autouse=True) def stitcher_context(): - stitcher.context.set(StitcherContext()) + stitcher.context.set(StitcherContext.make()) def read_contigs(line): diff --git a/micall/utils/contig_stitcher_context.py b/micall/utils/contig_stitcher_context.py index a90647247..7672733da 100644 --- a/micall/utils/contig_stitcher_context.py +++ b/micall/utils/contig_stitcher_context.py @@ -1,31 +1,48 @@ -from typing import List, Set +from typing import List, Dict from contextvars import ContextVar from contextlib import contextmanager +from dataclasses import dataclass +from copy import deepcopy -import micall.utils.contig_stitcher_events as events +import micall.utils.contig_stitcher_events as st_events +@dataclass class StitcherContext: - def __init__(self) -> None: - self.name_generator_state: int = 0 - self.nameset: Set[str] = set() - self.events: List[events.EventType] = [] - - def generate_new_name(self) -> str: - while True: - self.name_generator_state += 1 - name = f"c{self.name_generator_state}" - if name not in self.nameset: - self.nameset.add(name) - return name - - def emit(self, event: events.EventType) -> None: + uniq_dict: Dict[object, Dict[object, int]] + events: List[st_events.EventType] + + def register(self, key: object, value: object) -> int: + if value not in self.uniq_dict: + self.uniq_dict[value] = {} + + existing = self.uniq_dict[value] + if key not in existing: + existing[key] = len(existing) + 1 + + return existing[key] + + def emit(self, event: st_events.EventType) -> None: self.events.append(event) + @staticmethod + def make() -> 'StitcherContext': + return StitcherContext(events=[], uniq_dict={}) + @staticmethod @contextmanager def fresh(): - ctx = StitcherContext() + ctx = StitcherContext.make() + token = context.set(ctx) + try: + yield ctx + finally: + context.reset(token) + + @staticmethod + @contextmanager + def stage(): + ctx = deepcopy(context.get()) token = context.set(ctx) try: yield ctx diff --git a/micall/utils/contig_stitcher_contigs.py b/micall/utils/contig_stitcher_contigs.py index 42a6807f2..c79e835ed 100644 --- a/micall/utils/contig_stitcher_contigs.py +++ b/micall/utils/contig_stitcher_contigs.py @@ -1,14 +1,40 @@ from dataclasses import dataclass from typing import Optional, Literal - +from functools import cached_property from aligntools import CigarHit +ID_STATE = 0 + +def generate_new_id() -> int: + global ID_STATE + ID_STATE += 1 + return ID_STATE + + @dataclass(frozen=True) class Contig: - name: str + name: Optional[str] seq: str + @cached_property + def id(self) -> int: + return generate_new_id() + + @cached_property + def unique_name(self) -> str: + index = self.register() + unqualified = repr(self.name) if self.name is not None else "" + if index == 1 and self.name: + return unqualified + else: + return unqualified + f'({index})' + + def register(self) -> int: + from micall.utils.contig_stitcher_context import context + ctx = context.get() + return ctx.register(key=self.id, value=self.name) + @dataclass(frozen=True) class GenotypedContig(Contig): diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 931d00a88..68c1c5cc0 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -15,9 +15,9 @@ class Cut: def __str__(self) -> str: return ( - f"Created contigs {self.left.name!r} at {self.left.alignment} and " - f"{self.right.name!r} at {self.right.alignment} by cutting " - f"{self.original.name!r} at {self.original.alignment} at cut point = " + f"Created contigs {self.left.unique_name} at {self.left.alignment} and " + f"{self.right.unique_name} at {self.right.alignment} by cutting " + f"{self.original.unique_name} at {self.original.alignment} at cut point = " f"{round(self.cut_point, 1)}." ) @@ -29,8 +29,8 @@ class LStrip: def __str__(self) -> str: return ( - f"Doing lstrip of {self.original.name!r} at {self.original.alignment} (len " - f"{len(self.original.seq)}) resulted in {self.result.name!r} at " + f"Doing lstrip of {self.original.unique_name} at {self.original.alignment} (len " + f"{len(self.original.seq)}) resulted in {self.result.unique_name} at " f"{self.result.alignment} (len {len(self.result.seq)})." ) @@ -42,8 +42,8 @@ class RStrip: def __str__(self) -> str: return ( - f"Doing rstrip of {self.original.name!r} at {self.original.alignment} (len " - f"{len(self.original.seq)}) resulted in {self.result.name!r} at " + f"Doing rstrip of {self.original.unique_name} at {self.original.alignment} (len " + f"{len(self.original.seq)}) resulted in {self.result.unique_name} at " f"{self.result.alignment} (len {len(self.result.seq)})." ) @@ -56,9 +56,9 @@ class Munge: def __str__(self) -> str: return ( - f"Munged contigs {self.left.name!r} at {self.left.alignment} with " - f"{self.right.name!r} at {self.right.alignment} resulting in " - f"{self.result.name!r} at {self.result.alignment}." + f"Munged contigs {self.left.unique_name} at {self.left.alignment} with " + f"{self.right.unique_name} at {self.right.alignment} resulting in " + f"{self.result.unique_name} at {self.result.alignment}." ) @@ -69,9 +69,9 @@ class Combine: def __str__(self) -> str: contigs_str = ', '.join( - [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in self.contigs]) + [f"{x.unique_name} at {x.alignment} (len {len(x.seq)})" for x in self.contigs]) return ( - f"Created a frankenstein {self.result.name!r} at {self.result.alignment} " + f"Created a frankenstein {self.result.unique_name} at {self.result.alignment} " f"(len {len(self.result.seq)}) from [{contigs_str}]." ) @@ -81,7 +81,7 @@ class NoRef: contig: GenotypedContig def __str__(self) -> str: - return f"Contig {self.contig.name!r} not aligned - no reference." + return f"Contig {self.contig.unique_name} not aligned - no reference." @dataclass(frozen=True) @@ -94,7 +94,7 @@ class InitialHit: def __str__(self) -> str: strand_info = '' if self.strand == 'forward' else ' (rev)' return ( - f"Part {self.index} of contig {self.contig.name!r} aligned at {self.hit}" + f"Part {self.index} of contig {self.contig.unique_name} aligned at {self.hit}" f"{strand_info}." ) @@ -104,7 +104,7 @@ class ZeroHits: contig: GenotypedContig def __str__(self) -> str: - return f"Contig {self.contig.name!r} not aligned - backend's choice." + return f"Contig {self.contig.unique_name} not aligned - backend's choice." @dataclass(frozen=True) @@ -113,7 +113,7 @@ class StrandConflict: def __str__(self) -> str: return ( - f"Discarding contig {self.contig.name!r} because it aligned both in forward " + f"Discarding contig {self.contig.unique_name} because it aligned both in forward " "and reverse sense." ) @@ -124,7 +124,7 @@ class ReverseComplement: result: GenotypedContig def __str__(self) -> str: - return f"Reverse complemented contig {self.contig.name!r}." + return f"Reverse complemented contig {self.contig.unique_name}." @dataclass(frozen=True) @@ -135,7 +135,7 @@ class HitNumber: def __str__(self) -> str: return ( - f"Contig {self.contig.name!r} produced {len(self.initial)} aligner hits. " + f"Contig {self.contig.unique_name} produced {len(self.initial)} aligner hits. " f"After connecting them, the number became {len(self.connected)}." ) @@ -149,8 +149,8 @@ class ConnectedHit: def __str__(self) -> str: part_strand_info = '' if self.part.strand == 'forward' else ' (rev)' return ( - f"Part {self.index} of contig {self.contig.name!r} re-aligned as " - f"{self.part.name!r} at {self.part.alignment}{part_strand_info}." + f"Part {self.index} of contig {self.contig.unique_name} re-aligned as " + f"{self.part.unique_name} at {self.part.alignment}{part_strand_info}." ) @@ -162,7 +162,7 @@ class InitialStrip: def __str__(self) -> str: return ( - f"Trimming (strip) contig {self.contig.name!r} from {self.q_st} to " + f"Trimming (strip) contig {self.contig.unique_name} from {self.q_st} to " f"{self.q_ei}." ) @@ -178,10 +178,10 @@ class StitchCut: def __str__(self) -> str: return ( - f"Stitching {self.left.name!r} at {self.left.alignment} (len {len(self.left.seq)}) " - f"with {self.right.name!r} at {self.right.alignment} (len {len(self.right.seq)}). " - f"The left_overlap {self.left_overlap.name!r} is at {self.left_overlap.alignment} " - f"(len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.name!r} is " + f"Stitching {self.left.unique_name} at {self.left.alignment} (len {len(self.left.seq)}) " + f"with {self.right.unique_name} at {self.right.alignment} (len {len(self.right.seq)}). " + f"The left_overlap {self.left_overlap.unique_name} is at {self.left_overlap.alignment} " + f"(len {len(self.left_overlap.seq)}) and the right_overlap {self.right_overlap.unique_name} is " f"at {self.right_overlap.alignment} (len {len(self.right_overlap.seq)})." ) @@ -206,9 +206,9 @@ def __str__(self) -> str: cut_point_location_scaled = round(self.cut_point_scaled * 100) concordance_str = ', '.join(str(int(round(x * 100)) / 100) for x in self.concordance) return ( - f"Created overlap contigs {self.left_take.name!r} at {self.left_overlap.alignment} and " - f"{self.right_take.name!r} at {self.right_take.alignment} based on parts of " - f"{self.left.name!r} and {self.right.name!r}, with avg. concordance {average_concordance}%, " + f"Created overlap contigs {self.left_take.unique_name} at {self.left_overlap.alignment} and " + f"{self.right_take.unique_name} at {self.right_take.alignment} based on parts of " + f"{self.left.unique_name} and {self.right.unique_name}, with avg. concordance {average_concordance}%, " f"cut point at {cut_point_location_scaled}%, and full concordance [{concordance_str}]." ) @@ -218,7 +218,7 @@ class NoOverlap: contig: AlignedContig def __str__(self) -> str: - return f"Nothing overlaps with {self.contig.name!r}." + return f"Nothing overlaps with {self.contig.unique_name}." @dataclass(frozen=True) @@ -229,8 +229,8 @@ class Stitch: def __str__(self) -> str: return ( - f"Stitching {self.left.name!r} with {self.right.name!r} results in " - f"{self.result.name!r} at {self.result.alignment} (len {len(self.result.seq)})." + f"Stitching {self.left.unique_name} with {self.right.unique_name} results in " + f"{self.result.unique_name} at {self.result.alignment} (len {len(self.result.seq)})." ) @@ -242,7 +242,7 @@ class Drop: def __str__(self) -> str: covering_contig_names = ', '.join(repr(x.name) for x in self.covering) return ( - f"Dropped contig {self.contig.name!r} as it is completely covered by these contigs: " + f"Dropped contig {self.contig.unique_name} as it is completely covered by these contigs: " f"{covering_contig_names}." ) @@ -253,7 +253,7 @@ class IgnoreGap: gap: CigarHit def __str__(self) -> str: - return f"Ignored insignificant gap of {self.contig.name!r}, {self.gap}." + return f"Ignored insignificant gap of {self.contig.unique_name}, {self.gap}." @dataclass(frozen=True) @@ -265,9 +265,9 @@ class SplitGap: def __str__(self) -> str: return ( - f"Split contig {self.contig.name!r} at {self.contig.alignment} around its gap at " + f"Split contig {self.contig.unique_name} at {self.contig.alignment} around its gap at " f"[{self.gap.q_st}, {self.gap.q_ei}]->[{self.gap.r_st}, {self.gap.r_ei}]. Left part: " - f"{self.left.name!r} at {self.left.alignment}, right part: {self.right.name!r} at " + f"{self.left.unique_name} at {self.left.alignment}, right part: {self.right.unique_name} at " f"{self.right.alignment}." ) @@ -278,7 +278,7 @@ class Intro: def __str__(self) -> str: return ( - f"Introduced contig {self.contig.name!r} (seq = {self.contig.seq}) of ref " + f"Introduced contig {self.contig.unique_name} (seq = {self.contig.seq}) of ref " f"{self.contig.ref_name!r}, group_ref {self.contig.group_ref} (seq = {self.contig.ref_seq}), " f"and length {len(self.contig.seq)}." ) @@ -290,7 +290,7 @@ class FinalCombine: result: AlignedContig def __str__(self) -> str: - contigs_str = [f"{x.name!r} at {x.alignment} (len {len(x.seq)})" for x in self.contigs] + contigs_str = [f"{x.unique_name} at {x.alignment} (len {len(x.seq)})" for x in self.contigs] contigs_format = ', '.join(contigs_str) return ( f"Combining these contigs for final output for {self.result.group_ref}: " From 41f0c0252add4cb750d6e460d05b1468917da656 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 10 May 2024 16:12:03 -0700 Subject: [PATCH 291/485] Add test cases for non-unique names in Contig stitcher --- ...tching_of_completely_identical_contigs.svg | 155 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 23 +++ 2 files changed, 178 insertions(+) create mode 100644 micall/tests/data/stitcher_plots/test_stitching_of_completely_identical_contigs.svg diff --git a/micall/tests/data/stitcher_plots/test_stitching_of_completely_identical_contigs.svg b/micall/tests/data/stitcher_plots/test_stitching_of_completely_identical_contigs.svg new file mode 100644 index 000000000..52229e76d --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_stitching_of_completely_identical_contigs.svg @@ -0,0 +1,155 @@ + + + + + + + + + + + + + + + + + + +testref + + + + + + + +3 + + + + + + + + + + + + +3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + + +1 + + + + + + + +2 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 36c8aae93..faaed9f48 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -529,6 +529,29 @@ def test_stitching_of_identical_contigs(exact_aligner, visualizer): assert len(visualizer().elements) > len(contigs) +def test_stitching_of_completely_identical_contigs(exact_aligner, visualizer): + # Scenario: The function correctly handles and avoids duplication when completely identical contigs + # are stitched together. + + contigs = [ + GenotypedContig( + name="x", + seq="ACTGACTG" * 100, + ref_name="testref", + group_ref="testref", + ref_seq="ACTGACTG" * 100, + match_fraction=1.0, + ) + for copy in [1, 2, 3] + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 1 + assert results[0].seq == contigs[2].seq + + assert len(visualizer().elements) > len(contigs) + + def test_stitching_of_zero_contigs(exact_aligner, visualizer): # Scenario: The function does not crash if no contigs given. From 1f0d296a9e72f54a71eab6b6bb6876a6e15c31f1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 20:11:18 +0000 Subject: [PATCH 292/485] Bump coverage from 7.5.0 to 7.5.1 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.0 to 7.5.1. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.0...7.5.1) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index e9ee24556..5063facb4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.10.0 mypy-extensions==1.0.0 pytest==8.2.0 -coverage==7.5.0 +coverage==7.5.1 pandas==2.0.2 seaborn==0.13.2 ete3 From 2786211e2f4e63943d0c4be66538529fb51dfa00 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 21:46:04 +0000 Subject: [PATCH 293/485] Bump nokogiri from 1.16.2 to 1.16.5 in /docs Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.2 to 1.16.5. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.2...v1.16.5) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index cd39f2718..c806cec91 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -214,7 +214,7 @@ GEM rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.8.5) + mini_portile2 (2.8.6) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) @@ -223,7 +223,7 @@ GEM mutex_m (0.2.0) net-http (0.4.1) uri - nokogiri (1.16.2) + nokogiri (1.16.5) mini_portile2 (~> 2.8.2) racc (~> 1.4) octokit (4.25.1) From 917787f8d127ab634430894c5e5d29615d02a3da Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 14 May 2024 14:55:12 -0700 Subject: [PATCH 294/485] Fix theme version syntax in the docs configuration --- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index e807070c5..1b1548bf2 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,2 +1,2 @@ -remote_theme: chrisrhymes/bulma-clean-theme:v0.14.0 +remote_theme: chrisrhymes/bulma-clean-theme@v0.14.0 title: MiCall From 3f6280b47a4f8c81db167de79a2f9d22745df517 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 May 2024 07:04:17 +0000 Subject: [PATCH 295/485] Bump pytest from 8.2.0 to 8.2.1 Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.2.0 to 8.2.1. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.2.0...8.2.1) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 5063facb4..05a70fe5a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,7 +4,7 @@ -r requirements-watcher.txt mypy==1.10.0 mypy-extensions==1.0.0 -pytest==8.2.0 +pytest==8.2.1 coverage==7.5.1 pandas==2.0.2 seaborn==0.13.2 From 17d4404c543027d1b2c11db514ef59dd0340b427 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 07:42:12 +0000 Subject: [PATCH 296/485] --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements-basespace.txt | 2 +- requirements-watcher.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-basespace.txt b/requirements-basespace.txt index 67a1f20c2..8ee75d522 100644 --- a/requirements-basespace.txt +++ b/requirements-basespace.txt @@ -1,3 +1,3 @@ # Requirements for running the micall_basespace.py script -r requirements.txt -requests==2.31.0 +requests==2.32.1 diff --git a/requirements-watcher.txt b/requirements-watcher.txt index 65613ba4c..a4e0c2e0d 100644 --- a/requirements-watcher.txt +++ b/requirements-watcher.txt @@ -1,3 +1,3 @@ # Requirements for running the MISEQ_MONITOR.py script git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api -requests==2.31.0 +requests==2.32.1 From ea8d87a12e7b5d763c6e206542e2d949fe8f6f18 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 May 2024 15:05:40 -0700 Subject: [PATCH 297/485] Add initial stitcher documentation --- docs/_data/navigation.yml | 2 + docs/design/stitcher.md | 561 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 563 insertions(+) create mode 100644 docs/design/stitcher.md diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index b59910515..ca43a60ef 100644 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -11,6 +11,8 @@ link: /design/remap.html - name: De Novo Assembly link: /design/assembly.html + - name: Contig stitcher + link: /design/stitcher.html - name: Resistance link: /design/resistance.html - name: Genome Coverage diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md new file mode 100644 index 000000000..0288f4186 --- /dev/null +++ b/docs/design/stitcher.md @@ -0,0 +1,561 @@ +--- +title: Contig Stitching in MiCall +--- + +DeNovo assembly does not invariably translate input reads into a +single contiguous sequence akin to a genomic consensus. Typically, +errors in input data lead to fragmented sequences — referred to as +contigs — which furthermore may overlap, thus encoding the same region +of a genome more than once. Assembling a unified consensus sequence +necessitates the systematic arrangement of these contigs while +addressing discrepancies within overlapping regions. That is the +Stitcher's function. + +# Structure + +The Stitcher is a specialized component within the MiCall system. It +is designed to operate as an independent module which processes the +assembled contigs, generally derived from DeNovo assembler outputs, +and produce a singular, coherent sequence. + +## Modular Aspect + +The Stitcher maintains a distinct and autonomous role within +MiCall. Its implementation is fully isolated to the +`contig_stitcher*.py` files within the MiCall's source code. The +stitcher module can be run as a CLI script, separately from the rest +of the pipeline. The following command runs the Stitcher: + +```sh +PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher --help +``` + + + + + + + + + + +## Interaction + +Stitching is initiated either as a pipeline step in MiCall, or as a +command line call given above. In each case: + +**Input:** The Stitcher receives a single input file in FASTA +format. This file contains 1 or more contigs that are the outcomes of +the previous assembly step. These contigs are essentially segments of +DNA sequences. They can vary significantly in length. + +**Output:** The sole output from the Stitcher is a CSV +file. This file holds the stitched sequences -- longer or fully +continuous sequences that represent the genomic consensus formed by +merging the initial fragmented contigs, and additional metadata, +such as the inferred reference genome's name. + + + +# Operational procedure + +To clarify operations of the Stitcher, the subsequent section +introduces a vocabulary that is necessary for a precise description. + +## Definitions + +- An **input nucleotide** refers to a nucleotide of an initial + assembly contig sequence. +- A **reference nucleotide** refers to a nucleotide of a reference + genome sequence. +- A **non-conflicting nucleotide** is a **reference nucleotide** that + has at most one candidate **input nucleotide**. +- A **non-ambiguous nucleotide** is an **input nucleotide**, which has + a clear positioning with respect to all **input nucleotides** of all + other contigs associated with the same reference genome. In + particular, all **conflicting nucleotides** are **ambiguous + nucleotides** because they do not have a clear positioning with + respect to their competing **conflicting nucleotide**. +- An **overlap** is a continuos segement of **conflicting + nucleotides**. +- **Multidirectional alignment** is a property of a contig such that: + 1. the contig has aligned in multiple parts. + 2. some parts have been aligned to the forward strand, and some to + the reverse strand of the reference genome. +- **Cross-alignment** is a property of a contig such that: + 1. the contig has aligned in multiple parts. + 2. the contig-order of the aligned parts does not agree with the + reference-order of the aligned parts. +- A **non-aligned contig** is a contig that has been assinged a + reference sequence, but did not align to it. +- An **invalid contig** is a contig with **multidirectional + alignment**. +- A **stitched consensus** is a **valid contig** in the output of the + Stitcher. +- The **final output** refers to the contents of the only output CSV + file produced by the Stitcher. + +## Principles + +The reason the Stitcher operates effectively is due to its utilization +of reference genomes as additional source of truth. More precisely, +the Stitcher integrates two sets of data: + +1. Sequences generated by the initial assembly. +2. Sequences of reference genomes to which assembled contigs get aligned. + +We will say that 1. is the assembler's data, and 2. is aligner's. + +The core belief is that a reference genome can be used to enhance the +quality of and resolve conflicts within initial assembly contigs. + +In applying this approach, the Stitcher is guided by the following principles: + +### Principle of Scale-Dependent Credibility + +The reliability of sequence alignments increases as the length of the +aligned segment increases. +Therefore: + +- **Micro Scale**: For shorter segments, assembler's findings are more + reliable, because of expected abundance of small, local mutations + not present in the reference genome. + +- **Macro Scale**: For longer segments, the aligner's interpretations + are prioritized. The exponential decrease in alignment errors with + increased sequence length makes long alignments particularly + trustworthy. + +### Principle of Length Prioritization + +A longer contig typically arises from a greater number of reads +spanning a larger genomic region. While this does not imply more reads +per individual position, it suggests that the initial set of reads has +successfully assembled over a more extensive sequence, reflecting a +broader and more robust dataset. Moreover, aligning a longer sequence +to the reference genome is statistically less probable, compared to a +shorter sequence. This means that a successful alignment of a longer +contig to the reference genome provides further confidence in its +accuracy. + +Therefore in scenarios where multiple contigs cover the same region of +the reference genome, longer contigs are prioritized over shorter +ones. + +### Ambiguity Omission Principle + +To mitigate the potential propagation of uncertainties, any data that +lacks a definitive, unambiguous position within the reference genome +should be entirely excluded. This approach acknowledges that absolute +certainty in complex genomic datasets is often unattainable, and tries +to establish a reasonable default. + +## Regulations + +Guided by the previously outlined principles, +several precise regulations governing the Stitcher can be extracted: + +1. For every reference genome, at most one **stitched consensus** + must result. +2. No **ambiguous, non-conflicting nucleotide** + shall be included into the **final output**. +3. Every **non-conflicting-** and **non-ambiguous-** nucleotide + pertaining to a **valid contig** is required to be included in the + **stitched consensus** for the associated reference genome. +4. The relative positions of **non-conflicting-** and + **non-ambiguous-** nucleotides must be preserved in the **final output**. +5. All nucleotides present in the **final output** must exclusively + originate from the initial assembly data. + +## Setup + +The setup process for the Stitcher ensures that each contig is +properly aligned and prepared for the stitching process. The steps are +as follows: + +1. **Determine Reference Genome**: Identify a the best maching + reference genome for each contig based on its sequence data. + +2. **Align Contigs**: Align each contig to its corresponding reference + genome to approximate their positions within a global reference + framework, allowing for spatial comparison between different contigs. + +3. **Split Multi-Alignment Contigs**: Split contigs that align to + multiple distinct parts of the reference genome into separate + segments. + +4. **Handle Reverse Complement**: Reverse complement contigs that + align to the reverse strand of the reference genome to ensure all + sequences are oriented in the same direction. + +5. **Sort Contigs**: Arrange the contigs based on their starting + positions along the reference genome. + +6. **Group by Reference**: Group contigs such that all contigs + associated with the same reference genome are processed together. + +These setup steps perform minimal alteration to the original contigs +and are primarily guided by straightforward, logical +considerations. Therefore, they do not require extensive +rationalization compared to the subsequent rules. + +## Rules of operation + +Stitching is an iterative process, governed by the following rules: + +### Rule 1: Merge Non-Overlapping Contigs + +1. **Verify Non-Overlap**: Ensure that the end of the first contig is + less or equal to the start of the second contig according to their + positions on the reference genome. + +2. **Delete adjacent non-aligned parts**: Filter out any non-aligned + nucleotides positioned after the first contig's aligned part and + before the second contig's aligned part. + +3. **Concatenate Sequences**: Directly join the end of the first + contig to the start of the second contig. + +#### Example: + +**Input:** +- Contig A: Sequence = `GG[ATGCCC]AA`, aligned to Referece X at + position 10, with first two and last two nucleotides not aligned. +- Contig B: Sequence = `AC[TTAG]TA`, aligned to Referece X at position + 30, with first two and last two nucleotides not aligned. + +**Procedure:** +- Verify that Contig A ends before Contig B begins. +- Delete non-aligned nucleotides resulting in Contig A = `GG[ATGCCC]` and Contig B = `[TTAG]TA`. +- Concatenate Contig A and Contig B to form `GG[ATGCCC][TTAG]TA`. + +**Result:** +- The new sequence, `GG[ATGCCCTTAG]TA`, spans positions 10 to 34 on the reference genome. + +#### Rationale + +There isn't many alternative actions available to us in these circumstances. +This enables us to consider all of them: + +1. **Leaving contigs as separate**: + + Separate contigs would result in multiple consensus outputs for one genome. + Thus it fails to comply with **regulation 1**. + +2. **Omitting the strip step**: + + Note that the adjacent non-aligned nucleotides of the two sequences + are **ambiguous, non-conflicting nucleotides**. Therefore, leaving + them in place violates **regulation 2**. + +3. **Introducing additional modifications**: + + Since given contigs do not overlap, every nucleotide in them is **non-conflicting**. + Additionally, we have stripped all the **ambiguous nucleotides**. + Therefore, all modifications that can be introduced + would either violate **regulation 3**, **regulation 4** or **regulation 5**. + +### Rule 2: Merge Overlapping Contigs + +1. **Verify Overlap**: Check if the ending position of the first + contig is greater than the starting position of the second contig. + +2. **Delete adjacent non-aligned parts**: Filter out any non-aligned + nucleotides positioned after the first contig's aligned part and + before the second contig's aligned part. + +3. **Align Overlapping Regions**: + - Extract the sequences from the overlapping region in both + contigs. + - Use a global alignment method to align these overlapping + sub-sequences. + +4. **Calculate Concordance Scores**: + - Compute concordance scores for each position within the + overlapping region. Importantly, the concordance calculation is + done purely between the aligned overlapping subsequences of the + contigs, with no regard to the reference genome sequence. The + concordance score represents how well the nucleotides from the + two contigs match at each position. + - The score is calculated using a sliding average approach, + emphasizing regions with high sequence agreement. + +5. **Determine Optimal Cut Point**: + - Identify the cut point based on the concordance scores such that + the it lies in the middle of regions with the highest + concordance. + - This means making cuts as far away from disagreeing nucleotides + as possible. + +6. **Segment and Combine**: + - Segment the overlapping sequences at the determined cut point. + - Concatenate the non-overlapping parts of the contigs with the + segmented parts from the overlapping region. + +#### Example + +**Input:** + +- Contig A: Sequence = `G[GGC A--TAC]T T`, aligned to Reference X from positions 10 to 19. +- Contig B: Sequence = `C CCA[AAATAC C]GGG`, aligned to Reference X from positions 14 to 20. + +**Procedure:** + +1. **Verify Overlap**: + - Contig A ends at position 19, and Contig B starts at position 14 + (both on Reference X), resulting in an overlap from positions 14 + to 19. + +2. **Delete adjacent non-aligned parts**: Contig A is right-stripped + to become `G[GGCA--TAC]`, contig B is left-stripped to become + `[AAATACC]GGG`. + +3. **Align Overlapping Regions**: + - The overlaping sequence is `A--TAC` from contig A, and `AAATAC` + from contig B. + - Align them globally to produce the following alignments: `--ATAC` + and `AAATAC` + +4. **Calculate Concordance**: + - Calculate concordance scores for positions 15 to 20, considering + only the overlap between the two aligned sequences. + - Approximate concordance: `[0.1, 0.2, 0.3, 0.8, 0.8, 0.3]`. + +5. **Determine Cut Point**: + - Use the computed concordance scores to identify the cut point. + - In this example, the highest concordance scores are around + positions with the score 0.9, so choose it as the cut point. + + ``` + Aligned sequences: + + A: --ATAC + B: AAATAC + + Concordance: + 0.1 0.2 0.3 0.8 0.8 0.3 + + Based on the concordance, cut between the positions: + A: --AT|AC + B: AAAT|AC + ``` + +6. **Segment and Combine**: + - Cut the sequences at the determined cut points. + - Combine sequence parts: `G[GGC][--AT][AC][C]GGG`. + +**Result:** +- The new sequence `G[GGC--ATACC]GGG` spans positions 10 to 20 on Reference X, + representing the most accurate combined sequence. + +#### Rationale + +This rule is similar to Rule 1, but deals with overlapping +regions. When contigs overlap, there is a need to choose a cut point +due to: + +1. **Aligner Constraints**: The aligner constrains the size of the + overlapping sequence (by the **Principle of Scale-Dependent + Credibility**), making it impossible to keep both versions of the + overlapping region simultaneously. +2. **Small scale adjustments**: Overlaps are usually small enough that + assembler data is the highest quality data we have for the + nucleotide positions within it. Thus interleaving segments from + both contigs would again violate the **Principle of Scale Dependent + Credibility**. + +We base the choice on concordance +scores, which measure the degree of agreement between the overlapping +sequences of the two contigs. We look for the highest concordance +because: + +**Choice of Cut Point**: +- If a cut point is chosen where concordance is lower than the + maximum, it implies that in the neighboring region around the cut + point, either to the left or right, there will almost certainly be + some incorrect nucleotides due to disagreement between the contigs. +- Conversely, if the concordance is high at the chosen cut point, the + neighboring region is similar between the two contigs. The selected + extensions (left of the cut point from the left contig and right of + the cut point from the right contig) are longer than the alternative + from the conflicting contig, ensuring greater trust in these regions + based on their length (by the **Principle of Length Prioritization**). + +While this method of choosing the cut point based on concordance +scores aligns with the Principles, we acknowledge that there might be +other ways to determine the optimal cut point. However, given the +complexity of overlapping regions and the necessity to preserve +relative ordering, this concordance-based approach is the best we have +identified so far. + +### Rule 3: Split Contigs with Gaps Which Are Covered by Other Contigs + +1. **Identify Large Gaps**: + - For each contig, identify regions within its alignment to the + reference genome that lack coverage, i.e., gaps. Both small gaps + resulting from sequencing errors and large gaps are recognized. + - Significant gaps are determined based on a pre-defined + threshold. In the context of HIV genome analysis, a gap size of + greater than 21 nucleotides is considered significant due to + common RNA secondary structure phenomena. + +2. **Verify Coverage by Other Contigs**: + - For each identified significant gap, check if other contigs span + or cover this gap. Specifically, check if other contigs have + aligned reference coordinates that overlap with the coordinates + of the gap. + +3. **Split Contig at Gap Midpoint**: + - If a significant gap is covered by another contig, split the + contig containing the gap into two separate contigs at the + midpoint of the gap. + - Left-trim the new right contig segment and right-trim the new + left contig segment to remove ambiguity from their ends. + +4. **Update Contig List**: + - Replace the original contig with its two new segments in the list + of contigs. + +#### Example + +**Input:** + +- Contig A: Sequence = `AGC[TTAC---------------------GGCACATATCATA]CTA`, + aligned to Reference X from positions 10 to 48. +- Contig B: Sequence = `G[TGAC-----GGACG-TCGTCG--TACGATCAG]G`, + aligned to Reference X from positions 8 to 40. + +**Procedure:** + +1. **Identify Large Gaps**: + - Contig A has a significant gap between positions 14 and 35. + +2. **Verify Coverage by Other Contigs**: + - Contig B covers the gap region from positions 8 to 40. + +3. **Split Contig at Gap Midpoint**: + - Split Contig A into two parts at the midpoint of the gap (i.e., position 24). + This creates two new contigs: + - Contig A1: Sequence = `AGC[TTAC----------]`, + aligned to Reference X from positions 10 to 24. + - Contig A2: Sequence = `[-----------GGCACATATCATA]CTA`, + aligned to Reference X from positions 25 to 48. + - Trim the new segments: + - Contig A1 becomes `AGC[TTAC]`. + - Contig A2 becomes `[GGCACATATCATA]CTA`. + +4. **Update Contig List**: + - Discard the original Contig A and add Contig A1 and Contig A2 to + the list of contigs. + +**Result:** + +- Modified list of contigs now includes Contig B, Contig A1, and Contig A2. + +#### Rationale + +The decision to split contigs at large gaps covered by other contigs +is grounded in the **Principle of Scale-Dependent +Credibility**. Assemblers can occasionally join sequence fragments +incorrectly if the end of one segment appears similar to the start of +another. Relying on the aligner's macro-scale credibility helps +identify these erroneous joins. Large gaps within a contig are +suspicious and suggest potential assembler errors, whereas small gaps +are generally due to sequencing errors or micro-scale mutations and do +not warrant splitting. By leveraging the aligner's high reliability on +a macro scale, we can effectively pinpoint these errors. If other +contigs cover large gaps, it confirms the aligner's indication that +the assembly might have joined unrelated segments. Splitting contigs +at the midpoint of significant gaps ensures that only those segments +supported by both the assembler's micro-scale data and the aligner's +macro-scale alignment are included in the final stitched consensus. + +The threshold for considering a gap significant is set at 21 +nucleotides. This value was chosen because it correlates with the +average pitch of the RNA helix, which reflects how reverse +transcription periodic deletions are structured around 21 nucleotides +in HIV sequences. Choosing this cutoff recognizes that deletions of +approximately this length are a common feature due to RNA secondary +structures and should not automatically warrant a split. This way, we +avoid splitting on every small gap, which is expected given the nature +of micro-scale mutations, but effectively identify and act on larger, +suspect gaps indicative of potential assembler errors. + +### Rule 4: Discard Contigs That Are Fully Covered By Other Contigs + +1. **Identify Covered Contigs**: + - For each contig in the input set, calculate its aligned interval on the reference genome. + - Identify intervals (regions) that are completely covered by input contigs. + +2. **Compare Intervals**: + - Assess the intervals of each contig to find any contig that falls entirely within the span of other contig intervals. + These are the contigs that are fully covered by others. + +3. **Discard Fully Covered Contigs**: + - Once identified, remove the covered contigs. + +#### Example + +**Input:** + +- Contig A: Sequence = `A[ATCGA]GCT`, aligned to Reference X from positions 10 to 15. +- Contig B: Sequence = `C[TAGTTG]A`, aligned to Reference X from positions 14 to 19. +- Contig C: Sequence = `G[CGTACC]G`, aligned to Reference X from positions 12 to 17. + +**Procedure:** + +1. **Identify Covered Contigs**: + - Calculate the intervals: + - Contig A: `[10-15]` + - Contig B: `[14-19]` + - Contig C: `[12-17]` + +2. **Compare Intervals**: + - Assess intervals and find Contig C: `[12-17]` is completely within the intervals `[10-15]` of Contig A and `[14-19]` of Contig B. + +3. **Discard Fully Covered Contigs**: + - Remove Contig C from the analysis. + +**Result:** + +- Unchanged remaining contigs Contig A and Contig C. + +#### Rationale + +The underlying idea for this rule is founded on the two following principles: + +1. **Principle of Length Prioritization**: longer contigs are + inherently more reliable. + +2. **Principle of Scale-Dependent Credibility**: Fully covered contigs + might introduce small-scale inconsistencies that the longer + contig can resolve more credibly, given the enhanced reliability + associated with its length and alignment. + +Moreover, keeping all contigs would violate **Regulation 1**. + +--- + +**Note**: rules apply to contigs that are in the same group. + + + + + + + + + + + + + + + + + + + + + + From b2e101042d9bf87bcda50b8190038cda4a9ed168 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 May 2024 15:02:35 -0700 Subject: [PATCH 298/485] Add illustrations for the stitcher documentation --- docs/design/stitcher.md | 81 ++++++----- docs/design/stitcher_rule_1_input.svg | 83 +++++++++++ docs/design/stitcher_rule_1_result.svg | 51 +++++++ docs/design/stitcher_rule_2_input.svg | 83 +++++++++++ docs/design/stitcher_rule_2_result.svg | 51 +++++++ docs/design/stitcher_rule_3_input.svg | 83 +++++++++++ docs/design/stitcher_rule_3_result.svg | 109 +++++++++++++++ docs/design/stitcher_rule_4_input.svg | 115 +++++++++++++++ docs/design/stitcher_rule_4_result.svg | 185 +++++++++++++++++++++++++ 9 files changed, 810 insertions(+), 31 deletions(-) create mode 100644 docs/design/stitcher_rule_1_input.svg create mode 100644 docs/design/stitcher_rule_1_result.svg create mode 100644 docs/design/stitcher_rule_2_input.svg create mode 100644 docs/design/stitcher_rule_2_result.svg create mode 100644 docs/design/stitcher_rule_3_input.svg create mode 100644 docs/design/stitcher_rule_3_result.svg create mode 100644 docs/design/stitcher_rule_4_input.svg create mode 100644 docs/design/stitcher_rule_4_result.svg diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index 0288f4186..a6b8618d9 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -219,17 +219,23 @@ Stitching is an iterative process, governed by the following rules: #### Example: **Input:** -- Contig A: Sequence = `GG[ATGCCC]AA`, aligned to Referece X at + +![non overlaping example input illustration](stitcher_rule_1_input.svg) + +- Contig 1: Sequence = `GG[ATGCCC]AA`, aligned to Referece X at position 10, with first two and last two nucleotides not aligned. -- Contig B: Sequence = `AC[TTAG]TA`, aligned to Referece X at position +- Contig 2: Sequence = `AC[TTAG]TA`, aligned to Referece X at position 30, with first two and last two nucleotides not aligned. **Procedure:** -- Verify that Contig A ends before Contig B begins. -- Delete non-aligned nucleotides resulting in Contig A = `GG[ATGCCC]` and Contig B = `[TTAG]TA`. -- Concatenate Contig A and Contig B to form `GG[ATGCCC][TTAG]TA`. +- Verify that Contig 1 ends before Contig 2 begins. +- Delete non-aligned nucleotides resulting in Contig 1 = `GG[ATGCCC]` and Contig 2 = `[TTAG]TA`. +- Concatenate Contig 1 and Contig 2 to form `GG[ATGCCC][TTAG]TA`. **Result:** + +![non overlaping example result illustration](stitcher_rule_1_result.svg) + - The new sequence, `GG[ATGCCCTTAG]TA`, spans positions 10 to 34 on the reference genome. #### Rationale @@ -296,18 +302,20 @@ This enables us to consider all of them: **Input:** -- Contig A: Sequence = `G[GGC A--TAC]T T`, aligned to Reference X from positions 10 to 19. -- Contig B: Sequence = `C CCA[AAATAC C]GGG`, aligned to Reference X from positions 14 to 20. +![overlaping example input illustration](stitcher_rule_2_input.svg) + +- Contig 1: Sequence = `G[GGCC A--TAC]T T`, aligned to Reference X from positions 10 to 19. +- Contig 2: Sequence = `--CCAC[AAATAC C]GGG`, aligned to Reference X from positions 14 to 20. **Procedure:** 1. **Verify Overlap**: - - Contig A ends at position 19, and Contig B starts at position 14 + - Contig 1 ends at position 19, and Contig 2 starts at position 14 (both on Reference X), resulting in an overlap from positions 14 to 19. -2. **Delete adjacent non-aligned parts**: Contig A is right-stripped - to become `G[GGCA--TAC]`, contig B is left-stripped to become +2. **Delete adjacent non-aligned parts**: Contig 1 is right-stripped + to become `G[GGCCA--TAC]`, contig B is left-stripped to become `[AAATACC]GGG`. 3. **Align Overlapping Regions**: @@ -342,9 +350,12 @@ This enables us to consider all of them: 6. **Segment and Combine**: - Cut the sequences at the determined cut points. - - Combine sequence parts: `G[GGC][--AT][AC][C]GGG`. + - Combine sequence parts: `G[GGCC][--AT][AC][C]GGG`. **Result:** + +![overlaping example result illustration](stitcher_rule_2_result.svg) + - The new sequence `G[GGC--ATACC]GGG` spans positions 10 to 20 on Reference X, representing the most accurate combined sequence. @@ -420,37 +431,41 @@ identified so far. **Input:** -- Contig A: Sequence = `AGC[TTAC---------------------GGCACATATCATA]CTA`, +![gap example input illustration](stitcher_rule_3_input.svg) + +- Contig 1: Sequence = `AGC[TTAC---------------------GGCACATATCATA]CTA`, aligned to Reference X from positions 10 to 48. -- Contig B: Sequence = `G[TGAC-----GGACG-TCGTCG--TACGATCAG]G`, +- Contig 2: Sequence = `G[TGAC-----GGACG-TCGTCG--TACGATCAG]G`, aligned to Reference X from positions 8 to 40. **Procedure:** 1. **Identify Large Gaps**: - - Contig A has a significant gap between positions 14 and 35. + - Contig 1 has a significant gap between positions 14 and 35. 2. **Verify Coverage by Other Contigs**: - - Contig B covers the gap region from positions 8 to 40. + - Contig 2 covers the gap region from positions 8 to 40. 3. **Split Contig at Gap Midpoint**: - - Split Contig A into two parts at the midpoint of the gap (i.e., position 24). + - Split Contig 1 into two parts at the midpoint of the gap (i.e., position 24). This creates two new contigs: - - Contig A1: Sequence = `AGC[TTAC----------]`, + - Contig 1a: Sequence = `AGC[TTAC----------]`, aligned to Reference X from positions 10 to 24. - - Contig A2: Sequence = `[-----------GGCACATATCATA]CTA`, + - Contig 1b: Sequence = `[-----------GGCACATATCATA]CTA`, aligned to Reference X from positions 25 to 48. - Trim the new segments: - - Contig A1 becomes `AGC[TTAC]`. - - Contig A2 becomes `[GGCACATATCATA]CTA`. + - Contig 1a becomes `AGC[TTAC]`. + - Contig 1b becomes `[GGCACATATCATA]CTA`. 4. **Update Contig List**: - - Discard the original Contig A and add Contig A1 and Contig A2 to + - Discard the original Contig 1 and add Contig 1a and Contig 1b to the list of contigs. **Result:** -- Modified list of contigs now includes Contig B, Contig A1, and Contig A2. +![gap example result illustration](stitcher_rule_3_result.svg) + +- Modified list of contigs now includes Contig 2, Contig 11, and Contig 12. #### Rationale @@ -498,27 +513,31 @@ suspect gaps indicative of potential assembler errors. **Input:** -- Contig A: Sequence = `A[ATCGA]GCT`, aligned to Reference X from positions 10 to 15. -- Contig B: Sequence = `C[TAGTTG]A`, aligned to Reference X from positions 14 to 19. -- Contig C: Sequence = `G[CGTACC]G`, aligned to Reference X from positions 12 to 17. +![covered example input illustration](stitcher_rule_4_input.svg) + +- Contig 1: Sequence = `A[ATCGA]GCT`, aligned to Reference X from positions 10 to 15. +- Contig 2: Sequence = `C[TAGTTG]A`, aligned to Reference X from positions 14 to 19. +- Contig 3: Sequence = `G[CGTACC]G`, aligned to Reference X from positions 12 to 17. **Procedure:** 1. **Identify Covered Contigs**: - Calculate the intervals: - - Contig A: `[10-15]` - - Contig B: `[14-19]` - - Contig C: `[12-17]` + - Contig 1: `[10-15]` + - Contig 2: `[14-19]` + - Contig 3: `[12-17]` 2. **Compare Intervals**: - - Assess intervals and find Contig C: `[12-17]` is completely within the intervals `[10-15]` of Contig A and `[14-19]` of Contig B. + - Assess intervals and find Contig 3: `[12-17]` is completely within the intervals `[10-15]` of Contig 1 and `[14-19]` of Contig 2. 3. **Discard Fully Covered Contigs**: - - Remove Contig C from the analysis. + - Remove Contig 3 from the analysis. **Result:** -- Unchanged remaining contigs Contig A and Contig C. +![covered example result illustration](stitcher_rule_4_result.svg) + +- Unchanged remaining contigs Contig 1 and Contig 3. #### Rationale diff --git a/docs/design/stitcher_rule_1_input.svg b/docs/design/stitcher_rule_1_input.svg new file mode 100644 index 000000000..1916fe621 --- /dev/null +++ b/docs/design/stitcher_rule_1_input.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_1_result.svg b/docs/design/stitcher_rule_1_result.svg new file mode 100644 index 000000000..f0adf1210 --- /dev/null +++ b/docs/design/stitcher_rule_1_result.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_2_input.svg b/docs/design/stitcher_rule_2_input.svg new file mode 100644 index 000000000..f6fdba0f2 --- /dev/null +++ b/docs/design/stitcher_rule_2_input.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + +2 + + + + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_2_result.svg b/docs/design/stitcher_rule_2_result.svg new file mode 100644 index 000000000..cfec558b1 --- /dev/null +++ b/docs/design/stitcher_rule_2_result.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_3_input.svg b/docs/design/stitcher_rule_3_input.svg new file mode 100644 index 000000000..be44cfe22 --- /dev/null +++ b/docs/design/stitcher_rule_3_input.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +2 + + + + + +1 + + + + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + +2 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_3_result.svg b/docs/design/stitcher_rule_3_result.svg new file mode 100644 index 000000000..f9289a002 --- /dev/null +++ b/docs/design/stitcher_rule_3_result.svg @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + +3 + + + + + +2 + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + + + + +3 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_4_input.svg b/docs/design/stitcher_rule_4_input.svg new file mode 100644 index 000000000..e7acfad78 --- /dev/null +++ b/docs/design/stitcher_rule_4_input.svg @@ -0,0 +1,115 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + +3 + + + + + +2 + + + + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + + + + +3 + + + + \ No newline at end of file diff --git a/docs/design/stitcher_rule_4_result.svg b/docs/design/stitcher_rule_4_result.svg new file mode 100644 index 000000000..a29e15d60 --- /dev/null +++ b/docs/design/stitcher_rule_4_result.svg @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + +Reference X + + + + + + + +1 + + + + + +3 + + + + + + + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + +3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discarded: + + + + + + +2 + + + + \ No newline at end of file From e70643cc0814e3dfda1841bbb324a466b7552eb3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 May 2024 15:32:33 -0700 Subject: [PATCH 299/485] Describe visualizer in the stitcher documentation --- docs/design/stitcher.md | 71 ++- docs/design/stitcher_practical_plot.svg | 582 ++++++++++++++++++++++++ 2 files changed, 647 insertions(+), 6 deletions(-) create mode 100644 docs/design/stitcher_practical_plot.svg diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index a6b8618d9..57571304a 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -557,17 +557,76 @@ Moreover, keeping all contigs would violate **Regulation 1**. **Note**: rules apply to contigs that are in the same group. - +# Visualizer - +The Stitcher includes a visualizer tool that helps to see and verify +its decisions by producing a visual representation of the stitching +process. This tool is particularly useful for understanding how +contigs are combined or discarded at each step of the stitching +process, providing a visual context to the resulting stitched output. - +## The Optional Visualizer Tool - +The visualizer can be enabled through the `--plot` flag when running +the Stitcher executable. Running the Stitcher with this flag will +produce an SVG file that visualizes the stitching process, helping to +confirm and debug the Stitcher's operations. - +To use the visualizer, run the Stitcher with an additional argument +specifying the path to the output plot file. Here's an example of how +to stitch contigs and retrieve a visualizer plot: - +```sh +PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher "contigs.fasta" "stitched_contigs.csv" --plot "visualized.svg" +``` + +**Command Line Arguments:** + +- `contigs.fasta`: Input file in FASTA format containing assembled + contigs. +- `stitched_contigs.csv`: Output CSV file that will contain the + stitched contigs. +- `--plot visualized.svg`: The optional argument to generate a visual + representation of the stitching process, saved as `visualized.svg`. + +### Understanding the Output + +In practice, a visualizer plot might look something like this: + +![practical visualizer plot](stitcher_practical_plot.svg) + +From such a diagram, you can gain insights into the following aspects +of the stitching process: + +- **Reference genome**: The best matching reference genome for this + group of contigs was determined to be `HIV1-A1-RW-KF716472`. + +- **Dropped Contigs**: Contigs that were dropped due to being fully + covered by other contigs, as per Rule 4. In the example plot: + - Contigs 2, 4, 7, 8, and 6 were dropped. + +- **Split Contigs**: Contigs split at large gaps covered by other + contigs, according to Rule 3. The resulting parts are shown as + individual segments. + - Contig 1 was split around Contig 3, producing segments labeled as + 1.1 and 1.3. + +- **Joined Contigs**: Contigs that were merged due to overlap: + - Contigs 1 and 3, which were joined as per Rule 2, with + **ambiguous, non-conflicting** nucleotides discarded, shown as + segments labeled 1.2 and 3.1. + +- **Unaligned Contigs**: Contigs that failed to align to the reference + genome during the alignment step of the setup. + - Contig 5 failed to align. + +- **Contigs without a Reference**: Contigs for which a reference + genome could not be determined during the reference detection step + of the setup. + - Contigs 9 and 10 failed to determine a reference genome. + +Understanding these basics will help to interpret other scenarios +displayed by the visualizer plot. diff --git a/docs/design/stitcher_practical_plot.svg b/docs/design/stitcher_practical_plot.svg new file mode 100644 index 000000000..c2a7587f7 --- /dev/null +++ b/docs/design/stitcher_practical_plot.svg @@ -0,0 +1,582 @@ + + + + + + + + +5' LTR + + + + + +gag + + + + + +vif + + + + + +tat + + + + + +nef + + + + + + + +tat + + + + + +vpu + + + + + +rev + + + + + +3' LTR + + + + + + + +pol + + + + + +vpr + + + + + +rev + + + + + +env + + + + + + + +PR + + + + + +RT + + + + + +INT + + + + + +V3 + + + + + +GP41 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +HIV1-A1-RW-KF716472-seed + + + + + + + +1.1 + + + + + +3.2 + + + + + +1.3 + + + + + + + + + + + + + + + + + + + +1.1 + + + + + + + + +1.3 + + + + + + + + + + + + + +3.2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + + + + +2 + + + + + + +3.1 + + + + + + +4 + + + + + + + + + + +6 + + + + + + + + + + +7 + + + + + + + + + + +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + + +5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +unknown: + + + + + + +9 + + + + + + + +10 + + + + \ No newline at end of file From 58eeab0221ad253b09670a51419de7c9f19e4486 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 May 2024 16:04:40 -0700 Subject: [PATCH 300/485] Describe logging diagnostics in the stitcher documentation --- docs/design/stitcher.md | 85 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index 57571304a..2f851fbf5 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -557,13 +557,13 @@ Moreover, keeping all contigs would violate **Regulation 1**. **Note**: rules apply to contigs that are in the same group. -# Visualizer +# Diagnostics -The Stitcher includes a visualizer tool that helps to see and verify -its decisions by producing a visual representation of the stitching -process. This tool is particularly useful for understanding how -contigs are combined or discarded at each step of the stitching -process, providing a visual context to the resulting stitched output. +The Stitcher includes diagnostic tools to ensure transparency and +correctness throughout the stitching process. Two primary methods are +used for diagnostics: visualizer plots and traditional log +files. These tools help users understand and verify the decisions made +by the Stitcher during the stitching process. ## The Optional Visualizer Tool @@ -628,6 +628,79 @@ of the stitching process: Understanding these basics will help to interpret other scenarios displayed by the visualizer plot. +## Traditional Logs + +In addition to visual tools, the Stitcher produces traditional log +files that provide textual details of the stitching process. These +logs are crucial for debugging and understanding the sequence of +operations performed by the Stitcher. The verbosity of logs can be +adjusted using command-line options (`--verbose`, `--debug`, `--quiet`). + +Here is an example of typical log entries: + +```text +DEBUG:micall.core.contig_stitcher:Introduced contig 'contig.00001' (seq = TA...CA) of ref 'HIV1-C-BR-JX140663-seed', group_ref HIV1-A1-RW-KF716472-seed (seq = GA...AC), and length 7719. +DEBUG:micall.core.contig_stitcher:Introduced contig 'contig.00002' (seq = CG...AG) of ref 'HIV1-A1-RW-KF716472-seed', group_ref HIV1-A1-RW-KF716472-seed (seq = GA...AC), and length 1634. +... +DEBUG:micall.core.contig_stitcher:Contig 'contig.00006' produced 1 aligner hits. After connecting them, the number became 1. +DEBUG:micall.core.contig_stitcher:Part 0 of contig 'contig.00006' re-aligned as (5) at 7M...3D@[8,1433]->[7461,8946]. +DEBUG:micall.core.contig_stitcher:Part 0 of contig 'contig.00007' aligned at 76M...3D@[0,732]->[5536,6277]. +DEBUG:micall.core.contig_stitcher:Contig 'contig.00007' produced 1 aligner hits. After connecting them, the number became 1. +DEBUG:micall.core.contig_stitcher:Part 0 of contig 'contig.00007' re-aligned as (6) at 76M...3D@[0,732]->[5536,6277]. +... +DEBUG:micall.core.contig_stitcher:Ignored insignificant gap of (5), 3D@[790,789]->[8280,8282]. +DEBUG:micall.core.contig_stitcher:Ignored insignificant gap of (5), 19D@[1324,1323]->[8817,8835]. +DEBUG:micall.core.contig_stitcher:Ignored insignificant gap of (5), 2D@[1354,1353]->[8866,8867]. +... +DEBUG:micall.core.contig_stitcher:Created contigs (8) at 24M...1I@[14,3864]->[0,4558] and (9) at 708D...92I@[3865,7691]->[4559,9032] by cutting (1) at 24M...1I@[14,7691]->[0,9032] at cut point = 4558.5. +DEBUG:micall.core.contig_stitcher:Doing rstrip of (8) at 24M...1I@[14,3864]->[0,4558] (len 7719) resulted in (10) at 24M...1I@[14,3864]->[0,3850] (len 3865). +DEBUG:micall.core.contig_stitcher:Doing lstrip of (9) at 708D...92I@[3865,7691]->[4559,9032] (len 7719) resulted in (11) at 14M...1I@[0,3734]->[5267,9032] (len 3762). +DEBUG:micall.core.contig_stitcher:Split contig (1) at 24M...1I@[14,7691]->[0,9032] around its gap at [3864, 3863]->[3851, 5266]. Left part: (10) at 24M...1I@[14,3864]->[0,3850], right part: (11) at 14M...1I@[0,3734]->[5267,9032]. +... +DEBUG:micall.core.contig_stitcher:Created a frankenstein (34) at 24M...1I@[14,4185]->[0,4171] (len 4186) from [(26) at 24M...1I@[14,3041]->[0,3027] (len 3042), (28) at 271M2D3M2I395M@[0,670]->[3028,3698] (len 671), (30) at 152M@[0,151]->[3699,3850] (len 152), (31) at 321M@[0,320]->[3851,4171] (len 321)]. +DEBUG:micall.core.plot_contigs:Contig name (26) is displayed as '1.1'. +DEBUG:micall.core.plot_contigs:Contig name (36) is displayed as '1.3'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00002' is displayed as '2'. +DEBUG:micall.core.plot_contigs:Contig name (2) is displayed as '2'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00003' is displayed as '3'. +DEBUG:micall.core.plot_contigs:Contig name (31) is displayed as '3.2'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00004' is displayed as '4'. +DEBUG:micall.core.plot_contigs:Contig name (4) is displayed as '4'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00005' is displayed as '5'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00006' is displayed as '6'. +DEBUG:micall.core.plot_contigs:Contig name (5) is displayed as '6'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00007' is displayed as '7'. +DEBUG:micall.core.plot_contigs:Contig name (6) is displayed as '7'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00008' is displayed as '8'. +DEBUG:micall.core.plot_contigs:Contig name (7) is displayed as '8'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00009' is displayed as '9'. +DEBUG:micall.core.plot_contigs:Contig name 'contig.00010' is displayed as '10'. +``` + +The following points illustrate how these logs can facilitate +understanding the stitching process: + +- **Contig Introduction**: Provides details about the contigs + introduced for stitching. + - `Introduced contig 'contig.00001'...` + +- **Alignment Details**: Shows the alignment results for each contig. + - `Part 0 of contig 'contig.00006' re-aligned as (5) at + 7M...3D@[8,1433]->[7461,8946].` + +- **Gap Handling**: Indicates which gaps were ignored as + insignificant. + - `Ignored insignificant gap of (5), 3D@[790,789]->[8280,8282].` + +- **Splitting and Merging Contigs**: Documents the splitting of + contigs at identified gaps and merging of overlapping segments. + - `Split contig (1) at 24M...1I@[14,7691]->[0,9032]...` + - `Created a frankenstein (34) at 24M...1I@[14,4185]->[0,4171]...` + +- **Visualizer Compatibility**: The visualizer diagrams are produced + exclusively from these logs, ensuring compatibility and consistency + between the logs and visual output. + From 56cbb48bcb4c27fcecda284d1a87dd2edbe7ce44 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 15 May 2024 16:24:32 -0700 Subject: [PATCH 301/485] Add Limitations section to the stitcher documentation --- docs/design/stitcher.md | 52 ++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index 2f851fbf5..1a676a795 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -701,12 +701,46 @@ understanding the stitching process: exclusively from these logs, ensuring compatibility and consistency between the logs and visual output. - - - - - - - - - +# Limitations + +Following limitations stem from the choice of principles and various +assumptions that guide the Stitcher's operation. Understanding them +allows users to better interpret the results and apply post-processing +steps to mitigate potential issues. + +One of the critical challenges is the handling of ambiguous +nucleotides. The Stitcher's **Ambiguity Omission Principle**, which +aims to avoid propagating uncertainties, might lead to the exclusion +of significant sequence data, resulting in the loss of potentially +valuable variations or mutations. + +Moreover, the calculation of concordance in overlapping regions +assumes that local concordance is the best indicator of the correct +sequence. This approach may not fully account for complex genomic +rearrangements or context outside the overlap, potentially +compromising the accuracy of the stitched sequence. + +The predefined threshold for significant gaps, based on specific +assumptions about RNA secondary structures of organisms like HIV, +might not generalize well to other organisms or genomic regions. This +can lead to over-splitting or under-splitting contigs, further +fragmenting the consensus sequence. + +Additionally, The Stitcher’s principle of scale-dependent credibility +might overlook important small-scale variations, such as single +nucleotide polymorphisms (SNPs) or small indels, especially if they +are lost in longer contigs deemed more reliable. + +Another critical limitation arises in the context of pipelines dealing +with proviral sequences. The Stitcher might attempt to "fix" sequences +that are inherently "broken", such as those that are scrambled, +contain long deletions, or exhibit hypermutation. In such cases, the +tool's corrective measures may not be desirable, as they risk +introducing inaccuracies. This limitation makes the Stitcher +unsuitable for certain pipelines where the integrity of such broken +sequences should be preserved without alteration. + +Finally, the handling of multidirectional and cross-alignments may +fall short when addressing complex genomic rearrangements, such as +translocations or inversions, potentially resulting in misalignments +and stitching errors in the consensus sequence. From 12ca09097661f4f5495da154776a2763e19042ff Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 May 2024 18:11:20 -0700 Subject: [PATCH 302/485] Contig stitcher: initialize from CSV data This is a first step in making the Stitcher being able to accept CSV files as inputs. --- micall/core/contig_stitcher.py | 54 +++++++++++++++++++++++++++- micall/core/denovo.py | 65 ++++++++++++++++------------------ 2 files changed, 83 insertions(+), 36 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0e1197c91..682855adc 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,5 +1,6 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar +from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar, TextIO from collections import defaultdict +import csv from dataclasses import replace from math import ceil from mappy import Aligner @@ -13,6 +14,8 @@ from operator import itemgetter from aligntools import Cigar, connect_cigar_hits, CigarHit +from micall.core.project_config import ProjectConfig +from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig import micall.utils.contig_stitcher_events as events @@ -597,6 +600,55 @@ def combine(group_ref): yield from map(combine, consensus_parts) +def output_contigs(writer, contigs: Iterable[GenotypedContig]): + for contig in contigs: + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq)) + + +def input_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: + projects = ProjectConfig.loadDefault() + + for row in csv.DictReader(input_csv): + seq = row['contig'] + ref_name = row['ref'] + group_ref = row['group_ref'] + match_fraction = float(row['match']) + + try: + ref_seq = projects.getGenotypeReference(group_ref) + except KeyError: + try: + ref_seq = projects.getReference(group_ref) + except KeyError: + ref_seq = None + + yield GenotypedContig(name=None, + seq=seq, + ref_name=ref_name, + group_ref=group_ref, + ref_seq=str(ref_seq) if ref_seq is not None else None, + match_fraction=match_fraction) + + +def parse_and_run(input_csv: TextIO, stitched_writer: TextIO, stitcher_plot_path: Optional[str]) -> int: + with StitcherContext.fresh() as ctx: + contigs = list(input_contigs(input_csv)) + + if stitched_writer is not None or stitcher_plot_path is not None: + contigs = list(stitch_consensus(contigs)) + + if stitched_writer is not None: + output_contigs(stitched_writer, contigs) + + if stitcher_plot_path is not None: + plot_stitcher_coverage(ctx.events, stitcher_plot_path) + + return len(contigs) + + def main(args): import argparse from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. diff --git a/micall/core/denovo.py b/micall/core/denovo.py index e672e6548..aa7fdb1ba 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -11,9 +11,9 @@ from io import StringIO from itertools import groupby from operator import itemgetter -from shutil import rmtree +from shutil import rmtree, copyfileobj from subprocess import run, PIPE, CalledProcessError, STDOUT -from tempfile import mkdtemp +from tempfile import mkdtemp, NamedTemporaryFile from Bio import SeqIO from Bio.Blast.Applications import NcbiblastnCommandline @@ -21,9 +21,8 @@ from Bio.SeqRecord import SeqRecord from micall.core.project_config import ProjectConfig -from micall.core.contig_stitcher import GenotypedContig, stitch_consensus -from micall.core.plot_contigs import plot_stitcher_coverage -from micall.utils.contig_stitcher_context import StitcherContext +from micall.utils.contig_stitcher_contigs import GenotypedContig +import micall.core.contig_stitcher as stitcher IVA = "iva" DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), @@ -68,8 +67,18 @@ def init_contigs_refs(contigs_csv: TextIO): return writer -def contigs_refs_write(writer, ref: str, match: float, group_ref: str, contig: str): - writer.writerow(dict(ref=ref, match=match, group_ref=group_ref, contig=contig)) +def write_unstitched_contigs(writer, + group_refs, + genotypes, + contigs_fasta_path + ): + + for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta_path): + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq + )) def write_contig_refs(contigs_fasta_path: str, @@ -97,41 +106,27 @@ def write_contig_refs(contigs_fasta_path: str, contig_name = f'merged-contig-{i}' contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") - unstitched_writer = init_contigs_refs(unstitched_contigs_csv) \ - if unstitched_contigs_csv else None - stitched_writer = init_contigs_refs(contigs_csv) if contigs_csv else None - group_refs: Dict[str, str] = {} + with NamedTemporaryFile(mode='wt') as temporary_unstitched_csv: + unstitched_writer = init_contigs_refs(cast(TextIO, temporary_unstitched_csv)) + stitched_writer = init_contigs_refs(contigs_csv) if contigs_csv else None + group_refs: Dict[str, str] = {} - with StitcherContext.fresh() as ctx: genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) - contigs = list(read_assembled_contigs(group_refs, genotypes, contigs_fasta_path)) - - if unstitched_writer is not None: - for contig in contigs: - contigs_refs_write(unstitched_writer, - ref=contig.ref_name, - match=contig.match_fraction, - group_ref=contig.group_ref, - contig=contig.seq) - - if stitched_writer is not None or stitcher_plot_path is not None: - contigs = list(stitch_consensus(contigs)) - - if stitched_writer is not None: - for contig in contigs: - contigs_refs_write(stitched_writer, - ref=contig.ref_name, - match=contig.match_fraction, - group_ref=contig.group_ref, - contig=contig.seq) + write_unstitched_contigs(unstitched_writer, + group_refs, + genotypes, + contigs_fasta_path) + temporary_unstitched_csv.flush() - if stitcher_plot_path is not None: - plot_stitcher_coverage(ctx.events, stitcher_plot_path) + if unstitched_contigs_csv: + with open(temporary_unstitched_csv.name) as input_csv: + copyfileobj(input_csv, unstitched_contigs_csv) - return len(contigs) + with open(temporary_unstitched_csv.name) as input_csv: + return stitcher.parse_and_run(input_csv, stitched_writer, stitcher_plot_path) def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): From 7c99e15446c9c339e8467cd7086e1378ec1e9cce Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 May 2024 18:14:56 -0700 Subject: [PATCH 303/485] Contig stitcher: add --input-type argument This argument is used to choose between CSV and FASTA input types. --- micall/core/contig_stitcher.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 682855adc..a14d2acd8 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -651,19 +651,24 @@ def parse_and_run(input_csv: TextIO, stitched_writer: TextIO, stitcher_plot_path def main(args): import argparse + import os from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() - parser.add_argument('contigs', type=argparse.FileType('r'), help="Input FASTA file with assembled contigs.") + parser.add_argument('contigs', type=argparse.FileType('r'), help="Input file with assembled contigs.") parser.add_argument('stitched_contigs', type=argparse.FileType('w'), help="Output CSV file with stitched contigs.") parser.add_argument('--plot', type=argparse.FileType('w'), help="Output SVG image visualizing the stitching process.") + parser.add_argument('--input-type', choices=['csv', 'fasta'], + help='Type of the input file: either "csv" or "fasta".' + ' If not specified, the type is inferred from the file extension.') verbosity_group = parser.add_mutually_exclusive_group() verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity.') verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True) verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.') verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.') + args = parser.parse_args(args) if args.quiet: @@ -676,9 +681,22 @@ def main(args): logger.setLevel(logging.WARN) logging.basicConfig(level=logger.level) + + # Infer input type from file extension if not specified + if args.input_type is None: + _, ext = os.path.splitext(args.contigs.name) + if ext.lower() in ['.csv']: + args.input_type = 'csv' + elif ext.lower() in ['.fasta', '.fa']: + args.input_type = 'fasta' + else: + parser.error('Unable to infer the input type from file extension. Please provide "--input-type".') + with StitcherContext.fresh(): plot_path = args.plot.name if args.plot is not None else None + write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) + args.contigs.close() args.stitched_contigs.close() if args.plot is not None: From 4afe5321ee9ec10370cb4e1c76f50031d7e18853 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 22 May 2024 18:18:40 -0700 Subject: [PATCH 304/485] Contig stitcher: handle CSV files --- micall/core/contig_stitcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index a14d2acd8..633660c37 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -695,7 +695,10 @@ def main(args): with StitcherContext.fresh(): plot_path = args.plot.name if args.plot is not None else None - write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) + if args.input_type == 'csv': + parse_and_run(args.contigs, args.stitched_contigs, plot_path) + else: + write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) args.contigs.close() args.stitched_contigs.close() From 6de2dc4f77a402cddc68ac0db2300d04abaa2280 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 09:06:23 -0700 Subject: [PATCH 305/485] Contig stitcher: make group_ref optional --- micall/core/contig_stitcher.py | 2 +- micall/utils/contig_stitcher_contigs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 633660c37..0f92639a3 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -578,7 +578,7 @@ def stitch_contigs(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedCont yield from combine_overlaps(aligned) -GroupRef = str +GroupRef = Optional[str] def stitch_consensus(contigs: Iterable[GenotypedContig]) -> Iterable[GenotypedContig]: diff --git a/micall/utils/contig_stitcher_contigs.py b/micall/utils/contig_stitcher_contigs.py index c79e835ed..a05c5bdc4 100644 --- a/micall/utils/contig_stitcher_contigs.py +++ b/micall/utils/contig_stitcher_contigs.py @@ -39,7 +39,7 @@ def register(self) -> int: @dataclass(frozen=True) class GenotypedContig(Contig): ref_name: str - group_ref: str + group_ref: Optional[str] # The sequence of self.group_ref. None in cases where the reference organism is unknown. ref_seq: Optional[str] From 2f3e6f841504a1aec7550c62b6684f76e04b0c45 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 08:18:00 -0700 Subject: [PATCH 306/485] Factor out fasta_to_csv.py script --- micall/core/contig_stitcher.py | 17 ++- micall/core/denovo.py | 212 ++++----------------------------- micall/utils/fasta_to_csv.py | 200 +++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+), 194 deletions(-) create mode 100644 micall/utils/fasta_to_csv.py diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 0f92639a3..527d7dfbb 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,6 +1,7 @@ from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar, TextIO from collections import defaultdict import csv +import os from dataclasses import replace from math import ceil from mappy import Aligner @@ -600,7 +601,11 @@ def combine(group_ref): yield from map(combine, consensus_parts) -def output_contigs(writer, contigs: Iterable[GenotypedContig]): +def output_contigs(output_csv: TextIO, contigs: Iterable[GenotypedContig]): + writer = csv.DictWriter(output_csv, + ['ref', 'match', 'group_ref', 'contig'], + lineterminator=os.linesep) + writer.writeheader() for contig in contigs: writer.writerow(dict(ref=contig.ref_name, match=contig.match_fraction, @@ -633,15 +638,15 @@ def input_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: match_fraction=match_fraction) -def parse_and_run(input_csv: TextIO, stitched_writer: TextIO, stitcher_plot_path: Optional[str]) -> int: +def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str]) -> int: with StitcherContext.fresh() as ctx: contigs = list(input_contigs(input_csv)) - if stitched_writer is not None or stitcher_plot_path is not None: + if output_csv is not None or stitcher_plot_path is not None: contigs = list(stitch_consensus(contigs)) - if stitched_writer is not None: - output_contigs(stitched_writer, contigs) + if output_csv is not None: + output_contigs(output_csv, contigs) if stitcher_plot_path is not None: plot_stitcher_coverage(ctx.events, stitcher_plot_path) @@ -696,7 +701,7 @@ def main(args): plot_path = args.plot.name if args.plot is not None else None if args.input_type == 'csv': - parse_and_run(args.contigs, args.stitched_contigs, plot_path) + run(args.contigs, args.stitched_contigs, plot_path) else: write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index aa7fdb1ba..61dca8ec4 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -2,85 +2,28 @@ import logging import os import tempfile -import typing -from typing import Optional, TextIO, Iterable, Dict, cast -from collections import Counter -from csv import DictWriter, DictReader +from typing import Optional, TextIO, cast +from csv import DictReader from datetime import datetime from glob import glob -from io import StringIO -from itertools import groupby -from operator import itemgetter from shutil import rmtree, copyfileobj -from subprocess import run, PIPE, CalledProcessError, STDOUT +from subprocess import PIPE, CalledProcessError, STDOUT +import subprocess from tempfile import mkdtemp, NamedTemporaryFile from Bio import SeqIO -from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from micall.core.project_config import ProjectConfig -from micall.utils.contig_stitcher_contigs import GenotypedContig +from micall.utils.fasta_to_csv import run, DEFAULT_DATABASE, genotype import micall.core.contig_stitcher as stitcher +assert DEFAULT_DATABASE +assert genotype is not None + IVA = "iva" -DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), - '..', - 'blast_db', - 'refs.fasta') logger = logging.getLogger(__name__) - -def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> Iterable[GenotypedContig]: - projects = ProjectConfig.loadDefault() - - for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): - (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) - seq = record.seq - if match_fraction < 0: - seq = seq.reverse_complement() - match_fraction *= -1 - - group_ref = group_refs.get(ref_name) - try: - ref_seq = projects.getGenotypeReference(group_ref) - except KeyError: - try: - ref_seq = projects.getReference(group_ref) - except KeyError: - ref_seq = None - - yield GenotypedContig(name=record.name, - seq=str(seq), - ref_name=ref_name, - group_ref=group_ref, - ref_seq=str(ref_seq) if ref_seq is not None else None, - match_fraction=match_fraction) - - -def init_contigs_refs(contigs_csv: TextIO): - writer = DictWriter(contigs_csv, - ['ref', 'match', 'group_ref', 'contig'], - lineterminator=os.linesep) - writer.writeheader() - return writer - - -def write_unstitched_contigs(writer, - group_refs, - genotypes, - contigs_fasta_path - ): - - for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta_path): - writer.writerow(dict(ref=contig.ref_name, - match=contig.match_fraction, - group_ref=contig.group_ref, - contig=contig.seq - )) - - def write_contig_refs(contigs_fasta_path: str, unstitched_contigs_csv: Optional[TextIO], contigs_csv: Optional[TextIO], @@ -107,128 +50,21 @@ def write_contig_refs(contigs_fasta_path: str, contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") with NamedTemporaryFile(mode='wt') as temporary_unstitched_csv: - unstitched_writer = init_contigs_refs(cast(TextIO, temporary_unstitched_csv)) - stitched_writer = init_contigs_refs(contigs_csv) if contigs_csv else None - group_refs: Dict[str, str] = {} - - genotypes = genotype(contigs_fasta_path, - blast_csv=blast_csv, - group_refs=group_refs) - - write_unstitched_contigs(unstitched_writer, - group_refs, - genotypes, - contigs_fasta_path) - temporary_unstitched_csv.flush() + run(contigs_fasta_path, + cast(TextIO, temporary_unstitched_csv), + blast_csv) if unstitched_contigs_csv: with open(temporary_unstitched_csv.name) as input_csv: copyfileobj(input_csv, unstitched_contigs_csv) - with open(temporary_unstitched_csv.name) as input_csv: - return stitcher.parse_and_run(input_csv, stitched_writer, stitcher_plot_path) - - -def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): - """ Use Blastn to search for the genotype of a set of reference sequences. + if contigs_csv: + output_csv = contigs_csv + else: + output_csv = open("/dev/null", "wt") - :param str fasta: file path of the FASTA file containing the query - sequences - :param str db: file path of the database to search for matches - :param blast_csv: open file to write the blast matches to, or None - :param dict group_refs: {contig_ref: group_ref} or None. The dictionary - will get filled in with the mapping from each contig's reference name - to the best matched reference for the whole seed group. - :return: {query_name: (ref_name, matched_fraction)} where query_name is a - sequence header from the query sequences FASTA file, ref_name is the - name of the best match from the database, and matched_fraction is the - fraction of the query that aligned against the reference (matches and - mismatches). - """ - - contig_nums: Dict[str, int] = {} # {contig_name: contig_num} - with open(fasta) as f: - for line in f: - if line.startswith('>'): - contig_name = line[1:-1] - contig_nums[contig_name] = len(contig_nums) + 1 - blast_columns = ['qaccver', - 'saccver', - 'pident', - 'score', - 'qcovhsp', - 'qstart', - 'qend', - 'sstart', - 'send'] - cline = NcbiblastnCommandline(query=fasta, - db=db, - outfmt=f'"10 {" ".join(blast_columns)}"', - evalue=0.0001, - gapopen=5, - gapextend=2, - penalty=-3, - reward=1, - max_target_seqs=5000) - stdout, _ = cline() - samples = {} # {query_name: (subject_name, matched_fraction)} - matches = sorted(DictReader(StringIO(stdout), blast_columns), - key=lambda row: (row['qaccver'], float(row['score']))) - if not blast_csv: - blast_writer = None - else: - blast_writer = DictWriter(blast_csv, - ['contig_num', - 'ref_name', - 'score', - 'match', - 'pident', - 'start', - 'end', - 'ref_start', - 'ref_end'], - lineterminator=os.linesep) - blast_writer.writeheader() - contig_top_matches = {match['qaccver']: match['saccver'] - for match in matches} - top_refs = set(contig_top_matches.values()) - projects = ProjectConfig.loadDefault() - match_scores: typing.Counter[str] = Counter() - for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')): - contig_top_ref = contig_top_matches[contig_name] - contig_seed_group = projects.getSeedGroup(contig_top_ref) - for match in contig_matches: - ref_name = match['saccver'] - if ref_name not in top_refs: - continue - match_seed_group = projects.getSeedGroup(ref_name) - if match_seed_group == contig_seed_group: - match_scores[ref_name] += float(match['score']) # type: ignore[assignment] - - if group_refs is not None: - group_top_refs = {projects.getSeedGroup(ref_name): ref_name - for ref_name, count in reversed(match_scores.most_common())} - for ref_name in contig_top_matches.values(): - group_refs[ref_name] = group_top_refs[projects.getSeedGroup(ref_name)] - - for match in matches: - matched_fraction = float(match['qcovhsp']) / 100 - if int(match['send']) < int(match['sstart']): - matched_fraction *= -1 - pident = round(float(match['pident'])) - contig_name = match['qaccver'] - samples[contig_name] = (match['saccver'], matched_fraction) - if blast_writer: - blast_writer.writerow(dict(contig_num=contig_nums[contig_name], - ref_name=match['saccver'], - score=match['score'], - match=matched_fraction, - pident=pident, - start=match['qstart'], - end=match['qend'], - ref_start=match['sstart'], - ref_end=match['send'])) - return samples + with open(temporary_unstitched_csv.name) as input_csv: + return stitcher.run(input_csv, output_csv, stitcher_plot_path) def denovo(fastq1_path: str, @@ -273,12 +109,12 @@ def denovo(fastq1_path: str, joined_path = os.path.join(tmp_dir, 'joined.fastq') if stitcher_plot_path is None: stitcher_plot_path = os.path.join(tmp_dir, "stitcher_plot.svg") - run(['merge-mates', - fastq1_path, - fastq2_path, - '--interleave', - '-o', joined_path], - check=True) + subprocess.run(['merge-mates', + fastq1_path, + fastq2_path, + '--interleave', + '-o', joined_path], + check=True) iva_out_path = os.path.join(tmp_dir, 'iva_out') contigs_fasta_path = os.path.join(iva_out_path, 'contigs.fasta') iva_args = [IVA, '--fr', joined_path, '-t', '2'] @@ -294,7 +130,7 @@ def denovo(fastq1_path: str, iva_args.extend(['--contigs', seeds_fasta_path, '--make_new_seeds']) iva_args.append(iva_out_path) try: - run(iva_args, check=True, stdout=PIPE, stderr=STDOUT) + subprocess.run(iva_args, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: output = ex.output and ex.output.decode('UTF8') if output != 'Failed to make first seed. Cannot continue\n': diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py new file mode 100644 index 000000000..44361d211 --- /dev/null +++ b/micall/utils/fasta_to_csv.py @@ -0,0 +1,200 @@ +import typing +from typing import Optional, TextIO, Iterable, Dict, cast +from collections import Counter +from csv import DictWriter, DictReader +from io import StringIO +from itertools import groupby +from operator import itemgetter +import os + +from Bio import SeqIO +from Bio.Blast.Applications import NcbiblastnCommandline + +from micall.core.project_config import ProjectConfig +from micall.utils.contig_stitcher_contigs import GenotypedContig + + +DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), + '..', + 'blast_db', + 'refs.fasta') + + +def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> Iterable[GenotypedContig]: + projects = ProjectConfig.loadDefault() + + for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): + (ref_name, match_fraction) = genotypes.get(record.name, ('unknown', 0)) + seq = record.seq + if match_fraction < 0: + seq = seq.reverse_complement() + match_fraction *= -1 + + group_ref = group_refs.get(ref_name) + try: + ref_seq = projects.getGenotypeReference(group_ref) + except KeyError: + try: + ref_seq = projects.getReference(group_ref) + except KeyError: + ref_seq = None + + yield GenotypedContig(name=record.name, + seq=str(seq), + ref_name=ref_name, + group_ref=group_ref, + ref_seq=str(ref_seq) if ref_seq is not None else None, + match_fraction=match_fraction) + + +def init_contigs_refs(contigs_csv: TextIO): + writer = DictWriter(contigs_csv, + ['ref', 'match', 'group_ref', 'contig'], + lineterminator=os.linesep) + writer.writeheader() + return writer + + +def write_unstitched_contigs(writer, + group_refs, + genotypes, + contigs_fasta_path + ): + + for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta_path): + writer.writerow(dict(ref=contig.ref_name, + match=contig.match_fraction, + group_ref=contig.group_ref, + contig=contig.seq + )) + + +def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): + """ Use Blastn to search for the genotype of a set of reference sequences. + + :param str fasta: file path of the FASTA file containing the query + sequences + :param str db: file path of the database to search for matches + :param blast_csv: open file to write the blast matches to, or None + :param dict group_refs: {contig_ref: group_ref} or None. The dictionary + will get filled in with the mapping from each contig's reference name + to the best matched reference for the whole seed group. + :return: {query_name: (ref_name, matched_fraction)} where query_name is a + sequence header from the query sequences FASTA file, ref_name is the + name of the best match from the database, and matched_fraction is the + fraction of the query that aligned against the reference (matches and + mismatches). + """ + + contig_nums: Dict[str, int] = {} # {contig_name: contig_num} + with open(fasta) as f: + for line in f: + if line.startswith('>'): + contig_name = line[1:-1] + contig_nums[contig_name] = len(contig_nums) + 1 + blast_columns = ['qaccver', + 'saccver', + 'pident', + 'score', + 'qcovhsp', + 'qstart', + 'qend', + 'sstart', + 'send'] + cline = NcbiblastnCommandline(query=fasta, + db=db, + outfmt=f'"10 {" ".join(blast_columns)}"', + evalue=0.0001, + gapopen=5, + gapextend=2, + penalty=-3, + reward=1, + max_target_seqs=5000) + stdout, _ = cline() + samples = {} # {query_name: (subject_name, matched_fraction)} + matches = sorted(DictReader(StringIO(stdout), blast_columns), + key=lambda row: (row['qaccver'], float(row['score']))) + if not blast_csv: + blast_writer = None + else: + blast_writer = DictWriter(blast_csv, + ['contig_num', + 'ref_name', + 'score', + 'match', + 'pident', + 'start', + 'end', + 'ref_start', + 'ref_end'], + lineterminator=os.linesep) + blast_writer.writeheader() + contig_top_matches = {match['qaccver']: match['saccver'] + for match in matches} + top_refs = set(contig_top_matches.values()) + projects = ProjectConfig.loadDefault() + match_scores: typing.Counter[str] = Counter() + for contig_name, contig_matches in groupby(matches, itemgetter('qaccver')): + contig_top_ref = contig_top_matches[contig_name] + contig_seed_group = projects.getSeedGroup(contig_top_ref) + for match in contig_matches: + ref_name = match['saccver'] + if ref_name not in top_refs: + continue + match_seed_group = projects.getSeedGroup(ref_name) + if match_seed_group == contig_seed_group: + match_scores[ref_name] += float(match['score']) # type: ignore[assignment] + + if group_refs is not None: + group_top_refs = {projects.getSeedGroup(ref_name): ref_name + for ref_name, count in reversed(match_scores.most_common())} + for ref_name in contig_top_matches.values(): + group_refs[ref_name] = group_top_refs[projects.getSeedGroup(ref_name)] + + for match in matches: + matched_fraction = float(match['qcovhsp']) / 100 + if int(match['send']) < int(match['sstart']): + matched_fraction *= -1 + pident = round(float(match['pident'])) + contig_name = match['qaccver'] + samples[contig_name] = (match['saccver'], matched_fraction) + if blast_writer: + blast_writer.writerow(dict(contig_num=contig_nums[contig_name], + ref_name=match['saccver'], + score=match['score'], + match=matched_fraction, + pident=pident, + start=match['qstart'], + end=match['qend'], + ref_start=match['sstart'], + ref_end=match['send'])) + return samples + + +def run(contigs_fasta_path: str, + unstitched_contigs_csv: TextIO, + blast_csv: Optional[TextIO] = None): + """ Run BLAST search to identify contig sequences. + + :param str contigs_fasta_path: path to file to read contig sequences from + and append merged contigs to + :param unstitched_contigs_csv: open file to write assembled contigs to + :param contigs_csv: open file to write stitched contigs to + :param merged_contigs_csv: open file to read contigs that were merged from + amplicon reads + :param blast_csv: open file to write BLAST search results for each contig + :param stitcher_plot_path: open file to write the visualizer plot to + """ + + unstitched_writer = init_contigs_refs(cast(TextIO, unstitched_contigs_csv)) + group_refs: Dict[str, str] = {} + + genotypes = genotype(contigs_fasta_path, + blast_csv=blast_csv, + group_refs=group_refs) + + write_unstitched_contigs(unstitched_writer, + group_refs, + genotypes, + contigs_fasta_path) + unstitched_contigs_csv.flush() From 91be20c86563de1e5000802095b68ec426be15e8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 09:07:54 -0700 Subject: [PATCH 307/485] Cleanup the fasta_to_csv script Update documentation and improve typing. --- micall/utils/fasta_to_csv.py | 135 +++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index 44361d211..bb7c9819e 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -1,11 +1,14 @@ +import argparse +import logging +import os import typing -from typing import Optional, TextIO, Iterable, Dict, cast +from typing import Optional, TextIO, Iterable, Dict, cast, Sequence from collections import Counter from csv import DictWriter, DictReader -from io import StringIO from itertools import groupby from operator import itemgetter -import os + +from io import StringIO from Bio import SeqIO from Bio.Blast.Applications import NcbiblastnCommandline @@ -14,13 +17,22 @@ from micall.utils.contig_stitcher_contigs import GenotypedContig -DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), - '..', - 'blast_db', - 'refs.fasta') +DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', 'blast_db', 'refs.fasta') + +def read_assembled_contigs(group_refs: Dict[str, str], + genotypes: Dict[str, typing.Tuple[str, float]], + contigs_fasta_path: str) -> Iterable[GenotypedContig]: + """Read assembled contigs and generate GenotypedContig objects. -def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> Iterable[GenotypedContig]: + Args: + group_refs (Dict[str, str]): Mapping of reference names to group references. + genotypes (Dict[str, Tuple[str, float]]): Mapping of contig names to (reference name, match fraction). + contigs_fasta_path (str): Path to the FASTA file containing contig sequences. + + Returns: + Iterable[GenotypedContig]: An iterable of GenotypedContig objects. + """ projects = ProjectConfig.loadDefault() for i, record in enumerate(SeqIO.parse(contigs_fasta_path, "fasta")): @@ -47,7 +59,15 @@ def read_assembled_contigs(group_refs, genotypes, contigs_fasta_path: str) -> It match_fraction=match_fraction) -def init_contigs_refs(contigs_csv: TextIO): +def init_contigs_refs(contigs_csv: TextIO) -> DictWriter: + """Initialize a CSV writer with header for contig references. + + Args: + contigs_csv (TextIO): Open file object to write the contig references. + + Returns: + DictWriter: A CSV DictWriter object initialized with the headers. + """ writer = DictWriter(contigs_csv, ['ref', 'match', 'group_ref', 'contig'], lineterminator=os.linesep) @@ -55,35 +75,39 @@ def init_contigs_refs(contigs_csv: TextIO): return writer -def write_unstitched_contigs(writer, - group_refs, - genotypes, - contigs_fasta_path - ): +def write_unstitched_contigs(writer: DictWriter, + group_refs: Dict[str, str], + genotypes: Dict[str, typing.Tuple[str, float]], + contigs_fasta_path: str): + """Write unstitched contigs to a CSV file. + Args: + writer (DictWriter): CSV writer to write contigs. + group_refs (Dict[str, str]): Mapping of reference names to group references. + genotypes (Dict[str, Tuple[str, float]]): Mapping of contig names to (reference name, match fraction). + contigs_fasta_path (str): Path to the FASTA file containing contig sequences. + """ for contig in read_assembled_contigs(group_refs, genotypes, contigs_fasta_path): writer.writerow(dict(ref=contig.ref_name, match=contig.match_fraction, group_ref=contig.group_ref, - contig=contig.seq - )) - - -def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): - """ Use Blastn to search for the genotype of a set of reference sequences. - - :param str fasta: file path of the FASTA file containing the query - sequences - :param str db: file path of the database to search for matches - :param blast_csv: open file to write the blast matches to, or None - :param dict group_refs: {contig_ref: group_ref} or None. The dictionary - will get filled in with the mapping from each contig's reference name - to the best matched reference for the whole seed group. - :return: {query_name: (ref_name, matched_fraction)} where query_name is a - sequence header from the query sequences FASTA file, ref_name is the - name of the best match from the database, and matched_fraction is the - fraction of the query that aligned against the reference (matches and - mismatches). + contig=contig.seq)) + + +def genotype(fasta: str, db: str = DEFAULT_DATABASE, + blast_csv: Optional[TextIO] = None, + group_refs: Optional[Dict[str, str]] = None) -> Dict[str, typing.Tuple[str, float]]: + """Use Blastn to search for the genotype of a set of reference sequences. + + Args: + fasta (str): File path of the FASTA file containing the query sequences. + db (str): File path of the database to search for matches. + blast_csv (Optional[TextIO]): Open file to write the blast matches to, or None. + group_refs (Optional[Dict[str, str]]): Dictionary to fill with the mapping from + each contig's reference name to the best matched reference for the whole seed group. + + Returns: + Dict[str, Tuple[str, float]]: Mapping of query name to (reference name, matched fraction). """ contig_nums: Dict[str, int] = {} # {contig_name: contig_num} @@ -171,30 +195,35 @@ def genotype(fasta, db=DEFAULT_DATABASE, blast_csv=None, group_refs=None): return samples -def run(contigs_fasta_path: str, - unstitched_contigs_csv: TextIO, - blast_csv: Optional[TextIO] = None): - """ Run BLAST search to identify contig sequences. +def run(contigs_fasta_path: str, unstitched_contigs_csv: TextIO, blast_csv: Optional[TextIO] = None): + """Run BLAST search to identify contig sequences and write them to CSV. - :param str contigs_fasta_path: path to file to read contig sequences from - and append merged contigs to - :param unstitched_contigs_csv: open file to write assembled contigs to - :param contigs_csv: open file to write stitched contigs to - :param merged_contigs_csv: open file to read contigs that were merged from - amplicon reads - :param blast_csv: open file to write BLAST search results for each contig - :param stitcher_plot_path: open file to write the visualizer plot to + Args: + contigs_fasta_path (str): Path to the FASTA file containing contig sequences. + unstitched_contigs_csv (TextIO): Open file to write assembled contigs to. + blast_csv (Optional[TextIO]): Open file to write BLAST search results for each contig. """ - unstitched_writer = init_contigs_refs(cast(TextIO, unstitched_contigs_csv)) group_refs: Dict[str, str] = {} - genotypes = genotype(contigs_fasta_path, - blast_csv=blast_csv, - group_refs=group_refs) + genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) - write_unstitched_contigs(unstitched_writer, - group_refs, - genotypes, - contigs_fasta_path) + write_unstitched_contigs(unstitched_writer, group_refs, genotypes, contigs_fasta_path) unstitched_contigs_csv.flush() + + +def main(argv: Sequence[str]): + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser(description="Convert contigs from FASTA to CSV format with BLAST annotations.") + parser.add_argument('contigs_fasta', help="Input FASTA file with contig sequences.") + parser.add_argument('unstitched_contigs_csv', type=argparse.FileType('w'), + help="Output CSV file to write assembled contigs.") + parser.add_argument('--blast_csv', type=argparse.FileType('w'), + help="Optional CSV file to write BLAST search results.") + args = parser.parse_args(argv) + run(args.contigs_fasta, args.unstitched_contigs_csv, args.blast_csv) + + +if __name__ == "__main__": + import sys + main(sys.argv[1:]) From 8db85c5a49bf5b4db02d7f2750efe80cd2e9a6f9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 09:03:28 -0700 Subject: [PATCH 308/485] Contig stitcher: small code improvements --- micall/core/contig_stitcher.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 527d7dfbb..de7dd2100 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar, TextIO +from typing import Iterable, Optional, Tuple, List, Dict, Literal, TypeVar, TextIO, Sequence from collections import defaultdict import csv import os @@ -601,7 +601,7 @@ def combine(group_ref): yield from map(combine, consensus_parts) -def output_contigs(output_csv: TextIO, contigs: Iterable[GenotypedContig]): +def write_contigs(output_csv: TextIO, contigs: Iterable[GenotypedContig]): writer = csv.DictWriter(output_csv, ['ref', 'match', 'group_ref', 'contig'], lineterminator=os.linesep) @@ -612,8 +612,10 @@ def output_contigs(output_csv: TextIO, contigs: Iterable[GenotypedContig]): group_ref=contig.group_ref, contig=contig.seq)) + output_csv.flush() -def input_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: + +def read_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: projects = ProjectConfig.loadDefault() for row in csv.DictReader(input_csv): @@ -640,13 +642,13 @@ def input_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str]) -> int: with StitcherContext.fresh() as ctx: - contigs = list(input_contigs(input_csv)) + contigs = list(read_contigs(input_csv)) if output_csv is not None or stitcher_plot_path is not None: contigs = list(stitch_consensus(contigs)) if output_csv is not None: - output_contigs(output_csv, contigs) + write_contigs(output_csv, contigs) if stitcher_plot_path is not None: plot_stitcher_coverage(ctx.events, stitcher_plot_path) @@ -654,7 +656,7 @@ def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str] return len(contigs) -def main(args): +def main(argv: Sequence[str]): import argparse import os from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. @@ -674,7 +676,7 @@ def main(args): verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.') verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.') - args = parser.parse_args(args) + args = parser.parse_args(argv) if args.quiet: logger.setLevel(logging.ERROR) @@ -697,18 +699,11 @@ def main(args): else: parser.error('Unable to infer the input type from file extension. Please provide "--input-type".') - with StitcherContext.fresh(): - plot_path = args.plot.name if args.plot is not None else None - - if args.input_type == 'csv': - run(args.contigs, args.stitched_contigs, plot_path) - else: - write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) - - args.contigs.close() - args.stitched_contigs.close() - if args.plot is not None: - args.plot.close() + plot_path = args.plot.name if args.plot is not None else None + if args.input_type == 'csv': + run(args.contigs, args.stitched_contigs, plot_path) + else: + write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) if __name__ == '__main__': From 74a531945246e614bd61764cea36f802f832d382 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 08:55:47 -0700 Subject: [PATCH 309/485] Contig stitcher: only accept CSV files as inputs --- micall/core/contig_stitcher.py | 22 ++-------------------- micall/tests/data/exact_parts_contigs.csv | 3 +++ micall/tests/test_contig_stitcher.py | 4 ++-- 3 files changed, 7 insertions(+), 22 deletions(-) create mode 100644 micall/tests/data/exact_parts_contigs.csv diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index de7dd2100..90ddb4bab 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -658,18 +658,13 @@ def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str] def main(argv: Sequence[str]): import argparse - import os - from micall.core.denovo import write_contig_refs # TODO(vitalik): move denovo stuff here. parser = argparse.ArgumentParser() - parser.add_argument('contigs', type=argparse.FileType('r'), help="Input file with assembled contigs.") + parser.add_argument('contigs', type=argparse.FileType('r'), help="Input CSV file with assembled contigs.") parser.add_argument('stitched_contigs', type=argparse.FileType('w'), help="Output CSV file with stitched contigs.") parser.add_argument('--plot', type=argparse.FileType('w'), help="Output SVG image visualizing the stitching process.") - parser.add_argument('--input-type', choices=['csv', 'fasta'], - help='Type of the input file: either "csv" or "fasta".' - ' If not specified, the type is inferred from the file extension.') verbosity_group = parser.add_mutually_exclusive_group() verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity.') verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True) @@ -689,21 +684,8 @@ def main(argv: Sequence[str]): logging.basicConfig(level=logger.level) - # Infer input type from file extension if not specified - if args.input_type is None: - _, ext = os.path.splitext(args.contigs.name) - if ext.lower() in ['.csv']: - args.input_type = 'csv' - elif ext.lower() in ['.fasta', '.fa']: - args.input_type = 'fasta' - else: - parser.error('Unable to infer the input type from file extension. Please provide "--input-type".') - plot_path = args.plot.name if args.plot is not None else None - if args.input_type == 'csv': - run(args.contigs, args.stitched_contigs, plot_path) - else: - write_contig_refs(args.contigs.name, None, args.stitched_contigs, stitcher_plot_path=plot_path) + run(args.contigs, args.stitched_contigs, plot_path) if __name__ == '__main__': diff --git a/micall/tests/data/exact_parts_contigs.csv b/micall/tests/data/exact_parts_contigs.csv new file mode 100644 index 000000000..cb9745df6 --- /dev/null +++ b/micall/tests/data/exact_parts_contigs.csv @@ -0,0 +1,3 @@ +ref,match,group_ref,contig +HIV1-B-FR-K03455-seed,1.0,HIV1-B-FR-K03455-seed,GGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACAT +HIV1-B-FR-K03455-seed,1.0,HIV1-B-FR-K03455-seed,ACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAG diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index faaed9f48..f893dab00 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1403,7 +1403,7 @@ def test_correct_stitching_of_one_normal_and_one_unknown(exact_aligner, visualiz def test_main_invocation(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) - contigs = os.path.join(pwd, "data", "exact_parts_contigs.fasta") + contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") stitched_contigs = os.path.join(tmp_path, "stitched.csv") stitcher.main([contigs, stitched_contigs]) @@ -1425,7 +1425,7 @@ def test_main_invocation(exact_aligner, tmp_path, hcv_db): def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): pwd = os.path.dirname(__file__) - contigs = os.path.join(pwd, "data", "exact_parts_contigs.fasta") + contigs = os.path.join(pwd, "data", "exact_parts_contigs.csv") stitched_contigs = os.path.join(tmp_path, "stitched.csv") plot = os.path.join(tmp_path, "exact_parts_contigs.plot.svg") stitcher.main([contigs, stitched_contigs, "--debug", "--plot", plot]) From a701f28334e55877b35aab877feaa72e78070671 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 09:23:10 -0700 Subject: [PATCH 310/485] Docs: update contig input description --- docs/design/stitcher.md | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index 1a676a795..265cad691 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -44,10 +44,11 @@ PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher - Stitching is initiated either as a pipeline step in MiCall, or as a command line call given above. In each case: -**Input:** The Stitcher receives a single input file in FASTA +**Input:** The Stitcher receives a single input file in CSV format. This file contains 1 or more contigs that are the outcomes of -the previous assembly step. These contigs are essentially segments of -DNA sequences. They can vary significantly in length. +the previous assembly step, together with associated reference genome +information. These contigs are essentially segments of DNA +sequences. They can vary significantly in length. **Output:** The sole output from the Stitcher is a CSV file. This file holds the stitched sequences -- longer or fully @@ -173,25 +174,22 @@ The setup process for the Stitcher ensures that each contig is properly aligned and prepared for the stitching process. The steps are as follows: -1. **Determine Reference Genome**: Identify a the best maching - reference genome for each contig based on its sequence data. - -2. **Align Contigs**: Align each contig to its corresponding reference +1. **Align Contigs**: Align each contig to its corresponding reference genome to approximate their positions within a global reference framework, allowing for spatial comparison between different contigs. -3. **Split Multi-Alignment Contigs**: Split contigs that align to +2. **Split Multi-Alignment Contigs**: Split contigs that align to multiple distinct parts of the reference genome into separate segments. -4. **Handle Reverse Complement**: Reverse complement contigs that +3. **Handle Reverse Complement**: Reverse complement contigs that align to the reverse strand of the reference genome to ensure all sequences are oriented in the same direction. -5. **Sort Contigs**: Arrange the contigs based on their starting +4. **Sort Contigs**: Arrange the contigs based on their starting positions along the reference genome. -6. **Group by Reference**: Group contigs such that all contigs +5. **Group by Reference**: Group contigs such that all contigs associated with the same reference genome are processed together. These setup steps perform minimal alteration to the original contigs @@ -577,13 +575,13 @@ specifying the path to the output plot file. Here's an example of how to stitch contigs and retrieve a visualizer plot: ```sh -PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher "contigs.fasta" "stitched_contigs.csv" --plot "visualized.svg" +PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher "contigs.csv" "stitched_contigs.csv" --plot "visualized.svg" ``` **Command Line Arguments:** -- `contigs.fasta`: Input file in FASTA format containing assembled - contigs. +- `contigs.csv`: Input file in CSV format containing assembled + contigs and related information. - `stitched_contigs.csv`: Output CSV file that will contain the stitched contigs. - `--plot visualized.svg`: The optional argument to generate a visual From 5be1d87e982e1eb2306a287cf23ff14e399346da Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 10:14:06 -0700 Subject: [PATCH 311/485] Fix syntax error crashing mypy --- micall/utils/make_hcv_genotype_refs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/utils/make_hcv_genotype_refs.py b/micall/utils/make_hcv_genotype_refs.py index e2705c2fd..db65c6ded 100644 --- a/micall/utils/make_hcv_genotype_refs.py +++ b/micall/utils/make_hcv_genotype_refs.py @@ -58,7 +58,7 @@ def convert_fasta (lines): if subtype in processed_subtypes: continue for region, refseq in h77.iteritems(): - print subtype, region + print(subtype, region) aquery, aref, ascore = hyphyAlign.pair_align(hyphy, refseq, genome) left, right = hyphyAlign.get_boundaries(aref) outfile.write('>%s-%s\n%s\n' % (subtype, From 6aa480f09630b42a98a16a45b4dbbd06c3ea199e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 10:42:28 -0700 Subject: [PATCH 312/485] Update flake8 config --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index d9a369289..34deca975 100644 --- a/.flake8 +++ b/.flake8 @@ -68,7 +68,7 @@ per-file-ignores = micall/utils/hcv_reference_tree.py: W504 micall/utils/hcv_rules_import.py: W503,W504,E126,E241 micall/utils/make_fda_refs.py: E211,E261,W293,W292 - micall/utils/make_hcv_genotype_refs.py: E999 + micall/utils/make_hcv_genotype_refs.py: E211,E261,E231,E722,E305,E127,E251,E127 micall/utils/primer_checks.py: E303,F541 micall/utils/primer_tracker.py: F541 micall/utils/projects_dump.py: W504 From fcc6712ebb24497a76b7d83d21880d25e5c13cc0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 10:38:45 -0700 Subject: [PATCH 313/485] Mypy: exclude negative inputs instead of including positive This ensures that newly added files are automatically checked by mypy. It also increases the current total number of checked files from 5 to 35. --- .mypy.ini | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index 4ca29fa84..b9f38fe59 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,8 +1,75 @@ [mypy] -files = micall/utils/consensus_aligner.py, - micall/utils/report_amino.py, - micall/core/project_config.py, - micall/core/contig_stitcher.py +files = micall/ + +exclude = (?x) + ^micall/core/aln2counts[.]py$| + ^micall/core/amplicon_finder[.]py$| + ^micall/core/cascade_report[.]py$| + ^micall/core/consensus_builder[.]py$| + ^micall/core/coverage_plots[.]py$| + ^micall/core/plot_simple[.]py$| + ^micall/core/prelim_map[.]py$| + ^micall/core/remap[.]py$| + ^micall/core/sam2aln[.]py$| + ^micall/core/trim_fastqs[.]py$| + ^micall/drivers/run_info[.]py$| + ^micall/drivers/sample_group[.]py$| + ^micall/drivers/sample[.]py$| + ^micall/g2p/fastq_g2p[.]py$| + ^micall/g2p/pssm_lib[.]py$| + ^micall/monitor/find_groups[.]py$| + ^micall/monitor/kive_watcher[.]py$| + ^micall/monitor/qai_helper[.]py$| + ^micall/monitor/sample_watcher[.]py$| + ^micall/monitor/update_qai[.]py$| + ^micall/resistance/asi_algorithm[.]py$| + ^micall/resistance/genreport[.]py$| + ^micall/resistance/pdfreport[.]py$| + ^micall/resistance/resistance[.]py$| + ^micall/tests/.*| + ^micall/utils/aln2counts_simplify[.]py$| + ^micall/utils/amplicon_finder[.]py$| + ^micall/utils/big_counter[.]py$| + ^micall/utils/compare_454_samples[.]py$| + ^micall/utils/compare_mapping[.]py$| + ^micall/utils/concordance_evaluation[.]py$| + ^micall/utils/conseq_compare[.]py$| + ^micall/utils/contig_blaster[.]py$| + ^micall/utils/contig_counts[.]py$| + ^micall/utils/contig_summary[.]py$| + ^micall/utils/convert_primers_cutadapt[.]py$| + ^micall/utils/coverage_data[.]py$| + ^micall/utils/dd[.]py$| + ^micall/utils/denovo_simplify[.]py$| + ^micall/utils/externals[.]py$| + ^micall/utils/fetch_sequences[.]py$| + ^micall/utils/find_by_coverage[.]py$| + ^micall/utils/find_chimera[.]py$| + ^micall/utils/find_missing_samples[.]py$| + ^micall/utils/find_reads_in_sam[.]py$| + ^micall/utils/genreport_rerun[.]py$| + ^micall/utils/hcv_reference_tree[.]py$| + ^micall/utils/hcv_rules_display[.]py$| + ^micall/utils/hcv_rules_import[.]py$| + ^micall/utils/make_fda_refs[.]py$| + ^micall/utils/make_hcv_genotype_refs[.]py$| + ^micall/utils/primer_tracker[.]py$| + ^micall/utils/probe_finder[.]py$| + ^micall/utils/projects_dump[.]py$| + ^micall/utils/project_seeds_from_compendium[.]py$| + ^micall/utils/projects_upload[.]py$| + ^micall/utils/ref_aligner[.]py$| + ^micall/utils/reference_distances[.]py$| + ^micall/utils/remap_fastq_simplify[.]py$| + ^micall/utils/sam_g2p_simplify[.]py$| + ^micall/utils/sample_fastq[.]py$| + ^micall/utils/sample_project_summary[.]py$| + ^micall/utils/samples_from_454[.]py$| + ^micall/utils/sample_sheet_parser[.]py$| + ^micall/utils/scan_run_folders[.]py$| + ^micall/utils/seed_alignments[.]py$| + ^micall/utils/spring_beads[.]py$| + ^micall/utils/v3loop_alignment_scores/plot[.]py$ check_untyped_defs = True From d868628bf1bef25c23409318c300e7175dd62630 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 21:59:59 +0000 Subject: [PATCH 314/485] Bump rexml from 3.2.6 to 3.2.8 in /docs Bumps [rexml](https://github.com/ruby/rexml) from 3.2.6 to 3.2.8. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.2.6...v3.2.8) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index c806cec91..ea3f48825 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -236,7 +236,8 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.6) + rexml (3.2.8) + strscan (>= 3.0.9) rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) @@ -250,6 +251,7 @@ GEM faraday (>= 0.17.3, < 3) simpleidn (0.2.1) unf (~> 0.1.4) + strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) typhoeus (1.4.1) From 0c6f5f89c2c0422f965dbed81ba4126ea218cc85 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 23 May 2024 20:59:07 -0700 Subject: [PATCH 315/485] Remove outdated travis CI links --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 1778dc796..a928002f5 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,9 @@ # MiCall # ## Processing FASTQ data from an Illumina MiSeq ## -[![Build Status]][travis] [![Code Coverage]][codecov] [![DOI]][zenodo] -[Build Status]: https://travis-ci.com/cfe-lab/MiCall.svg?branch=master -[travis]: https://travis-ci.com/cfe-lab/MiCall [Code Coverage]: https://codecov.io/github/cfe-lab/MiCall/coverage.svg?branch=master [codecov]: https://codecov.io/github/cfe-lab/MiCall?branch=master [DOI]: https://zenodo.org/badge/DOI/10.5281/zenodo.1289989.svg From 2d1b7c5bded651191ab1750f8bc720dadb833253 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 07:54:11 +0000 Subject: [PATCH 316/485] Bump coverage from 7.5.1 to 7.5.2 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.1 to 7.5.2. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.1...7.5.2) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 05a70fe5a..59ac54223 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.10.0 mypy-extensions==1.0.0 pytest==8.2.1 -coverage==7.5.1 +coverage==7.5.2 pandas==2.0.2 seaborn==0.13.2 ete3 From d2d65bd4cc8ab6610d053625d24f4cb1248e6463 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 27 May 2024 14:38:50 -0700 Subject: [PATCH 317/485] Add a comment describing MICALL_VERSION --- release_test_compare.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/release_test_compare.py b/release_test_compare.py index d8914460e..1722671bd 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -23,6 +23,11 @@ from micall_docker import get_available_memory MICALL_VERSION = '7.15' +# ^^^^^^ Version of the MiCall release being tested. +# This is the new version against which older versions are compared. +# The version for the older revision is determined dynamically in the `find_runs` function. +# The source folder is inspected to find all previous result versions for each run. +# These versions are then sorted and the latest one is selected for comparison. MiseqRun = namedtuple('MiseqRun', 'source_path target_path is_done') MiseqRun.__new__.__defaults__ = (None,) * 3 From f5c903a4d5cf0da67a5398b0792e1f9ab99df67a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 27 May 2024 15:03:49 -0700 Subject: [PATCH 318/485] Improve parsing in `parse_version` of release_test_compare Enhance the `parse_version` function to handle version names that include a dash '-' followed by modifiers. This ensures the function correctly parses version names like "1.0.0-rc1" by focusing on the version number and ignoring the modifiers. --- release_test_compare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/release_test_compare.py b/release_test_compare.py index 1722671bd..03355836f 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -109,6 +109,7 @@ def parse_version(version_name): version_text = version_name.split('_')[-1] if version_text.endswith('.zip'): version_text = version_text[:-4] + version_text, possible_dash, possible_modifiers = version_text.partition("-") return tuple(map(int, version_text.split('.'))) From 3397708a61b44b9fb4cf47366d7dce36ad0df48f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 27 May 2024 15:14:15 -0700 Subject: [PATCH 319/485] Add logger to release_test_compare.py --- release_test_compare.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/release_test_compare.py b/release_test_compare.py index 03355836f..bacf968a3 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -10,6 +10,7 @@ from glob import glob from operator import itemgetter import os +import logging import matplotlib.pyplot as plt import pandas as pd @@ -44,6 +45,8 @@ 'scenarios', # {Scenarios: [description]} 'consensus_distances']) # [ConsensusDistance] +logger = logging.getLogger(__name__) + class Scenarios(IntEnum): NONE = 0 @@ -75,6 +78,13 @@ def parse_args(default_max_active): default=default_max_active, type=int, help='Number of parallel workers to process the samples.') + + verbosity_group = parser.add_mutually_exclusive_group() + verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity.') + verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True) + verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.') + verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.') + return parser.parse_args() @@ -102,6 +112,8 @@ def find_runs(source_folder, target_folder, use_denovo): message = f'Unexpected results file name in {run_name}.' raise ValueError(message) from ex source_path = os.path.join(source_results_path, source_versions[-1]) + + logger.debug("Comparing %r with %r.", source_path, target_path) yield MiseqRun(source_path, target_path, is_done) @@ -683,6 +695,15 @@ def main(): default_max_active = max(1, available_memory // recommended_memory) args = parse_args(default_max_active) + if args.quiet: + logger.setLevel(logging.ERROR) + elif args.verbose: + logger.setLevel(logging.INFO) + elif args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.WARN) + with ProcessPoolExecutor() as pool: runs = find_runs(args.source_folder, args.target_folder, args.denovo) runs = report_source_versions(runs) From 9cc1bd6ed6ef4fc0d80609d88c689b46c3dfe86e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 27 May 2024 15:19:15 -0700 Subject: [PATCH 320/485] Make sure to compare denovo with denovo in release_test_compare.py --- release_test_compare.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/release_test_compare.py b/release_test_compare.py index bacf968a3..bedf6e0c9 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -96,10 +96,10 @@ def find_runs(source_folder, target_folder, use_denovo): target_path = os.path.join(run_path, 'Results', 'version_' + MICALL_VERSION) + done_path = os.path.join(target_path, 'doneprocessing') is_done = os.path.exists(done_path) - if use_denovo: - target_path = os.path.join(target_path, 'denovo') + source_results_path = os.path.join(source_folder, 'MiSeq', 'runs', @@ -113,6 +113,10 @@ def find_runs(source_folder, target_folder, use_denovo): raise ValueError(message) from ex source_path = os.path.join(source_results_path, source_versions[-1]) + if use_denovo: + target_path = os.path.join(target_path, 'denovo') + source_path = os.path.join(source_path, 'denovo') + logger.debug("Comparing %r with %r.", source_path, target_path) yield MiseqRun(source_path, target_path, is_done) From 0273d691353bfdd3cb934939699c87b1cc598c53 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 28 May 2024 14:24:45 -0700 Subject: [PATCH 321/485] Take modifiers into account when sorting versions in release_test_compare.py --- release_test_compare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/release_test_compare.py b/release_test_compare.py index bedf6e0c9..3f4fc15c7 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -126,7 +126,8 @@ def parse_version(version_name): if version_text.endswith('.zip'): version_text = version_text[:-4] version_text, possible_dash, possible_modifiers = version_text.partition("-") - return tuple(map(int, version_text.split('.'))) + version_numbers = tuple(map(int, version_text.split('.'))) + return (version_numbers, possible_modifiers) def report_source_versions(runs): From d67d42ef8ba625bcc6a97e5ab603c4ae508ba63d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 28 May 2024 17:07:11 -0700 Subject: [PATCH 322/485] Contig stitcher: improve the big insertion test --- .../test_big_insertion_in_a_single_contig.svg | 26 +++++++++---------- micall/tests/test_contig_stitcher.py | 8 +++--- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg index be9c6fd17..f3571995d 100644 --- a/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg +++ b/micall/tests/data/stitcher_plots/test_big_insertion_in_a_single_contig.svg @@ -5,39 +5,39 @@ - + - + - + - - + + testref - - - -1 + + + +1 - - + + - - + + 1 diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index f893dab00..b44711505 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1021,15 +1021,17 @@ def test_partial_align_consensus_multiple_overlaping_sequences( assert len(visualizer().elements) > len(contigs) -def test_big_insertion_in_a_single_contig(exact_aligner, visualizer): +def test_big_insertion_in_a_single_contig(projects, visualizer): # Scenario: Single contig produces many alignments. - ref_seq = "A" * 10 + "B" * 20 + "C" * 10 + hxb2_name = "HIV1-B-FR-K03455-seed" + ref_seq = projects.getReference(hxb2_name) + seq = ref_seq[2000:3000] + 'C' * 300 + ref_seq[3100:4000] contigs = [ GenotypedContig( name="a", - seq="B" * 10 + "D" * 100 + "B" * 10, + seq=seq, ref_name="testref", group_ref="testref", ref_seq=ref_seq, From 4e9b0bcb16accaf2b262f87f39e4a2fc327005c0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 28 May 2024 16:14:37 -0700 Subject: [PATCH 323/485] Bump aligntools to v1.0.3 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 48b804eb8..1f42f161d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ git+https://github.com/cfe-lab/genetracks.git@v0.4 mappy==2.17 drawsvg==2.3.0 cairosvg==2.7.1 -git+https://github.com/cfe-lab/aligntools.git@v1.0.2 +git+https://github.com/cfe-lab/aligntools.git@v1.0.3 From 00cc7425b650e5eb2c48fb9bd2d6f5eb0474b4c1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 28 May 2024 16:57:11 -0700 Subject: [PATCH 324/485] Contig stitcher: add a test for reference overlapping alignments --- .../test_overlaping_in_reference_space.svg | 173 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 58 +++++- 2 files changed, 228 insertions(+), 3 deletions(-) create mode 100644 micall/tests/data/stitcher_plots/test_overlaping_in_reference_space.svg diff --git a/micall/tests/data/stitcher_plots/test_overlaping_in_reference_space.svg b/micall/tests/data/stitcher_plots/test_overlaping_in_reference_space.svg new file mode 100644 index 000000000..6d416d0a5 --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_overlaping_in_reference_space.svg @@ -0,0 +1,173 @@ + + + + + + + + + + + + + + + + + + + + + + + + +testref + + + + + + + +1.1 + + + + + +1.3 + + + + + + + + + + + + + + + + + + + +1.1 + + + + + + + + +1.3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + +1.2 + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index b44711505..16af8c722 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1,7 +1,11 @@ import random +from dataclasses import dataclass import logging import os import pytest +from typing import Iterable, List, Tuple + +from aligntools import CigarActions, CigarHit, Cigar import micall.core.contig_stitcher as stitcher from micall.core.contig_stitcher import ( @@ -1368,6 +1372,54 @@ def test_forward_and_reverse_match(projects, visualizer): assert len(visualizer().elements) > len(contigs) +def test_overlaping_in_reference_space(projects, visualizer, monkeypatch): + # Scenario: Single contig is aligned in two parts that overlap in reference space. + + @dataclass + class MockMappyHit: + strand: int + q_st: int + q_en: int + r_st: int + r_en: int + cigar: List[Tuple[int, CigarActions]] + + @dataclass + class MockAligner: + seq: str + preset: str + + def map(self, seq: str) -> Iterable[MockMappyHit]: + return [MockMappyHit(1, 100, 300, 200, 400, [(200, CigarActions.MATCH)]), + MockMappyHit(1, 300, 500, 300, 500, [(200, CigarActions.MATCH)]), + ] + + monkeypatch.setattr("micall.core.contig_stitcher.Aligner", MockAligner) + + ref = 'A' * 700 + seq = 'C' * 600 + + contigs = [ + GenotypedContig( + name="a", + seq=seq, + ref_name="testref", + group_ref="testref", + ref_seq=ref, + match_fraction=0.3, + ), + ] + + results = list(stitch_consensus(contigs)) + assert len(results) == 1 + assert results[0].seq == 'C' * 500 + + assert isinstance(results[0], stitcher.AlignedContig) + assert results[0].alignment == CigarHit(Cigar.parse('300M'), r_st=200, r_ei=499, q_st=100, q_ei=399) + + assert len(visualizer().elements) > len(contigs) + + def test_correct_stitching_of_one_normal_and_one_unknown(exact_aligner, visualizer): # Scenario: Two partially overlapping contigs are stitched correctly into a single sequence. @@ -1518,10 +1570,10 @@ def __init__(self, ref_name, group_ref, r_st, r_ei, name="contig"): self.id = id(self) +@dataclass class MockAlignment: - def __init__(self, r_st, r_ei): - self.r_st = r_st - self.r_ei = r_ei + r_st: int + r_ei: int # Simple function to create mock AlignedContig objects for testing, including ref_name. From 2fca5e4ad3561665d62dd899c8ff30aa215fe7b2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:19:35 -0700 Subject: [PATCH 325/485] Do not crash stitcher visualizer on group_ref = None --- micall/core/plot_contigs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4e12edbb8..4c0783e47 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1068,7 +1068,10 @@ def add_section(title: str) -> None: figure = Figure() for group_ref in group_refs: try: - landmarks = landmark_reader.get_landmarks(group_ref) + if group_ref is not None: + landmarks = landmark_reader.get_landmarks(group_ref) + else: + landmarks = None except ValueError: landmarks = None From a706ec56b4b6c03cb87ed8929c5b15b5551ee349 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:10:37 -0700 Subject: [PATCH 326/485] Contig stitcher: fix discards,anomalies labels in multiple groups cases --- micall/core/plot_contigs.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4c0783e47..2d0db2ae2 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1187,8 +1187,7 @@ def add_section(title: str) -> None: # Discarded # ############# - if discarded or merged_unaligned_parts: - add_section("discards:") + def get_group_discards(group_ref): for root in sorted_roots: if contig_map[root].group_ref != group_ref: continue @@ -1201,25 +1200,36 @@ def add_section(title: str) -> None: if id in unaligned_parts: (q_st, q_ei) = query_position_map[id] label = name_map[id] - figure.add(Track(position_offset, position_offset + abs(q_ei - q_st), - label=label, color="yellow")) + yield Track(position_offset, position_offset + abs(q_ei - q_st), + label=label, color="yellow") else: part = contig_map[id] - figure.add(Multitrack(list(get_tracks([part])))) + yield Multitrack(list(get_tracks([part]))) + + disc = list(get_group_discards(group_ref)) + if disc: + add_section("discards:") + for element in disc: + figure.add(element) ############# # Anomalies # ############# - if anomaly: - add_section("anomaly:") + def get_group_anomalies(group_ref): for root in sorted_roots: parts_ids = final_children_mapping[root] parts_ids = [name for name in parts_ids if name in anomaly] parts = [contig_map[name] for name in parts_ids] parts = [part for part in parts if part.group_ref == group_ref] for part in parts: - figure.add(Multitrack(list(get_tracks([part])))) + yield Multitrack(list(get_tracks([part]))) + + anom = list(get_group_anomalies(group_ref)) + if anom: + add_section("anomaly:") + for element in anom: + figure.add(element) ########### # Unknown # From ad8ccb5631a4740bb2c8535c868e550e327cd848 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:31:30 -0700 Subject: [PATCH 327/485] Contig stitcher: add a visualizer test --- ...labeling_of_different_organism_contigs.svg | 500 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 64 +++ 2 files changed, 564 insertions(+) create mode 100644 micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg diff --git a/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg new file mode 100644 index 000000000..e550e9a1a --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg @@ -0,0 +1,500 @@ + + + + + + + + + + + + + + + + + + +testref-1 + + + + + + + +1 + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + + +3 + + + + + + + + + + + + + + + + + +testref-2 + + + + + + + +2 + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + + +4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + + +5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +unknown: + + + + + + +6 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 16af8c722..a57c2d05b 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1508,6 +1508,70 @@ def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): ), "The contents of the stitched plot file do not match the expected contents." +def test_visualizer_correct_labeling_of_different_organism_contigs( + exact_aligner, visualizer +): + # Scenario: Some discarded and anomaly contigs correctly labelled. + + ref_seq = "A" * 100 + "C" * 100 + + contigs = [ + GenotypedContig( + name="a", + seq="A" * 50, + ref_name="testref-1", + group_ref="testref-1", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="C" * 50, + ref_name="testref-2", + group_ref="testref-2", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="a_anomaly", + seq="D" * 50, + ref_name="testref-1", + group_ref="testref-1", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b_discarded", + seq="C" * 20, + ref_name="testref-2", + group_ref="testref-2", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="some_anomaly", + seq="T" * 20, + ref_name='unknown', + group_ref=None, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="some_unknown", + seq="T" * 20, + ref_name='unknown', + group_ref=None, + ref_seq=None, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 5 + + assert len(visualizer().elements) > len(contigs) + + # _ _ _ _ _ _ # | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ # | | | | '_ \| | __| | __/ _ \/ __| __/ __| From 617bcd2ee46957780c7faaa3039c9793d4aae106 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 13:57:50 -0700 Subject: [PATCH 328/485] Update conseq.csv:region field documentation --- docs/steps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/steps.md b/docs/steps.md index 747f7665c..28c87a1d6 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -181,7 +181,7 @@ Individual files are described after the list of steps. * remap - mapped to other references after remapping * aligned - aligned with a reference and merged with mate * conseq.csv - * region - seed region it mapped to + * region - the name of the contig. Includes the name of the reference seed, plus an optional prefix, which is a number that makes the name unique. * q-cutoff - minimum quality score * consensus-percent-cutoff - to be included in a mixture, a variant must make up at least this fraction of the total valid counts From 5f05025ca8caecf0e294b1ffbb636030f715a1b0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 5 Jun 2024 08:00:07 +0000 Subject: [PATCH 329/485] Bump pytest from 8.2.1 to 8.2.2 Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.2.1 to 8.2.2. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.2.1...8.2.2) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 59ac54223..234244fd5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,7 +4,7 @@ -r requirements-watcher.txt mypy==1.10.0 mypy-extensions==1.0.0 -pytest==8.2.1 +pytest==8.2.2 coverage==7.5.2 pandas==2.0.2 seaborn==0.13.2 From d22ffab9bec973af1dc13f5fece4dc015dfc6bec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 07:15:31 +0000 Subject: [PATCH 330/485] Bump gprof2dot from 2022.7.29 to 2024.6.6 Bumps [gprof2dot](https://github.com/jrfonseca/gprof2dot) from 2022.7.29 to 2024.6.6. - [Release notes](https://github.com/jrfonseca/gprof2dot/releases) - [Commits](https://github.com/jrfonseca/gprof2dot/compare/2022.07.29...2024.06.06) --- updated-dependencies: - dependency-name: gprof2dot dependency-type: direct:development update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a5be4fb9..0ce57d1fb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,6 +2,6 @@ -r requirements-test.txt -r requirements-watcher.txt # Used for plotting profiling results. -gprof2dot==2022.7.29 +gprof2dot==2024.6.6 mypy==1.10.0 mypy-extensions==1.0.0 From eca5c03f7c402f8454206d5de24f51f79defe4c0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:07:59 +0000 Subject: [PATCH 331/485] Bump coverage from 7.5.2 to 7.5.3 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.2 to 7.5.3. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.2...7.5.3) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 234244fd5..55e681f42 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.10.0 mypy-extensions==1.0.0 pytest==8.2.2 -coverage==7.5.2 +coverage==7.5.3 pandas==2.0.2 seaborn==0.13.2 ete3 From 5a10055e1993653f1951ad03b6562ee9051ff4ae Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 6 Jun 2024 13:12:26 -0700 Subject: [PATCH 332/485] Move some definitions out of denovo.py --- micall/core/denovo.py | 11 ++++------- micall/tests/test_denovo.py | 3 ++- micall/utils/contig_summary.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 61dca8ec4..7336b2611 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -15,12 +15,9 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from micall.utils.fasta_to_csv import run, DEFAULT_DATABASE, genotype +import micall.utils.fasta_to_csv as fasta_to_csv import micall.core.contig_stitcher as stitcher -assert DEFAULT_DATABASE -assert genotype is not None - IVA = "iva" logger = logging.getLogger(__name__) @@ -50,9 +47,9 @@ def write_contig_refs(contigs_fasta_path: str, contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") with NamedTemporaryFile(mode='wt') as temporary_unstitched_csv: - run(contigs_fasta_path, - cast(TextIO, temporary_unstitched_csv), - blast_csv) + fasta_to_csv.run(contigs_fasta_path, + cast(TextIO, temporary_unstitched_csv), + blast_csv) if unstitched_contigs_csv: with open(temporary_unstitched_csv.name) as input_csv: diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index 06001779b..0c1c400fe 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -4,7 +4,8 @@ from Bio import SeqIO from pytest import fixture, mark -from micall.core.denovo import write_contig_refs, denovo, DEFAULT_DATABASE, genotype +from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype +from micall.core.denovo import write_contig_refs, denovo from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS diff --git a/micall/utils/contig_summary.py b/micall/utils/contig_summary.py index a798139d7..9249ae964 100644 --- a/micall/utils/contig_summary.py +++ b/micall/utils/contig_summary.py @@ -5,7 +5,7 @@ from Bio.Blast.Applications import NcbiblastnCommandline -from micall.core.denovo import DEFAULT_DATABASE +from micall.utils.fasta_to_csv import DEFAULT_DATABASE import matplotlib matplotlib.use('Agg') From ae97aea371153e31da23560b4d44567b24fb3dea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:53:26 +0000 Subject: [PATCH 333/485] Bump openpyxl from 3.1.2 to 3.1.3 Bumps [openpyxl](https://openpyxl.readthedocs.io) from 3.1.2 to 3.1.3. --- updated-dependencies: - dependency-name: openpyxl dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 55e681f42..6484b879e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -12,4 +12,4 @@ ete3 flake8==7.0.0 # For reading HCV rules from an Excel file. -openpyxl==3.1.2 +openpyxl==3.1.3 From 37637223cef3041f14eb3cd76fd5960648ad3cd8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 6 Jun 2024 14:14:59 -0700 Subject: [PATCH 334/485] Factor out test_fasta_to_csv.py --- micall/tests/test_contig_stitcher.py | 2 +- micall/tests/test_denovo.py | 54 +++------------------------- micall/tests/test_fasta_to_csv.py | 54 ++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 51 deletions(-) create mode 100644 micall/tests/test_fasta_to_csv.py diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index a57c2d05b..05db47949 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -23,7 +23,7 @@ ) from micall.core.plot_contigs import plot_stitcher_coverage from micall.tests.utils import MockAligner, fixed_random_seed -from micall.tests.test_denovo import check_hcv_db # activates the fixture +from micall.tests.test_fasta_to_csv import check_hcv_db # activates the fixture from micall.tests.test_remap import load_projects # activates the "projects" fixture diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index 0c1c400fe..d7d800f18 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -1,38 +1,13 @@ from io import StringIO from pathlib import Path -from Bio import SeqIO -from pytest import fixture, mark +from pytest import mark -from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype from micall.core.denovo import write_contig_refs, denovo -from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS - - -@fixture(scope='session', name='hcv_db') -def check_hcv_db(): - db_path = Path(DEFAULT_DATABASE) - index_path = db_path.parent / "refs.fasta.nin" - build_needed = not index_path.exists() - if not build_needed: - projects_date = Path(DEFAULT_PROJECTS).stat().st_mtime - index_date = index_path.stat().st_mtime - build_needed = index_date < projects_date - if build_needed: - with open(DEFAULT_PROJECTS) as projects_json, \ - open(DEFAULT_DATABASE, 'w') as refs_fasta: - make_blast_db(projects_json, refs_fasta) - assert index_path.exists() - return db_path - - -def test_make_blast_db_excludes_hivgha(hcv_db): - fasta_path = Path(DEFAULT_DATABASE) - with fasta_path.open() as f: - for reference in SeqIO.parse(f, 'fasta'): - # Exclude the Ghana project, because they're recombinant. - assert reference.name != 'HIV1-CRF02_AG-GH-AB286855-seed' +from micall.tests.test_fasta_to_csv import check_hcv_db # activates the fixture +# make linters not complain about unused imports. +assert check_hcv_db def test_write_contig_refs_two_sequences(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" @@ -140,27 +115,6 @@ def test_write_contig_refs_reversed_match(tmpdir, hcv_db): assert expected_contigs_csv == contigs_csv.getvalue() -def test_genotype(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA ->bar -CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""") - blast_csv = StringIO() - expected_blast_csv = """\ -contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end -2,HCV-1g,37,0.67,100,19,55,8506,8542 -2,HCV-1a,41,0.75,100,15,55,8518,8558 -1,HCV-1a,41,1.0,100,1,41,8187,8227 -""" - - genotype(str(contigs_fasta), blast_csv=blast_csv) - - assert expected_blast_csv == blast_csv.getvalue() - - def test_write_contig_refs(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py new file mode 100644 index 000000000..0045a7db2 --- /dev/null +++ b/micall/tests/test_fasta_to_csv.py @@ -0,0 +1,54 @@ +from io import StringIO +from pathlib import Path + +from Bio import SeqIO +import pytest + +from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype +from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS + + +@pytest.fixture(scope='session', name='hcv_db') +def check_hcv_db(): + db_path = Path(DEFAULT_DATABASE) + index_path = db_path.parent / "refs.fasta.nin" + build_needed = not index_path.exists() + if not build_needed: + projects_date = Path(DEFAULT_PROJECTS).stat().st_mtime + index_date = index_path.stat().st_mtime + build_needed = index_date < projects_date + if build_needed: + with open(DEFAULT_PROJECTS) as projects_json, \ + open(DEFAULT_DATABASE, 'w') as refs_fasta: + make_blast_db(projects_json, refs_fasta) + assert index_path.exists() + return db_path + + +def test_make_blast_db_excludes_hivgha(hcv_db): + fasta_path = Path(DEFAULT_DATABASE) + with fasta_path.open() as f: + for reference in SeqIO.parse(f, 'fasta'): + # Exclude the Ghana project, because they're recombinant. + assert reference.name != 'HIV1-CRF02_AG-GH-AB286855-seed' + + +def test_genotype(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +>bar +CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""") + blast_csv = StringIO() + expected_blast_csv = """\ +contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end +2,HCV-1g,37,0.67,100,19,55,8506,8542 +2,HCV-1a,41,0.75,100,15,55,8518,8558 +1,HCV-1a,41,1.0,100,1,41,8187,8227 +""" + + genotype(str(contigs_fasta), blast_csv=blast_csv) + + assert expected_blast_csv == blast_csv.getvalue() From 9e49af5ea81467d271c22aae046abdbad435549e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 21:09:22 +0000 Subject: [PATCH 335/485] Bump requests from 2.32.1 to 2.32.3 Bumps [requests](https://github.com/psf/requests) from 2.32.1 to 2.32.3. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.32.1...v2.32.3) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-basespace.txt | 2 +- requirements-watcher.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-basespace.txt b/requirements-basespace.txt index 8ee75d522..8613fa60d 100644 --- a/requirements-basespace.txt +++ b/requirements-basespace.txt @@ -1,3 +1,3 @@ # Requirements for running the micall_basespace.py script -r requirements.txt -requests==2.32.1 +requests==2.32.3 diff --git a/requirements-watcher.txt b/requirements-watcher.txt index a4e0c2e0d..7cc63208e 100644 --- a/requirements-watcher.txt +++ b/requirements-watcher.txt @@ -1,3 +1,3 @@ # Requirements for running the MISEQ_MONITOR.py script git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api -requests==2.32.1 +requests==2.32.3 From bfc931ed390af4def312985de448b1489863c090 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 6 Jun 2024 22:22:58 -0700 Subject: [PATCH 336/485] Make denovo.py only do the assembly --- micall/core/denovo.py | 97 +++------------ micall/drivers/sample.py | 27 ++-- micall/tests/test_denovo.py | 196 +++--------------------------- micall/tests/test_fasta_to_csv.py | 169 +++++++++++++++++++++++++- micall/utils/fasta_to_csv.py | 37 ++++-- 5 files changed, 246 insertions(+), 280 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 7336b2611..52c33e8d8 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -1,111 +1,54 @@ import argparse import logging import os -import tempfile -from typing import Optional, TextIO, cast +from typing import Optional, TextIO from csv import DictReader from datetime import datetime from glob import glob from shutil import rmtree, copyfileobj from subprocess import PIPE, CalledProcessError, STDOUT import subprocess -from tempfile import mkdtemp, NamedTemporaryFile +from tempfile import mkdtemp from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -import micall.utils.fasta_to_csv as fasta_to_csv -import micall.core.contig_stitcher as stitcher IVA = "iva" logger = logging.getLogger(__name__) -def write_contig_refs(contigs_fasta_path: str, - unstitched_contigs_csv: Optional[TextIO], - contigs_csv: Optional[TextIO], - merged_contigs_csv: Optional[TextIO] = None, - blast_csv: Optional[TextIO] = None, - stitcher_plot_path: Optional[str] = None) -> int: - """ Run BLAST search to identify contig sequences. - - :param str contigs_fasta_path: path to file to read contig sequences from - and append merged contigs to - :param unstitched_contigs_csv: open file to write assembled contigs to - :param contigs_csv: open file to write stitched contigs to - :param merged_contigs_csv: open file to read contigs that were merged from - amplicon reads - :param blast_csv: open file to write BLAST search results for each contig - :param stitcher_plot_path: open file to write the visualizer plot to - """ - - with open(contigs_fasta_path, 'a') as contigs_fasta: - if merged_contigs_csv is not None: - contig_reader = DictReader(merged_contigs_csv) - for i, row in enumerate(contig_reader, 1): - contig_name = f'merged-contig-{i}' - contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") - - with NamedTemporaryFile(mode='wt') as temporary_unstitched_csv: - fasta_to_csv.run(contigs_fasta_path, - cast(TextIO, temporary_unstitched_csv), - blast_csv) - - if unstitched_contigs_csv: - with open(temporary_unstitched_csv.name) as input_csv: - copyfileobj(input_csv, unstitched_contigs_csv) - if contigs_csv: - output_csv = contigs_csv - else: - output_csv = open("/dev/null", "wt") - - with open(temporary_unstitched_csv.name) as input_csv: - return stitcher.run(input_csv, output_csv, stitcher_plot_path) +def count_fasta_sequences(file_path): + with open(file_path, 'r') as file: + return sum(1 for line in file if line.startswith('>')) def denovo(fastq1_path: str, fastq2_path: str, - unstitched_contigs_csv: Optional[TextIO], - contigs_csv: Optional[TextIO], + fasta: TextIO, work_dir: str = '.', merged_contigs_csv: Optional[TextIO] = None, - blast_csv: Optional[TextIO] = None, - stitcher_plot_path: Optional[str] = None, ): """ Use de novo assembly to build contigs from reads. - :param fastq1_path: FASTQ file name for read 1 reads - :param fastq2_path: FASTQ file name for read 2 reads - :param unstitched_contigs_csv: open file to write assembled contigs to - :param contigs_csv: open file to write stitched contigs to + :param fastq1: FASTQ file for read 1 reads + :param fastq2: FASTQ file for read 2 reads + :param fasta: file to write assembled contigs to :param work_dir: path for writing temporary files :param merged_contigs_csv: open file to read contigs that were merged from amplicon reads - :param blast_csv: open file to write BLAST search results for each contig - :param stitcher_plot_path: open file to write the visualizer plot to """ - if unstitched_contigs_csv is None and contigs_csv is None: - raise ValueError("Must specify either contigs_csv or unstitched_contigs_csv") - old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*')) for old_tmp_dir in old_tmp_dirs: rmtree(old_tmp_dir, ignore_errors=True) tmp_dir = mkdtemp(dir=work_dir, prefix='assembly_') - if contigs_csv is None: - contigs_csv_tmp = tempfile.NamedTemporaryFile("wt") - contigs_csv = cast(TextIO, contigs_csv_tmp.file) - else: - contigs_csv_tmp = None - start_time = datetime.now() start_dir = os.getcwd() joined_path = os.path.join(tmp_dir, 'joined.fastq') - if stitcher_plot_path is None: - stitcher_plot_path = os.path.join(tmp_dir, "stitcher_plot.svg") subprocess.run(['merge-mates', fastq1_path, fastq2_path, @@ -136,31 +79,25 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass + with open(contigs_fasta_path) as reader: + copyfileobj(reader, fasta) + os.chdir(start_dir) duration = datetime.now() - start_time - contig_count = write_contig_refs(contigs_fasta_path, - unstitched_contigs_csv, - contigs_csv, - blast_csv=blast_csv, - stitcher_plot_path=stitcher_plot_path) + contig_count = count_fasta_sequences(contigs_fasta_path) logger.info('Assembled %d contigs in %s (%ds) on %s.', contig_count, duration, duration.total_seconds(), fastq1_path) - if contigs_csv_tmp: - contigs_csv_tmp.close() - if __name__ == '__main__': logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() - parser.add_argument('fastq1') - parser.add_argument('fastq2') - parser.add_argument('--unstitched_contigs', type=argparse.FileType('w')) - parser.add_argument('--contigs', type=argparse.FileType('w')) - parser.add_argument('--stitcher_plot') + parser.add_argument('fastq1', type=argparse.FileType('r')) + parser.add_argument('fastq2', type=argparse.FileType('r')) + parser.add_argument('fasta', type=argparse.FileType('w')) args = parser.parse_args() - denovo(args.fastq1, args.fastq2, args.unstitched_contigs, args.contigs, args.stitcher_plot_path) + denovo(args.fastq1.name, args.fastq2.name, args.fasta) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index fecffd769..611fa9154 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -19,7 +19,9 @@ from micall.core.denovo import denovo from micall.g2p.fastq_g2p import fastq_g2p, DEFAULT_MIN_COUNT, MIN_VALID, MIN_VALID_PERCENT from micall.utils.driver_utils import makedirs +from micall.utils.fasta_to_csv import write_contig_refs from contextlib import contextmanager +import micall.core.contig_stitcher as stitcher logger = logging.getLogger(__name__) @@ -414,17 +416,28 @@ def run_mapping(self, excluded_seeds): def run_denovo(self, excluded_seeds): logger.info('Running de novo assembly on %s.', self) scratch_path = self.get_scratch_path() - with open(self.merged_contigs_csv) as merged_contigs_csv, \ - open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \ - open(self.contigs_csv, 'w') as contigs_csv, \ - open(self.blast_csv, 'w') as blast_csv: + + with open(self.unstitched_contigs_fasta, 'w') as unstitched_contigs_fasta, \ + open(self.merged_contigs_csv, 'r') as merged_contigs_csv: denovo(self.trimmed1_fastq, self.trimmed2_fastq, - unstitched_contigs_csv, - contigs_csv, + unstitched_contigs_fasta, self.scratch_path, merged_contigs_csv, - blast_csv=blast_csv) + ) + + with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \ + open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \ + open(self.blast_csv, 'w') as blast_csv: + write_contig_refs(self.unstitched_contigs_fasta, + unstitched_contigs_csv, + merged_contigs_csv, + blast_csv=blast_csv, + ) + + with open(self.unstitched_contigs_csv, 'r') as unstitched_contigs_csv, \ + open(self.contigs_csv, 'w') as contigs_csv: + stitcher.run(unstitched_contigs_csv, contigs_csv, self.stitcher_plot_svg) logger.info('Running remap on %s.', self) if self.debug_remap: diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index d7d800f18..81f6a1135 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -1,196 +1,31 @@ from io import StringIO from pathlib import Path +import re from pytest import mark -from micall.core.denovo import write_contig_refs, denovo +from micall.core.denovo import denovo from micall.tests.test_fasta_to_csv import check_hcv_db # activates the fixture # make linters not complain about unused imports. assert check_hcv_db -def test_write_contig_refs_two_sequences(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA ->bar -CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA -HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_write_contig_refs_two_groups(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -ACCCGCCCCTAATAGGGGCGACACTCCGCCATGAATC ->bar -ACCATGGATCACTCCCCTGTGAGGAACTACTGTCTT ->baz -TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-2a,1.0,HCV-2b,ACCCGCCCCTAATAGGGGCGACACTCCGCCATGAATC -HCV-1g,1.0,HCV-1g,ACCATGGATCACTCCCCTGTGAGGAACTACTGTCTT -HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_write_contig_refs_not_found(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -CATCACATAGGAGA -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -unknown,0,,CATCACATAGGAGA -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_write_contig_refs_partial_match(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA ->bar -CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA -HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_write_contig_refs_reversed_match(tmpdir, hcv_db): - """ If BLAST match is reversed, then reverse the contig before reporting. """ - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA ->bar -GTCGTCGCCACACACGAGCATGGTGCAGTCCTGGAGCCCTGTCTCCTATGTGATG -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA -HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_write_contig_refs(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / "contigs.fasta" - contigs_fasta.write_text("""\ ->foo -TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA ->bar -CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""") - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA -HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC -""" - blast_csv = StringIO() - expected_blast_csv = """\ -contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end -2,HCV-1g,37,0.67,100,19,55,8506,8542 -2,HCV-1a,41,0.75,100,15,55,8518,8558 -1,HCV-1a,41,1.0,100,1,41,8187,8227 -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv, blast_csv=blast_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - assert expected_blast_csv == blast_csv.getvalue() - - -def test_write_contig_refs_none(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / 'contigs.fasta' - assert not contigs_fasta.exists() - - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -""" - - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), contigs_csv, contigs_stitched_csv) - - assert expected_contigs_csv == contigs_csv.getvalue() - - -def test_merged_contig(tmpdir, hcv_db): - contigs_fasta = Path(tmpdir) / 'contigs.fasta' - assert not contigs_fasta.exists() - - merged_contigs_path = Path(tmpdir) / 'merged_contigs.csv' - merged_contigs_path.write_text("""\ -contig -TGCACAAGACCCAACAACAATACAAGAAAAAGTATAAGGATAGGACCAGGA -""") - - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HIV1-C-BR-JX140663-seed,1.0,HIV1-C-BR-JX140663-seed,TGCACAAGACCCAACAACAATACAAGAAAAAGTATAAGGATAGGACCAGGA -""" - - with merged_contigs_path.open() as merged_contigs_csv: - contigs_stitched_csv = StringIO() - write_contig_refs(str(contigs_fasta), - contigs_csv, contigs_stitched_csv, - merged_contigs_csv=merged_contigs_csv) - assert expected_contigs_csv == contigs_csv.getvalue() +def normalize_fasta(content: str) -> str: + result = re.sub(r'^>.*$', '>', + content, + flags=re.MULTILINE) + result = ''.join(result.split('\n')) + return result @mark.iva() # skip with -k-iva def test_denovo_iva(tmpdir, hcv_db): microtest_path = Path(__file__).parent / 'microtest' - contigs_csv = StringIO() - expected_contigs_csv = """\ -ref,match,group_ref,contig -HCV-2a,1.0,HCV-2a,TGAGGGCCAAAAAGGTAACTTTTGATAGGATGCAAGTGC\ + contigs_fasta = StringIO() + expected_contigs_fasta = """\ +>contig.00001 +TGAGGGCCAAAAAGGTAACTTTTGATAGGATGCAAGTGC\ TCGACGCTCATTACGACTCAGTCTTAAAGGACATCAAGCTAGCGGCCTCCAAGGTCTCCG\ CGAGGCTCCTCACCCTGGAGGAGGCATGCCAGCTAACTCCACCCCATTCTGCAAGATCCAAATATGGGTTTGGGGCTA\ AGGAGGTGCGCAGCTTGTCCGGGAGGGCCGTTAACCACATCAAGTCCGTGTGGAAGGACCTCCTGGAAGACTCACAAA\ @@ -201,8 +36,9 @@ def test_denovo_iva(tmpdir, hcv_db): denovo(str(microtest_path / '2160A-HCV_S19_L001_R1_001.fastq'), str(microtest_path / '2160A-HCV_S19_L001_R2_001.fastq'), - contigs_csv, - None, + contigs_fasta, tmpdir) - assert contigs_csv.getvalue() == expected_contigs_csv + result = contigs_fasta.getvalue() + expected = expected_contigs_fasta + assert normalize_fasta(result) == normalize_fasta(expected) diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py index 0045a7db2..a283b8589 100644 --- a/micall/tests/test_fasta_to_csv.py +++ b/micall/tests/test_fasta_to_csv.py @@ -4,7 +4,7 @@ from Bio import SeqIO import pytest -from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype +from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype, write_contig_refs from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS @@ -52,3 +52,170 @@ def test_genotype(tmpdir, hcv_db): genotype(str(contigs_fasta), blast_csv=blast_csv) assert expected_blast_csv == blast_csv.getvalue() + + +def test_write_contig_refs_two_sequences(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +>bar +CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_write_contig_refs_two_groups(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +ACCCGCCCCTAATAGGGGCGACACTCCGCCATGAATC +>bar +ACCATGGATCACTCCCCTGTGAGGAACTACTGTCTT +>baz +TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HCV-2a,1.0,HCV-2b,ACCCGCCCCTAATAGGGGCGACACTCCGCCATGAATC +HCV-1g,1.0,HCV-1g,ACCATGGATCACTCCCCTGTGAGGAACTACTGTCTT +HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_write_contig_refs_not_found(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +CATCACATAGGAGA +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +unknown,0,,CATCACATAGGAGA +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_write_contig_refs_partial_match(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +>bar +CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_write_contig_refs_reversed_match(tmpdir, hcv_db): + """ If BLAST match is reversed, then reverse the contig before reporting. """ + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +>bar +GTCGTCGCCACACACGAGCATGGTGCAGTCCTGGAGCCCTGTCTCCTATGTGATG +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_write_contig_refs(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / "contigs.fasta" + contigs_fasta.write_text("""\ +>foo +TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +>bar +CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""") + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HCV-1a,1.0,HCV-1a,TCACCAGGACAGCGGGTTGAATTCCTCGTGCAAGCGTGGAA +HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC +""" + blast_csv = StringIO() + expected_blast_csv = """\ +contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end +2,HCV-1g,37,0.67,100,19,55,8506,8542 +2,HCV-1a,41,0.75,100,15,55,8518,8558 +1,HCV-1a,41,1.0,100,1,41,8187,8227 +""" + + write_contig_refs(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + assert expected_blast_csv == blast_csv.getvalue() + + +def test_write_contig_refs_none(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / 'contigs.fasta' + assert not contigs_fasta.exists() + + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +""" + + write_contig_refs(str(contigs_fasta), contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() + + +def test_merged_contig(tmpdir, hcv_db): + contigs_fasta = Path(tmpdir) / 'contigs.fasta' + assert not contigs_fasta.exists() + + merged_contigs_path = Path(tmpdir) / 'merged_contigs.csv' + merged_contigs_path.write_text("""\ +contig +TGCACAAGACCCAACAACAATACAAGAAAAAGTATAAGGATAGGACCAGGA +""") + + contigs_csv = StringIO() + expected_contigs_csv = """\ +ref,match,group_ref,contig +HIV1-C-BR-JX140663-seed,1.0,HIV1-C-BR-JX140663-seed,TGCACAAGACCCAACAACAATACAAGAAAAAGTATAAGGATAGGACCAGGA +""" + + with merged_contigs_path.open() as merged_contigs_csv: + write_contig_refs(str(contigs_fasta), + contigs_csv, + merged_contigs_csv=merged_contigs_csv) + + assert expected_contigs_csv == contigs_csv.getvalue() diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index bb7c9819e..41f3048eb 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -75,11 +75,11 @@ def init_contigs_refs(contigs_csv: TextIO) -> DictWriter: return writer -def write_unstitched_contigs(writer: DictWriter, - group_refs: Dict[str, str], - genotypes: Dict[str, typing.Tuple[str, float]], - contigs_fasta_path: str): - """Write unstitched contigs to a CSV file. +def write_contigs(writer: DictWriter, + group_refs: Dict[str, str], + genotypes: Dict[str, typing.Tuple[str, float]], + contigs_fasta_path: str): + """Write contigs to a CSV file. Args: writer (DictWriter): CSV writer to write contigs. @@ -195,33 +195,46 @@ def genotype(fasta: str, db: str = DEFAULT_DATABASE, return samples -def run(contigs_fasta_path: str, unstitched_contigs_csv: TextIO, blast_csv: Optional[TextIO] = None): +def write_contig_refs(contigs_fasta_path: str, + contigs_csv: TextIO, + merged_contigs_csv: Optional[TextIO] = None, + blast_csv: Optional[TextIO] = None) -> None: """Run BLAST search to identify contig sequences and write them to CSV. Args: contigs_fasta_path (str): Path to the FASTA file containing contig sequences. - unstitched_contigs_csv (TextIO): Open file to write assembled contigs to. + contigs_csv (TextIO): Open file to write assembled contigs to. blast_csv (Optional[TextIO]): Open file to write BLAST search results for each contig. """ - unstitched_writer = init_contigs_refs(cast(TextIO, unstitched_contigs_csv)) + + with open(contigs_fasta_path, 'a') as contigs_fasta: + if merged_contigs_csv is not None: + contig_reader = DictReader(merged_contigs_csv) + for i, row in enumerate(contig_reader, 1): + contig_name = f'merged-contig-{i}' + contigs_fasta.write(f">{contig_name}\n{row['contig']}\n") + + writer = init_contigs_refs(cast(TextIO, contigs_csv)) group_refs: Dict[str, str] = {} genotypes = genotype(contigs_fasta_path, blast_csv=blast_csv, group_refs=group_refs) - write_unstitched_contigs(unstitched_writer, group_refs, genotypes, contigs_fasta_path) - unstitched_contigs_csv.flush() + write_contigs(writer, group_refs, genotypes, contigs_fasta_path) + contigs_csv.flush() def main(argv: Sequence[str]): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Convert contigs from FASTA to CSV format with BLAST annotations.") parser.add_argument('contigs_fasta', help="Input FASTA file with contig sequences.") - parser.add_argument('unstitched_contigs_csv', type=argparse.FileType('w'), + parser.add_argument('contigs_csv', type=argparse.FileType('w'), help="Output CSV file to write assembled contigs.") + parser.add_argument('--merged_contigs_csv', type=argparse.FileType('r'), + help="Optional CSV file with contigs that were merged from amplicon reads.") parser.add_argument('--blast_csv', type=argparse.FileType('w'), help="Optional CSV file to write BLAST search results.") args = parser.parse_args(argv) - run(args.contigs_fasta, args.unstitched_contigs_csv, args.blast_csv) + write_contig_refs(args.contigs_fasta, args.contigs_csv, args.merged_contigs_csv, args.blast_csv) if __name__ == "__main__": From 584a579bbef525c5420d98851a32a7719d01ae97 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 6 Jun 2024 22:41:48 -0700 Subject: [PATCH 337/485] Rename "write_contig_refs" to "fasta_to_csv" --- micall/drivers/sample.py | 12 +++++------ micall/tests/test_fasta_to_csv.py | 36 +++++++++++++++---------------- micall/utils/contig_blaster.py | 4 ++-- micall/utils/fasta_to_csv.py | 11 +++++----- 4 files changed, 32 insertions(+), 31 deletions(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 611fa9154..eefa799ad 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -19,7 +19,7 @@ from micall.core.denovo import denovo from micall.g2p.fastq_g2p import fastq_g2p, DEFAULT_MIN_COUNT, MIN_VALID, MIN_VALID_PERCENT from micall.utils.driver_utils import makedirs -from micall.utils.fasta_to_csv import write_contig_refs +from micall.utils.fasta_to_csv import fasta_to_svg from contextlib import contextmanager import micall.core.contig_stitcher as stitcher @@ -429,11 +429,11 @@ def run_denovo(self, excluded_seeds): with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \ open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \ open(self.blast_csv, 'w') as blast_csv: - write_contig_refs(self.unstitched_contigs_fasta, - unstitched_contigs_csv, - merged_contigs_csv, - blast_csv=blast_csv, - ) + fasta_to_svg(self.unstitched_contigs_fasta, + unstitched_contigs_csv, + merged_contigs_csv, + blast_csv=blast_csv, + ) with open(self.unstitched_contigs_csv, 'r') as unstitched_contigs_csv, \ open(self.contigs_csv, 'w') as contigs_csv: diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py index a283b8589..436251c3e 100644 --- a/micall/tests/test_fasta_to_csv.py +++ b/micall/tests/test_fasta_to_csv.py @@ -4,7 +4,7 @@ from Bio import SeqIO import pytest -from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype, write_contig_refs +from micall.utils.fasta_to_csv import DEFAULT_DATABASE, genotype, fasta_to_svg from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS @@ -54,7 +54,7 @@ def test_genotype(tmpdir, hcv_db): assert expected_blast_csv == blast_csv.getvalue() -def test_write_contig_refs_two_sequences(tmpdir, hcv_db): +def test_fasta_to_svg_two_sequences(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -69,12 +69,12 @@ def test_write_contig_refs_two_sequences(tmpdir, hcv_db): HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_write_contig_refs_two_groups(tmpdir, hcv_db): +def test_fasta_to_svg_two_groups(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -92,12 +92,12 @@ def test_write_contig_refs_two_groups(tmpdir, hcv_db): HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_write_contig_refs_not_found(tmpdir, hcv_db): +def test_fasta_to_svg_not_found(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -109,12 +109,12 @@ def test_write_contig_refs_not_found(tmpdir, hcv_db): unknown,0,,CATCACATAGGAGA """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_write_contig_refs_partial_match(tmpdir, hcv_db): +def test_fasta_to_svg_partial_match(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -129,12 +129,12 @@ def test_write_contig_refs_partial_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_write_contig_refs_reversed_match(tmpdir, hcv_db): +def test_fasta_to_svg_reversed_match(tmpdir, hcv_db): """ If BLAST match is reversed, then reverse the contig before reporting. """ contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ @@ -150,12 +150,12 @@ def test_write_contig_refs_reversed_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_write_contig_refs(tmpdir, hcv_db): +def test_fasta_to_svg(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -177,13 +177,13 @@ def test_write_contig_refs(tmpdir, hcv_db): 1,HCV-1a,41,1.0,100,1,41,8187,8227 """ - write_contig_refs(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) assert expected_contigs_csv == contigs_csv.getvalue() assert expected_blast_csv == blast_csv.getvalue() -def test_write_contig_refs_none(tmpdir, hcv_db): +def test_fasta_to_svg_none(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / 'contigs.fasta' assert not contigs_fasta.exists() @@ -192,7 +192,7 @@ def test_write_contig_refs_none(tmpdir, hcv_db): ref,match,group_ref,contig """ - write_contig_refs(str(contigs_fasta), contigs_csv) + fasta_to_svg(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -214,8 +214,8 @@ def test_merged_contig(tmpdir, hcv_db): """ with merged_contigs_path.open() as merged_contigs_csv: - write_contig_refs(str(contigs_fasta), - contigs_csv, - merged_contigs_csv=merged_contigs_csv) + fasta_to_svg(str(contigs_fasta), + contigs_csv, + merged_contigs_csv=merged_contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() diff --git a/micall/utils/contig_blaster.py b/micall/utils/contig_blaster.py index 3a872f8af..7158d25ef 100644 --- a/micall/utils/contig_blaster.py +++ b/micall/utils/contig_blaster.py @@ -5,7 +5,7 @@ from operator import itemgetter from tempfile import NamedTemporaryFile -from micall.core.denovo import write_contig_refs +from micall.utils.fasta_to_csv import fasta_to_svg def parse_args(): @@ -44,7 +44,7 @@ def main(): fasta_file.flush() new_contigs_csv = StringIO() blast_csv = StringIO() - write_contig_refs(fasta_file.name, new_contigs_csv, blast_csv=blast_csv) + fasta_to_svg(fasta_file.name, new_contigs_csv, blast_csv=blast_csv) blast_csv.seek(0) for source_contig_num, contig_rows in groupby(DictReader(blast_csv), itemgetter('contig_num')): diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index 41f3048eb..8bca5968c 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -195,15 +195,16 @@ def genotype(fasta: str, db: str = DEFAULT_DATABASE, return samples -def write_contig_refs(contigs_fasta_path: str, - contigs_csv: TextIO, - merged_contigs_csv: Optional[TextIO] = None, - blast_csv: Optional[TextIO] = None) -> None: +def fasta_to_svg(contigs_fasta_path: str, + contigs_csv: TextIO, + merged_contigs_csv: Optional[TextIO] = None, + blast_csv: Optional[TextIO] = None) -> None: """Run BLAST search to identify contig sequences and write them to CSV. Args: contigs_fasta_path (str): Path to the FASTA file containing contig sequences. contigs_csv (TextIO): Open file to write assembled contigs to. + merged_contigs_csv: open file to read contigs that were merged from amplicon reads. blast_csv (Optional[TextIO]): Open file to write BLAST search results for each contig. """ @@ -234,7 +235,7 @@ def main(argv: Sequence[str]): parser.add_argument('--blast_csv', type=argparse.FileType('w'), help="Optional CSV file to write BLAST search results.") args = parser.parse_args(argv) - write_contig_refs(args.contigs_fasta, args.contigs_csv, args.merged_contigs_csv, args.blast_csv) + fasta_to_svg(args.contigs_fasta, args.contigs_csv, args.merged_contigs_csv, args.blast_csv) if __name__ == "__main__": From 16d8e9db7a2b74985f2342a88f69dd2eabc0d830 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 6 Jun 2024 22:49:03 -0700 Subject: [PATCH 338/485] Rename "stitcher.run" to "contig_stitcher" --- micall/core/contig_stitcher.py | 4 ++-- micall/drivers/sample.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 90ddb4bab..7cf9c3418 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -640,7 +640,7 @@ def read_contigs(input_csv: TextIO) -> Iterable[GenotypedContig]: match_fraction=match_fraction) -def run(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str]) -> int: +def contig_stitcher(input_csv: TextIO, output_csv: TextIO, stitcher_plot_path: Optional[str]) -> int: with StitcherContext.fresh() as ctx: contigs = list(read_contigs(input_csv)) @@ -685,7 +685,7 @@ def main(argv: Sequence[str]): logging.basicConfig(level=logger.level) plot_path = args.plot.name if args.plot is not None else None - run(args.contigs, args.stitched_contigs, plot_path) + contig_stitcher(args.contigs, args.stitched_contigs, plot_path) if __name__ == '__main__': diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index eefa799ad..41f73c997 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -9,6 +9,7 @@ from micall.core.aln2counts import aln2counts from micall.core.amplicon_finder import write_merge_lengths_plot, merge_for_entropy from micall.core.cascade_report import CascadeReport +from micall.core.contig_stitcher import contig_stitcher from micall.core.coverage_plots import coverage_plot, concordance_plot from micall.core.plot_contigs import plot_genome_coverage from micall.core.prelim_map import prelim_map @@ -21,7 +22,6 @@ from micall.utils.driver_utils import makedirs from micall.utils.fasta_to_csv import fasta_to_svg from contextlib import contextmanager -import micall.core.contig_stitcher as stitcher logger = logging.getLogger(__name__) @@ -437,7 +437,7 @@ def run_denovo(self, excluded_seeds): with open(self.unstitched_contigs_csv, 'r') as unstitched_contigs_csv, \ open(self.contigs_csv, 'w') as contigs_csv: - stitcher.run(unstitched_contigs_csv, contigs_csv, self.stitcher_plot_svg) + contig_stitcher(unstitched_contigs_csv, contigs_csv, self.stitcher_plot_svg) logger.info('Running remap on %s.', self) if self.debug_remap: From a17238a0d0c04f03d1443bdc1cb1a1e56bc09430 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 7 Jun 2024 09:29:07 -0700 Subject: [PATCH 339/485] Add help messages to denovo.py arguments --- micall/core/denovo.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 52c33e8d8..3614575a8 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -94,10 +94,24 @@ def denovo(fastq1_path: str, if __name__ == '__main__': logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser() - parser.add_argument('fastq1', type=argparse.FileType('r')) - parser.add_argument('fastq2', type=argparse.FileType('r')) - parser.add_argument('fasta', type=argparse.FileType('w')) + parser = argparse.ArgumentParser( + description="A script to perform de novo assembly of reads to build contigs." + ) + parser.add_argument( + 'fastq1', + type=argparse.FileType('r'), + help="Path to the FASTQ file containing read 1 of paired-end sequencing data." + ) + parser.add_argument( + 'fastq2', + type=argparse.FileType('r'), + help="Path to the FASTQ file containing read 2 of paired-end sequencing data." + ) + parser.add_argument( + 'fasta', + type=argparse.FileType('w'), + help="Path to the output FASTA file where assembled contigs will be written." + ) args = parser.parse_args() denovo(args.fastq1.name, args.fastq2.name, args.fasta) From 344edd8fef5154f24e6d6d6e829f046d3f620c5a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 7 Jun 2024 10:09:01 -0700 Subject: [PATCH 340/485] Update change notes in Singularity image --- Singularity | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Singularity b/Singularity index 3b267501c..3180869e2 100644 --- a/Singularity +++ b/Singularity @@ -9,7 +9,8 @@ From: python:3.8 This Singularity container can be run on Kive: http://cfe-lab.github.io/Kive - Change Notes: Fix alignment bugs, and updated to HIVdb 9.4. + Change Notes: Comprehensive updates to the contig stitcher, + including bug fixes and visualization enhancements. %labels MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall From d78926fc39263e31bfd49ec77a4552da4d2d3a80 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 7 Jun 2024 14:34:53 -0700 Subject: [PATCH 341/485] Use unstitched_cascade.csv for proviral pipeline --- Singularity | 2 +- docs/steps.md | 7 +++++++ micall/monitor/kive_watcher.py | 6 ++++-- micall/tests/test_kive_watcher.py | 2 +- micall_docker.py | 3 ++- micall_kive.py | 3 +++ release_test_microtest.py | 1 + 7 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Singularity b/Singularity index 3180869e2..253586ab8 100644 --- a/Singularity +++ b/Singularity @@ -157,7 +157,7 @@ From: python:3.8 conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - unstitched_conseq_csv unstitched_contigs_csv contigs_csv \ + unstitched_cascade_csv unstitched_conseq_csv unstitched_contigs_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index 28c87a1d6..bed3813e1 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -353,6 +353,13 @@ Individual files are described after the list of steps. * unstitched_conseq.csv * region - the region mapped to * sequence - the consensus sequence used +* unstitched_cascade.csv - number of read pairs that flow through the pipeline steps + * demultiplexed - count from the raw FASTQ + * v3loop - aligned with V3LOOP + * g2p - valid reads to count in G2P + * prelim_map - mapped to other references on first pass + * remap - mapped to other references after remapping + * aligned - aligned with a reference and merged with mate * resistance.csv * region - the region code, like PR or RT * drug_class - the drug class code from the HIVdb rules, like NRTI diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index fd22956fc..003a6526d 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -59,6 +59,7 @@ 'resistance_consensus_csv', 'wg_fasta', 'mid_fasta', + 'unstitched_cascade_csv', 'unstitched_conseq_csv', 'unstitched_contigs_csv', 'contigs_csv', @@ -939,12 +940,13 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description): run_dataset['argument_name']: run_dataset['dataset'] for run_dataset in main_run['datasets'] if run_dataset['argument_name'] in ('sample_info_csv', + 'unstitched_cascade_csv', 'unstitched_conseq_csv', - 'unstitched_contigs_csv', - 'cascade_csv')} + 'unstitched_contigs_csv')} input_datasets = { argument_name: self.kive_retry(lambda: self.session.get(url).json()) for argument_name, url in input_dataset_urls.items()} + input_datasets['cascade_csv'] = input_datasets.pop('unstitched_cascade_csv') input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_conseq_csv') input_datasets['contigs_csv'] = input_datasets.pop('unstitched_contigs_csv') run = self.find_or_launch_run( diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 64a1e542b..4ec503c34 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1765,7 +1765,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): argument_name='unstitched_conseq_csv'), dict(dataset='/datasets/113/', argument_type='O', - argument_name='cascade_csv')]] # run datasets + argument_name='unstitched_cascade_csv')]] # run datasets mock_session.get.return_value.json.side_effect = [ dict(url='/datasets/110/', id=110), dict(url='/datasets/111/', id=111), diff --git a/micall_docker.py b/micall_docker.py index 18a70829c..4a32e2e39 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1098,7 +1098,8 @@ def collate_samples(run_info: RunInfo): 'concordance.csv', 'concordance_seed.csv'] if run_info.is_denovo: - filenames += ['conseq_stitched.csv', 'conseq_region.csv', 'unstitched_conseq.csv'] + filenames += ['conseq_stitched.csv', 'conseq_region.csv', + 'unstitched_cascade.csv', 'unstitched_conseq.csv', 'unstitched_contigs.csv'] for filename in filenames: out_path = run_info.output_path with open(os.path.join(out_path, filename), 'w') as fout: diff --git a/micall_kive.py b/micall_kive.py index 60275cc11..b6650cfb6 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -80,6 +80,9 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') + parser.add_argument('unstitched_cascade_csv', + nargs='?', + help='count of reads at each step') parser.add_argument('unstitched_conseq_csv', nargs='?', help='CSV containing mapping unstitched consensus sequences') diff --git a/release_test_microtest.py b/release_test_microtest.py index d9e6440af..157cd13de 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -465,6 +465,7 @@ def process_sample(self, fastq_file: Path): 'genome_coverage.csv', 'genome_coverage.svg', 'genome_concordance.svg', + 'unstitched_cascade.csv', 'unstitched_conseq.csv', 'unstitched_contigs.csv', 'contigs.csv', From 3f924e3815041609ae42b7cf2b5fdbede2917d83 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 12 Jun 2024 10:13:56 -0700 Subject: [PATCH 342/485] Fix the name of the debug_debug.sam to prevent overrides --- micall/drivers/sample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index fecffd769..98b07e6f9 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -452,7 +452,7 @@ def run_denovo(self, excluded_seeds): excluded_seeds=excluded_seeds) def with_prefix(path): - return prepend_prefix_to_basename("unstitched_", path) + return path and prepend_prefix_to_basename("unstitched_", path) with open(self.unstitched_contigs_csv) as contigs_csv, \ open(with_prefix(self.remap_csv), 'w') as remap_csv, \ @@ -470,5 +470,5 @@ def with_prefix(path): unmapped1, unmapped2, scratch_path, - debug_file_prefix=debug_file_prefix, + debug_file_prefix=with_prefix(debug_file_prefix), excluded_seeds=excluded_seeds) From bcd45c4a05238bd3e154b3ea39a1253664876563 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 7 Jun 2024 10:09:01 -0700 Subject: [PATCH 343/485] Update change notes in Singularity image --- Singularity | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Singularity b/Singularity index 3b267501c..3180869e2 100644 --- a/Singularity +++ b/Singularity @@ -9,7 +9,8 @@ From: python:3.8 This Singularity container can be run on Kive: http://cfe-lab.github.io/Kive - Change Notes: Fix alignment bugs, and updated to HIVdb 9.4. + Change Notes: Comprehensive updates to the contig stitcher, + including bug fixes and visualization enhancements. %labels MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall From 03d8de4e6b87755c4fcd6b7ccb349de8707cbedd Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 7 Jun 2024 14:34:53 -0700 Subject: [PATCH 344/485] Use unstitched_cascade.csv for proviral pipeline --- Singularity | 2 +- docs/steps.md | 7 +++++++ micall/monitor/kive_watcher.py | 6 ++++-- micall/tests/test_kive_watcher.py | 2 +- micall_docker.py | 3 ++- micall_kive.py | 3 +++ release_test_microtest.py | 1 + 7 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Singularity b/Singularity index 3180869e2..253586ab8 100644 --- a/Singularity +++ b/Singularity @@ -157,7 +157,7 @@ From: python:3.8 conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \ coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \ genome_coverage_csv genome_coverage_svg genome_concordance_svg \ - unstitched_conseq_csv unstitched_contigs_csv contigs_csv \ + unstitched_cascade_csv unstitched_conseq_csv unstitched_contigs_csv contigs_csv \ read_entropy_csv conseq_region_csv conseq_stitched_csv KIVE_THREADS 2 KIVE_MEMORY 6000 diff --git a/docs/steps.md b/docs/steps.md index 747f7665c..c0c748f7e 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -353,6 +353,13 @@ Individual files are described after the list of steps. * unstitched_conseq.csv * region - the region mapped to * sequence - the consensus sequence used +* unstitched_cascade.csv - number of read pairs that flow through the pipeline steps + * demultiplexed - count from the raw FASTQ + * v3loop - aligned with V3LOOP + * g2p - valid reads to count in G2P + * prelim_map - mapped to other references on first pass + * remap - mapped to other references after remapping + * aligned - aligned with a reference and merged with mate * resistance.csv * region - the region code, like PR or RT * drug_class - the drug class code from the HIVdb rules, like NRTI diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py index fd22956fc..003a6526d 100644 --- a/micall/monitor/kive_watcher.py +++ b/micall/monitor/kive_watcher.py @@ -59,6 +59,7 @@ 'resistance_consensus_csv', 'wg_fasta', 'mid_fasta', + 'unstitched_cascade_csv', 'unstitched_conseq_csv', 'unstitched_contigs_csv', 'contigs_csv', @@ -939,12 +940,13 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description): run_dataset['argument_name']: run_dataset['dataset'] for run_dataset in main_run['datasets'] if run_dataset['argument_name'] in ('sample_info_csv', + 'unstitched_cascade_csv', 'unstitched_conseq_csv', - 'unstitched_contigs_csv', - 'cascade_csv')} + 'unstitched_contigs_csv')} input_datasets = { argument_name: self.kive_retry(lambda: self.session.get(url).json()) for argument_name, url in input_dataset_urls.items()} + input_datasets['cascade_csv'] = input_datasets.pop('unstitched_cascade_csv') input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_conseq_csv') input_datasets['contigs_csv'] = input_datasets.pop('unstitched_contigs_csv') run = self.find_or_launch_run( diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 64a1e542b..4ec503c34 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -1765,7 +1765,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive): argument_name='unstitched_conseq_csv'), dict(dataset='/datasets/113/', argument_type='O', - argument_name='cascade_csv')]] # run datasets + argument_name='unstitched_cascade_csv')]] # run datasets mock_session.get.return_value.json.side_effect = [ dict(url='/datasets/110/', id=110), dict(url='/datasets/111/', id=111), diff --git a/micall_docker.py b/micall_docker.py index 18a70829c..4a32e2e39 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1098,7 +1098,8 @@ def collate_samples(run_info: RunInfo): 'concordance.csv', 'concordance_seed.csv'] if run_info.is_denovo: - filenames += ['conseq_stitched.csv', 'conseq_region.csv', 'unstitched_conseq.csv'] + filenames += ['conseq_stitched.csv', 'conseq_region.csv', + 'unstitched_cascade.csv', 'unstitched_conseq.csv', 'unstitched_contigs.csv'] for filename in filenames: out_path = run_info.output_path with open(os.path.join(out_path, filename), 'w') as fout: diff --git a/micall_kive.py b/micall_kive.py index 60275cc11..b6650cfb6 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -80,6 +80,9 @@ def parse_args(): action='store_true', help='Use de novo assembly instead of mapping to ' 'reference sequences.') + parser.add_argument('unstitched_cascade_csv', + nargs='?', + help='count of reads at each step') parser.add_argument('unstitched_conseq_csv', nargs='?', help='CSV containing mapping unstitched consensus sequences') diff --git a/release_test_microtest.py b/release_test_microtest.py index d9e6440af..157cd13de 100644 --- a/release_test_microtest.py +++ b/release_test_microtest.py @@ -465,6 +465,7 @@ def process_sample(self, fastq_file: Path): 'genome_coverage.csv', 'genome_coverage.svg', 'genome_concordance.svg', + 'unstitched_cascade.csv', 'unstitched_conseq.csv', 'unstitched_contigs.csv', 'contigs.csv', From fe7bbbeea0e60391bda2c64f5372aa16c38d49d8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 17 Jun 2024 13:16:06 -0700 Subject: [PATCH 345/485] Fix names of dropped contigs in contig stitcher logs --- micall/utils/contig_stitcher_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/utils/contig_stitcher_events.py b/micall/utils/contig_stitcher_events.py index 68c1c5cc0..bfa5c5ed2 100644 --- a/micall/utils/contig_stitcher_events.py +++ b/micall/utils/contig_stitcher_events.py @@ -240,7 +240,7 @@ class Drop: covering: List[AlignedContig] def __str__(self) -> str: - covering_contig_names = ', '.join(repr(x.name) for x in self.covering) + covering_contig_names = ', '.join(repr(x.unique_name) for x in self.covering) return ( f"Dropped contig {self.contig.unique_name} as it is completely covered by these contigs: " f"{covering_contig_names}." From 9b16499371164ce89b95f95eb66b0943a9f555bd Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 25 Jun 2024 10:48:59 -0700 Subject: [PATCH 346/485] Copy SampleSheetOverrides file in release_test_setup.py --- release_test_setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/release_test_setup.py b/release_test_setup.py index 60480cbea..9d2df4578 100644 --- a/release_test_setup.py +++ b/release_test_setup.py @@ -173,9 +173,14 @@ def setup_run(self): os.symlink(interop_source, interop_target) for filename in ('RunInfo.xml', 'SampleSheet.csv', + 'SampleSheetOverrides.csv', 'needsprocessing'): - copy(os.path.join(self.run_name, filename), - os.path.join(target_run_path, filename)) + + source = os.path.join(self.run_name, filename) + target = os.path.join(target_run_path, filename) + if os.path.exists(source): + copy(source, target) + results_path = os.path.join(target_run_path, 'Results', 'version_' + self.config.pipeline_version) From 7a3fc2f8456d7c945f9aabe170943c77b3926304 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 26 Jun 2024 14:46:32 -0700 Subject: [PATCH 347/485] Save git version and history in Singularity container This commit adds functionality to save the current git version and a portion of the git history within the Singularity container to improve traceability and reproducibility. We capture the git commit hash and the last 10 git commit logs into a specific directory inside the container. The `.git` directory is utilized temporarily for this purpose and then removed to keep the container lightweight. --- Singularity | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Singularity b/Singularity index 253586ab8..090a80498 100644 --- a/Singularity +++ b/Singularity @@ -31,6 +31,9 @@ From: python:3.8 mkdir ${SINGULARITY_ROOTFS}/opt/micall/micall %files + ## Git files + .git /opt/micall/.git + ## MiCall micall_docker.py /opt/micall/ micall_kive.py /opt/micall/ @@ -57,6 +60,13 @@ From: python:3.8 apt-get update -q apt-get install -q -y unzip wget + echo ===== Saving git version ===== >/dev/null + # Git is expected to be already installed. + mkdir -p /etc/micall + git -C /opt/micall/ rev-parse HEAD > /etc/micall/git-version + git -C /opt/micall/ log -n 10 > /etc/micall/git-log + rm -rf /opt/micall/.git + echo ===== Installing blast ===== >/dev/null apt-get install -q -y ncbi-blast+ From c73f3e35bf92acbd3cadc08bfe2d23643188ce24 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 26 Jun 2024 16:28:45 -0700 Subject: [PATCH 348/485] Record `git describe` output in Singularity container --- Singularity | 1 + 1 file changed, 1 insertion(+) diff --git a/Singularity b/Singularity index 090a80498..6ac3a38df 100644 --- a/Singularity +++ b/Singularity @@ -64,6 +64,7 @@ From: python:3.8 # Git is expected to be already installed. mkdir -p /etc/micall git -C /opt/micall/ rev-parse HEAD > /etc/micall/git-version + git -C /opt/micall/ -c 'core.fileMode=false' describe --tags --dirty 1>&2 > /etc/micall/git-describe || true git -C /opt/micall/ log -n 10 > /etc/micall/git-log rm -rf /opt/micall/.git From f5b361f26b92da5305ab3bf0b970f550e90e72f5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:19:35 -0700 Subject: [PATCH 349/485] Do not crash stitcher visualizer on group_ref = None --- micall/core/plot_contigs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4e12edbb8..4c0783e47 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1068,7 +1068,10 @@ def add_section(title: str) -> None: figure = Figure() for group_ref in group_refs: try: - landmarks = landmark_reader.get_landmarks(group_ref) + if group_ref is not None: + landmarks = landmark_reader.get_landmarks(group_ref) + else: + landmarks = None except ValueError: landmarks = None From 40b0a4c8309fb04d51becd491a68fa7f33371778 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:10:37 -0700 Subject: [PATCH 350/485] Contig stitcher: fix discards,anomalies labels in multiple groups cases --- micall/core/plot_contigs.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/micall/core/plot_contigs.py b/micall/core/plot_contigs.py index 4c0783e47..2d0db2ae2 100644 --- a/micall/core/plot_contigs.py +++ b/micall/core/plot_contigs.py @@ -1187,8 +1187,7 @@ def add_section(title: str) -> None: # Discarded # ############# - if discarded or merged_unaligned_parts: - add_section("discards:") + def get_group_discards(group_ref): for root in sorted_roots: if contig_map[root].group_ref != group_ref: continue @@ -1201,25 +1200,36 @@ def add_section(title: str) -> None: if id in unaligned_parts: (q_st, q_ei) = query_position_map[id] label = name_map[id] - figure.add(Track(position_offset, position_offset + abs(q_ei - q_st), - label=label, color="yellow")) + yield Track(position_offset, position_offset + abs(q_ei - q_st), + label=label, color="yellow") else: part = contig_map[id] - figure.add(Multitrack(list(get_tracks([part])))) + yield Multitrack(list(get_tracks([part]))) + + disc = list(get_group_discards(group_ref)) + if disc: + add_section("discards:") + for element in disc: + figure.add(element) ############# # Anomalies # ############# - if anomaly: - add_section("anomaly:") + def get_group_anomalies(group_ref): for root in sorted_roots: parts_ids = final_children_mapping[root] parts_ids = [name for name in parts_ids if name in anomaly] parts = [contig_map[name] for name in parts_ids] parts = [part for part in parts if part.group_ref == group_ref] for part in parts: - figure.add(Multitrack(list(get_tracks([part])))) + yield Multitrack(list(get_tracks([part]))) + + anom = list(get_group_anomalies(group_ref)) + if anom: + add_section("anomaly:") + for element in anom: + figure.add(element) ########### # Unknown # From b57c63ba5c8b7c1c1c81e4cbda968d34fce80f4f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 11:31:30 -0700 Subject: [PATCH 351/485] Contig stitcher: add a visualizer test --- ...labeling_of_different_organism_contigs.svg | 500 ++++++++++++++++++ micall/tests/test_contig_stitcher.py | 64 +++ 2 files changed, 564 insertions(+) create mode 100644 micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg diff --git a/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg b/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg new file mode 100644 index 000000000..e550e9a1a --- /dev/null +++ b/micall/tests/data/stitcher_plots/test_visualizer_correct_labeling_of_different_organism_contigs.svg @@ -0,0 +1,500 @@ + + + + + + + + + + + + + + + + + + +testref-1 + + + + + + + +1 + + + + + + + + + + + + +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + + +3 + + + + + + + + + + + + + + + + + +testref-2 + + + + + + + +2 + + + + + + + + + + + + +2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +discards: + + + + + + +4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +anomaly: + + + + + + +5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +unknown: + + + + + + +6 + + + + \ No newline at end of file diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index 16af8c722..a57c2d05b 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -1508,6 +1508,70 @@ def test_visualizer_simple(exact_aligner, tmp_path, hcv_db): ), "The contents of the stitched plot file do not match the expected contents." +def test_visualizer_correct_labeling_of_different_organism_contigs( + exact_aligner, visualizer +): + # Scenario: Some discarded and anomaly contigs correctly labelled. + + ref_seq = "A" * 100 + "C" * 100 + + contigs = [ + GenotypedContig( + name="a", + seq="A" * 50, + ref_name="testref-1", + group_ref="testref-1", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b", + seq="C" * 50, + ref_name="testref-2", + group_ref="testref-2", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="a_anomaly", + seq="D" * 50, + ref_name="testref-1", + group_ref="testref-1", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="b_discarded", + seq="C" * 20, + ref_name="testref-2", + group_ref="testref-2", + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="some_anomaly", + seq="T" * 20, + ref_name='unknown', + group_ref=None, + ref_seq=ref_seq, + match_fraction=0.5, + ), + GenotypedContig( + name="some_unknown", + seq="T" * 20, + ref_name='unknown', + group_ref=None, + ref_seq=None, + match_fraction=0.5, + ), + ] + + results = list(stitch_contigs(contigs)) + assert len(results) == 5 + + assert len(visualizer().elements) > len(contigs) + + # _ _ _ _ _ _ # | | | |_ __ (_) |_ | |_ ___ ___| |_ ___ # | | | | '_ \| | __| | __/ _ \/ __| __/ __| From 67cbed99203821d90b6b70cf3775dcdf9c18adbd Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 31 May 2024 13:57:50 -0700 Subject: [PATCH 352/485] Update conseq.csv:region field documentation --- docs/steps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/steps.md b/docs/steps.md index c0c748f7e..bed3813e1 100644 --- a/docs/steps.md +++ b/docs/steps.md @@ -181,7 +181,7 @@ Individual files are described after the list of steps. * remap - mapped to other references after remapping * aligned - aligned with a reference and merged with mate * conseq.csv - * region - seed region it mapped to + * region - the name of the contig. Includes the name of the reference seed, plus an optional prefix, which is a number that makes the name unique. * q-cutoff - minimum quality score * consensus-percent-cutoff - to be included in a mixture, a variant must make up at least this fraction of the total valid counts From 4b88a891b835dc19b27abf4aceaecd14c1ce9a41 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 07:15:31 +0000 Subject: [PATCH 353/485] Bump gprof2dot from 2022.7.29 to 2024.6.6 Bumps [gprof2dot](https://github.com/jrfonseca/gprof2dot) from 2022.7.29 to 2024.6.6. - [Release notes](https://github.com/jrfonseca/gprof2dot/releases) - [Commits](https://github.com/jrfonseca/gprof2dot/compare/2022.07.29...2024.06.06) --- updated-dependencies: - dependency-name: gprof2dot dependency-type: direct:development update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a5be4fb9..0ce57d1fb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,6 +2,6 @@ -r requirements-test.txt -r requirements-watcher.txt # Used for plotting profiling results. -gprof2dot==2022.7.29 +gprof2dot==2024.6.6 mypy==1.10.0 mypy-extensions==1.0.0 From 52dfc8d69256c5bd564d232694af43a03503d903 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 5 Jun 2024 08:00:07 +0000 Subject: [PATCH 354/485] Bump pytest from 8.2.1 to 8.2.2 Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.2.1 to 8.2.2. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.2.1...8.2.2) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 59ac54223..234244fd5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,7 +4,7 @@ -r requirements-watcher.txt mypy==1.10.0 mypy-extensions==1.0.0 -pytest==8.2.1 +pytest==8.2.2 coverage==7.5.2 pandas==2.0.2 seaborn==0.13.2 From 05a7d876ba522b90f28af8acc86b7e174a30f888 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:07:59 +0000 Subject: [PATCH 355/485] Bump coverage from 7.5.2 to 7.5.3 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.2 to 7.5.3. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.2...7.5.3) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 234244fd5..55e681f42 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,7 +5,7 @@ mypy==1.10.0 mypy-extensions==1.0.0 pytest==8.2.2 -coverage==7.5.2 +coverage==7.5.3 pandas==2.0.2 seaborn==0.13.2 ete3 From becc2b778e2bd407fdd9d24c3b74fa00bd3ebd75 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:53:26 +0000 Subject: [PATCH 356/485] Bump openpyxl from 3.1.2 to 3.1.3 Bumps [openpyxl](https://openpyxl.readthedocs.io) from 3.1.2 to 3.1.3. --- updated-dependencies: - dependency-name: openpyxl dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 55e681f42..6484b879e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -12,4 +12,4 @@ ete3 flake8==7.0.0 # For reading HCV rules from an Excel file. -openpyxl==3.1.2 +openpyxl==3.1.3 From 06a10647b21821a26b9a0f408893756aa24f2677 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Jun 2024 21:09:22 +0000 Subject: [PATCH 357/485] Bump requests from 2.32.1 to 2.32.3 Bumps [requests](https://github.com/psf/requests) from 2.32.1 to 2.32.3. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.32.1...v2.32.3) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- requirements-basespace.txt | 2 +- requirements-watcher.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-basespace.txt b/requirements-basespace.txt index 8ee75d522..8613fa60d 100644 --- a/requirements-basespace.txt +++ b/requirements-basespace.txt @@ -1,3 +1,3 @@ # Requirements for running the micall_basespace.py script -r requirements.txt -requests==2.32.1 +requests==2.32.3 diff --git a/requirements-watcher.txt b/requirements-watcher.txt index a4e0c2e0d..7cc63208e 100644 --- a/requirements-watcher.txt +++ b/requirements-watcher.txt @@ -1,3 +1,3 @@ # Requirements for running the MISEQ_MONITOR.py script git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api -requests==2.32.1 +requests==2.32.3 From a69f035bd135206fe4968fe61b58f5bfe9c91fe0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 10 Jul 2024 13:27:58 -0700 Subject: [PATCH 358/485] Add a failing test for sample sheet parser This case tests the behaviour described in #888 When the issue is solved, we should unmark this case as "Failure". --- micall/tests/test_sample_sheet_parser.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index 8364aeed1..45bab7911 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -730,6 +730,43 @@ def test_extra_commas(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual(ss["Experiment Name"], "10-Jul-2014") + @unittest.expectedFailure + def test_underscores_in_sample_name(self): + """ + Extracts the correct project code having multiple options. + """ + + stub_sample_sheet = """ +[Header] +IEMFileVersion,3 +Investigator Name,RL +Project Name,10-Jul-2014_v1test +Experiment Name,10-Jul-2014_v1test +Date,07/10/2014 +Workflow,GenerateFASTQ +Assay,Nextera +Description,Nextera +Chemistry,Amplicon +[Reads] +251 +251 +[Settings] +[Data] +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder +CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\ +10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \ +Disablecontamcheck:Sample1_Proj1:FALSE, +CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\ +10-Jul-2014_testing,Research:Sample2_Foo_Proj2:FALSE Comments:Sample2_Foo_Proj2:thisiscommenttwo \ +Chemistry:Sample2_Foo_Proj2:BreakingBad Disablecontamcheck:Sample2_Foo_Proj2:TRUE, +""" + + ss = sample_sheet_parser(StringIO(stub_sample_sheet)) + split_rows = ss['DataSplit'] + assert len(split_rows) == 2 + assert split_rows[0]['project'] == 'Proj1' + assert split_rows[1]['project'] == 'Proj2' + def test_read_sample_sheet_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' From 6e452f2bcbcd41af0bb95cf686a0af59ab28c0fd Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Aug 2023 16:03:58 -0700 Subject: [PATCH 359/485] Make samplesheet parser tolerable to underscores in enum names Solves #888 --- micall/tests/test_sample_sheet_parser.py | 1 - micall/utils/sample_sheet_parser.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index 45bab7911..d853887f9 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -730,7 +730,6 @@ def test_extra_commas(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual(ss["Experiment Name"], "10-Jul-2014") - @unittest.expectedFailure def test_underscores_in_sample_name(self): """ Extracts the correct project code having multiple options. diff --git a/micall/utils/sample_sheet_parser.py b/micall/utils/sample_sheet_parser.py index 45ce675dd..f6edfec57 100755 --- a/micall/utils/sample_sheet_parser.py +++ b/micall/utils/sample_sheet_parser.py @@ -179,9 +179,12 @@ def sample_sheet_parser(handle): samp, proj, val = None, None, None if sample_sheet_version == 1: sj, val = elem.split(':') - samp, proj = sj.split(project_delimiter_v1) + components = sj.split(project_delimiter_v1) + samp, proj = (project_delimiter_v1.join(components[:-1]), components[-1]) elif sample_sheet_version == 2: - samp, proj, val = elem.split(project_delimiter_v2) + components = elem.split(project_delimiter_v2) + samp, proj, val = (project_delimiter_v2.join(components[:-2]), + components[-2], components[-1]) if samp == entry['sample'] and proj == entry['project']: if name == 'Research': From c9c6dee01bd27c5441130f43010f244f341f9804 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 10 Jul 2024 13:09:31 -0700 Subject: [PATCH 360/485] Add more tests for sample sheet parsing with underscores --- micall/tests/test_sample_sheet_parser.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index d853887f9..ed401d451 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -732,7 +732,7 @@ def test_extra_commas(self): def test_underscores_in_sample_name(self): """ - Extracts the correct project code having multiple options. + Extracts the correct project code and sample name in presence of underscores. """ stub_sample_sheet = """ @@ -763,9 +763,19 @@ def test_underscores_in_sample_name(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) split_rows = ss['DataSplit'] assert len(split_rows) == 2 + + assert split_rows[0]['filename'] == 'Sample1-Proj1_S1' + assert split_rows[1]['filename'] == 'Sample2-Proj2_S2' + assert split_rows[0]['project'] == 'Proj1' assert split_rows[1]['project'] == 'Proj2' + assert split_rows[0]['sample'] == 'Sample1' + assert split_rows[1]['sample'] == 'Sample2' + + assert split_rows[0]['sample_number'] == 'S1' + assert split_rows[1]['sample_number'] == 'S2' + def test_read_sample_sheet_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' From 9062708fe127b22066653d1c1f5c1073249b7eeb Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 10 Jul 2024 13:27:58 -0700 Subject: [PATCH 361/485] Add a failing test for sample sheet parser This case tests the behaviour described in #888 When the issue is solved, we should unmark this case as "Failure". --- micall/tests/test_sample_sheet_parser.py | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index 8364aeed1..45bab7911 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -730,6 +730,43 @@ def test_extra_commas(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual(ss["Experiment Name"], "10-Jul-2014") + @unittest.expectedFailure + def test_underscores_in_sample_name(self): + """ + Extracts the correct project code having multiple options. + """ + + stub_sample_sheet = """ +[Header] +IEMFileVersion,3 +Investigator Name,RL +Project Name,10-Jul-2014_v1test +Experiment Name,10-Jul-2014_v1test +Date,07/10/2014 +Workflow,GenerateFASTQ +Assay,Nextera +Description,Nextera +Chemistry,Amplicon +[Reads] +251 +251 +[Settings] +[Data] +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,index,index2,Sample_Project,Description,GenomeFolder +CFE_SomeId_10-Jul-2014_N501-N701_Sample1_Proj1,Sample1_Proj1,10-Jul-2014_testing,N/A,ACGTACGT,TGCATGCA,\ +10-Jul-2014_testing,Research:Sample1_Proj1:TRUE Comments:Sample1_Proj1:thisiscommentone \ +Disablecontamcheck:Sample1_Proj1:FALSE, +CFE_SomeId_10-Jul-2014_N501-N702_Sample2_Proj2,Sample2_Proj2,10-Jul-2014_testing,N/A,AAAAGGGG,CCCCTTTT,\ +10-Jul-2014_testing,Research:Sample2_Foo_Proj2:FALSE Comments:Sample2_Foo_Proj2:thisiscommenttwo \ +Chemistry:Sample2_Foo_Proj2:BreakingBad Disablecontamcheck:Sample2_Foo_Proj2:TRUE, +""" + + ss = sample_sheet_parser(StringIO(stub_sample_sheet)) + split_rows = ss['DataSplit'] + assert len(split_rows) == 2 + assert split_rows[0]['project'] == 'Proj1' + assert split_rows[1]['project'] == 'Proj2' + def test_read_sample_sheet_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' From df04f9f3c703178a16b239dd4f4f58281874cc7f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Aug 2023 16:03:58 -0700 Subject: [PATCH 362/485] Make samplesheet parser tolerable to underscores in enum names Solves #888 --- micall/tests/test_sample_sheet_parser.py | 1 - micall/utils/sample_sheet_parser.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index 45bab7911..d853887f9 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -730,7 +730,6 @@ def test_extra_commas(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) self.assertEqual(ss["Experiment Name"], "10-Jul-2014") - @unittest.expectedFailure def test_underscores_in_sample_name(self): """ Extracts the correct project code having multiple options. diff --git a/micall/utils/sample_sheet_parser.py b/micall/utils/sample_sheet_parser.py index 45ce675dd..f6edfec57 100755 --- a/micall/utils/sample_sheet_parser.py +++ b/micall/utils/sample_sheet_parser.py @@ -179,9 +179,12 @@ def sample_sheet_parser(handle): samp, proj, val = None, None, None if sample_sheet_version == 1: sj, val = elem.split(':') - samp, proj = sj.split(project_delimiter_v1) + components = sj.split(project_delimiter_v1) + samp, proj = (project_delimiter_v1.join(components[:-1]), components[-1]) elif sample_sheet_version == 2: - samp, proj, val = elem.split(project_delimiter_v2) + components = elem.split(project_delimiter_v2) + samp, proj, val = (project_delimiter_v2.join(components[:-2]), + components[-2], components[-1]) if samp == entry['sample'] and proj == entry['project']: if name == 'Research': From b9d9b1be73c17b48d6a5e19d60705fa6ffa97b44 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 10 Jul 2024 13:09:31 -0700 Subject: [PATCH 363/485] Add more tests for sample sheet parsing with underscores --- micall/tests/test_sample_sheet_parser.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/micall/tests/test_sample_sheet_parser.py b/micall/tests/test_sample_sheet_parser.py index d853887f9..ed401d451 100644 --- a/micall/tests/test_sample_sheet_parser.py +++ b/micall/tests/test_sample_sheet_parser.py @@ -732,7 +732,7 @@ def test_extra_commas(self): def test_underscores_in_sample_name(self): """ - Extracts the correct project code having multiple options. + Extracts the correct project code and sample name in presence of underscores. """ stub_sample_sheet = """ @@ -763,9 +763,19 @@ def test_underscores_in_sample_name(self): ss = sample_sheet_parser(StringIO(stub_sample_sheet)) split_rows = ss['DataSplit'] assert len(split_rows) == 2 + + assert split_rows[0]['filename'] == 'Sample1-Proj1_S1' + assert split_rows[1]['filename'] == 'Sample2-Proj2_S2' + assert split_rows[0]['project'] == 'Proj1' assert split_rows[1]['project'] == 'Proj2' + assert split_rows[0]['sample'] == 'Sample1' + assert split_rows[1]['sample'] == 'Sample2' + + assert split_rows[0]['sample_number'] == 'S1' + assert split_rows[1]['sample_number'] == 'S2' + def test_read_sample_sheet_overrides(tmpdir): sample_sheet_path = Path(str(tmpdir)) / 'SampleSheet.csv' From 9fb50bd9b4281245f55e57aa9e0c1711dcad719e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 22 Aug 2024 20:41:33 +0000 Subject: [PATCH 364/485] Bump rexml from 3.2.8 to 3.3.6 in /docs Bumps [rexml](https://github.com/ruby/rexml) from 3.2.8 to 3.3.6. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.2.8...v3.3.6) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index ea3f48825..f975d6ac5 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -236,8 +236,8 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.8) - strscan (>= 3.0.9) + rexml (3.3.6) + strscan rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) From dd73d657f0b0c557af3d034279a87435f9fac625 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 10 Sep 2024 11:32:14 -0700 Subject: [PATCH 365/485] Add a useful comment to release_test_compare.py script --- release_test_compare.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/release_test_compare.py b/release_test_compare.py index 3f4fc15c7..197ac857e 100644 --- a/release_test_compare.py +++ b/release_test_compare.py @@ -485,6 +485,8 @@ def compare_consensus(sample: Sample, continue source_nucs = [] target_nucs = [] + + # Note: if either source or target region is missing, it might be because its coverage score is below 4. if source_details is None: has_big_change = True target_nucs = [nuc for nuc, row in target_details] From 6e090b394114473131668880f03347f147cf1c4a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 19 Sep 2024 14:07:20 -0700 Subject: [PATCH 366/485] Bump CI/github/upload-artifacts version --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index f0b41a2ca..efe6ff494 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -122,7 +122,7 @@ jobs: run: python release_test_microtest.py micall.simg - name: Upload Singularity Image - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: micall.simg path: micall.simg From 5ad8431cf76a400f74e082d0c130eb52773d7734 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 18:34:31 +0000 Subject: [PATCH 367/485] Bump jekyll from 3.9.5 to 3.10.0 in /docs Bumps [jekyll](https://github.com/jekyll/jekyll) from 3.9.5 to 3.10.0. - [Release notes](https://github.com/jekyll/jekyll/releases) - [Changelog](https://github.com/jekyll/jekyll/blob/master/History.markdown) - [Commits](https://github.com/jekyll/jekyll/compare/v3.9.5...v3.10.0) --- updated-dependencies: - dependency-name: jekyll dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index f975d6ac5..12f102f9d 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -11,8 +11,8 @@ GEM minitest (>= 5.1) mutex_m tzinfo (~> 2.0) - addressable (2.8.6) - public_suffix (>= 2.0.2, < 6.0) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) base64 (0.2.0) bigdecimal (3.1.6) coffee-script (2.4.1) @@ -21,8 +21,9 @@ GEM coffee-script-source (1.12.2) colorator (1.1.0) commonmarker (0.23.10) - concurrent-ruby (1.2.3) + concurrent-ruby (1.3.4) connection_pool (2.4.1) + csv (3.3.0) dnsruby (1.71.0) simpleidn (~> 0.2.1) drb (2.2.1) @@ -37,15 +38,15 @@ GEM faraday-net_http (>= 2.0, < 3.2) faraday-net_http (3.1.0) net-http - ffi (1.16.3) + ffi (1.17.0) forwardable-extended (2.6.0) gemoji (4.1.0) - github-pages (231) + github-pages (232) github-pages-health-check (= 1.18.2) - jekyll (= 3.9.5) + jekyll (= 3.10.0) jekyll-avatar (= 0.8.0) jekyll-coffeescript (= 1.2.2) - jekyll-commonmark-ghpages (= 0.4.0) + jekyll-commonmark-ghpages (= 0.5.1) jekyll-default-layout (= 0.1.5) jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) @@ -82,9 +83,10 @@ GEM liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.6, < 2.0) + nokogiri (>= 1.16.2, < 2.0) rouge (= 3.30.0) terminal-table (~> 1.4) + webrick (~> 1.8) github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) @@ -95,11 +97,12 @@ GEM activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (1.14.4) + i18n (1.14.6) concurrent-ruby (~> 1.0) - jekyll (3.9.5) + jekyll (3.10.0) addressable (~> 2.4) colorator (~> 1.0) + csv (~> 3.0) em-websocket (~> 0.5) i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) @@ -110,6 +113,7 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) + webrick (>= 1.0) jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.2.2) @@ -117,9 +121,9 @@ GEM coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.4.0) - commonmarker (~> 0.23.7) - jekyll (~> 3.9.0) + jekyll-commonmark-ghpages (0.5.1) + commonmarker (>= 0.23.7, < 1.1.0) + jekyll (>= 3.9, < 4.0) jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 5.0) jekyll-default-layout (0.1.5) @@ -231,13 +235,12 @@ GEM sawyer (~> 0.9) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (5.0.4) + public_suffix (5.1.1) racc (1.7.3) rb-fsevent (0.11.2) - rb-inotify (0.10.1) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.6) - strscan + rexml (3.3.7) rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) @@ -251,7 +254,6 @@ GEM faraday (>= 0.17.3, < 3) simpleidn (0.2.1) unf (~> 0.1.4) - strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) typhoeus (1.4.1) @@ -263,6 +265,7 @@ GEM unf_ext (0.0.9.1) unicode-display_width (1.8.0) uri (0.13.0) + webrick (1.8.2) PLATFORMS ruby From 72514d9e0ede890fb407bd6589230a9276a88fea Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 1 Oct 2024 11:00:25 -0700 Subject: [PATCH 368/485] Add "MiCall at BC CfE" page to docs --- docs/_data/navigation.yml | 4 +- docs/micall_at_bccfe.md | 83 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 docs/micall_at_bccfe.md diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index ca43a60ef..b2df72767 100644 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -17,5 +17,7 @@ link: /design/resistance.html - name: Genome Coverage link: /genome_coverage.html +- name: MiCall at BC CfE + link: /micall_at_bccfe.html - name: GitHub - link: https://github.com/cfe-lab/MiCall \ No newline at end of file + link: https://github.com/cfe-lab/MiCall diff --git a/docs/micall_at_bccfe.md b/docs/micall_at_bccfe.md new file mode 100644 index 000000000..5d7e96850 --- /dev/null +++ b/docs/micall_at_bccfe.md @@ -0,0 +1,83 @@ + +# MiCall at BC CfE + +The **MiCall** tool is designed to process **FASTQ** data generated by +the **Illumina MiSeq** sequencing platform. MiCall is primarily +utilized for **HIV** resistance testing, alongside research into +various types of sequence analysis, such as the examination of +**proviral** sequences for defects and other characteristics. As +open-source software, MiCall offers transparency and adaptability, +meeting diverse research needs by supporting the sequencing of various +organisms and accommodating the unique requirements of different +studies. The tool is specialized in deep sequencing of viral samples, +differentiating it from the lab's other sequencing tool, **ReCall**, +which is employed for population-level genomic sequencing. Our +laboratory uses MiCall for sequencing **HIV**, **HCV**, and +**SARS-CoV-2**. + +While the results obtained from MiCall are mainly intended for +research purposes and are not used in clinical settings, there is an +exception for specific **V3 loop** analysis requests. This particular +analysis aids in drug prescription decisions based on mutations in the +**V3** region of **HIV**. However, for general clinical results, the +lab relies on **ReCall**. MiCall remains an invaluable research tool, +offering comprehensive deep sequencing capabilities and robust +analysis of **proviral** sequences. + +### MiCall operates in two modes: + +- **Remapping Mode** + In this mode, MiCall maps all reads from a sample against a set of + reference sequences, updates the sequences, remaps the reads, then + combines the reads into consensus sequences and coverage maps. This + mode is typically used for clinical decisions. + +- **De-Novo Assembly Mode** + This mode assembles sequences from scratch without relying on a + reference genome, then uses them to provide the same kinds of + consensus sequences and coverage maps. Currently, it is used + exclusively for research purposes. + +The operational behavior of MiCall is highly automated to ensure that +results are generated without the need for manual intervention. This +automation allows researchers to focus on interpreting sequencing data +rather than managing the pipeline itself. This is particularly +beneficial for labs with high throughput, providing timely and +consistent data processing. + +### MiCall programmatically interacts with several systems: + +- **QAI** - our general **Laboratory Information Management System**. + Files that define individual runs (these are the `SampleSheet.xml` + files) are produced via QAI's graphical interface and placed in a + specific network location (`/MiSeq/runs/` directory on the + network-mounted **RAW_DATA** drive) monitored by + MiCall. Additionally, MiCall utilizes QAI's web server **REST** + interface to update the database with new run results. + +- **CFE-scripts** - a collection of scripts responsible for producing resistance interpretation reports. + These scripts monitor available and unprocessed results produced by + MiCall, extract and reshape them, and upload them to the + laboratory's database. More specifically, the `miseq_gen_results.rb` + script polls MiCall's resistance interpretation scores, stored in + the same location as all other inputs and outputs, and uploads all + fresh ones. + +- **Kive** - our platform for version control of bioinformatic pipelines. + MiCall uses the **Python** interface of Kive to initiate new jobs, + upload inputs, and download processing results. + +- **MiSeq Hardware** - the physical machines that perform sequencing for the laboratory. + Internally, MiCall consistently monitors specific network locations + for new data from the **MiSeq** sequencers. This data, in the form + of **FASTQ** files (and supporting files, like those containing the + read quality information), triggers MiCall to interact with Kive to + initiate and manage analysis workflows. + +MiCall incorporates several features that enhance the quality and +reliability of sequence analysis. It can be executed using **Docker** +for simpler setups or via **Singularity** containers within the +**Kive** platform for production use. Singularity provides lightweight +and reproducible environments suited for high-performance computing +contexts. Combined with Kive, this ensures that MiCall can operate +efficiently at scale, facilitating reliable and scalable deployment. From f0952e208e058e1a9122a412006ee6ee4497e895 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Wed, 2 Oct 2024 15:54:09 -0700 Subject: [PATCH 369/485] Document similar source for 2060 microtest. --- micall/tests/microtest/make_sample.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/micall/tests/microtest/make_sample.py b/micall/tests/microtest/make_sample.py index 41fa7e26e..bc8d64bf4 100644 --- a/micall/tests/microtest/make_sample.py +++ b/micall/tests/microtest/make_sample.py @@ -106,6 +106,14 @@ def main(): 315, 20, (CodonMutation(207, 'TCT'),)))), + # FastqFile('2060A-V3LOOP_S8_L001_R1_001.fastq', + # '02060', + # False, + # [FastqSection('HIV1-B-FR-K03455-seed', 7106, 7169, 10)]), + # FastqFile('2060A-V3LOOP_S8_L001_R2_001.fastq', + # '02060', + # True, + # [FastqSection('HIV1-B-FR-K03455-seed', 7160, 7217, 10)]), FastqFile('2070A-PR_S9_L001_R1_001.fastq', '2070', False, @@ -268,6 +276,8 @@ def main(): True, sections_2210_2)] for fastq_file in fastq_files: + # if not fastq_file.name.startswith('2060'): + # continue with open(fastq_file.name, 'w') as f: next_cluster = 1 for section in fastq_file.sections: From 461cfd57e4d2919515f90b3649c71a88b646be84 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 11 Oct 2024 10:19:22 -0700 Subject: [PATCH 370/485] Add get_list_of_executables script --- micall/utils/get_list_of_executables.py | 94 +++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 micall/utils/get_list_of_executables.py diff --git a/micall/utils/get_list_of_executables.py b/micall/utils/get_list_of_executables.py new file mode 100644 index 000000000..ff4bd5da7 --- /dev/null +++ b/micall/utils/get_list_of_executables.py @@ -0,0 +1,94 @@ +#! /usr/bin/env python3 + +import argparse +import sys +from typing import Sequence +import os +from pathlib import Path +import re + + +def dir_path(string: str) -> Path: + if os.path.isdir(string): + return Path(string) + else: + raise ValueError("Path %r is not a directory.", string) + + +def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: + """ + Parse command-line arguments. + + Args: + argv: A list of command-line arguments. + + Returns: + A Namespace object containing parsed arguments. + """ + + script_path: Path = Path(__file__).resolve() + base_dir = script_path.parent.parent.parent + + parser = argparse.ArgumentParser( + description="List executable Python scripts.") + parser.add_argument( + '-d', '--directory', type=dir_path, default=base_dir, + help='The root directory to search for Python files (default: parent of the current script\'s directory).' + ) + + return parser.parse_args(argv) + + +def is_executable_script(content: str) -> bool: + if content.startswith("#!"): + return True + + if re.findall(r'__name__\s*==\s*[\'"]__main__', content): + return True + + if 'import argparse' in content: + return True + + if 'from argparse' in content: + return True + + return False + + +def main(argv: Sequence[str]) -> int: + """ + Main function to list the script files. + + Args: + argv: A list of command-line arguments. + + Returns: + An exit status code (0 for success). + """ + + args = parse_arguments(argv) + base_dir: Path = args.directory + + # Iterate over all files in the specified directory + for root, _, files in os.walk(base_dir): + for file in files: + # Process only files with a .py extension + if not file.endswith('.py'): + continue + + file_path = os.path.join(root, file) + try: + with open(file_path, 'r') as f: + content = f.read() + except (IOError, OSError) as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + continue + + if is_executable_script(content): + print(file_path) + + return 0 + + +if __name__ == "__main__": + exit(main(sys.argv[1:])) From 50d744a20571a3df301fa6b94f3aa47fc21ca71c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 11 Oct 2024 10:23:37 -0700 Subject: [PATCH 371/485] Simplify get_list_of_executables code --- micall/utils/get_list_of_executables.py | 75 ++++++++----------------- 1 file changed, 24 insertions(+), 51 deletions(-) diff --git a/micall/utils/get_list_of_executables.py b/micall/utils/get_list_of_executables.py index ff4bd5da7..f8a12895d 100644 --- a/micall/utils/get_list_of_executables.py +++ b/micall/utils/get_list_of_executables.py @@ -2,43 +2,12 @@ import argparse import sys -from typing import Sequence +from typing import Sequence, Iterator import os from pathlib import Path import re -def dir_path(string: str) -> Path: - if os.path.isdir(string): - return Path(string) - else: - raise ValueError("Path %r is not a directory.", string) - - -def parse_arguments(argv: Sequence[str]) -> argparse.Namespace: - """ - Parse command-line arguments. - - Args: - argv: A list of command-line arguments. - - Returns: - A Namespace object containing parsed arguments. - """ - - script_path: Path = Path(__file__).resolve() - base_dir = script_path.parent.parent.parent - - parser = argparse.ArgumentParser( - description="List executable Python scripts.") - parser.add_argument( - '-d', '--directory', type=dir_path, default=base_dir, - help='The root directory to search for Python files (default: parent of the current script\'s directory).' - ) - - return parser.parse_args(argv) - - def is_executable_script(content: str) -> bool: if content.startswith("#!"): return True @@ -55,6 +24,25 @@ def is_executable_script(content: str) -> bool: return False +def iterate_executables() -> Iterator[Path]: + script_path: Path = Path(__file__).resolve() + base_dir = script_path.parent.parent.parent + + # Iterate over all files in the base directory. + for root, _, files in os.walk(base_dir): + for file in files: + + # Process only files with a .py extension. + if not file.endswith('.py'): + continue + + file_path = Path(root) / file + content = file_path.read_text() + + if is_executable_script(content): + yield file_path + + def main(argv: Sequence[str]) -> int: """ Main function to list the script files. @@ -66,26 +54,11 @@ def main(argv: Sequence[str]) -> int: An exit status code (0 for success). """ - args = parse_arguments(argv) - base_dir: Path = args.directory + parser = argparse.ArgumentParser(description="List executable Python scripts.") + parser.parse_args(argv) - # Iterate over all files in the specified directory - for root, _, files in os.walk(base_dir): - for file in files: - # Process only files with a .py extension - if not file.endswith('.py'): - continue - - file_path = os.path.join(root, file) - try: - with open(file_path, 'r') as f: - content = f.read() - except (IOError, OSError) as e: - print(f"Error reading {file_path}: {e}", file=sys.stderr) - continue - - if is_executable_script(content): - print(file_path) + for path in iterate_executables(): + print(path) return 0 From 3b2f0fcfaccf093c2264adcb892a8b6e734e83cf Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 11:01:36 -0700 Subject: [PATCH 372/485] Print relative paths in get_list_of_executables --- micall/utils/get_list_of_executables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/utils/get_list_of_executables.py b/micall/utils/get_list_of_executables.py index f8a12895d..d43206b0f 100644 --- a/micall/utils/get_list_of_executables.py +++ b/micall/utils/get_list_of_executables.py @@ -40,7 +40,8 @@ def iterate_executables() -> Iterator[Path]: content = file_path.read_text() if is_executable_script(content): - yield file_path + relative = file_path.relative_to(base_dir) + yield relative def main(argv: Sequence[str]) -> int: From d4b668007a8aa652e0dd54f0bde2c0105fb8b743 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 14:30:15 -0700 Subject: [PATCH 373/485] Implement initial pyproject.toml --- micall/main.py | 3 +++ pyproject.toml | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 micall/main.py create mode 100644 pyproject.toml diff --git a/micall/main.py b/micall/main.py new file mode 100644 index 000000000..7285bac13 --- /dev/null +++ b/micall/main.py @@ -0,0 +1,3 @@ + +def cli(): + print("Hello from MiCall!") # TODO: implement main. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..efd5ec803 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,66 @@ +[project] +name = "micall" +version = "7.17.0" +description = "Pipeline for processing FASTQ data from an Illumina MiSeq to genotype human RNA viruses like HIV and hepatitis C." +authors = [ + { name = "British Columbia Centre for Excellence in HIV/AIDS", email = "vmysak@bccfe.ca" }, +] +license = {text = "AGPL-3.0-or-later"} +readme = "README.md" +requires-python = ">=3.7" +classifiers = [ + "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Operating System :: OS Independent", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] + +dependencies = [ + "requests", +] + +[project.optional-dependencies] +test = [ + # Dependencies required for running the test suite + "pytest>=6.0", + "pytest-cov", +] +dev = [ + # Dependencies required for development (linting, type checking, etc.) + "flake8", + "mypy>=0.800", +] + +[project.scripts] +micall = "micall.main:cli" + +[tool.hatch.build] +include = [ + "micall/", + "basespace/", # TODO: do we really need this in the package? + "gui/", # TODO: do we really need this in the package? + "*.py", +] +exclude = [ + "micall/tests/", +] +skip-excluded-dirs = true + +[project.urls] +homepage = "https://cfe-lab.github.io/MiCall" +repository = "https://github.com/cfe-lab/MiCall" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.metadata] +allow-direct-references = true From 82c8d212cf11af8b1f1fc05cceb698a2efa75092 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 14:43:36 -0700 Subject: [PATCH 374/485] Try calling a top-level script in micall/main.py --- micall/main.py | 10 ++++++++++ micall_kive.py | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/micall/main.py b/micall/main.py index 7285bac13..38f9c0024 100644 --- a/micall/main.py +++ b/micall/main.py @@ -1,3 +1,13 @@ +import sys + +import micall.core.contig_stitcher as stitcher +import release_test_publish as pub + +assert stitcher +assert sys + + def cli(): print("Hello from MiCall!") # TODO: implement main. + pub.main() # Note: just a test. diff --git a/micall_kive.py b/micall_kive.py index b6650cfb6..4c302cc75 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -164,4 +164,5 @@ def main(): tar.add(image_path, archive_path) -main() +if __name__ == '__main__': + main() From b198e96c398c75b9a970a5350b1fc2ac87fced44 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:18:19 -0700 Subject: [PATCH 375/485] Add basic installation test --- micall/tests/test_installation.py | 78 +++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 micall/tests/test_installation.py diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py new file mode 100644 index 000000000..8940ef818 --- /dev/null +++ b/micall/tests/test_installation.py @@ -0,0 +1,78 @@ +#! /usr/bin/env python + +""" + +This test is supposed to verify that installation of MiCall is not broken. + +This tests assumes Debian-compatible operating system, such as Ubuntu. +It also assumes that python3 and python3-venv are installed. + +It then: + 1. Creates a temporary virtual environment. + 2. Activates the environment. + 3. Installs MiCall via pip. (path to micall is `__file__.parent.parent.parent`) + 4. Runs `micall --version` to check the installation. + +""" + + +import subprocess +import venv +import shutil +from pathlib import Path +from typing import Sequence +import pytest +import shlex + + +@pytest.fixture(scope="function") +def temp_venv(tmpdir: Path): + """ + Fixture for creating and cleaning up a virtual environment. + + This fixture creates a virtual environment in a temporary directory, + provides context to run commands in this environment, and cleans up after the test. + """ + + # Create the virtual environment + venv_dir = tmpdir / "temp_test_venv" + venv.create(venv_dir, with_pip=True) + + # Yield the environment setup to the test function + yield venv_dir / "bin" / "activate" + + # Cleanup the virtual environment after the test + shutil.rmtree(venv_dir) + + +def run_command(command: Sequence[str]): + """Executes a shell command within a provided environment and returns output, error, and return code.""" + + result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return result.stdout.decode('utf-8').strip(), result.stderr.decode('utf-8').strip(), result.returncode + + +def test_micall_installation(temp_venv): + """ + Test to verify installation of MiCall. + + This test installs MiCall in an isolated virtual environment and verifies the installation + by executing the command `micall --version`. + """ + + # Path to MiCall directory (3 levels up from the current script file) + script_path = Path(__file__).resolve() + micall_path = script_path.parent.parent.parent + + # Function to quote shell arguments. + def q(s: object) -> str: + return shlex.quote(str(s)) + + # Install MiCall using pip from the local path + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") + assert returncode == 0, f"Failed to install MiCall:\n{stderr}" + + # Check MiCall version to verify installation + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && command -v micall") + assert returncode == 0, f"MiCall version command failed:\n{stderr}" + assert 'micall' in stdout, "Unexpected output for micall path check." From 16203819a61f1240025f1c92cefcd0f6e49900a5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:22:02 -0700 Subject: [PATCH 376/485] Make sure no micall installation is leaking in install test --- micall/tests/test_installation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index 8940ef818..a0c3d0222 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -69,10 +69,10 @@ def q(s: object) -> str: return shlex.quote(str(s)) # Install MiCall using pip from the local path - stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && pip install -- {q(micall_path)}") assert returncode == 0, f"Failed to install MiCall:\n{stderr}" # Check MiCall version to verify installation - stdout, stderr, returncode = run_command(f". {q(temp_venv)} && command -v micall") + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") assert returncode == 0, f"MiCall version command failed:\n{stderr}" assert 'micall' in stdout, "Unexpected output for micall path check." From 93415bf2668cc377d73785b80280370197d389a4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:24:11 -0700 Subject: [PATCH 377/485] Fix documentation for micall installation test --- micall/tests/test_installation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index a0c3d0222..d10bd0568 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -10,8 +10,8 @@ It then: 1. Creates a temporary virtual environment. 2. Activates the environment. - 3. Installs MiCall via pip. (path to micall is `__file__.parent.parent.parent`) - 4. Runs `micall --version` to check the installation. + 3. Installs MiCall via pip. + 4. Runs `command -v micall` to check the installation. """ @@ -57,7 +57,7 @@ def test_micall_installation(temp_venv): Test to verify installation of MiCall. This test installs MiCall in an isolated virtual environment and verifies the installation - by executing the command `micall --version`. + by executing the command `command -v micall`. """ # Path to MiCall directory (3 levels up from the current script file) @@ -75,4 +75,4 @@ def q(s: object) -> str: # Check MiCall version to verify installation stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") assert returncode == 0, f"MiCall version command failed:\n{stderr}" - assert 'micall' in stdout, "Unexpected output for micall path check." + assert stdout.endswith('micall'), "Unexpected output for micall path check." From adc3be5af325527c22d43b3026812345d37613bb Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:26:36 -0700 Subject: [PATCH 378/485] Make sure no micall is installed before our installation test --- micall/tests/test_installation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index d10bd0568..d901b1d90 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -68,6 +68,10 @@ def test_micall_installation(temp_venv): def q(s: object) -> str: return shlex.quote(str(s)) + # Check that MiCall is not installed. + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") + assert returncode != 0, "Unexpected MiCall installation." + # Install MiCall using pip from the local path stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && pip install -- {q(micall_path)}") assert returncode == 0, f"Failed to install MiCall:\n{stderr}" From 22915b44509403c9b39648539c472e6312801a2e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:27:40 -0700 Subject: [PATCH 379/485] No need to remove venv dir in installation test --- micall/tests/test_installation.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index d901b1d90..881026342 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -18,7 +18,6 @@ import subprocess import venv -import shutil from pathlib import Path from typing import Sequence import pytest @@ -41,9 +40,6 @@ def temp_venv(tmpdir: Path): # Yield the environment setup to the test function yield venv_dir / "bin" / "activate" - # Cleanup the virtual environment after the test - shutil.rmtree(venv_dir) - def run_command(command: Sequence[str]): """Executes a shell command within a provided environment and returns output, error, and return code.""" From 96e62f346db08e209d2efeffe918b985dc7b896f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 15:48:16 -0700 Subject: [PATCH 380/485] Copy requirements* to pyproject.toml --- micall/tests/test_installation.py | 2 +- pyproject.toml | 49 +++++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index 881026342..daf4be42e 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -69,7 +69,7 @@ def q(s: object) -> str: assert returncode != 0, "Unexpected MiCall installation." # Install MiCall using pip from the local path - stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && pip install -- {q(micall_path)}") + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") assert returncode == 0, f"Failed to install MiCall:\n{stderr}" # Check MiCall version to verify installation diff --git a/pyproject.toml b/pyproject.toml index efd5ec803..084d46203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,19 +24,58 @@ classifiers = [ ] dependencies = [ - "requests", + # Requirements for running the bare pipeline under Kive. + "pytz==2024.1", + "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", + "biopython==1.83", + "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", + "pyvdrm @ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2", + "numpy==1.24.3", + "scipy==1.10.1", + "matplotlib==3.7.3", + "cutadapt==4.8", + "python-Levenshtein==0.25.1", + "PyYAML==6.0.1", + "reportlab==4.2.0", + "pysam==0.22.1", + "genetracks @ git+https://github.com/cfe-lab/genetracks.git@v0.4", + "mappy==2.17", + "drawsvg==2.3.0", + "cairosvg==2.7.1", + "aligntools", ] [project.optional-dependencies] test = [ # Dependencies required for running the test suite - "pytest>=6.0", - "pytest-cov", + "pytest==8.2.2", + "coverage==7.5.3", + "pandas==2.0.2", + "seaborn==0.13.2", + "ete3", + # For reading HCV rules from an Excel file. + "openpyxl==3.1.3", + + # Also includes watcher's dependencies. + "kiveapi @ git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api", + "requests==2.32.3", ] dev = [ # Dependencies required for development (linting, type checking, etc.) - "flake8", - "mypy>=0.800", + "flake8==7.0.0", + "mypy==1.10.0", + "mypy-extensions==1.0.0", + "gprof2dot==2024.6.6", + "utilities", +] +watcher = [ + # Requirements for running the MISEQ_MONITOR.py script + "kiveapi @ git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api", + "requests==2.32.3", +] +basespace = [ + # Requirements for running the micall_basespace.py script + "requests==2.32.3", ] [project.scripts] From c3d655420cc6aefe6e6326dd8d2f69ab0ad9391c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 16:38:07 -0700 Subject: [PATCH 381/485] Test MiCall --version output --- micall/main.py | 13 +++++++++++-- micall/tests/test_installation.py | 32 ++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/micall/main.py b/micall/main.py index 38f9c0024..84b50896e 100644 --- a/micall/main.py +++ b/micall/main.py @@ -1,13 +1,22 @@ import sys +from importlib.metadata import version import micall.core.contig_stitcher as stitcher import release_test_publish as pub assert stitcher assert sys +assert pub + + +def get_version() -> str: + if __package__ is None: + return "development" + else: + return str(version(__package__)) def cli(): - print("Hello from MiCall!") # TODO: implement main. - pub.main() # Note: just a test. + print(get_version()) + print('Bye!') diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index daf4be42e..41608c778 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -6,12 +6,13 @@ This tests assumes Debian-compatible operating system, such as Ubuntu. It also assumes that python3 and python3-venv are installed. +For the version check it also assumes that git is installed, and that the test is run in micall git repository. It then: 1. Creates a temporary virtual environment. 2. Activates the environment. 3. Installs MiCall via pip. - 4. Runs `command -v micall` to check the installation. + 4. Runs various shell commands to check the installation. """ @@ -22,6 +23,7 @@ from typing import Sequence import pytest import shlex +import re @pytest.fixture(scope="function") @@ -76,3 +78,31 @@ def q(s: object) -> str: stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") assert returncode == 0, f"MiCall version command failed:\n{stderr}" assert stdout.endswith('micall'), "Unexpected output for micall path check." + + +def test_micall_version(temp_venv): + """ + Test to verify installation of MiCall. + + This test installs MiCall in an isolated virtual environment and verifies the installation + by executing the command `micall --version`. + """ + + # Path to MiCall directory (3 levels up from the current script file) + script_path = Path(__file__).resolve() + micall_path = script_path.parent.parent.parent + + # Function to quote shell arguments. + def q(s: object) -> str: + return shlex.quote(str(s)) + + # Install MiCall using pip from the local path + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") + assert returncode == 0, f"Failed to install MiCall:\n{stderr}" + + # Check MiCall version to verify installation + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && micall --version") + assert returncode == 0, f"MiCall version command failed:\n{stderr}" + lines = [line.strip() for line in stdout.split('\n')] + first_line = lines[0].strip() + assert re.match(r'(\d+[.]\d+[.]\d+)|development', first_line), "Unexpected output for micall version check." From 2d2d28d9842640c46a7af43ceaa2341113c11f5e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 15 Oct 2024 16:56:13 -0700 Subject: [PATCH 382/485] Update installation test source comments --- micall/tests/test_installation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index 41608c778..2247ecf77 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -6,7 +6,6 @@ This tests assumes Debian-compatible operating system, such as Ubuntu. It also assumes that python3 and python3-venv are installed. -For the version check it also assumes that git is installed, and that the test is run in micall git repository. It then: 1. Creates a temporary virtual environment. @@ -74,9 +73,9 @@ def q(s: object) -> str: stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") assert returncode == 0, f"Failed to install MiCall:\n{stderr}" - # Check MiCall version to verify installation + # Check MiCall executable path to verify installation stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") - assert returncode == 0, f"MiCall version command failed:\n{stderr}" + assert returncode == 0, f"Cound not find MiCall installation:\n{stderr}" assert stdout.endswith('micall'), "Unexpected output for micall path check." From 27a2cfde0be5a808646a64db60d7fd2dc61dda8b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 10:30:56 -0700 Subject: [PATCH 383/485] Implement ctl part of main.py --- micall/main.py | 140 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 8 deletions(-) diff --git a/micall/main.py b/micall/main.py index 84b50896e..49cd64727 100644 --- a/micall/main.py +++ b/micall/main.py @@ -1,13 +1,101 @@ +#! /usr/bin/env python import sys +import argparse +import os +from typing import Sequence +from pathlib import Path from importlib.metadata import version -import micall.core.contig_stitcher as stitcher -import release_test_publish as pub -assert stitcher -assert sys -assert pub +EXECUTABLES = [ + "release_test_publish.py", + "micall_kive.py", + "micall_watcher.py", + "release_test_microtest.py", + "docker_build.py", + "release_test_setup.py", + "release_test_compare.py", + "micall_kive_resistance.py", + "micall_docker.py", + "micall/main.py", + "micall/resistance/genreport.py", + "micall/resistance/resistance.py", + "micall/resistance/pdfreport.py", + "micall/core/filter_quality.py", + "micall/core/sam2aln.py", + "micall/core/denovo.py", + "micall/core/trim_fastqs.py", + "micall/core/plot_contigs.py", + "micall/core/cascade_report.py", + "micall/core/remap.py", + "micall/core/prelim_map.py", + "micall/core/aln2counts.py", + "micall/core/contig_stitcher.py", + "micall/core/coverage_plots.py", + "micall/core/plot_simple.py", + "micall/tests/test_installation.py", + "micall/tests/test_hcv_rules_import.py", + "micall/g2p/fastq_g2p.py", + "micall/blast_db/make_blast_db.py", + "micall/utils/concordance_evaluation.py", + "micall/utils/basespace_upload.py", + "micall/utils/compare_mapping.py", + "micall/utils/project_seeds_from_compendium.py", + "micall/utils/fasta_to_csv.py", + "micall/utils/hcv_rules_import.py", + "micall/utils/dd.py", + "micall/utils/find_reads_in_sam.py", + "micall/utils/hcv_rules_display.py", + "micall/utils/coverage_data.py", + "micall/utils/find_by_coverage.py", + "micall/utils/primer_locations.py", + "micall/utils/fetch_sequences.py", + "micall/utils/sam_g2p_simplify.py", + "micall/utils/contig_summary.py", + "micall/utils/compare_454_samples.py", + "micall/utils/genreport_rerun.py", + "micall/utils/remove_dupe_dirs.py", + "micall/utils/find_missing_samples.py", + "micall/utils/denovo_simplify.py", + "micall/utils/sort_sam.py", + "micall/utils/sample_fastq.py", + "micall/utils/sample_sheet_parser.py", + "micall/utils/projects_upload.py", + "micall/utils/projects_dump.py", + "micall/utils/find_chimera.py", + "micall/utils/probe_finder.py", + "micall/utils/aln2counts_simplify.py", + "micall/utils/samples_from_454.py", + "micall/utils/amplicon_finder.py", + "micall/utils/driver_utils.py", + "micall/utils/seed_alignments.py", + "micall/utils/remap_fastq_simplify.py", + "micall/utils/contig_counts.py", + "micall/utils/ref_aligner.py", + "micall/utils/scan_run_folders.py", + "micall/utils/contig_blaster.py", + "micall/utils/hcv_reference_tree.py", + "micall/utils/sample_project_summary.py", + "micall/utils/get_list_of_executables.py", + "micall/monitor/update_qai.py", + "micall/tcr/igblast.py", +] + + +def executable_name(path: str) -> str: + file_name = Path(path).name + name, extension = os.path.splitext(file_name) + return name + + +def executable_module(path: str) -> str: + noext, extension = os.path.splitext(path) + pythized = noext.replace(os.path.sep, '.') + return pythized + + +EXECUTABLES_MAP = {executable_name(path): path for path in EXECUTABLES} def get_version() -> str: @@ -17,6 +105,42 @@ def get_version() -> str: return str(version(__package__)) -def cli(): - print(get_version()) - print('Bye!') +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Run MiCall script.", add_help=False) + parser.add_argument("--version", action="store_true", help="Print version and exit.") + parser.add_argument('--help', action='store_true', help='Show this help message and exit.') + parser.add_argument("program", nargs='?', choices=EXECUTABLES_MAP.keys(), help="Program name.") + parser.add_argument("arguments", nargs=argparse.REMAINDER, help="Program arguments.") + return parser + + +def main(argv: Sequence[str]) -> int: + parser = get_parser() + args = parser.parse_args(argv) + + if args.version: + print(get_version()) + return 0 + + elif args.help: + parser.print_help() + return 0 + + elif EXECUTABLES_MAP.get(args.program): + path = EXECUTABLES_MAP[args.program] + mod = executable_module(path) + evalstring = f'__import__({mod!r}, fromlist=[""])' + evaluated_module = eval(evalstring) + return evaluated_module.main(args.arguments) + + else: + parser.print_help() + return 1 + + +def cli() -> int: + return main(sys.argv[1:]) + + +if __name__ == '__main__': + exit(cli()) From 72a24cce7d8ec65fa9c9434ceba24595054da9c7 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 10:47:20 -0700 Subject: [PATCH 384/485] main: do not assume that imported module has main --- micall/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/micall/main.py b/micall/main.py index 49cd64727..52cd1606a 100644 --- a/micall/main.py +++ b/micall/main.py @@ -3,6 +3,7 @@ import sys import argparse import os +import runpy from typing import Sequence from pathlib import Path from importlib.metadata import version @@ -98,6 +99,12 @@ def executable_module(path: str) -> str: EXECUTABLES_MAP = {executable_name(path): path for path in EXECUTABLES} +def execute_module_as_main(module_name: str, arguments: Sequence[str]) -> int: + sys.argv = [module_name] + list(arguments) + runpy.run_module(module_name, run_name='__main__', alter_sys=True) + return 0 + + def get_version() -> str: if __package__ is None: return "development" @@ -129,9 +136,7 @@ def main(argv: Sequence[str]) -> int: elif EXECUTABLES_MAP.get(args.program): path = EXECUTABLES_MAP[args.program] mod = executable_module(path) - evalstring = f'__import__({mod!r}, fromlist=[""])' - evaluated_module = eval(evalstring) - return evaluated_module.main(args.arguments) + return execute_module_as_main(mod, args.arguments) else: parser.print_help() From 66843b15c2d7361266fa7d8a5ad48c4f7d2f0cca Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 10:58:09 -0700 Subject: [PATCH 385/485] Add a test for main --help --- micall/tests/test_installation.py | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index 2247ecf77..7d2543003 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -23,6 +23,8 @@ import pytest import shlex import re +import os +from micall.utils.get_list_of_executables import iterate_executables @pytest.fixture(scope="function") @@ -105,3 +107,34 @@ def q(s: object) -> str: lines = [line.strip() for line in stdout.split('\n')] first_line = lines[0].strip() assert re.match(r'(\d+[.]\d+[.]\d+)|development', first_line), "Unexpected output for micall version check." + + +def test_micall_help(temp_venv): + """ + Test to verify installation of MiCall. + + This test installs MiCall in an isolated virtual environment and verifies the installation + by executing the command `micall --help`. + """ + + # These are supposed to be listed in output of --help. + executables = [os.path.splitext(path.name)[0] for path in iterate_executables()] + + # Path to MiCall directory (3 levels up from the current script file) + script_path = Path(__file__).resolve() + micall_path = script_path.parent.parent.parent + + # Function to quote shell arguments. + def q(s: object) -> str: + return shlex.quote(str(s)) + + # Install MiCall using pip from the local path + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") + assert returncode == 0, f"Failed to install MiCall:\n{stderr}" + + # Check MiCall help to verify installation + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && micall --help") + assert returncode == 0, f"MiCall help command failed:\n{stderr}" + + for executable in executables: + assert executable in stdout, f"Executable {executable!r} not listed in micall --help." From b94c02329ff600665a52eeb96e99e32301e8ff50 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 11:12:16 -0700 Subject: [PATCH 386/485] Document main.py --- micall/main.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/micall/main.py b/micall/main.py index 52cd1606a..d52786b9f 100644 --- a/micall/main.py +++ b/micall/main.py @@ -1,5 +1,11 @@ #! /usr/bin/env python +""" +This script serves as the primary command-line interface for MiCall, +enabling users to execute various MiCall components directly +from the terminal. +""" + import sys import argparse import os @@ -8,7 +14,8 @@ from pathlib import Path from importlib.metadata import version - +# Run micall/utils/get_list_of_executables.py to get the up-to-date list of these executables. +# The consistency of this list is verified in micall/tests/test_installation.py EXECUTABLES = [ "release_test_publish.py", "micall_kive.py", From 94149137162b4fc9a53a63c6f6006a52f69bf94e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 11:28:59 -0700 Subject: [PATCH 387/485] Remove nonexistant dev dependency --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 084d46203..83b0e5844 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,6 @@ dev = [ "mypy==1.10.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", - "utilities", ] watcher = [ # Requirements for running the MISEQ_MONITOR.py script From 8a3bd953bbedcc0088b0d7adc229a5b24adc2a15 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 11:19:47 -0700 Subject: [PATCH 388/485] Switch to pyproject.toml installation --- .devcontainer/devcontainer.json | 4 ++-- .github/workflows/build-and-test.yml | 2 +- Dockerfile | 4 ++-- Singularity | 26 +++++++++++++------------- dev.dockerfile | 4 ++-- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ebd16c0b4..561ade051 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -15,7 +15,7 @@ // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "python -m pip install -r requirements-dev.txt", + "postCreateCommand": "python -m pip install .[dev]", // Configure tool-specific properties. "customizations": { @@ -26,4 +26,4 @@ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root" -} \ No newline at end of file +} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index efe6ff494..fb75d6fc1 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -57,7 +57,7 @@ jobs: # in Singularity build with the wrong GLIBC libraries. run: | python -m pip install --upgrade pip - pip install -r requirements-test.txt + pip install .[dev,test] - name: Run type checker run: | diff --git a/Dockerfile b/Dockerfile index 2a0c7ea41..dc71d699e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -86,12 +86,12 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev -COPY requirements.txt requirements-basespace.txt /opt/micall/ +COPY pyproject.toml /opt/micall/ ## Python packages, plus trigger matplotlib to build its font cache WORKDIR /opt RUN pip install --upgrade pip && \ - pip install -r /opt/micall/requirements-basespace.txt && \ + pip install .[basespace] && \ python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' ## MiCall diff --git a/Singularity b/Singularity index 6ac3a38df..9d4aba101 100644 --- a/Singularity +++ b/Singularity @@ -35,22 +35,22 @@ From: python:3.8 .git /opt/micall/.git ## MiCall - micall_docker.py /opt/micall/ - micall_kive.py /opt/micall/ + micall_docker.py /opt/micall/ + micall_kive.py /opt/micall/ micall_kive_resistance.py /opt/micall/ - micall/__init__.py /opt/micall/micall/ - micall/project* /opt/micall/micall/ + micall/__init__.py /opt/micall/micall/ + micall/project* /opt/micall/micall/ - micall/core /opt/micall/micall/core - micall/data /opt/micall/micall/data + micall/core /opt/micall/micall/core + micall/data /opt/micall/micall/data micall/drivers /opt/micall/micall/drivers - micall/g2p /opt/micall/micall/g2p - micall/resistance /opt/micall/micall/resistance - micall/monitor /opt/micall/micall/monitor - micall/utils /opt/micall/micall/utils + micall/g2p /opt/micall/micall/g2p + micall/resistance /opt/micall/micall/resistance + micall/monitor /opt/micall/micall/monitor + micall/utils /opt/micall/micall/utils - requirements.txt /opt/micall/ - requirements-basespace.txt /opt/micall/ + README.md /opt/micall/ + pyproject.toml /opt/micall/ ## HCV genotyping database micall/blast_db /opt/micall/micall/blast_db @@ -120,7 +120,7 @@ From: python:3.8 # Also trigger matplotlib to build its font cache. cd /opt pip install --upgrade pip - pip install -r /opt/micall/requirements.txt + pip install /opt/micall python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' python /opt/micall/micall/blast_db/make_blast_db.py diff --git a/dev.dockerfile b/dev.dockerfile index ca63d75ba..31b4b2906 100644 --- a/dev.dockerfile +++ b/dev.dockerfile @@ -17,5 +17,5 @@ FROM docker.illumina.com/cfe_lab/micall:latest ## Add the dev packages. -COPY requirements-test.txt requirements-watcher.txt requirements-dev.txt /opt/micall/ -RUN pip install -r /opt/micall/requirements-dev.txt +COPY pyproject.toml /opt/micall/ +RUN pip install -r .[dev] From 460490369354a8ef1fef3f184359ae7914b7fe8d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 11:14:09 -0700 Subject: [PATCH 389/485] Delete pythons's requirements*.txt --- requirements-basespace.txt | 3 --- requirements-dev.txt | 7 ------- requirements-test.txt | 15 --------------- requirements-watcher.txt | 3 --- requirements.txt | 19 ------------------- 5 files changed, 47 deletions(-) delete mode 100644 requirements-basespace.txt delete mode 100644 requirements-dev.txt delete mode 100644 requirements-test.txt delete mode 100644 requirements-watcher.txt delete mode 100644 requirements.txt diff --git a/requirements-basespace.txt b/requirements-basespace.txt deleted file mode 100644 index 8613fa60d..000000000 --- a/requirements-basespace.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Requirements for running the micall_basespace.py script --r requirements.txt -requests==2.32.3 diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 0ce57d1fb..000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Requirements for running the tests, doing development, and using utilities --r requirements-test.txt --r requirements-watcher.txt -# Used for plotting profiling results. -gprof2dot==2024.6.6 -mypy==1.10.0 -mypy-extensions==1.0.0 diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index 6484b879e..000000000 --- a/requirements-test.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Requirements for running the tests - --r requirements.txt --r requirements-watcher.txt -mypy==1.10.0 -mypy-extensions==1.0.0 -pytest==8.2.2 -coverage==7.5.3 -pandas==2.0.2 -seaborn==0.13.2 -ete3 -flake8==7.0.0 - -# For reading HCV rules from an Excel file. -openpyxl==3.1.3 diff --git a/requirements-watcher.txt b/requirements-watcher.txt deleted file mode 100644 index 7cc63208e..000000000 --- a/requirements-watcher.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Requirements for running the MISEQ_MONITOR.py script -git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api -requests==2.32.3 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1f42f161d..000000000 --- a/requirements.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Requirements for running the bare pipeline under Kive or BaseSpace -pytz==2024.1 -git+https://github.com/cfe-lab/iva.git@v1.1.1 -biopython==1.83 -git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh -git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2 -numpy==1.24.3 -scipy==1.10.1 -matplotlib==3.7.3 -cutadapt==4.8 -python-Levenshtein==0.25.1 -PyYAML==6.0.1 -reportlab==4.2.0 -pysam==0.22.1 -git+https://github.com/cfe-lab/genetracks.git@v0.4 -mappy==2.17 -drawsvg==2.3.0 -cairosvg==2.7.1 -git+https://github.com/cfe-lab/aligntools.git@v1.0.3 From 09991fa701896a670f560c36b6389213445086d5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 11:16:06 -0700 Subject: [PATCH 390/485] Update installation instructions --- docs/admin.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/admin.md b/docs/admin.md index 5841e9dd3..2c5bf6a19 100644 --- a/docs/admin.md +++ b/docs/admin.md @@ -72,7 +72,7 @@ Create a Python 3.6 virtual environment to run MiCall. $ cd /usr/local/share $ sudo python3.6 -m venv venv-micall $ cd MiCall - $ sudo ../venv-micall/bin/pip install -r requirements-watcher.txt + $ sudo ../venv-micall/bin/pip install .[watcher] Copy the logging configuration if you want to change any of the settings. From d1d7a0e1465ea6f45eb608282dd492368dbcaf46 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 12:57:56 -0700 Subject: [PATCH 391/485] Add a test for duplicate executables names --- micall/tests/test_installation.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index 7d2543003..feb82b8ff 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -24,6 +24,7 @@ import shlex import re import os +from itertools import groupby from micall.utils.get_list_of_executables import iterate_executables @@ -138,3 +139,18 @@ def q(s: object) -> str: for executable in executables: assert executable in stdout, f"Executable {executable!r} not listed in micall --help." + + +def test_executables_names(): + """ + Verify that there is no duplication in names of executables. + """ + + def get_name(path: Path) -> str: + return os.path.splitext(path.name)[0] + + executables = list(iterate_executables()) + + for key, group in groupby(executables, key=get_name): + group = list(map(str, group)) + assert len(group) == 1, f"Scripts {group!r} share the same executable name." From f29d5d0d4862ec1946eabafa47773d7fd70feb79 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 13:56:58 -0700 Subject: [PATCH 392/485] Move mypy config to pyproject.toml --- .mypy.ini | 113 ------------------------------------------- pyproject.toml | 128 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 113 deletions(-) delete mode 100644 .mypy.ini diff --git a/.mypy.ini b/.mypy.ini deleted file mode 100644 index b9f38fe59..000000000 --- a/.mypy.ini +++ /dev/null @@ -1,113 +0,0 @@ -[mypy] -files = micall/ - -exclude = (?x) - ^micall/core/aln2counts[.]py$| - ^micall/core/amplicon_finder[.]py$| - ^micall/core/cascade_report[.]py$| - ^micall/core/consensus_builder[.]py$| - ^micall/core/coverage_plots[.]py$| - ^micall/core/plot_simple[.]py$| - ^micall/core/prelim_map[.]py$| - ^micall/core/remap[.]py$| - ^micall/core/sam2aln[.]py$| - ^micall/core/trim_fastqs[.]py$| - ^micall/drivers/run_info[.]py$| - ^micall/drivers/sample_group[.]py$| - ^micall/drivers/sample[.]py$| - ^micall/g2p/fastq_g2p[.]py$| - ^micall/g2p/pssm_lib[.]py$| - ^micall/monitor/find_groups[.]py$| - ^micall/monitor/kive_watcher[.]py$| - ^micall/monitor/qai_helper[.]py$| - ^micall/monitor/sample_watcher[.]py$| - ^micall/monitor/update_qai[.]py$| - ^micall/resistance/asi_algorithm[.]py$| - ^micall/resistance/genreport[.]py$| - ^micall/resistance/pdfreport[.]py$| - ^micall/resistance/resistance[.]py$| - ^micall/tests/.*| - ^micall/utils/aln2counts_simplify[.]py$| - ^micall/utils/amplicon_finder[.]py$| - ^micall/utils/big_counter[.]py$| - ^micall/utils/compare_454_samples[.]py$| - ^micall/utils/compare_mapping[.]py$| - ^micall/utils/concordance_evaluation[.]py$| - ^micall/utils/conseq_compare[.]py$| - ^micall/utils/contig_blaster[.]py$| - ^micall/utils/contig_counts[.]py$| - ^micall/utils/contig_summary[.]py$| - ^micall/utils/convert_primers_cutadapt[.]py$| - ^micall/utils/coverage_data[.]py$| - ^micall/utils/dd[.]py$| - ^micall/utils/denovo_simplify[.]py$| - ^micall/utils/externals[.]py$| - ^micall/utils/fetch_sequences[.]py$| - ^micall/utils/find_by_coverage[.]py$| - ^micall/utils/find_chimera[.]py$| - ^micall/utils/find_missing_samples[.]py$| - ^micall/utils/find_reads_in_sam[.]py$| - ^micall/utils/genreport_rerun[.]py$| - ^micall/utils/hcv_reference_tree[.]py$| - ^micall/utils/hcv_rules_display[.]py$| - ^micall/utils/hcv_rules_import[.]py$| - ^micall/utils/make_fda_refs[.]py$| - ^micall/utils/make_hcv_genotype_refs[.]py$| - ^micall/utils/primer_tracker[.]py$| - ^micall/utils/probe_finder[.]py$| - ^micall/utils/projects_dump[.]py$| - ^micall/utils/project_seeds_from_compendium[.]py$| - ^micall/utils/projects_upload[.]py$| - ^micall/utils/ref_aligner[.]py$| - ^micall/utils/reference_distances[.]py$| - ^micall/utils/remap_fastq_simplify[.]py$| - ^micall/utils/sam_g2p_simplify[.]py$| - ^micall/utils/sample_fastq[.]py$| - ^micall/utils/sample_project_summary[.]py$| - ^micall/utils/samples_from_454[.]py$| - ^micall/utils/sample_sheet_parser[.]py$| - ^micall/utils/scan_run_folders[.]py$| - ^micall/utils/seed_alignments[.]py$| - ^micall/utils/spring_beads[.]py$| - ^micall/utils/v3loop_alignment_scores/plot[.]py$ - -check_untyped_defs = True - -[mypy-micall.core.plot_contigs] -check_untyped_defs = False - -[mypy-mappy] -ignore_missing_imports = true - -[mypy-gotoh] -ignore_missing_imports = true - -[mypy-yaml] -ignore_missing_imports = true - -[mypy-Bio] -ignore_missing_imports = true - -[mypy-Bio.Seq] -ignore_missing_imports = True - -[mypy-Bio.Blast.Applications] -ignore_missing_imports = True - -[mypy-Bio.SeqRecord] -ignore_missing_imports = True - -[mypy-genetracks] -ignore_missing_imports = true - -[mypy-genetracks.elements] -ignore_missing_imports = true - -[mypy-drawsvg] -ignore_missing_imports = true - -[mypy-matplotlib] -ignore_missing_imports = true - -[mypy-matplotlib.colors] -ignore_missing_imports = true diff --git a/pyproject.toml b/pyproject.toml index 83b0e5844..51a154cf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,3 +102,131 @@ build-backend = "hatchling.build" [tool.hatch.metadata] allow-direct-references = true + +[tool.mypy] +files = ["micall/"] + +exclude = '''(?x) + ^micall/core/aln2counts[.]py$| + ^micall/core/amplicon_finder[.]py$| + ^micall/core/cascade_report[.]py$| + ^micall/core/consensus_builder[.]py$| + ^micall/core/coverage_plots[.]py$| + ^micall/core/plot_simple[.]py$| + ^micall/core/prelim_map[.]py$| + ^micall/core/remap[.]py$| + ^micall/core/sam2aln[.]py$| + ^micall/core/trim_fastqs[.]py$| + ^micall/drivers/run_info[.]py$| + ^micall/drivers/sample_group[.]py$| + ^micall/drivers/sample[.]py$| + ^micall/g2p/fastq_g2p[.]py$| + ^micall/g2p/pssm_lib[.]py$| + ^micall/monitor/find_groups[.]py$| + ^micall/monitor/kive_watcher[.]py$| + ^micall/monitor/qai_helper[.]py$| + ^micall/monitor/sample_watcher[.]py$| + ^micall/monitor/update_qai[.]py$| + ^micall/resistance/asi_algorithm[.]py$| + ^micall/resistance/genreport[.]py$| + ^micall/resistance/pdfreport[.]py$| + ^micall/resistance/resistance[.]py$| + ^micall/tests/.*| + ^micall/utils/aln2counts_simplify[.]py$| + ^micall/utils/amplicon_finder[.]py$| + ^micall/utils/big_counter[.]py$| + ^micall/utils/compare_454_samples[.]py$| + ^micall/utils/compare_mapping[.]py$| + ^micall/utils/concordance_evaluation[.]py$| + ^micall/utils/conseq_compare[.]py$| + ^micall/utils/contig_blaster[.]py$| + ^micall/utils/contig_counts[.]py$| + ^micall/utils/contig_summary[.]py$| + ^micall/utils/convert_primers_cutadapt[.]py$| + ^micall/utils/coverage_data[.]py$| + ^micall/utils/dd[.]py$| + ^micall/utils/denovo_simplify[.]py$| + ^micall/utils/externals[.]py$| + ^micall/utils/fetch_sequences[.]py$| + ^micall/utils/find_by_coverage[.]py$| + ^micall/utils/find_chimera[.]py$| + ^micall/utils/find_missing_samples[.]py$| + ^micall/utils/find_reads_in_sam[.]py$| + ^micall/utils/genreport_rerun[.]py$| + ^micall/utils/hcv_reference_tree[.]py$| + ^micall/utils/hcv_rules_display[.]py$| + ^micall/utils/hcv_rules_import[.]py$| + ^micall/utils/make_fda_refs[.]py$| + ^micall/utils/make_hcv_genotype_refs[.]py$| + ^micall/utils/primer_tracker[.]py$| + ^micall/utils/probe_finder[.]py$| + ^micall/utils/projects_dump[.]py$| + ^micall/utils/project_seeds_from_compendium[.]py$| + ^micall/utils/projects_upload[.]py$| + ^micall/utils/ref_aligner[.]py$| + ^micall/utils/reference_distances[.]py$| + ^micall/utils/remap_fastq_simplify[.]py$| + ^micall/utils/sam_g2p_simplify[.]py$| + ^micall/utils/sample_fastq[.]py$| + ^micall/utils/sample_project_summary[.]py$| + ^micall/utils/samples_from_454[.]py$| + ^micall/utils/sample_sheet_parser[.]py$| + ^micall/utils/scan_run_folders[.]py$| + ^micall/utils/seed_alignments[.]py$| + ^micall/utils/spring_beads[.]py$| + ^micall/utils/v3loop_alignment_scores/plot[.]py$ +''' + +check_untyped_defs = true + +[[tool.mypy.overrides]] +module = "micall.core.plot_contigs" +check_untyped_defs = false + +[[tool.mypy.overrides]] +module = "mappy" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "gotoh" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "yaml" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Bio" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Bio.Seq" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Bio.Blast.Applications" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "Bio.SeqRecord" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "genetracks" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "genetracks.elements" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "drawsvg" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "matplotlib" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "matplotlib.colors" +ignore_missing_imports = true From b220d6adf1b70bf91e63031e6e8391a3c805cab5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 14:04:59 -0700 Subject: [PATCH 393/485] Move pytest config to pyproject.toml --- pyproject.toml | 5 +++++ pytest.ini | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) delete mode 100644 pytest.ini diff --git a/pyproject.toml b/pyproject.toml index 51a154cf5..f7acd8c35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,6 +103,11 @@ build-backend = "hatchling.build" [tool.hatch.metadata] allow-direct-references = true +[tool.pytest.ini_options] +markers = [ + "iva: test requires IVA (deselect with '-m \"not iva\"')" +] + [tool.mypy] files = ["micall/"] diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index e556e61ca..000000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -markers = - iva: test requires IVA (deselect with '-m "not iva"') From cf58bfabda0bacfc7526751dde97fb63579a5a00 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 14:11:06 -0700 Subject: [PATCH 394/485] Specify more concrete Python version --- pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f7acd8c35..3a1bc8372 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,15 +7,12 @@ authors = [ ] license = {text = "AGPL-3.0-or-later"} readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.8,<3.9" classifiers = [ "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", From 65e5a3af82dd2225702e020a88492bb0e20b9e7e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 14:35:05 -0700 Subject: [PATCH 395/485] Optimize installation tests --- micall/tests/test_installation.py | 87 ++++++++++++++----------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index feb82b8ff..c76d0cdb1 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -28,8 +28,13 @@ from micall.utils.get_list_of_executables import iterate_executables -@pytest.fixture(scope="function") -def temp_venv(tmpdir: Path): +# Function to quote shell arguments. +def quote(s: object) -> str: + return shlex.quote(str(s)) + + +@pytest.fixture(scope="session") +def temp_venv(tmpdir_factory): """ Fixture for creating and cleaning up a virtual environment. @@ -38,13 +43,36 @@ def temp_venv(tmpdir: Path): """ # Create the virtual environment - venv_dir = tmpdir / "temp_test_venv" + venv_dir = tmpdir_factory.mktemp("venv") venv.create(venv_dir, with_pip=True) # Yield the environment setup to the test function yield venv_dir / "bin" / "activate" +@pytest.fixture(scope="session") +def micall_installation(temp_venv: Path): + """ + Ensures an installed micall executable. + """ + + q = quote + + # Check that MiCall is not installed. + stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") + assert returncode != 0, "Unexpected MiCall installation." + + # Path to MiCall directory (3 levels up from the current script file) + script_path = Path(__file__).resolve() + micall_path = script_path.parent.parent.parent + + # Install MiCall using pip from the local path + stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") + assert returncode == 0, f"Failed to install MiCall:\n{stderr}" + + yield "micall" + + def run_command(command: Sequence[str]): """Executes a shell command within a provided environment and returns output, error, and return code.""" @@ -52,7 +80,7 @@ def run_command(command: Sequence[str]): return result.stdout.decode('utf-8').strip(), result.stderr.decode('utf-8').strip(), result.returncode -def test_micall_installation(temp_venv): +def test_micall_installation(temp_venv, micall_installation): """ Test to verify installation of MiCall. @@ -60,29 +88,14 @@ def test_micall_installation(temp_venv): by executing the command `command -v micall`. """ - # Path to MiCall directory (3 levels up from the current script file) - script_path = Path(__file__).resolve() - micall_path = script_path.parent.parent.parent - - # Function to quote shell arguments. - def q(s: object) -> str: - return shlex.quote(str(s)) - - # Check that MiCall is not installed. - stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") - assert returncode != 0, "Unexpected MiCall installation." - - # Install MiCall using pip from the local path - stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") - assert returncode == 0, f"Failed to install MiCall:\n{stderr}" - # Check MiCall executable path to verify installation + q = quote stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && command -v micall") assert returncode == 0, f"Cound not find MiCall installation:\n{stderr}" assert stdout.endswith('micall'), "Unexpected output for micall path check." -def test_micall_version(temp_venv): +def test_micall_version(temp_venv, micall_installation): """ Test to verify installation of MiCall. @@ -90,19 +103,8 @@ def test_micall_version(temp_venv): by executing the command `micall --version`. """ - # Path to MiCall directory (3 levels up from the current script file) - script_path = Path(__file__).resolve() - micall_path = script_path.parent.parent.parent - - # Function to quote shell arguments. - def q(s: object) -> str: - return shlex.quote(str(s)) - - # Install MiCall using pip from the local path - stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") - assert returncode == 0, f"Failed to install MiCall:\n{stderr}" - # Check MiCall version to verify installation + q = quote stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && micall --version") assert returncode == 0, f"MiCall version command failed:\n{stderr}" lines = [line.strip() for line in stdout.split('\n')] @@ -110,7 +112,7 @@ def q(s: object) -> str: assert re.match(r'(\d+[.]\d+[.]\d+)|development', first_line), "Unexpected output for micall version check." -def test_micall_help(temp_venv): +def test_micall_help(temp_venv, micall_installation): """ Test to verify installation of MiCall. @@ -121,19 +123,8 @@ def test_micall_help(temp_venv): # These are supposed to be listed in output of --help. executables = [os.path.splitext(path.name)[0] for path in iterate_executables()] - # Path to MiCall directory (3 levels up from the current script file) - script_path = Path(__file__).resolve() - micall_path = script_path.parent.parent.parent - - # Function to quote shell arguments. - def q(s: object) -> str: - return shlex.quote(str(s)) - - # Install MiCall using pip from the local path - stdout, stderr, returncode = run_command(f". {q(temp_venv)} && pip install -- {q(micall_path)}") - assert returncode == 0, f"Failed to install MiCall:\n{stderr}" - # Check MiCall help to verify installation + q = quote stdout, stderr, returncode = run_command(f"export PATH= ; . {q(temp_venv)} && micall --help") assert returncode == 0, f"MiCall help command failed:\n{stderr}" @@ -152,5 +143,5 @@ def get_name(path: Path) -> str: executables = list(iterate_executables()) for key, group in groupby(executables, key=get_name): - group = list(map(str, group)) - assert len(group) == 1, f"Scripts {group!r} share the same executable name." + paths = list(map(str, group)) + assert len(paths) == 1, f"Scripts {group!r} share the same executable name." From 9806ca9fab5c89dcd23735a16c493d0814922d15 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 14:39:39 -0700 Subject: [PATCH 396/485] Add one more name test to check installation --- micall/tests/test_installation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/micall/tests/test_installation.py b/micall/tests/test_installation.py index c76d0cdb1..b9971876e 100644 --- a/micall/tests/test_installation.py +++ b/micall/tests/test_installation.py @@ -26,6 +26,7 @@ import os from itertools import groupby from micall.utils.get_list_of_executables import iterate_executables +from micall.main import EXECUTABLES # Function to quote shell arguments. @@ -133,6 +134,14 @@ def test_micall_help(temp_venv, micall_installation): def test_executables_names(): + """ + Verify that all and only those executables found by `iterate_executables()` are used in micall/main.py. + """ + + assert set(EXECUTABLES) == set(map(str, iterate_executables())) + + +def test_executables_duplicates(): """ Verify that there is no duplication in names of executables. """ From 69961d34fc571655d70ad22217431bdd5b0a4449 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 15:06:57 -0700 Subject: [PATCH 397/485] Move flake8 config to pyproject --- .flake8 | 82 ------------------------------------------------ pyproject.toml | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 82 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 34deca975..000000000 --- a/.flake8 +++ /dev/null @@ -1,82 +0,0 @@ -[flake8] -ignore = E226,E302,E41 -max-line-length = 120 -per-file-ignores = - micall/core/aln2counts.py: W504,E225,E126,W293 - micall/core/amplicon_finder.py: E126,W504 - micall/core/cascade_report.py: W293 - micall/core/filter_quality.py: W504 - micall/core/plot_contigs.py: W504,E501,F841,E231,E261,E262,E265,E302,E303,E305,E999,W291,W292,F401 - micall/core/plot_simple.py: F401,F841,W292 - micall/core/project_config.py: E722,E261 - micall/core/remap.py: F401 - micall/core/sam2aln.py: E128 - micall/core/trim_fastqs.py: W504 - micall/drivers/run_info.py: E225 - micall/g2p/pssm_lib.py: E722 - micall/monitor/find_groups.py: W504 - micall/monitor/kive_watcher.py: W504,E126,E501,W503 - micall/monitor/sample_watcher.py: W503,W504 - micall/monitor/update_qai.py: W503,W504 - micall/resistance/asi_algorithm.py: W504 - micall/resistance/pdfreport.py: E241,W504 - micall/resistance/resistance.py: W504 - micall/tcr/igblast.py: E265,E231,E502,E128,W291,E305 - micall/tests/microtest/make_sample.py: W504 - micall/tests/microtest/reverse_and_complement.py: E126 - micall/tests/microtest/translate_nuc.py: E126 - micall/tests/svg_differ.py: E126,F541 - micall/tests/test_align_it_gotoh.py: E261,E262,E225,E303,E501,W292 - micall/tests/test_aln2counts_report.py: W504,F401 - micall/tests/test_aln2counts_seed_amino.py: W293 - micall/tests/test_aln2counts_seed_nucleotide.py: W293 - micall/tests/test_cigar_tools.py: E704,W503 - micall/tests/test_consensus_aligner.py: F841,E126,W504,E701,F401 - micall/tests/test_contig_stitcher.py: W503 - micall/tests/test_error_metrics_parser.py: E231 - micall/tests/test_fastq_g2p.py: W293 - micall/tests/test_hcv_reference_tree.py: W293 - micall/tests/test_hcv_rules_import.py: E241 - micall/tests/test_kive_watcher.py: W504,E126,E127,F841 - micall/tests/test_plot_contigs.py: E303 - micall/tests/test_primer_tracker.py: F401 - micall/tests/test_projects_dump.py: W504 - micall/tests/test_quality_metrics_parser.py: E231 - micall/tests/test_remap.py: F541 - micall/tests/test_resistance.py: W504 - micall/tests/test_report_amino.py: W292 - micall/tests/test_sam2aln.py: E121 - micall/tests/test_sample.py: F401 - micall/tests/test_sample_sheet_parser.py: E123 - micall/tests/test_tile_metrics_parser.py: E231 - micall/tests/test_trim_fastqs.py: E121,E123 - micall/tests/test_update_qai.py: E127 - micall/utils/aln2counts_simplify.py: F401,E261,E126,F821 - micall/utils/big_counter.py: W293 - micall/utils/cigar_tools.py: E704 - micall/utils/compare_454_samples.py: W504,E305 - micall/utils/compare_mapping.py: W504,F841 - micall/utils/concordance_evaluation.py: E303 - micall/utils/consensus_aligner.py: W504,W293,E126,E303,F401,E261,E252 - micall/utils/conseq_compare.py: F811,E305,W503,F841,F401 - micall/utils/contig_summary.py: W504 - micall/utils/coverage_data.py: W504 - micall/utils/dd.py: E711,W504,E211,E303 - micall/utils/fetch_sequences.py: W504,E126,E303,E265 - micall/utils/find_chimera.py: W504 - micall/utils/find_reads_in_sam.py: F401,W293,F841 - micall/utils/hcv_reference_tree.py: W504 - micall/utils/hcv_rules_import.py: W503,W504,E126,E241 - micall/utils/make_fda_refs.py: E211,E261,W293,W292 - micall/utils/make_hcv_genotype_refs.py: E211,E261,E231,E722,E305,E127,E251,E127 - micall/utils/primer_checks.py: E303,F541 - micall/utils/primer_tracker.py: F541 - micall/utils/projects_dump.py: W504 - micall/utils/ref_aligner.py: F541 - micall/utils/reference_distances.py: E305 - micall/utils/report_amino.py: W504 - micall/utils/sam_g2p_simplify.py: W293,E265,E303,E305 - micall/utils/sample_fastq.py: E305 - micall/utils/sample_sheet_parser.py: E275,W504 - micall/utils/scan_run_folders.py: W504 - micall/utils/v3loop_alignment_scores/plot.py: W293 diff --git a/pyproject.toml b/pyproject.toml index 3a1bc8372..053fc6af1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ test = [ dev = [ # Dependencies required for development (linting, type checking, etc.) "flake8==7.0.0", + "Flake8-pyproject==1.2.3", "mypy==1.10.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", @@ -105,6 +106,90 @@ markers = [ "iva: test requires IVA (deselect with '-m \"not iva\"')" ] +[tool.flake8] +ignore = ['E226', 'E302', 'E41'] +max-line-length = 120 +per-file-ignores = [ + 'micall/core/aln2counts.py: W504,E225,E126,W293', + 'micall/core/amplicon_finder.py: E126,W504', + 'micall/core/cascade_report.py: W293', + 'micall/core/filter_quality.py: W504', + 'micall/core/plot_contigs.py: W504,E501,F841,E231,E261,E262,E265,E302,E303,E305,E999,W291,W292,F401', + 'micall/core/plot_simple.py: F401,F841,W292', + 'micall/core/project_config.py: E722,E261', + 'micall/core/remap.py: F401', + 'micall/core/sam2aln.py: E128', + 'micall/core/trim_fastqs.py: W504', + 'micall/drivers/run_info.py: E225', + 'micall/g2p/pssm_lib.py: E722', + 'micall/monitor/find_groups.py: W504', + 'micall/monitor/kive_watcher.py: W504,E126,E501,W503', + 'micall/monitor/sample_watcher.py: W503,W504', + 'micall/monitor/update_qai.py: W503,W504', + 'micall/resistance/asi_algorithm.py: W504', + 'micall/resistance/pdfreport.py: E241,W504', + 'micall/resistance/resistance.py: W504', + 'micall/tcr/igblast.py: E265,E231,E502,E128,W291,E305', + 'micall/tests/microtest/make_sample.py: W504', + 'micall/tests/microtest/reverse_and_complement.py: E126', + 'micall/tests/microtest/translate_nuc.py: E126', + 'micall/tests/svg_differ.py: E126,F541', + 'micall/tests/test_align_it_gotoh.py: E261,E262,E225,E303,E501,W292', + 'micall/tests/test_aln2counts_report.py: W504,F401', + 'micall/tests/test_aln2counts_seed_amino.py: W293', + 'micall/tests/test_aln2counts_seed_nucleotide.py: W293', + 'micall/tests/test_cigar_tools.py: E704,W503', + 'micall/tests/test_consensus_aligner.py: F841,E126,W504,E701,F401', + 'micall/tests/test_contig_stitcher.py: W503', + 'micall/tests/test_error_metrics_parser.py: E231', + 'micall/tests/test_fastq_g2p.py: W293', + 'micall/tests/test_hcv_reference_tree.py: W293', + 'micall/tests/test_hcv_rules_import.py: E241', + 'micall/tests/test_kive_watcher.py: W504,E126,E127,F841', + 'micall/tests/test_plot_contigs.py: E303', + 'micall/tests/test_primer_tracker.py: F401', + 'micall/tests/test_projects_dump.py: W504', + 'micall/tests/test_quality_metrics_parser.py: E231', + 'micall/tests/test_remap.py: F541', + 'micall/tests/test_resistance.py: W504', + 'micall/tests/test_report_amino.py: W292', + 'micall/tests/test_sam2aln.py: E121', + 'micall/tests/test_sample.py: F401', + 'micall/tests/test_sample_sheet_parser.py: E123', + 'micall/tests/test_tile_metrics_parser.py: E231', + 'micall/tests/test_trim_fastqs.py: E121,E123', + 'micall/tests/test_update_qai.py: E127', + 'micall/utils/aln2counts_simplify.py: F401,E261,E126,F821', + 'micall/utils/big_counter.py: W293', + 'micall/utils/cigar_tools.py: E704', + 'micall/utils/compare_454_samples.py: W504,E305', + 'micall/utils/compare_mapping.py: W504,F841', + 'micall/utils/concordance_evaluation.py: E303', + 'micall/utils/consensus_aligner.py: W504,W293,E126,E303,F401,E261,E252', + 'micall/utils/conseq_compare.py: F811,E305,W503,F841,F401', + 'micall/utils/contig_summary.py: W504', + 'micall/utils/coverage_data.py: W504', + 'micall/utils/dd.py: E711,W504,E211,E303', + 'micall/utils/fetch_sequences.py: W504,E126,E303,E265', + 'micall/utils/find_chimera.py: W504', + 'micall/utils/find_reads_in_sam.py: F401,W293,F841', + 'micall/utils/hcv_reference_tree.py: W504', + 'micall/utils/hcv_rules_import.py: W503,W504,E126,E241', + 'micall/utils/make_fda_refs.py: E211,E261,W293,W292', + 'micall/utils/make_hcv_genotype_refs.py: E211,E261,E231,E722,E305,E127,E251,E127', + 'micall/utils/primer_checks.py: E303,F541', + 'micall/utils/primer_tracker.py: F541', + 'micall/utils/projects_dump.py: W504', + 'micall/utils/ref_aligner.py: F541', + 'micall/utils/reference_distances.py: E305', + 'micall/utils/report_amino.py: W504', + 'micall/utils/sam_g2p_simplify.py: W293,E265,E303,E305', + 'micall/utils/sample_fastq.py: E305', + 'micall/utils/sample_sheet_parser.py: E275,W504', + 'micall/utils/scan_run_folders.py: W504', + 'micall/utils/v3loop_alignment_scores/plot.py: W293', +] + [tool.mypy] files = ["micall/"] From 596d5b8db8f25f354b39023ac51dda5233d9a92c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 16 Oct 2024 16:28:11 -0700 Subject: [PATCH 398/485] Fix license in pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 053fc6af1..ff5d185e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,11 +5,11 @@ description = "Pipeline for processing FASTQ data from an Illumina MiSeq to geno authors = [ { name = "British Columbia Centre for Excellence in HIV/AIDS", email = "vmysak@bccfe.ca" }, ] -license = {text = "AGPL-3.0-or-later"} +license = {text = "AGPL-3.0"} readme = "README.md" requires-python = ">=3.8,<3.9" classifiers = [ - "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", + "License :: OSI Approved :: GNU Affero General Public License v3", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", From 2c083302d37c270ccee83c674edf8bdc1006c03d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 08:03:25 -0700 Subject: [PATCH 399/485] Add license file to pyproject.toml --- Singularity | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Singularity b/Singularity index 9d4aba101..928a15ef6 100644 --- a/Singularity +++ b/Singularity @@ -50,6 +50,7 @@ From: python:3.8 micall/utils /opt/micall/micall/utils README.md /opt/micall/ + LICENSE.txt /opt/micall/ pyproject.toml /opt/micall/ ## HCV genotyping database diff --git a/pyproject.toml b/pyproject.toml index ff5d185e4..be0707e2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ authors = [ { name = "British Columbia Centre for Excellence in HIV/AIDS", email = "vmysak@bccfe.ca" }, ] license = {text = "AGPL-3.0"} +license-files = { paths = ["LICENSE.txt"] } readme = "README.md" requires-python = ">=3.8,<3.9" classifiers = [ From 1446c9ce1dcad7774e5dcf1ce51cb123f8eb9561 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 09:04:26 -0700 Subject: [PATCH 400/485] Make sure that modules are loadable from main.py --- micall/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/micall/main.py b/micall/main.py index d52786b9f..04acd4b4d 100644 --- a/micall/main.py +++ b/micall/main.py @@ -108,6 +108,9 @@ def executable_module(path: str) -> str: def execute_module_as_main(module_name: str, arguments: Sequence[str]) -> int: sys.argv = [module_name] + list(arguments) + micall_directory = str(Path(__file__).parent.parent) + if micall_directory not in sys.path: + sys.path.append(micall_directory) runpy.run_module(module_name, run_name='__main__', alter_sys=True) return 0 From c50992539bd04467866888eb2cc0ef0b05fa2688 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 09:24:48 -0700 Subject: [PATCH 401/485] Change location for local assets If micall is installed as a package, then the old path won't work because it will be based on system's "site-packages" directory. This commit makes assets work for both the installed and uninstalled micall cases. --- micall/utils/externals.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/micall/utils/externals.py b/micall/utils/externals.py index e0f1bf01d..9bc08dee5 100644 --- a/micall/utils/externals.py +++ b/micall/utils/externals.py @@ -3,6 +3,7 @@ import sys import re from subprocess import CalledProcessError +from pathlib import Path class AssetWrapper(object): @@ -10,10 +11,10 @@ class AssetWrapper(object): def __init__(self, path, **kwargs): # noinspection PyArgumentList super(AssetWrapper, self).__init__(**kwargs) - app_dir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) - local_path = os.path.join(app_dir, path) - if os.path.exists(local_path): - self.path = local_path + app_dir = Path(__file__).parent.parent / "assets" + local_path = app_dir / path + if local_path.exists(): + self.path = str(local_path) else: self.path = os.path.join(getattr(sys, '_MEIPASS', ''), path) From c0b854768a20f63fbe5577093f5ee9f78d0dbbf8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 09:56:03 -0700 Subject: [PATCH 402/485] Improve location value for DEFAULT_DATABASE --- micall/utils/fasta_to_csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index bb7c9819e..d03cc19aa 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -7,6 +7,7 @@ from csv import DictWriter, DictReader from itertools import groupby from operator import itemgetter +from pathlib import Path from io import StringIO @@ -17,7 +18,7 @@ from micall.utils.contig_stitcher_contigs import GenotypedContig -DEFAULT_DATABASE = os.path.join(os.path.dirname(__file__), '..', 'blast_db', 'refs.fasta') +DEFAULT_DATABASE = os.path.normpath((Path(__file__).parent.parent / 'blast_db' / 'refs.fasta').resolve()) def read_assembled_contigs(group_refs: Dict[str, str], From ca29f0dc006c7f033c9dd395b4d1568cf468a967 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Thu, 17 Oct 2024 16:18:03 -0700 Subject: [PATCH 403/485] Handle other pipeline groups in update_qai.py. --- micall/monitor/update_qai.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/micall/monitor/update_qai.py b/micall/monitor/update_qai.py index cbc9a9750..3926388ba 100644 --- a/micall/monitor/update_qai.py +++ b/micall/monitor/update_qai.py @@ -8,10 +8,11 @@ from collections import defaultdict from datetime import datetime import logging +from functools import partial from pathlib import Path from micall.monitor.sample_watcher import PipelineType -from operator import itemgetter +from operator import itemgetter, getitem import os from micall.monitor import qai_helper @@ -24,6 +25,7 @@ def parse_args(): import argparse + pipeline_parser = partial(getitem, PipelineType) parser = argparse.ArgumentParser( description="Update the Oracle database with conseq information", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -34,7 +36,7 @@ def parse_args(): help='version suffix for batch names and folder names') parser.add_argument('--pipeline_group', default=PipelineType.MAIN, - type=PipelineType, + type=pipeline_parser, choices=(PipelineType.MAIN, PipelineType.DENOVO_MAIN, PipelineType.PROVIRAL), From e27e35f503541794f831995627a8bb9b3489ba9c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 08:34:07 -0700 Subject: [PATCH 404/485] Do not include unused files and folders in pyproject.toml --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be0707e2b..21d077560 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,9 +82,6 @@ micall = "micall.main:cli" [tool.hatch.build] include = [ "micall/", - "basespace/", # TODO: do we really need this in the package? - "gui/", # TODO: do we really need this in the package? - "*.py", ] exclude = [ "micall/tests/", From 085190b50bdbd815f7b3c99b6d2e68eaf74ba0be Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 08:34:56 -0700 Subject: [PATCH 405/485] Add source documentation for some top level scripts --- micall_docker.py | 6 ++++++ micall_kive.py | 7 +++++++ micall_kive_resistance.py | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/micall_docker.py b/micall_docker.py index 4a32e2e39..ce44356d4 100644 --- a/micall_docker.py +++ b/micall_docker.py @@ -1,3 +1,9 @@ +#! /usr/bin/env python + +""" +Entry script that serves as an entry point of MiCall's Docker image. +""" + from argparse import ArgumentParser import csv import errno diff --git a/micall_kive.py b/micall_kive.py index 4c302cc75..b846acd2a 100644 --- a/micall_kive.py +++ b/micall_kive.py @@ -1,3 +1,10 @@ +#! /usr/bin/env python + +""" +Entry script that serves as an entry point of MiCall's Singularity image. +This file is run by Kive. +""" + import logging import shutil import tarfile diff --git a/micall_kive_resistance.py b/micall_kive_resistance.py index 74008623c..c2ce3ddec 100644 --- a/micall_kive_resistance.py +++ b/micall_kive_resistance.py @@ -1,3 +1,10 @@ +#! /usr/bin/env python + +""" +Entry script that serves as an entry point of MiCall's Singularity image. +This file is run by Kive. +""" + import logging import shutil from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter From 56d836e4d0aea05f179ee63533e2ba21dd89ec66 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 18 Oct 2024 13:33:36 -0700 Subject: [PATCH 406/485] Remove gui/ directory Looks like this directory is not used anymore. --- gui/README.md | 14 -------------- gui/micall.ico | Bin 293950 -> 0 bytes gui/micall.xcf | Bin 121234 -> 0 bytes 3 files changed, 14 deletions(-) delete mode 100644 gui/README.md delete mode 100644 gui/micall.ico delete mode 100644 gui/micall.xcf diff --git a/gui/README.md b/gui/README.md deleted file mode 100644 index 77700267e..000000000 --- a/gui/README.md +++ /dev/null @@ -1,14 +0,0 @@ -To create an icon, create a GIMP image with the [following sizes][sizes], then export as a .ico file with the listed colour settings for each layer. - - - 256x256 will be saved as 32bpp 8bit alpha - - 48x48 will be saved as 32bpp 8bit alpha - - 48x48 will be saved as 8bpp 1bit alpha - - 32x32 will be saved as 32bpp 8bit alpha - - 32x32 will be saved as 8bpp 1bit alpha - - 32x32 will be saved as 4bpp 1bit alpha - - 16x16 will be saved as 32bpp 8bit alpha - - 16x16 will be saved as 8bpp 1bit alpha - - 16x16 will be saved as 4bpp 1bit alpha - -[sizes]: http://stackoverflow.com/a/10819673/4794 - diff --git a/gui/micall.ico b/gui/micall.ico deleted file mode 100644 index d603925258ab2f41e9e90305d660ff2cd1c0dd96..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 293950 zcmeEP2VfP&)*gFBk%tuvg4mx5HdJ^hV!;BUbW~8Pf>e~|`+wie&hG5qdz0{zJSBft53_g6%PMb3uAhJqdZsK zewGlUg}`rt-xrEj=LwOHW9oNl(dt|wZoNQ=(xuh)2hJ8^=q3EEu8+M?i1#kjze|f) z)N|w2Lfm(seIDml+@^oyd^sW7K7;s}qU(NNpDXNgoiV&svJ4>E*?gsM=!D!q#Hb#-U=vhN)uTp5@}$ zvHh|}OK)Z5cGIze0|yQoII!mNe&KhH95t}r@yNH&A3hxAq{q%nuGPKW@u*VwFTTHF zyW=UB-naDXM(vKzJ?EVJUVXpa@wDyRw{Ls3-SN!rJFwI4_}cARS=%$)9pAnqE9=O1 z97l=c$EIgyZp&P4S@+5ftqYNX@$GZEbdC_4>-onCgP+ZI&N=6ZOE0}dlq`9E*^g99`V?tcZo+Iy-!qntb%y)c_C`N zATZ7qM9mk4s8v&lr=F@Ho~vFSbia7z<$e2nhY+te zdRDY~yM$=c_zCewlNzGydzXl=T`m#byIv{Y?{=l=*}atL`Tq5ySC5-Tr;g8w-aX6U znkPi(cb^k|dfg`a_P$N@>vI>5y(0Scy<2?H_g>NegL}n?ACwm#_OBp%bgzf<0WqNe zgJQsk4~UOGd`Ju&@G$5l5f^=-2p>~lL`^Oao~S1#j;kT2$CnZb@z;x~lb;q-COs#@ zKkEQqxmzTI5~h_G<0CqW*yx&KX3|4qR`O$F_KYebDWML^>S9`aa}gKYL!{1oNuH&^%08~wG+#hbPyTo!>~VCEY27rRxax;K3~yStXTe;SP5Fc zwvWhMJw&Wp86nnZ4ijH&2oq~oM~RK=KNB0)PZVEm{sd)$*t}`7_zfkPkI}hbD z&ZtQi>xDCLEFTS zS=+?VKYuTN0UbTMPyG7pA#vijpTwVkp2&vlicbAAN8sX1F1bie@kr>rrS823j`(!Um`_Ix?A^Vx5H~k$-0bbP z@4nmi?WRo{am23_CB+r^uLMdV@>kB5W&=5G(2N37I#9Dw0GVq+=2?(&7UY=)8D>F-S&(6pxX)vdcNS!zycRMVnn-Ps%@$;(1$k~k zK3b5E7G$~w`Da0P%6-U23-Vpo2YG2hmOJ-DrdyDW{B1#oTab-%AF|kj?3DYE$FhX{ zL><*F=m7qf8+3sMS!zK?S z1-)cJw^@**7W9|}on}FPTF`YC^cK$FZ$Z~skkuCSjdMTrjRjeazP}>df(*By&n)Oq z3$ogR{33p!QqLkC)r@fP%s1=(*w zr#bgSw^`5u7UaDJnQcLKTaeWjWU>YMY(ZwrW03t8^r{8jU_lo;_d_RI&>48vL<@QV z*RHT2+bzg*N!*A0x1b*^=ni=;bf|lO{|7AS77Ka=_b#)b4=m^a3v%Cr{Kx(_d5q_H z4fK%(o$K5W9c)1t$@-yNEa(sm`oe;)ki>l+<2m+$*Fs-A_j4b5%!2N+pqDJ@9Sgd~ zf}WAYeIDaE=r(yRbiH#w?2PTu!+5WQE$CSbIu&D`U_oa};y#Z-4_na77WB2%=Iwv& zB=gmN8KSAe#plAdc^Nj2B-lJJ!}jqIkLmNg_Q8iQIIs3AwwwX#Pure@{YM!^ySFsx zaZq-4?-eD4c;*=)n!GN=`0%S?%RDTir?i9}^14Wy*F!8{+70%_2(e{LEcAb>IB;OS z)Z2fa`U5qyt$^)v9MOc`pNtJZ&r$}m`uAST`sMy7tJ&KF_up(;WgmC|KcD8IC!Z{D zS^wgGixy9}Xodq7aK3uhoYq;Mk-#QlQeF%fJao z!qbn0uREN0U_e!zAJ{7^Jn_ib^dl1v4aE6dyNu7;x-RS69Y-SOkHPP9-Nt9ZGSABT zAufE(o0fH3ctqCPtRL29ZJo9)T%I2rKP`1lYTC4ES@Ji3rlzgQOkEHk-VxW2jZaO> z%*@sSwSZ3DKclG10MoaWQhJ z5T6Va;3aC&Z-5RmHTJd&R`@b;R`8`^2OP zbw$*qnj#^tf|xq>X%Q7!UnIugFD6fDBBCa~A)=?gERqu*5;4)Qi0H}9#LUEsA~vSB zn3ePh{EIpwc1jB|8@@+;TwReeqmr0Dt$~=GR9&PbKPRS4?F=90O)+Q2^CCI<6)|__ z3nDtEi%6W_Ow5~AL&U{(6Um7!#LQWB#DbKU#H`tM#k6Vfi{52mGC$V%3UXV%4$%;`5d8 zeU^VDRh7 zV$a?UV$c57;_$w8V(&iqLGX$8?f*jTKk%hEaB!12ba=Bk{KHq`ha+FZH`*zVWPJnQ zXqU+P@mu&uyTy+`Z52QL^qusRj{dS&9HXCvSi|vO4~XB69~37}91 z+G|Uk&kKrOS+-ocyYIN8dUYtk%8ysChW~E8=Z>;vZ@lVKb^g{0m7l`VTD9ubs`YZ+ z`n9T7tyb-^d+siGRWY8w^qL3WPoF>h*_Vf;r+-kl{)F`TT|StffGZxjO`b1S=7HAf z=}~nX_eoD5)3hTU>hjMz_ut8J&$^!dHIeVm>?t6#s->FLqE)6#pjZS$nM z|KaxCI?YH=j~zKVeSUh(C-G_NiM@sl>D{gEldAnEJ9OzjH67>D($iDY`0s>XeR}ok z($T*Eg>F51j7dwIoRmI4En@U+Tr_mh=!EEQU23cQYj*4TL0H(3!9zY7GkVk~5u-;B z88URl{M4RZYpV0Ld-duyaPW|kqee$WM2roefQyC<9{AA*yX*q~m$Y8U7D7i-e%qoG4S867cUQdCS-Ol)N2xN)C+GBPaegSXWE zO+FYnWXPy7<0GSD;-)7|kDNMb+^7h&(5s<({wBRX=IxPF6XFt*6BCnAMfm85Az=g5 z{g=1+DD2};A|_6aNlctQJ1HeOAvSV+#Hb+y!&<2O8w?B^88Lc7faAO^z@@O zYSeh8UV{epUU|Moji(=d=)r&AAcu#SoFDYR^LQ$!jkdx+$4)7Irif$)%>M5LB#~n2t7DgWv^uwsX!qPqcaQ1@U9kH-*)_X~7WduJNMy z32n7S>x}W}?XoATh_xNt3v2vXVMR?6R&>L9GDPlyd@_;TGi$`TFgiJwOdwcDL~ z9qzrwnihMvY7gxsPJa|}oJv-5!sFJAgsRqz#3#LkV?39X@E>_C`!+oe{Xre9D;9M| zM-{V14ii?x`r;VdG}~{~KpY!A>QZ?vpKWGhRm7g2MZBq+Y-eWDGgiv1TGqUjde;0o z4Xin{U%~H}B+W~yk7Etxxmhz_ux2J#$9>PpekP&c3Gol0AODthkNNa+>(z$hw_xpm zHu`!wj_`jeKIFD->kKPpW=+}Nj%_LWu(kQiNoey`NeB0@bB?e1d?3zE#P3tq&p++5 z`+;$h{kY!>A9GX4_9MdYz!+AMaV1^h4~I9SjYf!ty>4yX5Uvj+4zt`^xS)lkZChtK z$G3i)jJQ&N`TMWG{*+@fd&Y~hAACRK#@-pS{Rt5jFy_x$b5rU-cK$Bw`2FNDD{Vn@ zYhh|DYsIoYPCJb0WTbbHv~j~&=lG$68?B6m@8JGF<*`*OK9GHwgLlWeCyu*6Wcw4x zKPubi^ZxwPZl}(b%Rj)FbVO|LJ?oF(f0f7o`s)w$rHB!z0nW9*{g#DmJIJ^Z z4>HEIVBQ<@J@L6FO{f&I{mGGxIlhj#PIGJTo)u1=Uu}s+J3Y`|Z|kQYcR0tt+&IBn zwc?|@tCR17?W|oh!rHlgo^$+%!&|Hk>qbl3xA$}B_@3{USz9(uk-w>fIc|J+ ztUETkR>=0_qU%^o7k9B%EbWQs9pLcLAAkIYSnj8YZHCL(q;BZ=uRmB{Zk#A-_pU|y zu#6G%`ctR=lE?U-SSRa_kF6K7{b{icB~S5vu30t6`t{gBr_QXbt=6`!v#o>RZ*$8! z@-4=Co{TH%<3IkmUG{a`w%LetrdS&>maLQfIz9fiknK;8Zz8W-^ZCbetU11V(|FXs zS)M%cvx8If_`w6~txaD{g1i`qcQZ!T#X46(E+kHWGi3XS0pPh++q`ejcS~jcY@4y) zUw+x+jOm|$o&-i#BF?+q;C;*DJK6e8vPI00KY#w|JL~Y_O{x#58?l9u?I$O`Wo51z zF8S$;jpHFlqfk$reCJys<92MDYwg;VF0uLD?hI?UCcOLa!E@j5UMjDlJY)=;?UM&6 zi#Dzwi#`m8ZfG5{{Ta#cfX_zBaohamWV`JoYa8C-&g~0i+u!eAB4sb1eed2?inxz) z;5qh>*Ydr8i&*qmTjJ$&@LiB6XU%9Avi(^z+DpFSyF>g{J_GAvteeluHi)zR`!ePG zIJkcU$_*X_3rCOcm&AP@<2l&}T)P+dNPa*a97onkex5g{ zd&u_ZrgX#H4aYta?`{tG7rc(=;PY^N`HXCb^B}(-J8T_4e#H6>#C;y)IbOqSc@OVp z9jxn{uM=e5^XK-+rv2`?CI)<$B*%?1m3&VA;&XAF`K)Y@`tBrb1KJ1N=P{n+HN2Mh z@LtxzcfdMX_x!m%L$*I}&ihhs5I2;mlJEAcl4DO^{U4$Gz3rbw`zLeRK4n3ye*Rz4J|ff3 zcJ@m(#g>T^8X4n1M~6cKje%*pIwdrA>a6~3p)OjC%$IAQcHY;mOXRFrAv!? z&E6EJh7YY~O&Hh6nmD1cHECiq*ek89sZnjLn5iA$Q*^fC)CkVNY3Xqf8lr2z%gm_0^&(=Bp=Ft16NTf~r&&lhH=Cz?T2M zBU*3&1b=K7=2J2XelF!4w!`+=Hs@22Wg(h?n70>lI}1U9QbLp~E5u!Q3Q@7*S)y{K z3&el^bFrLHsfJN~?r9;Ot$u;1g&eOJYq;|(^=b=Izm5bi3*@P(Z@mKM#0Xw?#q z`C9_HUT2ASTAw4@wmDz4d*?#YzU?K>989MUCB(ZOON!2&t`hIPd$s7&`C8<4l@i^$ zT#tOP8<7uo6Y{t26{ANA5jLQ*hzQ60%vd2NPZXF#5y;sR&b$ogWb}Lt=V7MDUnA#W zIRCA2IA@~g zO6JdhRnC)ej)e0goEy>eB6?0_aYkF@fOQc|7PoiiN0u+??wKQT=0`9$vgY%?V$I4h zIX|*?^~at$lGUrj+HqpjhLQi>+{w3bInJN# z-ZfXwp?tS{UbgcnhxV@!hxV?`pZS!Zw`D)4a{Q;Fd6kfW_nuh%58T9J{3miIr|V~5 za2dD7#fn{4jHUc#|Hc9QzVxySQK`M1cjc8=mAtxS$*ZosvLrVCk|p-=|J5b0zWOR2 zIp4nFtdgbfy63*~_m#ir5p0z!R(`bdV-+8I_~D1K`$&aH9=WId1NS^~&t2D*w5z}9 z%6sm+r+j%{`^dwUt2|bvQl-jOo~iWAqm>@3RJn3xR`TzPxa7)<)eXh3E`Q&Bl^?16 zaOH=qRC=aLrK*ottM*K_$E#I+rdqWsRVqDJ@xh9fE8O=;`BHZM#jimP6)RP$gi0Sr zmL9gJpRMuibI(2Xt*wffV~Re9{0 zO4qCIh|6xQ^30RBpGaPRGQuXJad%VgGy8cqg7&3Ci8G&FfQwYGQD4i z4juSPQromxkdYDA`1LpH*1k#Ce?#5ZUw^$x98O1eXw#tsF31S)+P!P9blljt4Z8JN zlje;ZG`Ptg{}MO6{%VuvEjivPZMt^tK9e{0?$NC~hIrxd_K_JGiA|d{eZ5g>UH^?u z8#is*ayTx_7|`?mo-^yvC-=QgdHw`|f>Uti*;rp=nSY4a}MFs{g8wTn=4FLhA&ZauoUY2UVO z(`Gl@<9}J1maW>f>Chtu7bnY`QWr)%dUK*Eodbj)7 z1q!uw)c#)AuKjy|IH>=?p<$m!gby7) zY{c+kpMDxPeBhw|pJk+`zSp(Q+s^f+(fJ--`}ZF-AZ+-sVZ%p_8WAxnB78(d_^_~{ zVPS&?MMg&U?%Jkp8GHPTmu}O!YmfeYKO8VHENtYch*9C#K8pw+^J#eauwh{X2KDcO zXMVR$S*QP<-s{%8Z@&Ry1BVR@|1|uwaS;RoCZ|>8l zKe{tAd_4M}m^3G8(L%JaFl9#ajL66d<3@!K2NU(~Z})%k<)wS{W&Lsae47e@~ zoZj=QOYIk2^6vH@f*VGS8#e(j3)QEfKlnE#F)=1;(zu9G!-fw0u>GA^o?~CX_{uA9 zy17i5vSn|%<(6_thQM~)ZTuHMaHwqAGG%VQ>B^F}y5mhHuFP>OajvetKuhJP2|NGM zEq_Jo#}zjTapFR3_X+{O3f`3vyI5j|O_rg4>*Kn_^%=ZR|Mr*u`huK4)9%;fC0Byr z`TpedV?6!i?e$Yj^Qxll%oqVg4zN&Ai0eSNfXe;N#6GYu>=XNz-NqQ%&7e0xLqN#* z6yyJ9VjtKS_KAJ-#PRUk+-}5=9=DVgnTV$yLGHnEA}>&3EV{R-IB}2WSWz9Q%cIV*e2U%A3jB2+M-YF+l@Dhc=Vi0o&4K%&ps`- zB3^4ne5U(-7V5fO<)zqtjz@bkPekXY$h?$?G0#|0<%~>!3^^gBN`ZuP%qvNpR?*30 zah{9M6NBeL?g+<6<*{6ju{&2j$GC9C57n!So#;3slL1?)y0t}C8?0%X8pZK0hB~fA zeLA0pW6XRS=F-^tEz_CbQpK0!QVqE%Pbo@5NwR;>NL0Bg%!gra1@BEl9egI4|AM+z zJ`QqVoP3;XWnZRFK1=ciVxqsG=hp}H9?w*_j`;B%tgniha+Tr{nFoZN9K3Vpv~b*# zFotYb<>a8R$a|TYguEH#z|2a10rQP7Vx1JJhBbSpP29&Zp5rwtUxv9h$eH0Y$>-s- z;QjF3pe?o;&s-iphy6am2{Rf}L#gvtN%{n%&ABFiS&gG_HUS%_sYJT+lk+Pbn=7d&aRDq*2NsucJmVC3~Z2j zCCvXIrOtm-&PkgWZT&V$KHGP@)1AXde)!6h%ap92L%xrDt#Ji5-v zZPE>;r?#->%N#1?GogRq?_R7=$&$~QHouvNj$*FQ+?e;ndzlxdw?FUOk>qdiCF{U?d|ka$u3a+}IX+F~xH4CXbNBvj*^=(Kw~a$)eh;r@ zE(Pyhv8<2hR;-7TyvN)#nMZ{_5F_kUyS6!xKl@Ky5ogR-Bad*-&fK_Wjk!7Lsc%_} z(%Q&84f8zrzd-)L;zjSuTps2^ee+ek{}SXV^_00r%$L#ImQ7JI_lbFPY@2SnFWbaU)n*UJuanFV zTDGJc`uM(O_JP+i?}xnMQeU~IP}jk7=Z^WZ4ii6|FJ|r>bIR0nye9j?K6Pl9Gykyv zI{%MyV_|AbnImGh%Un3Vd(sNzTCG^t7r8sj%u6soe*k>-kxc_o4v_f*=0(gKU>#p= ziFOX3LjKI+jLzVz*0OE(LA{Fx=>OR{%76A>VY8VW2V#(UQ2vd%UCi^7`E=+f^LNaR z^ZZ;JoB26uu%!LS2{AW5 z$_M7I5%(GBLwYLm2_dUGwm&z=@n`?}?sy;N$I``J@s4|V^3~V|^GtP)pX%c$o<8uJ z?~xC-VclmojX^$5pq$;ZIaOSxcgRV^eOMo6 z^LSsG2c_$0dHnbfa$K2X#oRUW(=qUaxpD3A7aJ$YJS65KF(-@IF}Gc?CHQWb%cQrT ze%dK9MQpGS> z2lFHJhwq^$`hQuj`me@)fRZi9VO*E_iRAHLk!xvg#N5e~DnHshuJ%vKd?4bSycTTx z4R#)L@0hD6`3pI`%vsy?MWn=&`wRmlFTC3^=l&BT%+Fqpd|KYG`_6YvUQs+sKK)I8 zD}cF{d=3+r>@Rb=n2Si-hxPw%zO{N1{^mL6Ei&hlxn}(KfZmuJ#`BO-mmUE^L?3P$T8owbD@+i>@#gg z+WC};%*Ui2k~y*{W$rG|;Tm2mb7t`z%voh!Du*~pwn1KEn`~QQnsRBl)wxq{`hS_h z^W$U=Dsw#fekt#_emhg~3-Uxcj^AOtnY$`^2Yk*v&i(t=S_k&6Lr$el*yk~xBNliq z@8P|ygWv8T5AvO|jjzxaF>U6AlaD%gDw*T>Q$|r%GJjg~4P*jk1anl`cfR9Y7&G#V zGv*jy_Kp2zKJFpNd)k(VZQ?$U@tn?G=KIm_VmGmZI$1Z{U|UKqO_8#dePN$++kfg3 z>iW&FSNJYvelM`<@cT~Wk7MlEKF9L_#+Lb`vd_q6{o(Lea!mp42TeT2JXc=BYxzFp zyRi9b7wRNG%4a}ZY*X?P`k?djk;j&c@h>jNp1en$>x{kR`FRp|lrNIk<+!8o7+2!% zi0t<_QV!yK4>C8G`MKO@uJVx|zLD4PTKO*IGw2+0$Sh!b$F_NrkKA_<<=`OK{m0mE zf=;Au=ox#wW4>$pD#RmsM)n;#it?WFn>n{!N5ypnKmTkK_i>Epcnz;TjC*)5>tJ21 zlXVl%-ggjWV9_tI#*sqYXn!K#W9ejfEWiB%N{0sa` z>L0GfA#tC_c#gKEd>4EMz6aDnouP8*@eUx*I8*jsh1r&L|u^k{%>agiQTW*{~2;DOy1YHRlKuR^0ZnfaM0p3sMFFEhT;3L59|y3l*9O6i}-mKa)#x#PXAE{ z-=*q5WIk+w!2a)&_#}U*{vSo3Pe{A@6l_s`PwM2!qw*ZDkysAaf7GpXvfKY}g7=^E zk^b!p%wPP_sY7vZ|MmDA{1e#!opMY$wj5(Q{=XlWV=u=a`?()~%|Fz|-tnjWtY25A zJaFx{e>)HBsW)-HZvEPDoByZf%>U3yO7?*NAt&kMDE>cc@c&VHjOQp%i4|hzNVfcs z{%igR7B{Smuv)bcR?Ql6Ed;(46u8}fyXeh1GJaQV&FV3b{RVy!C!ozpy~{WhaLu>X{ySi^*SS%+FjMIB({7j0=B#y(`O3Ae&N zdQ{He-Fc_@FtF#&k&B;sT71rD;5;6&(7Dq!z|FOwQdXDFHz<1VMtrlbG^mW#tqZ9v zsGQZk>#a83juO<(Chkj8=Xovf>56+<2kYvBI(7YQi*2&)P8~|hzOYZ2yW<@2rA}{y z?_B(7MG@AhzSz%u{t1M&LI=?&&cWx@cmETej{}@5{yT|%D72gMmqkGSkjyKpf;=rg zIWi7p-ce5ZRCr6fczsdn)8ptLZ{}%iuwTbIMm>>U7$kYEj*9a7~{r9fX$dB>Yd`zC!{7qgb&y)B6KW-ZH91o5Q z$N6H=C7@y;tOyasL0r$KiGQcEh>FJGOyL2|w=Buaz8;F7b53)dG zp0bb6&F}tlEI2m&d&F0Rxc>GA5WhuxvrT1C;;$VQQPCKjaXdiY)qG5zCU29+$?N2K z@;)&@ED#fg)h?iXqny)vp0beNI_BRu=eXPgx&w3%s66Na&_ke#phs+a6lD<=jlr41 z1DbEi!{lZ1GCx80c9Le(nhdu$ZL`?nDZ2{BXXI$f7Aj-2lK=gsCfoKyo0kr{j z0rdrOe1?HWfj$F`1&squ08O+h5@it;jlr411DbEi!{b5xdvWA#@;G^&Jl_vQz4s32 z4G?uA{ey==#0;@h5=1Of2Al)>C(8i#kNsqSX~WzMx(8GlR0H%Hh`8?#`Vho%7z5(? zM1f*JaiD3S1W=+)Nhph`XbjFY9?<+tUM5eIx5?w=b;^@x2#KYuL6?EF48ZO` zW}|O%9*BCM*r&`Q_G$0e1-%XG0irK71~eJOXQw>pcyMg^ep78qL#b&Y_KT=!49-*@ z(0t5N^EY{&JWt*y2I4`K1GEP|2K5BB2Gs{W4WbMnrid-=3lMYvgpK|Pv41hBB#8Yi z4|)pJ5Ja1%4~RN{Jc#y>#yw@@0#L@e=ZXy_ONi}dZx;LSy+>p{^pH6E*kj_?N|nU% zA}Sh#GnNOy!xh2H_uVUgyrrx-aOIU^=XvLeFTnFlY~CjZv>b@VJ~2XDpf9K`s1b-V zfclhn0R8GqK*ZeN9SdOl=YlQ*T>-iYMC{XUX#i>m>IeD+Gzml>XeNlVFYUq$#ip`l z#E)DO_!}&5HW606I>M?}Ram#(Dt@`-67e;# zK--3LfY=~LCWA(SJ^*q4kb0E1G-W_(PzezAEwM)o{$1PIXqf&8{gUfIcY&&aYJ*yX z`hrG+CV{4dW`P!5e!2MO!3V^NdUajAH*fmS_?se4{~>?dq^KW-mmiwrZvasikQ&qz z)}xPz-%FGbJAe)P2($$fK&A|!4NX7#3DDi3QlQI0#2zvDcWjh9ls}h%t_GC@Jp!r) zY6B+qpNW73gKqqo7+s#2jM*f0qrQbwBO?(x3{U>Yz74T|q-Zks#{*1=n3Gj?}8D zo|VsB#2-atP{ac+4`?|+-{9t(#V^22x{aMkl*2&XK}|u=ffx_G8B_wq9Kmxy|I5wP z{nw!3J3y5|^*|j!13+UzjPa-a`)+Zp$?MAADSGGbJ1-iGqOtJvfVPS0AD?rMSPcAd zZXg0Q5Y!P=A4DIUzV@{s+5rFSSRi#jY0v|p=RwUuv{h)cOhNybSA1BUq@Poy z^NV=h&+A3MgY$&81)iuP{=~dkCa^RW#CTvI5Pj{h@GdZ2@5h{74S3>G#Kxe6bF zCShLl2>qDTvH3Yhj1jQpGr!YXSRL94t7Cg%bppw<1NQSs@8W)kI%1#1V|qW|%d@P` z9S!P)eG-r9{h;MJIA73oeuy?g9nX*c94#@w>z@zaq7!~^n&jsaUSpfC{gYB(Rrxqw?hB|-lyAAq+1#ULH`djk8ffx3W( zGl%chhH73vH_u4CQ@+2|Qdm9T7uN70!ixAzSdrs|HE9B9qOc}wio`xBO4B6lgQl1? z8NZ{1744yO@cSGqChC09MWA9<%#=$(ms_!tN}w#MXzEo~?9^){#i5iGeVr8-eLd(# zN%1HZl}1_Gii;tYv8k*TA44jqC>Etdw_#t=wAkA%m+rKt#erh&GVU(?HtFBi^f*P+ z>{9=hWxRb}mvQ%6)8p^SXF?mget+rf6Z`0r`5gNB^z*u7pvOg(#+d1G#F)l8#IaW6 z?!F84PSktCyP8_cCiM>0JH@-z?|BN}`Q?%aBrlLBByXHAd4>FHznEpO5P`r5Wz)6b2~sIVQ?G*@xE=)Dn?%e@ zDg*q37o%^nsT@iWcvjJEC>6!W+zxqQ6L22yp}T?oyKxM-2}T|~X>8pKj0L0naQr^V z3%BHcUdXMF?Xliq?doIpGqBHm7TtI3$Kx5*bMjd=8DpWxN0xf*^jO-ZGu9k)j=d(l z6Z0KmpYJV}?@+!^{cf+tyC&hCN8#Pud=L$uFnJ_8iac|H!$(slfv5b~_v4=Uw=u7= zFZtKdW8~Qh5yBccK<7^OIHrl`^}bnV zcigCxZLxl{P3&v=V9K08Ibq9!VDe2K)AE2N#w>>8X2?u?Tpig;IqQ&qANKp1YRhL^ z9w<2gSuo`WTOOFQfO6n!$OA14$S0;O$Ok!~dD`G{@-S_JuJ0+`o*Nlp`vE@!YXd>d z3495p=K&Z4AU287g4$U3`5@*U(D#2B^a`lMdFP2}$|pU4ncedc`|o#Cc9FD?jPY^g zCh(sV?v*^IuFoF#O2$*qYhCZYf2kjA%zO2RtK*f9(LCnUM<(7izdE{I@pNdNuCdKB zn7%f!CUH&75`!jo@3#`-AFz+HRK#QIxGMGU|F%^1vR$_g_n5aojtuh13D+jct!xOe z5hxQJ8$sI#EVXSwnV{bjOUi>_vViiPMB6~|f@2d%IiO^LR}K^dpPg@$vMqD82Q&{G zx=iaisq1(gyx+H1?qmS{fcnUpM~>heV2*QbbwG^2F$TbR+y$V5#y{I&TepH3e_+l} zXRPDdnj7ph{?V*Qd_+>w6dX_Itp4@#}hR*9FBq?ZIGrUh|4$Q<8_W z!8>`8n3s6hSf`C^+rGN=>2$9xZeu(Mt`*L6iQj3s&Y_2}pWkHEZ_AcYvV?L$$sWI4 z;BU$=n{2;C+Yes9!Z(H(OZrDjCTN>V+XslL`0WGAgj~x4uaDrB1tp~2=#>S*ZGw>T z0NSml46ym0GJr%L@6$7PGJrO~y?2Ylz#QXk4M3GZw}Gw%F$P%B_-7kzi)}LJk8_cO zo_R`~%E|Vp-f#DguqGn5C&vTNF0pTqP44x5PPh-I>xuaw{=5hChVIZhMV5-clcYZt%WGhi8Lv2h+q~Fiby;*!gIDru>$X7X_`QI)(cgk4?P{+OmI~%!m%n5uIR0VVgh?u+xRFHWMa-Q#_{qo5O*WcH+{^{ag>3Xkk@7nl`<*9e)#PWjK_jdfwqazH=lYbOx z{chsjr{g`CH~n;tcazt>_PL`&C5D1w+{8Aqsc~uJbUI2BaV+;AMOje9HdwcA%Wl_^ zA-Zp-jPT0`K4YM~%U&*+zJr#nUU@*9OUq*?e&F#HY+2xm8R&QpV}~Ra=XGpDJ8r1? zNyZ86n4yvbI!5H!AE*a#}#XvhQYZ^=^GW?os~iy3%q0wZ2v%K`F?KTe?R0_qqQBcR@K=MA(R zV4MIpYKS=l+WQXKwl6V3zqVf=l_QW{{ERZ6x@?RppUr;m46e{0O!SPzQxKa*%9+zd)~F-Z5?L#!(O~oFKRMv zbQ9+Wt|Yc|#&@!ff0iy)!9Iz{^#19V=Dlv+lsCGa>}8K3AEc~spHs;Pa}4Zpab$v$ zkIGiCo&i9FY(bM|onJr8ivMKTxYZ^i$`7mHC1>u2x#)AncH6La!%hkM%j zetqx8^c>t9aeT+l_xtsRPw!yfvG09)KByn>*YRq;&BeT-tAKkK@4@xD*G4z79$bfO zoVjIo7=9eM%#NytI3MbGc6EjBhn5d!>6H`4b4$O(Z!>7QV9Ep?Q!vNd>od5vf&I<` zWP#T%2o^6Oj|87j)GRqsYa9JH=zsbc zj{RTqUm}Wq$*HeHK34dbIv+AlKf9dE3!1ZI>U;WIDy}E7A2N>5cND1WgT?V2Jx{&h z;NFPcdFJE5XW9<+`0v_=XQ}b7d6%WD?>u(9t*=c?xcD}8uEtn!9L`7}ZmU`>gVU2Z zZc`#klK!pB?2bG2@;+}Hx?Prvg7s1N-z=4!;msRqbm!7+XQMJAy5{8$FyxJ;{&!F2(SmhuK{v^IxR4#P-4NhhnZtM3NiAg zO|r)nZGaoEmurIl*ZQCMr>{Z(zv?wtiRo|V&{v-4Gf}qjS#{oo&efxz^k3Qej;=Rw z@7VcuVM%+FC^Ikpg;GMWPFz@Pj8}ml2ow{9N-n4tYe6IOE&?a~BGsD*5e#|NC z0*4y2PFc-3#{KMRX5v#puL*VBt&cK9x24+%B2%Pn(Q-kT?(+pcJLQA&DFS4IC&pmf z36!s1dBAt$&P5tB+mr=H{LteckRKc$LC-0w{DR;%0eOz3<_(l@;Lj-X&Ou6WzjfhHEX@71jm=N@cpteNFg7V(;EO2W08cpvw58R~cr_0O=+vvg&I z*+5!f+iJ6`CYGjQXCrVOAhp#1{ovnaht9XTF(!MTY6mMbg2Azlrb{5$cVOj|_b zJXp!|%=727X`S!tdjt2hmvXf8wcPdE`Cc18v|sPw-mvc-U2o`lo$KQ8l%wZ;n3pj+ zSI28T=IS^T>mhJXY)G7#*x`N%d~3WqjgL2m zp=Cl4`(Qf9o$^4x6WU+e|I)UBlHIN>4b*D<%oCktH!*EciT=5g7W|!PCOAy5?7&c{VdLBL&-RoTwPtmGAP;C1e`PPHTKya z>kZm&uzu=iFiJy41j+@|R?xD5{(~M*Eelu%kq3^iplyTP%L2zHu;qX&3(Poy;xCm` zK;J;+h`BKWJ-?g_IS@29KzrbPxjryg`u_~VfB0yuzsH(;CG&J_&!_MG@qCGU!_IeN z`D(t#&C{WcZ`k?v`czlfyYqFfu6N_e%BR=1z2+IMQvxwhAJx_GCf*-6;$&K9=Mca%&isKt$0#Q`fO$k68}P>mLf8X3 zHo$LFOpG``clQ68g#R(;A$LNpt(3XC($44i7|eCI#yWdP=lkt^6_e3xZXDca6VG!n zFJpNc_l8gJ*o>huAIgV!V|01JyALlx@#V!d@u@KS0?H6H!#FzLGxT*&12XIUYX#v54`aMR~9Jw?D}LnPf5?m_2+Ho$MvOdNMnp~L^!F&842lKS4K^Uc_v{JxH% z?>(`6GcVWF^|a6My&F3ROy%GzeXrl88^1R(A0&on*!J4Kbajl@REB=UCRTQa~(C;xUu-IrEtRKJRaT zo91ubKfBLTRw$X^wI}?t!59132jTnG?txL46srV~cTwY>b^QPui zhlfqR#(n_a71ljC*ZMVEjC(QcVz;K1G7B`rSMr#om+Vsi)@7*UzIx1hxo^mroMnV5 z6S!~M3R?Era={Z{F#QHw9_aB#>5&CaJkdYrsNbh43$%ZrWPxK7*trF+958Ky)0G28 zUZGzO1eq(4*ueMaBF9~Bwaj515a<8!y`3e9{htx>|FNGH2gdPTJ;%q_wtg^uuVZ_V zqt4vD9gp?f`A$sFmDg^pm$;{%*Sh{uJ64Q$ZeU*9i%QoA!Mu#mhw$M8e0Ll30e-sH zgFYMt;M}XbHO3rFoA`DxP5cIJfIxl_}lWpl?8lPunjzOjrzS(7MOAX`_9@Zy`Bm>!k9~R`N{7a z`0Rn8enAd-L}m^VbRueunv;a&UTKCEk~ShOp1NpKky8EH*hZB9vT!Owi+H_zwYbM@J@T z`@ky;W*RY9B@2{1aAO9pP2iOW+8)qziLP&uy&N#-4{e@xzW1ir9C>q#74Jj;8S4x9 z{nE(sR~0_|hhGVs-q!i1tsfNk{1&ch>ofkU;xBR07(<8dK%I=c+zrC|Z%wiH!fwFz zk}BX{exvSwNz>!>x{-%4=SxD)n50Kw6I3zm$!d;X$rvD%ek{_OX4-TIqud~OSPkIcDqg8wkrMo zGy8Av^Z5*#fqjkzk1+MuLeGY z+4^b@)ztYq4^!u7YMq~y@R+r6!w75hmt(EXn<8u)XKmS}X}q;%GiVcO0`@gc#J;9T z?7Q^UmPkv}qO_KhCp3R4+#8sr&hh#14&KS19?ZKwI&I$ha~hzXN!B-C#TXQe{cPy# zIO`ipxW?Jn$5i>PeSGnvwo>oTO##Lw)j^qeq6}D+_LlYS*YP%a+hzN@55fDG&GXq~ zk&)gSWA~zM@7Q*bFTSA9^BpZ(*vcjRPIssU_BA!fJ}9-Rwa}qAvHu3%MLp#kz(-K_ zfR+WWP2kQYsyKm~L$qxIJ4WEn8|2O}$c-G}_v8yB{>O!13!RK_xKrNQI$!2b7~jux zanJAB>zF=dD%aaWf76dY`pfrLL2cyUMC6pYxpcPPaxkysb++Fc1n(;DZrHoD;Snbz z{!g6vv7owcZP_we#`))_)CKPATJvl&_w})PDfOK5?lIouJp^zCYW-4xa(C~ZWfB{duapg z+qW{$eGFB>&p+=%Kb5Va;}TBnfpWl(Yf>(6-l$(S4bary=8GKZdK zk8*$c@4x?QWu&)}_@^Amo?Z=cKKSuDvtB`*A};rxIn68C_eBfek~nu`7AjWB@zb(^ zGIaggVW(Mp@XI%D_*BJu-`X#5Z!v7DD`&hWxt=^39Ixd*<_OAK$i{y!m?WJRajM^HN^6e*gVg@RynoM>AJ{ z1bYxMX=&qYUGK%cqvN%1f_~II>crW7u{Y|lB^e#0-g+}(}A(HAFAFPPWZ`t{g> zP)+Cg_*Yw|D*Vp{hUdPXe*}ACNk&KOk3W9Pb00%hz`h5@FVq-1@&LBLf_bl7zy5kS zR5iIio^|AF_+;+9t(w=dLg2c}HAqtM}hV#IBjD^>F$ zuFhwC&(`_2t?ztm!-spFo1ZxCF}%0V`SAXOSMe8OfSm89{ZDip3}#S`xe;s5dYtPQ*iSaf1vM+ z7qx@zCGK_n!W=^-4;XvdxITa8fWFx9naoF2Yo?fwtmk;tT<>Yg0eiiH$|>~c5e174 zaGpTV6=*qtxo%)&O4MD25C7wU|LN*Eb$m~Lcf;@F+qS;e_Zs({i{acX$9MkRMgH+|I1_uGFy z```2d)@9~T>^+pI%UnIk_Fovk;CPYDv1}yc8_Sn=!x;V*s+wFM-@R*r)c;PNk)3m> z)55Cm)sn$^@pW6KE zkHG<#3m3FN4DitDb}84_bDoa4Nt@ryN6qn@WFFVYf*m)nYX)E94ahFy?nJKpa=I5X z7n{8GyTmW`~m1hLEzYZ`@ zk9MXP^BV6g{g}7;lRC@5yc2^%++X|R#Abfvu!VZoeS25H{%-<|0Mn2)c_&>@bL7sq zvW_0z8>;C%A7_pc{R|bu;Fzg6rbixN{OBY6fE?buZj1q@!Pk-aXN(u`ocl@+FsF-o zoqC<3-ySgKfIojguN7Bvz`M@Khz&UT#BR<&A}|s?<=#T4|0Bl({|UCd!}qqN&Nt%v z%Fd^~WPC%<#=d_&4)MSF%kg=A{UPsX43NAhbx}w?Ppk*xoxH02@4)yQ{cy^T{E1%$ z?>^&b3m3GKxL=ss68kpkeU^G(pJN&7xITxHHoyVQj}+9#9Ao+|T2^{Wdt5oboDbZ& zV@{sC!8Hvlmi2IAJt~L8&Dr+Kftu1MbYcXiU%*^(XU@>}3!tmCy~;deHiPhiRb z*aAtQm?`B8A^z{d7^oa66Z@{8AE@*FxL0}k)e#4%X02G>GtcioR0V$_2Do%_XW+lC zPuBzQ^hpieJ35~H86@8B#NK3V?M-?A#tmaaHJ#_iwplt!4Js{@`bPfUS0aFer zf57V#sQ8t#2ZGB1u5pc-QlXII-;RR@*!w|zeB0Ldti|KJEORy>Ytt4qK@9NM{9u8g zb*x`I9I^j8z^f6%b8P$Ub-cv9jdwQ=r~GO1_KIb_^CgEZXvcAoaXR|?>1l6+-m**X zgHr8Mo-_7?9p|xB-VZ{VirB%1b@>x}57qazYlcC-HulCUAq%t|<@iwsuUOU#YZCto zRZXss??x`kEX4hs93$p%K@Pa<5Q5kP0dYd#d_fRfz$*vzdSjhWtbKw?(2Z?3hcZT9oeJ9GV>PxnHuGm-euE#@WO zIj^SU=#019F*N$(jJ=^uMY(8Un|#S(3-wIjemz~bou1Yj*v&6`TiO!DT|veG*>}cJ zWchyfl6H3Ydno|W}2u&?90Do@jy_sLcc&=%14fIl`sn*h4X zlmh|rK{Gbs+5*Hz;lzJ=_(SSB^_-skK3kCZp3?cWrR*4s{4N}1ti9fzF&SR7ch8bs zzvt7v@X(yj%+}CBOfFYSFDM-8I$&Gk5l%kn!?TQ*P1^ZO4~0rT|9gA)JXHS(LbL0w&xJ?1^JHyK}3 z{y6>GT|4H5YC6xyfBEG*;Iu6;{*I6IF_xNmO{nAk``?kV#e)Y5E{FN((eI_NA>$W} zJqFBSD0x61VAswCc^)gq09P*Wqij9Idfi+@zZ_70fnMjO;{)Cpp;|AdVuXG<5EvT> zmNPH|7>S!&vGC!4;semx_PnSW*VpT<~EAlI`t z;tFbBSAF+D=XCnzKyZ7&kpqrTzLO@iy}HsuleUGzQ2xowC>41N-?wN`|n`jT;JX{RQR1dgMqmUc&KH zq{oz{8f(6jRVzNoml)vpyVGs`&seX{?O^G*2lN^*uK6?yUSj0&-ppd zcNf&gd3xHJGH1^5=aruK+V+}1gX3Mt;1&Tp^tt(aNyfX@Z@=a}hb`2zZ^b+v+gZG* zgO}2MzxDat9LF`Z0hH`I8LH_#AK$t)(THDi4#J*?kn+GDYuHR8%m@wL8q zPp!A7?&rKMzZ+3d+potCOTLh?D4#zs=f!E?XNP$iSJ#+V@iy&`(+AkKE484yj_-Bu zfsN7GS^|tOS%i}Nr%kvQc8aWnrM=I1;h}u^4ri!`KYDbJj8`bx$?>!2DqzoOTZ-dO zA7J;cg`uj+^>M}kKVR7&_7eV5xnB2HC_2Rct`z4%j-&@d8}1rB%7r!fURObF zi~+7(*2nfmX?JS--qc0Prg!wbC+4m(?~lK=mwdc=Q)EGP-Qs#$+UeBsOQ5e!qW(62 z>-|v2^?5Bni2t42=N4Sw5d&O-aaUui=O%19D(xuBgVxah&GC00@@EXUY?=gI+iMRv z-y>y!Uk+#)pko8tFQ7eO*a9*(puZ2L<$&Vr&^ZGN8?%r@7#~|W@t-n7**w|B_Dx&g zzy6+Mtk&dkyz9%njm4PfD#-6rG1f&r^O5_|d_=ij!w7G%@eS-xkx)1`$JC z@0uUqvI_;ojqAguoxgN(7v$CEkuq>Uh_WC9{m-x1d+5H8mNHV#TW~BfUhX`lk_D8< zYgP@+ml)ul?=qy0=UflxdL7D@9B}6fRNeq(fjLj0Wq@8=7=*alkCkmnd zuLPg|dEdOA*7=5?&vjS<-`!AgSG^ujt-ZC^+ap(u^W6ot?c6cfh(#&vySOJGYP_@5 zn0N8c*t^8MjxCX=% z8&GRQgXa%1Ht3HJ2C)Tf8NfNiX|Yuc9sZ|GBL3~U5ZL=}jL-kgeErQG`uWPH(zxgN zD(ut#BB^iF7UZ|-nWx7&Z-sf=29>(rz`ZlSu5iz|v>SK##?VY(ntJre4|$JYg~I>A z{cENDzYOd46?HdC?wh}PjXtK2;}~Rwd%yZkd9ZwGj{^OcDf`Ybm*XgHCa+9Yc0pUn zh&EQ%k#9l`L9UN82Do@pdsha)=2Q7zhArUu1@^a99XX)+QTqe52eeP1;#F=Aq3zdt z;sWlshf;uzX>m^$KKw^jh8$FDsRLqsjPLtm`^whW_P)lwUVBeJ3^EvTTFQum{5Jis zn4@E?T;pHG?|}UP%$xISCg$C^nf9Mm2|F70!fwnx7u3c$+Oj3@+qh<_Xt`bL-?|KS z+^xfv4}6BhhrcYSu3N{B?t`z(i?mar+ z9%s(a*z0nbtHtki71YKU;O8p`NIjD~%-ivD8Fw>d?keU+A6ok2DCrk$*<5h*7mSP0 zPN!a{zW%RiiE%Gv2lFC1r;s1{x1rwKAAg)c{PIJIe>I1p#?>PWFn*j5*s^&_sA^8@ z_?FF+?Qf?r#{+Y|a@~<$d!*MOn)ZN>5Blc}b$rl`3wYv#K7W8QFPY;4Y|Mcim>&Oh zp~HXllq#6J_s8~4Utih#9_)L*z2WM7g?+BUrC;_s;!^JxXbf=E#tAb1r*)3&&ucx; z`Encgo;fy+`S%p&ofw?*!x>BCde?%C0e-P@ytLt$FYO83Dq3Oe>tifK9XGF0GNh-B z8!%>GklV%$W2OJ^v7791)^b3_QHSJD4DkCUQvWl@L+2TXk^@d`0JeadE6B+wP&ot6 zx*}5sKnLpY4pB!E|E*dG>&+&@YWhZSYTis(<;saALWpXhe}k?8T>?4>R8ZU5sQ6+K z+qw%>_4?~Yaz&zK9-}0Wr77B+uw#@-n*-kHmdK>$> zIqqJwd@1%p^a+&gD!3RR^DilL1Lr06c&j`m%A}L|@VAwJ{BgU=FP6C;dd-1t3&0*w z>yT}GK+Y3n=MykS7(6cE$N+y1v5k#{_-6_s{+~chgma>~(fNAZfPHD}YwUZ!v7^`J z@O!rf`E7dU=~2&UoulT|H4n1X^XYny&5W~iU&qi~|5?S_-j{lmYtRd7<61qoMLDC0 z*w_1J8SFTZg(Av?eux3){crJvdiF!;JI7qjKWM*6k2Op64k(wI!<*NQF+k2Is2oq7 z(;Z33bxOd~`Cg$Ng(|?wvd}`LS z=evUYj=@pH(N-+$Yh#|}=@MmxzTcD=ePleLAaj`c_n0|W#6QQ=orlzN7A5l$5J%ku z+dZ!vV}SfShiS+wroG45qO%6k?-Sq~#O`-VRBXWX33Pl)SsTRm@b+>FL59lqC9P#gVo z@`b`be0lJki+hcETgOXX9|H4E49@k%$Y+CN`yrpvCF*ZY`ndkh zlE>WNfpUXB!0ufIH-Etx;F?u~6#n&GCCAq%3+N+!y(KnZUjJ8HqGi6Pv*tj`09y_~ z2Dq^Sd+oQ35eByfsJ9dk2igMe8e`*|Lx>3wBZR~M|7@3NgUEf6Fe^vv2-#lEnYYuJqOLzWA;R^4s)WXQJ?L`}0!I=N9vf zt(&;lJ~i#>-qzWio)7h7pYc7CKelhrxiS~ama$sr(?@f^x1qjGTae$TXPzE?f2HrWeXn)BF+Xn1 zt+|-jcxP$)(6&tt*^reF@vBfw&b4~P$12Dfmp+v9>*_c6^*MdqS1z}Iv$Xdq4?bV{ z5x!+waNjXt-x)u3=OB%-wrwri1oWeb-MnrmPyPa%tUKbp#+nYUIg)D+?R8(C*Z}2# z`X)rcTw%~Tf?)H6@PDBXnKL+lPJR3hfWnIZMnUwx!o45+rp}kxmpWgKvy1yy(k@eL zL7U;*^vQXB{UPt?-tf$ti!0aPb1i5~i{I-isEskewQGjkx=8b#&Zp6~eGYowj;HBZoA$*i zSHAta0ROfwVu0&1Kau$VeC0sMg43ao?CV+DWXlW64cZmof48`xhF!*V7{8Qx2hLoi zJ>G_`MIYfCd^f%Z@!uqs@Z%w&OAfvdav)qG1qdJ`AFUIw@E(DpBNx)G3t2c6*w}0 z>pkQeWLpk6J^^`9%K?{nTo{w*u7&gVNa{WPu>qrF&A+xJ)_PW&sr^JAXa*SJ@9eJH$3|Jm`QXk^73$e{?M##Ha6av{X{C7&iZ3k!>p#DaR`<~)N%iH z?r$X*GFK1Dm%qoIUEgW{2hT^)@1eJ}%ZS~)Zj1p^#w!`XH6D}!+1mn6Ou%CcDE?J) zK-&Rk{;(|vq%WYbK^<8r@gMU%@UOn3VERE?=esdJ=2gJYcXYlRa7$4Mg1Ue=_pTMvM;M0=V)!JfjOhCrKG4DMi@s+}-|6^W&9c0)&@cA|N zbsnFd+jFr`Ki~EB19iS$hp)b|LtB7r#tLd<43N1xO8-;Wo4GVv*9XJA((@{Q=8wH8 zpPD|wHhlA@pf=_K(oR>H9*WXWlmWqx@pvdwvVd{Gz27e@sIDWXa}szQX^*!aXFc|= z9AGZh)^8H?r5}uG(B{|q-joAM2EZP0YytQM9)AF_0bgv;%^T9TfIn~0D+3qGqWzppD}`;@2lH#Y5W?5S^X@LOB@dpq>iI?CUlLr)as-=`#B zNL$orThBuunlW-6E0b||PyEg6gRfmPB42XYLScyC8lt_-#L(Yfj zoE~#dAMrammzs^e?|xg8*f+nqqt@inCM(Eq)AR355vR%y)j2WRr+4SkgJGWWbvLGF z`qavvrk(JA?R^WlWL0%$MKl^2^H8HkNg`&+e>Htws0L&}?VFO?b((9f$qE_chL!xXN;`xzEpMaNs*KI}KijIC@q$Xn@cd;cwEl7;s(v?HV7Wz;sLD}3iSfw=e#C>*jMcU z_B<7AeEX=E&7c22^0zqvRZwqWBWUt1CVss#XC(}w?j1^-x|DOT(L@_D)aKn$=_ zxNE(4)a3%_@qEX6MA!q^f{*C;g1t#rLIXrD5jeM=L+Sa-W)FZ1FebpY2p)iCnv`(? z(*#o8U|BDe<_eGptQH?RkN7`K@&5+3b6E|)VjuNWi1WL#KF%XGKKS+&>*q5#QeU3U zx&zmp)eU_nz<<0B#cxAb&vN7H>-#YeeXWec;p0|9ziNGU%o@V~Tz~#&C#-!n*NJ>* zxy)|Z{j?u=%?%fSRqXBqt$)4j`>((LH0AuMPKkAi?%dx%);wx{fo^YXd#|A&3+e@ku@_D(*h)bghMUOTtf z(f3v31MU;w$M;}8d8sEC_uW3j+4Sf$g}(t`KhM|0M1D`)cl~?~^Rxlr<9Ph88SZJ0 zK4xeD>(AdWF4!FZd-vx)TaoO*g%^BoMfT%v0T;8z!22zZl6WfAAK`nTEym-}2Up@8 z!VY6LgU)M1uc7NV?s5UA3n&+$4Pc(o>w;yS0OJA7v3P^*0rrYH#Qz_Qf4BPR*8&phyzIJbXScWRy)tVSI92>d%!B6_xDVXEg?Yr!c?|v;d5uqvZpUc##hz(y86D4^-Y(7 z-%r@b_XIwGd0n%*Jt(ufpfMW$iFq7@zgJ%0<@WINCEm%O$*J+2>#jL*R&QPVBf--R z&-=@9n$>4F_7nVndZzgc=$Gh|H2=@@PFs-d0P=z>rK4VX<&hfqqaT2MpXz6n3k1%Y z%_DR?pqd|mCh#`#g;?k%q$VKxw_xK`iHDCl`c-p>|HBXcQ~BKZtkC%r=TErDc}cN; zK3AWyKJ2XL`+g2b=*17;6MXJ0H9%-x;G7=wsGF(z`WEK%xY{^2>;dZZRzkmO84=VO zgX05B)2;p0AHpZr>;~TZwO1eC`nM}^|3flw@S6eujIFwM05Je;+)A9on{Pfx@Q?ZF zq2D;p9GJuf+6@4tWh3gD&w5}R&3 zGvNOK!Sl4MWi$5I*PONO0yhHZhn>|rMRKkfdkG&9T>!S|zWcV-PrwxSLIX7Hr(R=R zYcU@1nt+c7Vs60ffaM0v2KYLG$OA&(0QkVX;{WthO!olqNq!%B{kq@x*vH=0N$1P? zzG2_@v)~z=#`|C#@0X#3w9^rN7Y=G!a zNfW>~;QzIA=l_p5^v@(Gf_jJ&`<5@&+@A0x(@)6vt)?gC z_C$w?vA&Se;MRu^`#j6Nbup_0IW%5}$=5SaE_w8rJENb+k+}DmPkb7D9M74x zj&{+72MT_%zNj@pIHtwNCLK%L0h{oJna%^nc|pv@^X`wH4^S%4aeT zfNP7_8RG$G2fQW_xd3v6P7|1r8-O-&?04o6|A+l~z&|-Z{Qh#VFZkE_PsBd^z%id5 zJ)s|vSn=RxfB&r)cu+JzXpHA&njWxEem@EGz`x*Ld}bMg1OJ;hU()*5%f2761?r3p z`(M&{19gFjJJK?{VE1=_PWHpw2(brZzgEKUwmu~07$jR;t7SGm0Bv@qv9P@!kE1&*?#s#zN1AtGv z^711C{tpt&*Os>hc-;T)e@?am_T<6~K0n^FUf|ZzM-FMw% z9w6{~$wgltunv;vBpcy>HrtWx0Y2X>xeM$$-ZSuz^PXy$tsa^20KOOGI9EgilzJ>) zTXsw{N6hymJK%GKNfV$RV08kqe<;QRsYc**@sY=^dfnXl|0557xy1JPoHea4`TS0P zFVys?-;dbG`KI6Zb9fB<)|?nL|zeOo(Kj1#Yewx9B-q19I$9h8Hhi5sL3i<-Dfa}kE{t(=!c$wN8_%mbi9f2X@ z^LxV8VFSPghO7;e^UxJ}LCzgu$L_lGTE6^Dj@QVZpg7-FgJ?PnawfK@!|ba$LaRqw zz6OYX0`N(jGr+k|v;(zxAjAT4{TdfwZXom!G8Tw+LZ%6>mEM8lkKJ?r@PEXsrAESP zp%DA9jvuuWU7epAzvNc5?vj2Vv3~OViSO^D-wWq)U_J!su^D@~Z*mx=N3Y=?b#uTE9z$+!mYjv}*&lD;e5qjn8^!^|mcgF2SKUtF{a-CPQN%^F zyutsgt<~CP)*~~|!n{l312j%SQvz$%uuXA8_(@kedt{k;=Ux+z#&+WahC+5{}20r~= z!T%k1EPo$s|GtL)46PhC4E%oMhJzamb^(tS_CdG0P6&DO)&KkHh2JL z4M|LV!m)20zhTeOc(>P##;<#AeeJRPX#Cvgj@FDuqZiBiMOn{~wX>|9WX^@ZX)oAqq^@_N7f?pxFknjJV$>uVzJ1I|6>@w4XGgafQZ&1(66ruiMUfj5g^ zMBJ=;fM6Z@wvB#mkl($12-?A{b$@=2tFAo8Ft7E?L0?rH;Os!`Yxep;(gsrfP+2?F zXCv(T!Znp_0PO?f0onk>1C9J(FAsQ+`pCJ&|B=8yw9i@}599rc-(Lpy5&NT81e(;2 z=F`X-ZN2M8(VtII?4&xGZtTC|;;*T_*b%;y7@Bm%EN^gT@BzhoRT~f;KRXgBg0&fF2|1i}I`1&F1?JjvjvH^Y$Y0?BFE_Lxh z$_*qvpq>}7SOA>(gyY^ce&g%s%>N7i*GWADIe+H&#U@tc{U-J~_m!N#^!xnItVbL* zIvTa6M<3yie(%`F3^pfeE0#+&8xZV3853k&An5{m?NFZ$z;$H41l-bQJ>r`4 zJYYyHfOx?4K(7bbTruc@e=YrO?-TsLdHlvX`#C(H*6(hXv*}ydS=_(!X8^O;EM8(wGIOck4zT7T8gkejb3D!4r;q%N*mM@jmJ!8~Oc7*pK-=VBd7UG!sg`r=ash zE+;X-EN9cN{8heq#nu_P>#iF#o=Z6ti>FvS*^UWwe!+y7Yo zklFyR8J2p0=vR`vp42Y@Eo?sV|2p~XsOJOjmyZ3khn}Cki|+#8i}`&&gA3dr_F&et z>08(9;{KIA1Gh@80h%Dt6SNwEt{#x*2Gf z_t;0?2j3&?I5^{u)&N)b%3rxxgEl$KIsGg5PP?yk-8Cm^jRf;2T6e7dN!EXmbXLR# z$io8vr9S9$g=Kyq)eo080M}892gn1Af7Wt`bsJFT1xycMjUasFiO2uN{Nev7;2$;7 z4bES!>tk+TX6*~VV!WSb?mNFsVK(4w_6=zqP>%)RBj*$Udr15P z?Sopsuj{MT_7MAdO<&6GuVnql8&L$Eg=_sw#q*25F&^>?*R>Ju*8c$m~0 ztgk=eVS4YDEmvp{qSYx|4RU4!ikQIcK+*@bx0+la%Mm1PFz+MC=L-=3bshj;lwtv^ z4=_ET%LBl1StA4-P!lw4oOHt5=g|MZUOqGPAN^_KEJ%fG?gzpUm^>yLdO!q4TmNI9m~B_LZ%BY)ds}e zU~hfUUgMKac<0>V|Cpoql)5OyJg{{w>`NbCr=FiWe+~O-<~}+9`txlrl=X*VCQqvU z!u2yN-D{KdjKBxoAsqGLe7jS63$DEUD8c?QvQ~xTF~VPNc-X#e=#|$=|Bd>6)FEf} z$LK?#9jL?v+3cboF5uz;vjK<+KIeLhhsFZnrT>`K2z7aY^{jX22ka$$cyHMwcTVxY z4lz%TeXH-WUdkHwTlsx&gNc3kc*gsP;|%*2+gZQJfwDj4oin-4H{W)#*2|+-e#_>| zX7WFl`=Y4jyYAZ6ih0blKpzd3*@0D}k6g3de=(E42R}jo!KJeQH<$J!#2y6114T^W z;sMy@q!0Gl0H+JOIswxJzypTb0Mi0|jZoe%VAuOuv0s2%fW*TRBcFWYU(TWbe}nLQ z`<&Lpo9Fqm9^TNu+t&GqaDMaqX(rDBn)96%*G11B{1fzPT%(w)@o=l@#ate6{;Q>4 zpV;?5UnMpH^Ah22A1*h{??jA(^Lp9FL$xo7=Q#EyWp<#43y405oPhQUIL?o2B=LUa z0J>PmUaj;w(El$uFYV>%@6YQ4Brg#6DE4)NWsP9c1JO5h@`>-7L;SxGKAvkFbH8tKn$3d(_GKOymeziXT^Fqfd|)N^g{@)Z?z=b1HTunPjUJ{p z;3nB;3%Rz3!)^PvTO;PtuTxvF1&9k!=f4s&GnV_gQLlmCeZ&4?nzMinF#MaHH9p|` z5fKlN3rJ70(*{v9i24D{Cyq;a8@L-5`ZLnHNPdt|^M%fqyuj)?#Q&;e-YBs>`Xh6{4_(xHcqPZLejokUE$nyWeb4zb z?8h1VhJCa7pC4a4qrG*}Uxb|6Wik&C{b^X2%C!od-Mw|wa-p!SzsGD6@P7Dw!~OAc zEEjA7?gKYirMR2r{)v-55V+H(P3Ne;=lR5}MLowj&lN79e4xPvUAUVYME&7J>WbwSN-P*p6-%e{d=;F>f|U>Y0XxTOihSx>h3O}wmmYDZiw-5v4cO1F+yMG?ur`ZeR1DII={d5-g3!Dr++OVNGM>tnF zcUNG^@3xFf85amPAjSmH26HY@iw6+51G~$;LD#`GW%;JOZ;01x(gO0{VfG4ia{}12 zY9I0c@0mONANw8PAAHBp>PXw;(eLB+u{6H-;w4-*nDk#XQ}~F z_t`AkPt=1p=-953<2X{wiD4-xR5<=lWH z)FdFkk8_4|qvt1_qqZGjOyKQ+*#OKg%W?#%V1K@jH{_P>LN!W)3^(M?0m~XKd->1GGdTD=MFR`Eee$e`z&R_HU>=)k< zdd0{Gct7&P@C~-Cj>KwAgPyEh5;4eB(nBvjXFKeDFcNA2tIR#C&7a9|7m^_lEn`%KcF@ z3tqD1b&dRO$9{-C2;AcO>o8mQe5rf7O8qLdM$G6ztYGgM?;GzI>we|l&=t{(fjv&Z zFK8Q~JL)+K=LGlbQZ5kqK$Qzn7oa}a&J*JM#d%(d8*&rCdR`x3Hh}oYG4TMa4bm8w zHo(mdpcX!__&*QYXVCkze!eR9TQNW5eT(_Ke&1sL((eQ3<-A?n??Up$T1yI?W3FwE zarhEq9Qt$A_avTYT$*qXzh7eCVd+W#$<^3wQ)RCu)wY z2FceVSUr;3fnWn*8_f=c^BTBS9upK?AZml22OMHEOr>uC=N&d_xi|1c_Z6)>T_qjE9_?aCQ3AOrJyi z9b@qDv;5x1dce5gPw|eUTgU_Ocm9kcf97M=W5<4!wt@D5HX&&TY99h$;OD$XKC`ZH z9ug<=_CU^Gv;!3`Am^RB0KOB`2Yo!?^MoNkAT}O3+LfrwfR2#rhRRxDvjK?*__@NY zWlwq_;{fCb=M(>bA^87zz`nzO)%RPiRnq!vHT@3zsND`)fA04!=7;8=xP1fr>i1b+ zN}uns5007pe8YXfe&DL{dd+YTzh*u@VqdR+%e5IM0$!XZ7kPKpuh%gS>=MuLw=1*K zYew5}?WqN)DC|KNGX(uWxkA@2u!-rKJt#dcZeUL<4-_ z{(F>D`W_S00zNCefSO>^0yHNO_a0zvP_O}KnHB&|;MCRcn>+u1+$xX%dW}!!_jPuU z#Zujv-(mkk*7xK(f7){+VSi)8@28j^_$T(ocX+-Z^7+Q~MUM_VRQZ04^AXF_=kxjn z?mf>BelPQf@a6L!p_P0sc2ksO0vsfRRzAoTC=lfCH15eGgy^`x2?#X|Z=PSP_?umJi`>u~C zU(fIkoH}en8?d!T<8Z{{OJ97g!gkO$WXBX=G#{%cX7U4cWvyO&$4T{^!M=fA%<6<54>{$Cjs*~2A^t}cmB8V z>1}MU72L~`Yjm20YdSt3f3Hhyh}7ly|K)8?#~zUn;21ws-40mIK@k_k`ecinJRj)T z0K{IMdI7W9Sv(MI0Bk<;qvP??^R2WU$OEsJ8534_mimagIe~=#zBpih@&69-|Cal# z)%cMgrI?@C$3D+xeP8nXsPALUU&X%Ud)itb`F`E+=QFa zy~9Zr_sRcud^_+8A1_CbUE~dxyX-a3NcMoXp~ESRCq$bNd4baoI(8s%p1!ys^dxk( z0j~=P{-X`>dBU6rq`Kj6DWAJ%yX07<{-AeBtswHSSje}Q3-wU8|Darx-~if72z{Dy zfUgahWCP4Eo_5OLivNGiTod-xcQ^(|+g^L@nljPq+;pSYJ}&EHosZ@j+B=ZQnbd<*Ai zY7P#0xFxR}HF*&)#ePz%g+ z0yQlFKJv6v{%%h3k2*-s?MA*&ey`;EdiB0)%+Gi~&++Hj5BQJuJBJkUzU7UQ#viU7 zhyCR9%UGYdw|pMBWz_VmT))fnGmal(`OMGv;vE>xG0wdEa*y6_;I-oK9lPM{L1qgQ zZ%KCGv>4me?Evcyi}{d zxSJ`^uLW?HfaC;=e8BwT|DEFhf&Z58XUtE{kDPx}%n$5$^L&PV)bTOL@BO~Re$@C) zGmd`0sq-3xhHE zeXJeo{2uo6tKN^ljVsmi@c%_ypzkH;An|=q#Rq~u!1INCp)S~gTAsi-fYuXbwSzt$ z@Hs-n1GMw#BeePe-#e802kUkq%@N|BBi;@OFO3`kdZ5**PzxBA6a1w3|M$%nXMjaWa+=>z-birb8>41bwc2Dv;j39U|b-x0m}Dt&Tm}6?>FN28%j2y#sP4B z>$Tp@2YhmT`l)k@|MTGgJ^u4LA7X!au8+Lm#r$C}dhhoW_KAPi^{}qbVPE)S)cGXd zPn@6is$IWN-^84rVc%=|f$zr{KV0`E?jy(dzBJ(~$D7Aw4dcLcv7|=;eFB44_P4dy z$9=Q~a&-0};{`!GxKWPA37m}!pFhO~-VQKVK#kDH1c3`=nn0N&w0I!T4>t6Gm=|Cy z5Nd^0FY>W~i~GgjpsqCSH-uOKD`-=J2haxeaR9FafFqxN>Ide||DSN&yTtB=c*n>5 z>ienrK`V87f9&Z?dLPblU++%z{aVwX=lHAneP7q(vER}867P?BJ-Plo_9O1WUt`T) zxTZ6%Z!xjw`L-vx?F()`-*E47m2>nGlL70B?{fmiaXd%+Y|v|iRsP<0Kc1UbVK+Lw zqQ@2#T0uQFn7{=zCTPV2Lp6a`EI{4Q_Yapd1WFE&W{T1VWP1z5dBSB*&}P(HAA8CN z&;}qkIPduH&Gk>r^&#Hsi}^k0_c4E8j?er;^!tka(65`-^ib#XvA)EnWgU;_`yTg! z>(_L>$nyioPq=US`2vpx&JEY+3g*L-K4bJ6ch`AxAC`Lzf5YGT_r>dH974H*w?|!D z5IKR%C8!;UaY4`trJvCFz$7k^v_YRA=++4Y9uRW_g$+xC?H{cYaQJU{$@f1W@2eTV<3^%MKT`=j26 zI1s#l680;8-^cn<-}f4yT)UxuCvjWW(FOk7iSJ8)U&DRwKPd4I%yP3t!eNVWb<=XW)I(eIZUzr~*2SfBA~iG6Z? z;6GtM`umW#Yy10zdEhPaci=Cq=)Wi9dIJtyM}`W{k@5qUvr2bhNlIziPAlw1Hg zk&Yb*=Q+j$$p#=6OLYR&X-v1VzN|7YV0xfxf`JE2%n306FEyoR1N=VZ-Ufs{hhsj# zY64QPFgQTq0Z|LIx#DY1+j}nk|N8S2-zoL}uI87ppJIOK{ieJ6I-lhGk+aEr`16?G za(u`EG0zt{zpv+w*th&rjQ2CY&w3uj`VRY^?`Qg+>EIUUL*FZPJ^Fj{=p6U-wTk)R z-zVV|b2+#$(>q@mgx_)G&-NVr4W7%twPW43B%Tmr24@ckrWzpT1*rw3+JJ68ppOH9BhM%P-!0nS zXT<-@-2SNfvBsCVe)#?Ae1DF8!++k>L#@A+=d-yIxz3;R%605Bzt@lbGR}`azl`yz zqpPM?!+pS36Zbh*4d=wQJIcPFigg@ytq(`{9JK|t-zVFIXbZ?of-RV22L^Egi9E_LJ}L=K3P`U5yXEC)W02{=V9B#{ACjTOVJ6{Q-VI z*Z9LV7<^NV^~LW;>{se`BG*^kJAWUv`(eJ`FyHa%5#zviTG;CwzvGye>}PQcm)-CG zCjCr%5snqUP;!x|7rN)n_-<$eP=lGr0>K8zcU;B- zX@-Esg<2n|wLyJafaL>l4VGF!9|vGil5$Y*vEz{p{%l z_lNDQ)cLqs9ufbH`8%__a=nk7ADUm}{V~ty^ZPBGzvTR7tS{H5V!smSXFk8e^#%LH zNk7N;xDS3h`f`tL{7h_zBlfSseY%g$MeZm+^ZxR0;_u0pm|gH3!E=K=PKa7T-~`46 zM8}vE6U4d!i3zBiAf5{OfxM3ZcG&b+trv_uAn5@qFHrOiRr&^F91!XQ{aoS11AJY8 z*8=joplUv#-~e6=1V>(b`iJJw{~P|xxWBK?k9~bL%`al#`TjQVr`B(Je;50r4?Ev) zoL_RvwRk_~_2rt&{62NQ3D_^We&+MZ^K%TrwhTfeK%!K`rq z-12vOEWCEr9=P`;+@Pixs2zxL0qY7vtfO&Hg$rah!0Z5PCBe0L{;h{VzK^I0CLU1B z4ff{*$N}W&`UJW;LByJ712#;_0az1M;{fxD{}Yb~{xKJc_5JYwE#HrPAM1Rm`PqK% z#c-Rpk7AKEL$$ zz=m+;oJX5qHGNJn=Hhna@bu+48it8o9D(o7%J#U!o_Abf@4MRH1Ute15$yr_M6g4V z7ibJp=L9h>h}u9UCh+=TtQ{(NfN}vJ3xJCy8{lIBvjKUHP|XI^;(%lWpar(-gCz!t z`G9U+(08>ikXisZZaW_cF0l6W56_+dKk@kY7Wj8{epBZAI@lM@zt8s*`@WAi>V47g z+x4NozZd)9nudRk_e0-~a#WwwpMd==kJrIH{J+C~#_yevk363HR0h09j1${-v^_5Q z^CgP?&9}*3n6h98{Nu?k;Q3uUl59bX7kExUeKPTy6c?~A5nRCO0>K94+92}-Yhuqt zH9y$o0cPLXx0P%F^gwEY>=l6j4;&!G0nn484RCu3)Ov!OS|H}tRp$hl4lv*N-%oR% z&A4CZcrxy1ou9>1oZa1>;Wq^PE#9AMdk`NE!oKJHg8k_C3+zXYk9luu`_b=17lY4F zm@wbp#eMYkjM1a7FK`)sIWTLuCcf*(4T^o;|HMBZ`=La;V74K*2lD!nC-~SQ*@8?v z3^|1|E=V>2=OV-e9bF*hNs&MCHG>ul82{2(prQ$y9*7!&nie2)$b%kO>lN}d#F!H# z{vMJc9luYXW<1w4zAo3#;`^$%VOvS$!}w|$TJSb0n~q019!hny|w=mg{g-~zeLOWHu&2Ed*w7sxaL zXe~(-)Ea@B4M2T1`-el%P^Jfl8RK0WKrO(or_u(XugvOAx>_LSf>^CeyC$fn10Y6T zyXNobkpBb!$b0&@AHDt5{H?|}#r;0tXZ?N2_Xq98@%~z!PtN;oc2}48Q|lLQnDKsS z{m_)b_Y2OCTr_oljr9ZmJ@$QW&ue_u9DeZm0r$DT@5ep;wC!P5Vz%7JEZ@`2_BXp! zaF695%YQEIhyEV0Z9vlf1*Y;eNB+~f5HD5bB%xQ--Fo*?OGrD{>gQIjh2_xv%bg0`zwll=JJAX)ZDy}?_FH)xP1%rIfjYrzT;&9`+d*JZ3DSN zvEzsP+g`2dvivfft{#W}i9q|L+rDzrW`ES?6E+e#`ek_xHZva;wJs z%XvP)zQcdD&L_<5)cRiQ1L^So==Y_zpS(Zj`G|eY>dF1S*ZJw6tS3Hjejn?1{62hs z!ad^Un1k2&KKRmrdvNZA`{d{QFt2-+V!zUIVgFLzce{Js-!G5fuB&DnJU>Y9CBDCC ztH=i!7Zi4YpI>W@Q%s;a0-qm)<5g}{F;=QmA%AGj84{2lg%n;}<|&F^!*zsvin_q$$R zVjn((oL}O}G^49h*H`8I-B{o2d@Ph%f`|irEU?baAw(|);sA>U z5)UYBK;!|=1{7KVwLn}ewRz$(A5hE-sn!IAeBclrfHi@{#u;lqGI##p@Q=Kw;op3} z#Qt&r9@hGGeLr+n%dxcM{@z?adB1Qx`u=*JKk)vJ?{|LR=l6w&PV4s#`{MUw?T%nS z3fZ2j9J}B*g#sxVS(DP8? z0Z9)yH_RY$HlUIlw7fvj14C^~J6v#xUY`JapD-tJyc zf9C%Q#|9^e`aqUTOnJjtL!?|F=mKg7OcUXGn-mLJPC#-4nipv3fymh^pMwoR9N_8$ z3l3mfAhiJN56R*H+JHg}ta1R(2%jPb02erO?L7MbgnxX-dhE|yzuFw{w(s{dJ*fG| zIzRgUsQFn9NS5nwV!s>n$Nt@*N83!U6z^MIPdC43^*rYH6ZYZzL!DlU{g9(e*e^JK zhI@z6D$cJO-?61{U7bC?B|I0$j@^Jg$!tPy2e!mm#JEan2Xc)taDf%H0ptNG4k&sC zvpAs42b8_SJ`S*10h}LfIsp5FOouA9K;{8(%yR*5R^WQm0T}}r{y#d0{2%x~psM-P z_qTPwG}EWA&M*3Y*Wc6h{ZaFS?+<-FLwSG1zVLtg{vyXm|CQJAu|LOf|2^vD5&IJ7 z$J{(~cQu|5%(XG!!TKG7?`hW-*o0&k%oY^(z`qCDBaI7uY~bv`ZNg_HCO~bH+JW%- zQ=TyK0IvtsV*%J)=z*pQvNnMDM_!ONAddsAMtCv@fL5I90j3;$w&srd^po9z3U>z|h7jP+xT-@rZbMa(%rpW(fM^E>7Kp$oPG&&_N`)gFXcf$u54N3l;?TtMBz>w~Nt z@Y*1BP@f~56bnqU0jX~|vjGJMsMQ8mwLs|ADcV=fAWccI|lw+ zaes&VclPY9)cH^4`+dF-GyEoKe$Mw#iur3by&n51-Y@HU^1PnK`i}2MJ%1Y9*KiJe z+roa8eGAXS<9vR{Zj`(LzpGB*V*?kP1Uryw30yo7w82g+Ao-A6MH66dz+wTj0Vx(J zY(NnQFei9@D-Iz3Lrnnp6tUi*Tn8Zjp(&O739=@*$^kelaEcfJG4g!k|L=P7Z`>b# zKkVfZ@gMU2s`+EK4`=z(_gjs&elOJgsrgAg*OGjH(br?zGIM#3v$+53~KhyYXxDRoD$KNMzo_sxVU&Hxz&d2)rv2P7$b#`-oiTCGvAL4z+{M7ma@9)&{CC<;7+3?cv`N97Zf5|^P z+;^~kz2JYTE7*--3p%`@X$KnGKvp+keMf2sXafpOuv;TU9$@uBHc!ac2C+}ba)Q(Y zsRgDuAoO*V`9O&cksFFJfbR|H)CK2#L4!Cz=nDeh#k}x526(jKU*Z78K5Bj({^|QG z_^0lVbF8sHxqqe3pPD~39Da{kzEA9ZJ?=-HPu|n3zCUVy@ck**r@TLl`F)O`^}Md8 zPq82Bcsucah?mt@ChW%;zr=lv>B;B4e=o6qgWzA*T>}@M8<&4A+5#UV1WthW6W*ih zhJ9S1W&;HOv9B<406BMz17J_%OyXbzdTNC8d;sbKhG~JE8OZC#bih^7|7#2&b*1n* z?Ofnw4FG&M?HAxUfW*k_&iIEpjQ`*7{C|#r&h%!!)cX8dn!lgtn>2s$GOgJp_m9|5 zzTbL4UEDvx_Y?b3^LM;I#r%nrc4B^sW4kqdmgDQj`sB%i{lND%5AQH#KA#*s;6D0# zkJB2?ZxsC7+8U1dz4fy_*S(I~1O5+ZYob;_JCI@%+OGI{d_2&!0X{bf+lib&(gMr| zkOSDhMKulpu2#kYw?+qFh0}Rpu;roghAmabAk;2yjv0>nii1TfJt!x;y0NaqB=mMCLq%SYjZ-V1Gsu1<^qL(rg_0>K2Uc) z2=jntEs%==WHtyi@^xo?eEg<8=M?|Q0Sw{(m7IT_`wJJN?w8Hh$iAW5oUhpC5UB z^6xp`HwpgJy4(Hi@6UfO?1i%@$rg|g;0Qawyn)$(ZamOu1E2*2J%Cz(*8^E27Yl*@SoC`tGea+qzqheT&cQ*og4jSiwwuCHxmN{pkC__p*8)*WbrFzu4QC=lklK zpXvQI-fw>2^M2P)ORdl6_gdHot`z^l=c>=I;~xIqR*Cgn1p7mm`&+RIi5t)k+!XYL zV1uF^2=6oF0x6EkY6YAg;OhfAdBLa$LJO$p1g%fV=L3ZY6g2_WTma{Vl`%k(2f#HI z_YGklfI47S4+wp#XHKwa+ueQu#sJ{|bL#(3BL1OuP9OWHncfy}#XbB=-A{XbIm;XV z|M)QbjdlLOe!xF@zxV?`$CJ8WJ>NIU_lG=xQPW@7`w;Il=J$L*`F*X&>EIriGwsfJ zzWMqb^Ie>86YOu>Heum89UBpBL1qW=KgAZrxPW$mF@fm9=M1BHZ*Dh zp$D!z^AqDY%^Ck+?eO2K_s?hi4CDUoKEG7!Pu;(t`&+)hi~W%E>(%_2>kGUeIc&%K zeGgyK`vmu~H@Aj;ukTlUe$?|yUvIcqtOqO`=5H134_VGW@b5EwKwH500ydzq15uAG zHGzr^i2a13fkYeNY6PhTWE>#n1E>Qa7VB~V;6K=aqAobg1(Y0Mhr|H&et`ML|3R7d zmumj}jIWgUBlqvc9X|GzBXU)1^7Y+jqCpEN(y`waV>(c%1l z(D_q-Zvys#H^qOzzQ*$f?$ysHyl)rmZ{K#?fR+9x*^34z$oPQSftWX(WCNr=z+wU6 z0WJ>kHh^_OIFHH$oDJyH0c!a`)&p2waF`RK902_Qp(h~A1yTnz4aw>OdSZY$BhY$+ z83Rn02e5ta=hy#x{QG{NI{t^{{kVrG`ZYPpy=9^Yb?Z>S4_F4WZh==+6VQuFK7 z_-1oFyxw2b_hm7^VE{V+5bUd;4Y-e*Uci3N?G3|Syl)o#4_UdL$m{^@O5_9N0@Mde z8vuRK=LU+rfUgmv4Tv1T^$T_IkGi0tI)HJ2EEgcN!qorcyRXIowfP{87@+72=3D@3 zfYagsi>i^$C;s;m-2?cy*q^$8XTEQi_bcc7q2@Q${7lXL8TZ59Gxpa$AIJTv`*k#b z(fwV%zpMGDd|y-Zi`X~cpKJZ5mnWS+=lqDXyFUMB#bS!-;p0m`@9}=SJT7Z{IF>)- zcU}H(k}b&WK!^=YPZT>4^97|1@Ol99gJuJm+srutVxl|_Ff9tYPDe){dfMPZvYk?UTuRHS}=P>@SsP(M74titKWO|O z`&*Hp^FALqj9foqHTijmdEkENinaiM4||Ya+uMPZPk;^ZJb<-Bm` zd;swuYXa)I0IdgLE>ONZtJe-1fYt&-njJ*8DN{Pc=W#{!888`F`0;nss61{;K&yGot?|{+;hn+`onW zO1`h*`<3^Hy1vZ!EB1rVA7lLj`w{ov=hJuBv3}3?E#rIck%ePr!9Ltmvk@_FFgxIF zKo$>VHo)Zwvp9e}An1V_2juyH4*rW?Va^Py_XryYuo(fNUjx|pHNmZVKiuQbM^C-~(sX~hZs0$ZOCsmz>-;C>`;+fadVi|%o5=eseqVEY-tPzOC%zwWU%_s~ zJaB){9X%_z4IQqaoPhSl+krw8$ZP<3fX@xOoIoun=r}-43$%P-=n>9xft(?ZJb={$ z4ao%#iUDFC&^SOD1C)8d%VZ9{%>}5;1!4?valH;;{_*en{RZGapYxMqf2;Ww?XT1C zGr8u6+#lb)`hV8^h=0g({`CD--LK~ROWn`c`FXwHxPR{Z!TXDtKl**q`K#C`C$D2a z;of8Z&hfqX*t#bi`8$reeMt5ovjx!(v~7T|7fLw+#sahft}Z}wAaRxuZ9volSr-T$ zAj}P8E`U0K)CFg`zzH!xnFr8%VCDhn|8+J<$^+y*fz!+cnA{IY4d79U|2XR-<^G1o z{=`4IKfa@^*S8h>_x1VqaQ{BtPxsu7{XKr3N6YtHollqd=e0e=K4NXg`Y9d<_H(Y^ z#eKm1y>h(w`#lSfC)<#0fyD~jC0>YjV1NxUO|Yg1690`jpo9OE3-Gf;%NU^B7v$>! zD;fZ5g0&vl=K|)DZKa<}yAW(f z#tC2t^8c5#LDmg07HH^!u1>h41(E|aYJyTd05|~W1Thzg`2dLnWHA7Bz$t5iI3t)E zfXxL#9He^KWX<9_j(?Agh9e;fOy&sRV0d_8a;*L_>xRbAfT+a^?bhVgTYF-(k`LD;xm+zflK( zYe)6t*b`_q07(O63;_R+Rn7#Nwim#B<5~0Q|4)Yh|4QNims0oR`+YIvmvuiYkAKzv zME6hHpU?aC)%qgtXY7xhw)ubJKa2ab+WtE4H{b7Tdh&R`5BnMBORV2_x8R?vum!z}=WLf~?fR+~MbpSU$4@{0S@+9*J+0@5_)pwFY5zs+FZmnw|IGVq%}?Iz zuX#W8hgP*eU_aITY3wi8nD+Sg$=Nu7Y%0M~!y@&Qc_;Bo;&bU@Yv>)h~|2auXT zx0i702M`XRbKBtm!``7f*U{#JsQ(u|D4z|;xj=pHsS_z)M6UTj0N}`vs>;xxcdRH`o58_Fuj$?fFyf zpZG_Q$EjNH9rJ!t2cFOQLETTE_ScI2t=3lGo4W#}b`prsdfKdYoy+EoxRb~Q4{$Jz(m;;Ra0^~VBX#X4L znEzMtuem?T`(NVs`RMWgou1!z&tIwiF9-iw-hWE`*SWuF|9bs1=Lg?^lg;fR{+qtv zVsGT}03Qojjew5g;yclvK;!}aY_MVY*ZJUOKQQZnI`aUCf6D`E9V#?H_5m;lxUs;0)Bte3 z*J}XG|9@)k^8crt{6YE5(En<=|5W!g(f?=o_u7A&_hGX>eck`S+<#g3a)Bu?SWF0`%0BiWiT%cjSK-D@x z*9VYl0N4k>93XQ5$N`2~K$5o}QUh>Fr~y*{ul#?0@!zfe$#egC-M{Fkat>AhXYQZ# zz9jFWxxZH3KkNQr!^5mU`v1W_e^L88PkjpZ^htYpf%^ykFWR4<>pNBKPyCbn3r4ig zPiy_i`-ARJ?jLG=GVDXEkG{W#ePaH;@q-U`*8>mSCw2h~&ohotw*f^Q;OYVxn^+IA z(*a!!z+B+e8bIs`OtXQ^|6iOY`hZdm(ER%Uy{rB|#r_TcAMqdbzb5{R+#m5@$^9w*F$*@-{imKk)Bd_W zKVJK9<^8GsS&#U1-!~VA9zm@CvW8d3@WCP@H91k#E z5O&4K0ptLv4JvB_z*o940P!Dd0XiB$nh{oVfKE>!IY6cX!vA;c096CvY~W@NAmjj5 z17sbr)Bwq~q;tVz{6Dl02>u^yXdjTo|BpQXf8>z;Kgj<}|6k&t{-3%3-2bQCKkNQG zxqqksWx4;pnIGy8lJ-9l|F^Vz{k7&-`c_Q0LQiWk<41CThW|#d&xF`NasQUypZk7` z^MQZw^Y0f9P+Q?Sjt9h8Ad3Tp1BkteIsiEUV}ME?zSaL38zOmKHixw0ObGZS?%Nh$^T+S91{te&XkNpS42KYsLU^XD}fTRWFIRU2w zl=#PcEi^#GKQ#c<0famt@h^3d)6@cI8er!Chv6TbFYXVf_(u;=tq;ig|Ecf~4Z!d} zWB(8Q|AH*+$%VbSut(RdZg}kzf z`=I~darpOn|AzKA82?H0BmNV=C&q#M2X#FZj>#6_IkW+$1vEK;s|V=j0l@z&dB7eG za3%3y%>j`AL+1tl>HpaWSnUHCj{mX-VEQ=#OZNYBkAH9~ z2ckA$3jVLPf5USB8vDojzM^kK`=|e}*8PMzzGeJBHU6RZTij1P8|OFdKlIRp<7}xN zz+P;{fEpn2PyY`N6f}Sg|2Suy3p7OyKsgU6s{tVX zOa4C*|L6mn+y~4W;O>3^L*sw=Mb!UHrT;$y{r|&=|Cj#X_5H4d|JV3G)V4?d&z_&i z|7-pq+CNsnk>DS_x6s+{Mc%)Y_X8%A?q@lEANK>-#`lT)@%SOJ0a#`Wj1$lXB>aO% zFc&Zh|9K7o@4bP4$#;}FfDHdQXVcVK>O?Dham zt^pp3fB65eJW}xg^AZRAw`V?cbi~s1zhS*Uk4pYu`uK;8etcCa2` zsQ*WPLODQ*{{{aa)B`vr{|EkmQx<$NG&yKZNvf!KHqi1!)YyY6EACvVnvVK+Yf5?u&|09+E=k-4|{Xffi_C>o*YIz7fYkeE{=c3F z6fXb&s0I3?`TJBOK+O-_-{<}Ma{pmAD78P-Dirm9#6PwFy#62czhM@5ssFoKz{BeQ zEA@Xe1EBQ(3IC}54fVgA0mgj*P&*d$f0Oxtng2)bYWn*Bsq%j(3I_j!tX*UQlfdSz zZ#&69K%F7<#-9`Y|DZ|uf8A?G#^W#D7oyKduE` zW&f|6|DpZAocqiEU+MqDeiZ2c{nzaOLmhr;1CkD~U2)7>pgb0Eb|B;l)AJJcq5Wrh zf7BbZ_K&__>Jlbuf707P{I_y{u#svnHTN%bz_QvO%mC{4|D_p#A^)%a|C|AW+<%FG z*8ZUeIPd>!_5bGmKcWABLjAwte@OqIumAtb_{_DR7{B55qw#L98I6IR+S+6H(fGN~ z9jy`!LiB${}BJ#{10dV)6f4|GXABnESvqy{(tfRHv6xx0Wb$h{;xH_tO1NQz-9x6XaJbE z7xMsuKby_V{(to5xi~=LcINJr9za{*?LoND<@&%O%?fpgnkNKaN*E+@gpPnAT`uyMe3;@ig#Qe|tzCXrmX#+A2z&zl@ z86f8WJqM`g0&*LW&jF%6fRCr2=aG-ev4Q=_`%!;D-`|P*;s0BEfk6B7^Z$nR{xIx4q?w=_a5_p)5QMR z3)IeA9{=~x|14|&d+{&dIc7g&28{OpS9|}k2OxTXYV$vO_5ka}zxjUc{V(C0hHmN z{y)qCfJPSQ|7ZBm=YZAY0O$dw9++%^<^~kw;P$}=#NWa5x$rlWurE3Ps_*a4{XtxV z`KIRkz5f>-gxY@@`)`5$47ODF0tUab*uOUax0?I!Xn)i6{u2Lb{(rd#@Z{c~(*HaB zUmWKD2LHcN;^Sk#^X^{V4_E@$e*49vhXi}ill5<8?TEae{G03sdxLs|AKP`;(RU{4 z6^Liv{jO(?FTCIY?Qx3yU(P|R2Eb+jNe)GG05(@qXRJ<|0anfcDr*2_&MP$ls{v^3 z|AYNMx*8y^GiXgqnE`4XfOPUEl}ftk_W&J(0A*Rn9l7% z()Ru1$>(Q&-^cued_QLX^!fho+@FbkKccs)_UE|2`TlbMpYy`(@7n%9!T-ztf9C%= z`z!SQmAStd``2fG)qDS^+y4i({~z4zdFm^H^VS0Xf8pmx7Xy2;cC_!8f2p^~0iO88 z(VHe|0QfB60AKvVA4^X38y53~7@!~jhw2;^G04;1lrsRe|(K(zsu6No&Z z*A7%LAGp5Z9{U92^<2D}J{GINHb5U~tEbpIk ze?#Yf4Z;7U*gw<$OZ>|mA>-}0uN z$Cw9t`K8|s=K%P}g8qkdG&~2uJpj1}5bS{N1GLl}K&%5kTe%}@M-u+S9Ply^C_EH= zbkIHkaTbthfm#dfVgStrrhb4f2Otly+1i$8tMh;|9$=1;$7l-z-s3FKu$KsZzU}+X zJYV7Wr@;QOxF5P-d+$GDA2nB)y$=4ROZWY#)crB`XWtL!ehk$9sr_}ef6V^J-V>F* ze~5qW`**QF@sDd<_x>>a!w;^Lc|Y%Z=QAhze#O7|&s}#NeG8bAwIenE4)DVrhy$QM zJXc_(QBB4#?^OG!E#u0Z9|Icp$d}tQBBR{6}XZwS0fj{%lXh#@^q}nLmjAechjN|5W!oMEi&CkGVfm z`z!H3wLk9tSH*v?_MiMeY`_(leM|h{hsN)G`!goS{EB_?{eSR3M-M&jaid=im;?46 z4mWU+$36Y&qff)XPvQXhEPUSQKlexD_2(ZT=K{789LYGqq#WRq=K!zdK7dUPpwTP|E&AZ_x|>? zzsddK*ACWe{ku}{^VKiEYP{#`A2Z1B12eme?|=5QwdM~o_rEspFYONA&(oj(pGUV3 z;Q;tNh!;@j@XohCP0q!u$7iqGS9F0xWX9*=5?9H>@$w^tXB{c)sPWYj?_Yh@x5XB( z(tTL3z4}s7n<~W}$YqiY!K1H-$SvQ;}-}&j1<69$j{%1&@ z4@>fWo7RbsI7`;qGVA*sS?9_%1de~6==%Z+H;rCMSzxiUxL1A4Y zIsOfzQ*D$bxxOt}mr0KQa#>eMJbRV+wyPz#CF}dr^QY@N@r78|i@tY*=;Aj@o=^7t zyL*%9d!qGkyG3+8F46k73(w!aS+q7;_iPs2-Y!^``G2zRyY~)Rcgm95-}|tj^_$** zzr^zQ-+#AT+hi{f^abLGegHZ4EIcnQz7BN){vXsFMBgWJzvux$9%zkRD_F=8+4X~K z={mWlaBb-|hHFi(Ib3_V2C*kFuF0#fI!3QgT&uWdaqZeQY}fMTmmWGk@7(=G=X;~% z_@ARb9~b}@CM{s<#V;D&Cm2J$|HBRYv;{l;;17=e?Dt>NUk_A%R&W8>fj92?*zt$< z{^jw3U-*;p5r_T7__$;LW_;oadyh{#;e%m)XnfL%AC_LCzt^?;qQ9VMJq7C@$ETk9$?@r@eM;8}PB3=cET?e_LmKe!T9C18kis$1}em>%j54bq9{mTK7d+Uy^mu`0TT=zAWo2 z<8yRTCDxYXUKPo?{}@P&&u~M-?=PY18Yu` zYeB9FyFPG@d`hnwy>@W@;98Pv>Xg-Xec844QM>l!`a4;!L%SwFEZ3=Aqq=a-?ma&K z*uNVeb>ut82Yu<~TZsaOa=)@ zFL}WWMmIzMz-L@sCyR5ixCYWSKz=NB^joE$X3xREdSpHENir|^gTMPbqwhl(fE`$9 z1L+!A;Nj7AF2IF>0r%$@ml}$myX82``e@2h3oGlfd7ek2Yu!9Jx*qryyT~TYIMM_{HM_k zzxO{zWAqV&8^9(m*5Vw@(>3t2KUY6L{`FrQZI;=c-`IKQ(aQuAn5DhA*%^4|V;?*E zuTOpI=w(0tl+oLyj_BR8-n&?fbMOeh2Hx}3Cy(APeD_s5KVkG+a+&Wc>)~3zyRbux zGq5-Vi!-n|1B)}TI0K6_us8#YGq5-Vi!-n|1B)}TI0K6_us8#YGq5-Vi!-n|1B)}T zI0K6_us8#YGq5-Vi!-n|1B)}TI0K6_us8#YGq5-Vi!-n|1B)}TI0KJ>GcaD<7H43> z8TfwvI{p#N z{eoxxc>CYm1+;j)eFkuSv>#oZzs31kTptU3w9mi-9}9f6ua1TPSm0xUj|D#3XJCPk z1wPtW$HIRs@Ug(h0w3)&u)xOxAMLAS;XfAmn16ii)ds*0(ce#-@c(-rAC!N%uKoIh z@*|V-FU!dP{50@2DZex+Kh-0^e;$6H z`Vh$e;Ni!E9}54$L(e=MhCdNyrd;;yb1y#@zTnv>AA30L`?D`w_E6}Pk3IR!g3I`? zA3XC=NbrKI{`o5YZ;Fenja%PG(~v2e%KHnZzo4csJ^NH}=)+GuwBTyV3x9lhsy`Zk z`@!>1Jp9~(ho1=jQ~0yg{>f#_o(+G_BOtY&{3r1j-=<^-;~(3sDAunOW!77Ya?<09 za`v5ya_QBIay4vwYmlNWbx9lGJL&p!=@@vA* zGOqDUp?)C@{hN&+c|FH;J=b)7lIi*sToc`wjGsW$J+m+1&-71SEPi|B1~2tV-@mDE z(Eqz2%53FLB?S~ufAjAvH}8#(-n)6_M{mO8u8T~_sp*=S=&H#{jQokB{CIEt@rt&A zfwqd{@q4dRlxug!q!m^-H`f%V#q7KmmfI8&pOIf&oSzXNvFS&O^23!|4ke~#A5Tw= z_|3W>D#{ffy{cTL{OF&RJJvt_$(4%oy*GZZeDX7G&5zD{_TIO?rzjUbef7DYUvb%* z1wXpxw@+TADCY*B@uN5A{QR%VZ%?`RL0IoQcPT%ANVzLq`GfMuTjA+*u3qq~-(G&t z9hcwvy@glLfpZTBoqOeVH!NIu!*y4l7o;puZ2$hFJ8!@JwtrLq?M6no}L`{h@!k#S>N6_>U56wwbxg^uPE$yp>y)MW zD|6slucy7BB;BPQTex7&D;Y5Pt0}i$QhNQ5E3WwEtCrhnhG@$S8A@7{L*{@pJs%EQ}s?TOsCfB(M7J-fC+ z{)>LLaN*A^KT{XFFIG-bY;9Q+4C;bCI?iX8Ux4K_~|F~5xNOK!w+JDIZ)EM&r6NdVHV<_WSz^{yJDKCb4 zz#o18763n)8R|1beP*bC^H86K?0(IU{s)b9D*cRqvyoK!Khs#B?Kl2sFCxTDTAxYl zGiiOQw7vkNgMJu(g0A_Ir;`7#O6y?D|4miF)02Sl@8t81iv{>iKS=8*_4Tz?Wrb)O zyKm>lm+0=ZUvJ$TbvQXar=Ym3qN1$0ASWaF@WH)XzkZ(XKDBD&j)>^^&-hO zqGFRWl3DWbk+jsc>Qsb}>m#;{*+w)soy6sr{fUfuK+mX;zvi!qkxI4b| z-4BAGE4b=M&McRYXN+7-(`d^ZGlNB{WxyMOD{_3DF#I@5=$4%O&-?W%*(E8cyb z_rpCezV^OTAOCVy1gNfGe?-?SzI5s9UVHIgVtnV5FRak@#*ddD()F^hzRJ?|v@N>p zjTfH$9o@b4p~v^?dd7QyIj-voAAbzW+hg_iXC8m(w{-XC4~A#x`i@uMsMGaz@4Y*q zyWh{%k3SOr;4O6b`g;~P>iRp+yfCUeUjOs^y59G6o4zyn{(EkuyFdEvJ$8L0{7-M{ zdh^pyuh#X#$DR5+_x|p;*U;VXFMLwhOBX-5O4rkuEsN6ieededdvE{sO_$T%^R8L0 z>-+EgeYCD`y#M|@UH{8^cf+r5y8fDfqPuf1x%TGY-13WG|L)%Vet*|pfB56Qx83^d zo3Fq22bY`zpBEu-L=Ixqa*AjqBH~{UqcfJhnIAxvsRLrlF<1yKiV@!e*No8S3k9Z)vEhC|&cs`Q`$hYR?7Tt%s$s-N);QGA z)L2tpQSWx;XYUgP_GD(~HMv~vLvG#K(mbK-qm^a#1Krt~5p;j-zFp~A+14fcy_I>cjx7`>UvX&?n+Be+$QdCI-FRk>n$0_+H}4C z_;JuLt<^`4Bp%)(?r(^VYr%j^P3wo}XQV@eW9@odLR{=dasTV+7zBazBguA9O~oi1 zKQgSB9g2xwC+@G=7mY}9I6h6+2a}QtbiFgtrDsG(?OQAEuiA@o-W?N*JI#j=SL=FJ zhVF{qAGvq6xc}9zTttim2Qi|HV`AEMJ*!wBh}^qp*9vj}i*36j_C-cSMn%UQjKY7S zF)4t%JGOmE_s8!2c=Oh6JK+3#*9959A*UpIW_f^OkMfckJA? zYqz?a?Hb*=WBax(n>Vh0`scXswB8mHIxo~2a=R7d)jfOOym{`El$-e|TX@U7TU-li zlnpHi`_wzgz-QPq&X$MqVRqO0vMP9SM^FFIC=IjGq5hr@_;FR)rr(22^WsD0bicFL zGt35hJL!IT-2FcH2jnpHyB}+&Z#U65+Q(v;Hng^P^$re?j*X8F5B7Gow>DrH8~0;M zE2oHC{yLx(id%8N?Tbt^uF$`nq{_x_c!V+Y$hDMCCPS!cp(caeD)PR&$QW#Buk;VJ- z3rowZYU&|`rFEpa0r{@Hv@k!4?hY;9ms3!L+Yq6(3B!yvkJi;xHMm`cxd#OQi0s^g z7MH7&MpBFG8-Rdoh5|@xfq3l{qBs+>`{!e8a~SU z>fr1Bg*CN>nHhTogPo}vnHXjHg;k(kT#R(loA1_(Gg5bn``eRK;J|}9x%C)-1qB#O zEoHhZBPDr0!C(edIrW)ab_ko$m-DB zk`m%Ji~H+i;|4MEj;1(3H66Kc{OGV=aTude++P(cEBi5P!F$;pMf-j(dqGY`c? ztrz##>^lm<;^J_pH8BxmsX9w{9Xfbm-&%2h<=%WDqJ)#+d6~w|=JX za{mz}t4l4@&(dv`n4&sqn65@KNqqVUU5%YCug3rPUk#E*BhcgITwF~QVES8otRH4@ z4m_d34$jja*&RDC*16}=)5$`XGtQoOw(AUwav>GO3ydFz-&EoMS5Mr5E&n%Fg+E)~ z#QO%_xL$zY^n(ij9Qm+s{Oz2J@%Nbuf2P8psqkki{Fw@Wrox}8@MkLgnF@cV!v9}i z>;I=!__I*q|Chh7qpF<#O;zI4-;95kHW?QS@SA>6;$J$3-}G;R`1vlqc$g{iXG;8; z5`U(|pDFQYO8l7;f2PErDe-4Y{QrNK_*bDQrXN0uf9+5IM@{0>qJHDw)oI3slou8L z)fK*fySjt^{|zkY9z6Z5v(G*M{0kObbkW6^e)qeVSuSJW9lrG9i!NGl!TIN(d-hrL zuoS=Yx)po&#bEtPc791^Z4<2z7{bf{uJ)$d%94WY^rP`H`}VB59!nuAS8d(BFDBt= zdUio+CDx?(3=EHs4iEHTWqMU9ArpYyx&{v^<)?1kwr4*H1=~LNOt3Rb6vO&%ns&=*U1%M{~U)ZY;#zAn0z`x;ru^p0CuesBLWP>KPau9O&t4 zYpku{EBE7LB6l0uQ`ggyow%gb%v`J&s%yf6h@KuShG@b&=Azup)TFqB5xX|8Bf7n( zuG_R@?}0-JM>AmEikkYSRxEw!#S7b-ijsURk54>wAY#X+uLXAPhHVh|a8g=kP9a_r z*EKY$O}%*aTTxn=lbM!u7}njkaShq5_te$vNC*&4&&tKRi^}R+wYImW600q8v(OQT zq9S%~Ubl*4PhPop({>P!JCc%qoYq*BSEv=e<+QTmczWs)BD;Ol+7$x(e zPba!_KU)6P>h+s<02~vSkd&N~ny#kzrllm~6>aQ+$e5C%f(>6S|A=E}{q5t=R;*dS zY1^)deFtL>AImTHr=PFr!ChD5sIKV__pwqJeh>BU z+`4JY<_)Vp`SAU>PbR{FuLJuDR!_~+-D7%yJ{-5<>s4QU_Q{7Iy!{#hCM>VK{`LnS zZpMNWyQ?Sx@3K|hVJ}Lq#_!={YgT@-{9{00e|ZjJtT16noYp=cWRvKM(eDJ~h?>FLVsQ9&Q=$q#s zeJJF?`#AdUKRgf|OPJPYUwrw`wYYK@zRBKny)}W$;kXy^y1ngB;lU65;cf&e`|ZD5 zw78tm#~*+4ndirGWsiB~m6w;3iff+4%AEAz2Ny5;-5rGe^=)_EJwk}chaP_HT@ruo zbI&}pi=eMPGOp{J@4xr%yKeh6ppJ#N-1;a1K3*CUPUdpI@)&!}oq@*A!SFZnYvSd5 zes}w=w=5*^Pj0-4_}Q2J;r<7Y;-0%dObv6l;A*Jw!Hrnf@Vh&1{pC$J{DiPSy!QGi z+_3fE`TM&UapejgWDmMRoP>^Cl!T=USgQ83>#zMG(3KZoe$^LPd9(UAzrFp===k`A zkM3Et=%Iv!`1t4VTp6+VJ>agp>hg<$>^t*;;!z`NM0l{%s)^z}@mo%P;$H`NhpQExhsi z>#q6Xm6t9!`y_@uF~Cllf59c+yYh!uU;C4vUVp<4H)=Ny-f;a-ue(pmjs4)t%f5HX1?SG2rM$tuuY@VCpZ+0uL9pw- zpTZT!eia<@^s~=B_x$rOuou)9wU^k-+Dqz->I>|7_MH9Mr$d5o#R9$3%YxrKUS3&K zPpiwid-?~lf{hmUU=`b7KgHhW#(D(5><@xzDPZYsVYww(tA?d!1{y1Y>~^dL8Xv-C` zdiqH433Z}}R`&JbV<{{u;#w9;N%-QqT0?L$A*DkyASns4YpW};NXUfdYVzfLkYxyX ztX6_Wy2yYec3H6pT~JucMQLh z-95d1YF|%pPdAnf@}H}>FVkz%F90f3^_^9Jx^sGhaiQ;uu3Ipx3o1js0}^!jctuJHI)@*SjXhb%Q>FO z(PQ^KmBHa%4(1r{tr>9Xy1To!wv~e6z(j3%S!u7X4+EZI;Gay-$jZ*iqhRkEu;~^o zj_h_LlMRlSl@ym07v&$%%1D>kv@}r1qDJI02Xfg^Z6S2Z0U|B!X%cq)o+puyGcrpk zyt%sS+j%x~bvE|l4?`^ldAZr(oR*qmpr1%i0ZE>6+!Y0dg%oXF_Nro5>~>LrYssy5 zIL1IdC0VveI&w5Mtra(IV<;)|sfD|zkQKTLc#Hb%4y>_EOFeod=_w)66A6iE(nl@2 zk7wuPP^NS@<*__>5eZeAg@s=H*6LD%NeglM~|OVq;{3=;+u(6#Z>O$wyOC`l&%@nwsXO zG%#3q6sdU_I|p(O79?U>UGt&XnCPejPY977 zi`*A=&<2>j@^E}Y22k#TBkYK~obag$sE0~oq7O#xi{yCQJ&)~;*q2B;XCI0?Tuktk zM3%^tm~-MVtQgTxt*>9TQ8L+&vzr#<_=ZHP9Jbtm_Vr-n;v8 z63BMXqdRx);f^tW;NZb%p>ZsWb;VIP4^%}%ZV7D?!d1?R&Lm|dCOM5 zmUZ{;J#3E^D0TM`hi%=mdDDhfp|=w9+~7w)TMw)N@7T$9T6d{C*^Z&@+qP~Y-1^TR z4ZZ*g&T(UKaA?>CVUEz?;GZEax-P!?_64`QZn*@HacPt?uU{OxAk?+^dfc1%b!gbL z&%f}(i|j@AlKS$jSJ=xcQYL%B{`|9H;7jQ;^n+|X5;Wj(qZ84XR-*EBiPflx$+Tj$ zp|-j*=dYnWUEaPdA5n_XZ5(Y%I|%okeEldwVZop7gifW4o}~yVX!jrij*i$D}DYC)1i5~932|;pb^1exNkc2RJ0|!#3c8?Ak_pS%~Rs`$X#MW+lqi*z|UHa5!BB`Jwzvy|qGC7MbIdSave*V{YbNtakA z&jqQgu0&D5(Y=eFMgD^cAqwxGzWzbWU@xtx2VqYle%`Q1VG(5oBF(|QeYE(#4_~-+ ziqs0iWkrP?-LvRfO2kAOML`>KA9AbO%X<2-N*DAycc>k#y|1mcrKzF5u9~OTf;^7yU-V2a z9v&eJrPhX~=2o?pwP~&W%>=E1olEghnJX#hAZt^qtRl&gxSLzl7S`O?*gz6j)pmDv zmLLl~M;7T>^bC?2k|k&(S=QCJwly_q4ZY26&B&}(6_hNIvX67{*rI2$cxvUzvae>4 zl4WmweVa&@_0SK?gNO5SvNH|*)5kK2JWrN`JXyB(P_lH6BeAm5@sdJdk4bO_)-vbd zflg-2`eJBB)G{)jC1J-GJ)M>Tie(fQ-CYfxMz(C?*|N2;09t`udYVM1q=6()1MVt3 zdy^P8dv#eU9zBsPTk{)`EHjT~q@@@QoHnC&>m0NXRKXanz%Z zC?r+N^73*?%0X9lCX!_}!4o)sV$qY7Tq(WS3-QHM6;N(hj+*0c$CY#RSPp)T=Ot5i zeVQafgE(YY`bx>>N=JI7rzH1irkd%dXV1Z|^m6psQA)0HvOz2|Qav>oMx}s{CD5Rg zk}J=Z^{LJ1(nKWJL$OAKCl1COitD8Y?MIT6v922p>e5)6I}c(G79}HBwu1bjn1fFV zk)A+ujpZs=Cng=q1j=1>lpS?f5nd?fPa6QMj${%dGx__7O@Il@CjZ$d6~UUJJFcmC6z zu3IkUiSjzd9T0dypgRDMwa(dpZPyp*&d!hKA|<+JXX%&duH3m`=YB)~uI_%17IIrl zaOtYFVsX6dv>v)1IE}8)?xX9WbLjfqetv!4%=Lev9fbY4&CjVOzV+vl{mA2w4oZhhoV~)Pu*Xe^lyLw^pM{B;m;A{JukAg2$sl;Nl&bj2$ z3odnBa?UKKgwPbZoI|HM|5=#x{~xui2Rn?&f49aNS5jb1`rn%C`?p(b>HlXY{Ws1` z`eS0acV^OmX3~FV(tl>se`eBuX40R}G|;@n%%nff3eHUW&rJHyO#07E`r|2aX3~FV z(!X|Q(tl>se`eBuX3~FV(tl>se`eDEzw?F1f5oIf&Gpj{|Ih3n{rJCO^IhUj|EBtw z*Vqf3ZU;xwe}$l2V$AvnXDh7o75u#KdoKhVbpLs+qI`9qqU^&yKgaPQaL>;bW_?Iu z3sx)a=l`Ox`|}j`^m__>Z@0p>99G!DFoh-IcNXpYue@QsSZT6cbOgc%484hsPe@F{OJ%I{ zO3zR(Q0jlTdiM@2jiF^Xc;yEi7Al#6IG3Py&I>!YVsQ@NDsdXrxk}BPw|8vXBDRe4 zCFW(S0QS=Mjhp0Z9KO_N-+r)+HOr=)qg3C$ZKDrv3ei%Ovz4lIR|6*r?%6Y?4jdT1 zb%O*>RyF}vfl*R%Vn*fc&o>yz9sXwVYC_2Nf7!HNBKf){$sFsVMtq@_4{YI3V-=F2 z_W>*;GwaRSKPQO#BBQ24zP^!Ynr(@V|NU*rX};x$jEI=r(L(vR{~@Hu@YLaxdIWGLSfY!asZw%?;T2(x;PXd_+b01v&%Cqlj3bXb!`sa82Cb1WhB} zOr^XL+5M4X`@zYhGx?}Me!YIZ*XyyCk|U8V%sp=)!QMQMy+i)cA>@?=$}1eX{%b?f zZCFn-+4e$EUnJa#kW&o#q+}qg-dMl(Ymf4tRiYH`j1U@6Y3&Vl3%2{4cg9&~{F7qS zF1Yr$i>xr+VVUo8m_4L(2kY{e1yFH6Jmb;DVTWB#4t zA;A|0&sUrmE)EO6IQUG(dC`*4;7fwPgNp}4gD(x9r#R1892z_;*s=71#Sbi*zvK+X z8E{)za8U5XlEn|bwPsHY7SO7PS-h6O5?PY_NV1mfKB}CqxULHi4h$Y$y7;LL8Q2Z9 zva*_0t2OT0I#$Q(Sp&B1Y+}u<#odYnot*QRgaii!k1q*Xk&9(_rDbXvD`yp~l2vOp ztd`ZOXyI-I(sfHXfVKx8jw&xL!D2Vf56J3O0-mZk&s-W3Yz-b=@<>8?adAn>6hws> zpm}3BA(w_mmzj`ig-*>g+|F>H}-a3Qbrksu}9KFcefr9(*pZ*cYl+x~pr1 zT9T~mrzFA11KZ0AeX(8(5L9hERdM}WSgjmhf6$gZ(m!`*Y3J{VHJTskIX$1lHd>LMpSFmT4!B-Jr<5R8=FX9 z*SXW+Np_3$J~w^pAn2b6FZ)RSR@5b0xb`V07G)9`R+&;yv(3G7d>gb!j_FKjCjAeA zy>k4glm1q6{NrvyA%r4>^pff1Hio7tNKNO3G=$vK4GB(uLOQKy$ekE(7kC=dCmy0M zPTomyRycJ8Q0uvi!wJ6`OLk$6-w5@LG#_lSp9rcaVv9lSB^&PxHKMC|987^#OVu(J ziGWAqX|bvZY?A|_TJLOVY^1PYm;pklQ<_ZW5gv(STC7^(tgJT2?GuB|gG+6YsyRa9 ziwj^fwb)4s$v+NLmh**@v~J%#XYzg%wy?lQGu4zksVAsJa9ZYl=ZyKHM)>i+{UJEy ztPox!+#9wmJS_C=P}B(L|2lYCcuM!0KPSm=eJR3khP9(r*o)d<1i zp_hbGjj%L4^wLnO5$+EUofYZ`S@OV=;Q7H+Bis=l8WcJn{J_KStc{309IwW+1T|4j zvKswrpS1IuNhfuXjL2cFrKSy8Q4GoI&Jy@o7Db%EA;I?96Ju+V_e!Qik}`B=GM zR!&){vWitPA2@1_UoHdAvGmdCO0Q9YQawZw@0+PAI4dM9)Ea6Fek`fdToG`JB719; zI|=K$aT%Z!A>pwVKGlPXyG3A6L0#|%PIt-I6(}`4<%?>8{F~BrOd_^+EkH8kSg6=rM$UkEn1?rST1QvtmEzXMcR+g_q8J=>o-Jy=`@q zey)C=;yn4eCjA2a9L0Ik+XMOn{cOc`Tbq87ewN~T#jRhgpQ*S`+M-{oe@AiQ*n#ir z^Awl5UH_hr1g^cKU#27XYJbqbuj3hBo!^5C!YaUJxX z@BQmuajnie^ZN@U1HwJ>UEH*=JpGv&7T&KBMjbmFqX~(Z|^5 zUwrx1S1VSoT(z35?q9QZEvCLOPqh(W5pTtu7TeBt4AK!g^p!GZHR%&9ZNGXzjj~49 z9LvNPdi2qJ0X`lqMzMsduCj`1jyil+*+e@Ww6k^>_FQ1yBRD;#k8jj4G&IbH)e-mT z2peg_yg9zY^C3jhma0P#VzndqoU@UW;0qHP>t`{aD7aW2SNFbzmBX(tXi;hWLX0==k`AjoDN?ORt@RJ&nE^ zVLftxjN)D|0j~ZiZ|}BaTM(B1?ldG22}~BiRxQGU+yEeeO#>)X1WyogTm&M+5sabWGMsUW zu~16UyvK;b=pN z`;sZ3sH58WiKAhY&uH?rm%-FB3j0kON~PtjyvuJe8GYc1#^?qzqBdHTDhOopw4To= z_B{#F8xN?x=mLaenbe0z)RE>{~N(!L?YxkBjq9Q4oqM-5QQv_tr-dhdXsi2VMkk&tHYRz(3 z-k4W?>D1T>UHS|3X(f$|xCq1aH~us&(e>me%aSO@E3-PRthw~f{+ zm|f4z8Pf-(`7l_^WK1V=x;wh4d01t}4qmu~VJ1^eRnydTEyHq5%~X$T*=kNe zu9j!X4=m6MS*zCOZtozKwQj1xRMdd#02^dOY}kT2h>cq&EVcl<>KG#vQOVZYreS|C zYGOeXwbw1YMAU885p6VR+=NxzNHT#XpHxv|svxfpTS4A3Zk!$iz)VM+b(ocncRu5J)g!S5j!4^bU zV6}-ro0F_S@=786346gowuz}sCZtdYoD|^*%^-$45n$8otjX<9Tom?HnF~oY-YY7k zw%SePofNucFM4e!$urQ~At>B?}D_S6+9`szXjm>Ii?j48pIpdrEiie$OzzXIkO!|sc#U^YX|ZK<_K`P5LbOi zi?G}@)>dI{li}2MCnW>fMU<*i7jL6q{rZ`Faj{f$GrAY^46)2;uM6)I@dJg~`lg z-^+!9iWqM61Z4pi)7bF~vmy?9<)ZO#u~}d-kMe#kykkK1j86&yGR#emM!sQOLwr%~ z?xtoDY$aTpiNZ38BZ#XQ%0V_CFgB?&r3ewjy^$`^6GCOUm4v-P+&|oUN1(t84b7oB zU;^8X+K_h4GOJvS$@#&8sBg+Uz6zRYQ;M=6>`%yqCcD#1Sy+SSA*>^o(ZF#ZsK{p` z>Pxx;Y8|zX<86s2pLvrMi7PT6Ap=LOqa)*^qsaYUhcc&0)dSXsg8;^xBbgY?_En8^ zO%pEa@SG7Rb|S^P5p$NLZX@%0p&FobM~pHd1Z6_#!eCJ%STj^A5wZeNBCuQ)C4yFP zav^QziRjR&qN-s%79R98&XB=2BUyPCRYw9~ zB5lGccKzi0gn~3f(c|vv6-W#BLPBcjQI1qwjS%G{gvB6fCaEfi{&Z#B*gQuk54^h$c<=S$OiEsoh&ovUGf@kYy5^V)tKxxSP~+xj z7$YKzQ3c?pEm|jwYnFk)6j&+wPl8L6NmEGLR4ZsFf(y9?+>^XbT9WC!#Jy;SU`lzx zDu}y{vYvaS^a)aw`j4o4O9N^^o9uZ#*dI?dqr~%sJxLzn$AZ2XBW66e&Iu$U(+dui zKq=?C^vN>GaY#DI&Jwwha=TN9hmjg$=n6A=#i9vPp*ERGF!?62q@hHMfN!#s)LZt0 z+272)CcRkbEYOz8E`&}o(keZHevj3B%^FaoLtQBtKR9g5QC!SIa%Rj8yIhLQqM{G)iT z;$d&XHZj4UB87L7RXsgMYEe)paRd0Hp?^Lh0f?%~qn`?@fs}G5?BkrKRLd)Xm!vup zG#W5Y1TCiF?ImdCaZekWMauIuW>!Y^u+^Z9Wn|8%z0t%{IENr6vmI4ZbC9|D1D#BY zLNZCsI!MXb0?Bwfe(8;qZqn)Hl{H$$rhyi|#V&32H@;{0?p@dxcL&~E^>5p{b@S#; z8#iuPzkc1s9aqgm|Cd82BXc* zjg8niw6?aUsEaVrMuEn z@$9961g5QeF@PnFQ7*R>q1h0DlPJtLBsWADg6j;TjP)l`&b2V9PP%)Afa(STdVvQL zHp6zTCm)-slNnlDxEfpwvjoy~Mtkh-4&R0K*?lO141Ljdeuib<_3 z+p!EXBW7Dw-h+*_@jEuC#hECA0`;D-0`EOm(S=p$9z%L`z+@-m1ZPBLa3U^(VFgIwbs(<}!heJjCMIw+2u)U0#|~Qz*hB!Lr-Z0 zJ8BPG`ouU&d}DDpY+4u%6OyO~H-VPj7P>Od@zZ!PF0ajzEEkJCfTCCCm~Gm^|s* zw1)uwJ_M)}SWnrvRsjUwRG@=d2++U@3*VlE(GSuSWIuR^lP5Ax*$?PNJ6=Nd@@^2W zC_Ra_SD#=!dCchRiZr4n0g7rewomVh^eR@2r#53Kc*nEog$9ZzZ9Y*E`x<)$z*usU zABzELj0J@EuD;1bdeVTTG0oC79uWpchAAQ#L6L{FDsM=eN&|!himBsxkV8Bg_l!i= zQ{w4Cb3~c43(`hWVIiVsiE+%W*KO**h7(2tYTd^L7qQeB0jzz{Pjpt%;YkT5>!o4* zVmShm$}vtrbe;@E{=%1*6nWFRP#!6X3IcU-n8$MBPr94ZK|Q)pW_E8*!Z6IDl$4eR ztD}J9dDSU$S|KRN&htc)>BezdswXWQ>e)V-%YsnI|E5{0DWxATKR=U7+^#8Li2F zhW>c}W_Ca99d5Q@DpsUuC|(Q&UkBVKxRek!(96iW*sw zm`rD?rG)WQQ!wGJ_2PX_H|uf}W7ak%M;+PSNpiD_b}w%q$y2cHyp=b`6gt{g`ARKK z?pp=Wh;gQ+r1S*#xVve_f;Sx%<}xZdgKwMQTU6jzNTnR5cW2$0@#)ao3rJrw7PS>w z&nO+td}Rrhauk!b8gHn!74wGNQq)j%lA)`)c;f}+QRS%byDLkfwZNLM;8h53Ani@`r4@y)?k0x?4KNYJ1)>J(kb4m0+uTFK z?xKag2}OhpTYsjVh1G{aiU^WiYEHZ)DPr1HgfXj>6(x> z?!I2?XQ{upiN(Z)8C=MXoRs8zn>dLJ*5SKdm=S_Hc>M*@L?hYj!kxTq3=eUCn-My}(~QcPqUZYF6FC*k(?XHS&>bn@Mru&%RzZdBlkE$Xu9SHTl1< zCyM}Oc7F^QYJh1nj}HEp#pH`QLZAhSggIg#70g=uk4cPqFWfAVH$-^jD~~8vP!~bl z9D&TCkm9BY5w;*c+9?+LhX)zE3C!Kr-DIQ4iFq!Lma0x041L3_Z~T_I!x%#d-&m-? zQ;3X${^485OerK_!h-xk0+R^RcGNx6cAl(&#TBMhGks1_n}Q6Smn7NOYy+9U#-6$r-JV?^WOw^kcS)+AymVqOEycA`Y6U;H&H99h9M1$fY zaSEW2Qss$LaAu8jeNd7u4RbGB6<-DL)hxzqs*?+~0xe(7(@+suv$ZU|tU895R_QEF zOQmXnQWX}g7*i;ib%jC}s4nWDPLh?ik;yR^$#XU){h$jb#zm3UN$lx>9ks>XJYfY8 zZ-F=v(~_JP%}c>G3?OC_k;m|qOf(5fJucbeom;oNs}r-vf-SJ_W;iJMzQ8hIiL95u zsMRz9#d;7xg*Ilgc&XOq?xZz4hOBt^*p_8wuq!Y7?!9))g3Z=Y)TBb3j zm!`WNr~Hwi0Y(CsL@|eMvXt8Cq$dRdrQ}VqNIZ{`1ekF)^v5KX*H%Q=>Iq!*@a7el zj5P@?0;veR;&DxCM{u9+C)y(K5?qKv-9~a#p)XxQ8pGXUWL!aA3M1fn%V0PJ!BrT9 z!>M+Va50_dYZd7eyrU_ji(-S)hF5xbKYw_T45VwiTindZm{ihuTlh;P8#r1ho8zqz zPk&yO$@;#YB6Oh0>(;?U6abm#;aHEqF$fiwWJgs z?-Gm<09iO#L_aZFs}nAxN=}!l3ke)-nshiXpB^ zl5$*oBMXfV!iiXA9@2QDgHeRs=ut>*37j_gdBZOsKFye+qMR>T%kXC&$AY!1*)|ct zks?qF3lZ}v0yPimGEI2Hv=K_=*rKe5^#nYhP!OSfLMg{lYWh~@au z$8bT}H}N;^H*r{Ki#B%C957)O1@&_SB}aKaVa>)5^W;ft^rUMtQ# z^SQkr_i>^Yoj=L8J9ck)P*u)XYEuO4FsN`M9uyrZxpnKVch16ocr^x!^NNj$!a-i( zMYK3-%ckw0oP(Y2s?9Kinh$K#w)bbzR(Dh2NH!0-O&hmA1RP7LnNo+{yA8IRHg4E( z1?`EKDk(}^nAE^gmYV?F@&bUB9w4_Gj5Zm#e(m|RQ$p zpN=l`ScjOe-}KZZq%e>fxrHO&nS_j`-sL9uS{!F{Zu~otEVatt54fOvaC92=APKsD z<0lqnzEXihY0=weml05t-PdoBP_pQWpd0+4I55x8_$C2-sN$@=!-ox@HH>P4dSn${ z8c*lsnLX1B-7tyjv`}AJ|62w4N4j7N@AtSBI@a6;$w@dr*slwGe84c4k6TaUf>S{a z9~2S9@GSWCbWrmU*t(SufvKT>9*QJpw0$w@eC7#;9JufkT~uCb6VCVEuf6 zLIuvy6%q)tGFJFO%~8c5TQL>XJF+>|b?c|0x(|mPntN*_HnW=ym662^gDGIRkp!oxz;2JGzPhf_;giK8OnsRx1sf}LTvTd|FeKM*@F zm1C0}cPaLQvlo}8Kl324uFwljP_-62tC@IX8-ij-*^xgj!5(FaC z&qUe~&=q!(KYA*wIEqu3ErsZ=uuB1VH#K7NN0`yyF@z|N;`Bdx@lG=1RQ@3){}02z z3u&+ddP?V;n)nQwT;)G(aO!?fce-gcz|`(y?tU`)i3gppIAh*X$EFzi{?hx6qsgxB zOWhAnW7=;VeIs53K?-iHSS@s#b#;*JJibh28P(75Q^+C%tf1{H3U=R_BFG9q*L>yh z4hd{CSxtLo8iAYWPJp{ZX2vmnut4JH9eE7i~QNvkP=tQLFZRZlXj zy`a$SI7X!~@60(=DTLVnc-IXV^GX35jfF1@zwg()QV6w&EepAeR|=sx&{+1 z3y*smE3=hX6>WNGDPhB?Krms|T5RQ1Euoe&M^(}@OR)EvGyG1f5Ijh1SmcAGy2DYC z@!V2iUEx$AaHv;8L6nbXY|2*ZC_nm0aHtj7-%yRv%o=^6kb!X`j z+6>SYeleg9yy-Ugzq!juHAiXspF$w8E1Z&oozPxykP>_2gQN7=Bi{h`b_h}b7I1e3 zHU|L;53qj5Bmy_buHw>(~Kk4&&{SA9YSt6E6q-us-?If6!}1PO~+D#r}#$uEo}fRZd!YCkFv5vKRVTgX)F(FWFvv z>cL-Js45GT^K?h|wm;6s{P{oW&Yt(r#@zRXx^v=-`Iut9Sa%IQggJ#vb=Q{~W)g5@ zQ{q`D55A|n8qY;_aGCBp8i=Cc`?_nZ1$q5)-SvuwoPD|Ow0&Pi?z}>G?hR0p!LHC9 z?w?p0BJ7pAbJr{jzi>FN3{;hqlppAhzs|+YxpY;0T9C@G?CvY*k8_lN)*biLANkcA z^XFLjm3{U3rvxx%Hg4PZUNR5&`DOI?&zhr&%fmlB?<5Pqv?cub!c#$O7QnWopI&l$ zpmBZV#_!DwGOm*se*er_s=yrm`Q`IH*Ee5r)@?qc@Wwhq(y2FFwr-#HYzZKHJxJq#oZY+eHmJ|{RFj0n zBtdk|+I8#X$r{s9CFJX0uQB*NP?s zpR;`Y&L9DD?De&)i9Zf2VVhb#R6VBR0^lGHBKn$uPChcj=ina$xq9td@5v&5`Wr+! za?L8vpAI(R`itX{#2F}_6JQAWszgp>R6$fASFTzk)Sq-($^p+=R2;b)dn|g*FVq*i zDtdG}6i3KEPlXgmQkjg(k*ii1L^p6rw@o^OB^GC4`6B&HO4j%AC*&)uzFM&gl78(u zuf(4y9k9ibE3t9pWSx8@jWv*8u2{j522ncnX(vBg#W;k8%uh65Uh(BLMCF-NUgQ^F z8APe`d6$|c6-sfMUty2SDJ9*#R~`oz6%Az5OTI|I-e(auVSgg}(wDSz<|L}-L5-n8 z$QOZxsIU}`6O2<+ppXZR?LPoy!;4>hHU&}dKw%LM5`Yu+LRYW-+C2O`{tqZ2U-%rS zsZADDQsrii^h5gfsx&J%7SVkEGdiNpL=y8JV<0X!WbM8!!!gnZ@`d79nly?%VR<_9|uz)r`^>>`pM~WFeOWh^RWnk@G20$C0{P3>^mYhI3j8KiNz&1IEE}->* zPK6acCa_Lm0j+!DZ#XNEL@@Q3FfLb(({*jx3TW-)f2DH;eZ@7q7stA{06O$I7G`*` z%h~cy(Z8IqbQ`?$1GMt7zwqMkKieh#g9=br8@G$@_{e9~a2avN*6k{LR;iq*UMQSZp~Y1^drxIj}bDjn~XbZhntV zfn3>V;9h@K01Y#Vev%4(wtSPFy#IZ9TsGZ3Ao=Soz z>Ppo((aR5e*cWT;Vs41Q>ZPpq5D#!_0VV-e^Z6;5MDehfNE=Af9=xjQU>H2PawH~B z>)iDol>jX;L-D+7oKhzApAxE?&gc^SiK^u8;3OGs7RYh_BqeTwaFVxbJ|TzQX}b25IrwUxP~A|=8>>bz;ZdRpimfGHGEiT?8P0*H{+zM zshq9qp*Io&oEgYU%mlr_F)vUkLKH<;6KCe3Xcs`1XSR*SIPXNQcGq|;Gahm%FNZT1 zP`ns3Xp*_rYN?COzzLET?q;fqx5PO(j^z?g^>mA1O~+z+7*B-6=3%4P+&nKuGiMWW z7)aXDRai*&HCae4Cm|*4(U90&i4)9ym65m+!g+zLz^e`QwY$p3-O{nPOna7Ei*y>D3MFSmkb{>>H1SkRX%0;#y;oW5> z6+Jk?l#F5JJ*AdP!E7LQj8oH09cAUzm1(txZW5iU5CVDBTP!gIT1X1#pi#pO-*VW~rWDkiV9FejZ zqy$I2E=z+QOD9VScYUOt)>>xoBW9>kSeDsyGzB+bdEBY2?y5L{)3ZLxgMfl z$#|~(Iuuz_7hA~w4BFF|n0t{L{L)#F*Jc7bIFO~QK0p zP;*_`Z6=T_WM7)qz=$p(Qm&kS6UG5dYMRA|l9uDj8#iI>A(3gRsbHsi*jch2IoZW7 z6Ua#*mTK{UEO23GPIk3VAs3mJ;sLQ^8t*FC@m2%le^`v?oKCzOK@Os_gw6nkFiNEjW^iB}mVo1w6st*)zZcHb62?4$VikA>b zgY@D6!g{fGAptAG@&TUHdC_4=fOiKtN=ZuKBq&Z!G@wS^0Vv*xaH>AgqDfGaK$t+D z*GJX@Rf4LM1dvq4(SDPlo`TOXEXitA(H<;HQ;;>47F+^huE|&_f>ca&7}1p$aD>Gd z@q(8N>#0LbR82_nY)~ZffIm&6;w4Ez6i}2^Cc8=}AW^4*3avzMNl7JlJCP)s?jkH! zG*wIM5f*CWUNa^1(n@)Ki8T+$G5g!cC@0x(n`; zfYM0BoL6g{xl#5KYr7!HYenAkvX6xCnULdQhf%>3cP~-VNh;?qc|#eF<5if^?Gh&o zP|=Ho6SWXCSx9JO)Sm>L2s4%3W&;VN$eOw^i1?IE;)Z zHz6PaW$o$9Sgn%eAf2epB|UG6O7cmVUBEmwugBI@>AR~uOvl6GOvN^86t0Jrcc04Z zSVJ_RL><73QU9#C$-SQO9^Wl9oo1Yw*j>Ar$3mFG^Op!VajH4UAEcLM9dcX%wymvWEeb z4WVRtD}*wYw?rsmv9u}RQDd<@}h9I1yP?mkf)fpC@g&I-?TdSvk$f}MKS@Q|uqO+*hB%IVj zEeI%dRLF`?C<+w8gmls()mj%A1fQ^KfUQM2aN8Ksq=#$i%~0MG|@8OK1xeJH!$cH{Gh{qQAYO~rB(8`inas9qic-@ThoJUK%)!o zSkjPWDWz?vx>mHL!zat(S2!_?m{LHJEPcRZ8KXN`kqgaOsgsnX%2=z?Y65EQVucb*8X(Jvqnw0>ET@+@<&tua)KcRu z18WB34jY0>$7=^i!!**S{Ake9GKS*Qigp22eKl2wh@sHX5Osp4&B&%KXUyP;j-)p! zZqZh)uu-p6R#aLm*&&a_HedK|`)8Zzx%h z$_E@YB;G2Hlhz=17g%EdGS-r%O z`o@PP%diF6RKC?m%EQ3m%rO`35EVggzcI=p)#+VS69uASad^ zhx-CW68ll3SE;gdQbWe)k7@Rho|6KLT8pFjf-7zqLrmxgw-qyl6C@|`6VE5u%%~u! zu)hcks%Y^MHww8rF|(PP(LAXv*6Cc1fC5`#0hZxJ3zHZfC~QJQi>4V7vxu2!WC-QJ?wk{gsak&mT0f|Y1RgONmZa2I8%?zD7j#&~tg zvV(BESZ7`?E$E?E+=Q~vSdu&WC_BnhoMaH1P2|j|GIZgAY)%e~>=EsxImCD(dr&i$ z%F;Bn61*(gvvS7h>|QC~h@q7spl=%$vyGOlpzLm};GrhyD1!;PKTB{YCaFnI(n;{K zfR6^9Tv?e}fmyBMlwfW@vr@tol9QTdMJvm(S($7vix}lA(0Jd%1KFW*G6R=Q<@#U~ zOB~G>fEKP}8OH*THPM8F@SYLUOe{&6Bl?9KIKVP zVKp^XOFKEeAB%U4vyMp%;MEu#84T*Pc&@*eq@n?Fu%-p2vt16}f<+9N6ybxO#W_h6 z-pB$j8XCD$Q&QE`pftAAE;zA>G1!^zv5 zCs1SEhvHZqJFLdz8|lPZNkPeM+vxTkwC0OjnMJTjn@}d2MLVf$$kr?_=&%bt$`VgX z8sdwNJgTS>zuM99F#Xo^z6}^>l5rzcV zz92TcX~eD1)ot#9vZO=t2>}VG>$b$b{N!u4-w}DZd{{q2cMKQC#l>6WPt%8OF??BE zWJFBGgg#$)v?e0VXopYJ?IV$UBlbqH2v_cieva~ zx??aAA&nh6O`mApv3t+X^idt_&^r+4SnO%Kt#;e4odquaLfu|^09$9Ev7>m)jtZRU zZnq`EWAF!O=B6r~?Cz-D7Zr{FbtRSIe0Rr0+<^o7DZ0zmh|}Gjc@dHN4Z5>^Pd@z` z&2-?G%k9wLP~OxhyoDs4@r1o>I_Qii)pHsWop@mxo;BhFwD6Ph=-4D|gM-71u)7XU zEVK++hjF+9uPw1@B|oo_ae~<0juRxD23)|f#&OJ!gka%>yB^~)~8Jsmlt6=cehnJpEmtY#S&_K^RXdS`_ z5Jr70iK!l977()z%o-#NNk->jFsi_DQlJX?iJ=9Q0>O~pEYPu3QhL>c(IAwifA)Z# z9zaMWVF;3(Kx7CF$7xvl0>F&kk$9O&3^5a!+5LpT2ZbJ1R!&9j3gC#qK5M_~M?)Z3 zS0E79-Z_2F+PXUW{??Ctc zen5|_x2M^?`t;XAmy8&IZ zaaXn_@#(Gx=9cz=j=5bHeov|J_UYKi${(eM{@G^dGX$W_jeT4{Aww3>sbQhLiF{vODm|19-E4JS3? z`9(I+sslmHf(C&NuEvJOz($%%7b4I-D}QJ_r9mwXFi2_DLHH|%95fE7cQw>E1U9%! zN~xJ<@H(kxxUK<*N47LK*4N>L;5qdwR#ec_M?jgo0-siD=yz57dORIEL)OFh^ z_t`oyLalYD=}uP-Jq$X^YkC|=ea@l!np#WkY5J(G5>JVEif*$};+tro2kFyv*LWE| z%5s*~3?b{eCh9RDPSf4PWontL*^Zp&9^rqn_2KEg!;OsR>aND4^l7@ItE8-3N5XT} zSJ%`~!fP$=rG(c9ucw4p-%Sb6QH%ehgf~bDuf3vzGTm@lJN>e^xOl>I@r37YY^EeZ zCmd1C#)f+Q5IieCGARJbPQ!tQbey0>@f^XAYf=#^>0l*7QA$L>^`V5Fk}9&2hasiSK)_}X+mVF%RFISk+sYYI5h!Rd0GXBNB!eGM68xk;N&%GM zf{+I%aGDEE40#k3q6@JCc!HY^Qas4>;6)OV2@r!IxImmV=)^G&wC{+aod5}m1UF0n z$%8x;nWUln8nSl)q_OtT8N@f^yuH;pNlJyZbS7>9lA5)D)*wDJgoRZa+ah;D2yx%q z7ldQBEQl!H`x-w3i;jH`=(YCE?swPI7xo4NUjjkBL0~{jAmn{IsZQ#``+GK;lcOsHok_&K6P0ko&L4cR40lX{G~!;)ATW z0l>iPY<^wUqGaQ$aV;K}QQ15nPfJlX#4>eIW#_fd_)ZI^k<^R zu-!g~S`E>G0gyBpwlCpX*=;D{3rovniXH0s($h%e@>kkmfSliH5&^Pa7fL!bpG7AE6S9$(4hOn z_wG?F^326Muw+)Sx$&X~bJ%x>jNWB3WmYdcvDm)dK*01W8?)>5^23;Q)}QGLlQDx? zY-D40`UV-Zo65Gy=IdatcabbLWt{5nxN@du) zTQ5-|9XJE3m54ZdhkA)bx{!#fJ5KMYqC!-}yP{nQI3q=CpF31wFbXqx+Sqw|xN#p6 z4DAS@II7mxgV*X34OBhTt~&#{fypVS(T=FP?G>7bEDOGjZRDX{5|@IiE)9K~8YH)R zxrEVfmmRm)3wJ3Zm2=wh&B07zc~KupG&>j2O%^Q9-gd{TmVL-ZL`4_m<>j>DtIT7y zmJqEIU0!#`Hal(mSZ!A@f)LcXkLv7Q_F<&XQn6E_*VYj#&a^}p`2-Jig%%a?$W|YZ zQ#D`U>NiA6qug-^@)C=xkQ#|VuOl@3lK&D+6|T7B%i$f_MBVYQ#5_^eOUmh}Q%Hp= zbFy3fE$U{x(HJ5{Re>U>C0=xnuE4vl!0cvxEg{WdX`S>QpzD)*Bs7d>h}-~=a~FpT z1~&Pd;9&D8eDBWQ;^oAvhe6Klu3?a?BrgmKU0!Uu2(Oq58K61_{@%}YlPG#)Bg2Bu zFeI6jk|M{#$i_q-CC(YDQ6sO1qaiJbS3|c>Vxu4Rhw%A3hkeKygny$x$9PTh4@gO^@ZhSV~O;v3OXTrw07*+8u(%I}=foUu4eS1W3+P{$`{ zg3y#79KY5gr;nzH3XZTGJk-EYF3WIA4aH%L<2JWoj0z4G6*{BuIiu6g1W;23c@GqH z(oWu`ho?k}sy;NzLYp#Lo=!DRtq%IY}I!BlCLee~?; z0q`h%A22KACEZb6A-6Uja&Ul~ltUH-%84>djS)qn&~ygfbOx!ix<~N`CnN0ew{Q_%L2Re{}#OaE8Z`VAD}(Oa&vfoUW)0EZb2GIaD=} z;ncuFDI^L*N&r$F*;eejA9LQunUlU!4Q%mvikTB142aVU8NPuWtYcegpLsgk3gnI%|T?}%am;rpeC&k@D2rYQRA%pj}#4>MJ!$0$5kAPz|7}W4$DR z2atq2kpT!cm&60DV+KfOfIXo_mdFeenkOJq?(sUQ;^Owc;z3YHXSl}jSmtWJSk{9G zV_PT`TO24hiVUFnf^^BKD-sP1c47aJk(5MgAgLScHU!UZ6WeG-xC;kQz}# zL`om2M5UvVJ=kyV+jF3+Fuu@$2xKvGG)Dxo4#z~Amg>L{Iq=ZFy}ORt1^xnGzLDqa zhrW&cL#p#;IQSr11n(#bBhTT_3jppaL(FwB7e~8TmxGAh@85?SwtKdp4d(@*h@Fe~ zLB{|M+G|RCQb!yWN4gF}A9`dwWcRKe9pRjT47TV|8LYcVy~TgnYPQga>br}5_xA4D zy?f`r-Y{|-etzB3kELhk6wtS!FfSAOry&DnryNCEgH$6y(2c70V&JarsWO}$$kwxZ zGt<-as~bAVJ=Wy)P;X~rc?Kob@Q36~NjuQ3=Cm6vlD8Iz!&yd_kr_-smUrqJ1%1rk z#+*Y(#1X+Am9INuoR-Qvwwwuvk@BFYomSM;7k(go*K9~Vs5GKSC-#ZGcSxt5J9li~ z+7S+C7&{QC=C5cF^&!DY1Yx|zVJ7~H$wS) z_#wJ%w&TjV+?)v;9HeJ)#twfoe6#299xsn=z}(%kY5m6^W9=`(Qc+!OaOCX{)aSQ%Y!};YMHHL+P>vZLsn3NU z3isJr+b}ueDYo2I!#1p6AO5f5(D|+OJFyd+@6w2kAbbpp?Afzb-`b60`RHt|T3dg- zF8sZ4XXfUu)bDz;^XvML*R5MizaQVkzoN1-_ra#0*WmZ-mr%9AWx-%H$4=-m?x5K= zx-n6Q18Vbvc!|@^Pvgc5fyplV=* zXcRgos0{!@Z0t_QGm1J}tM)R~3@OUab@lZ-NcEuqHIhTDokF?12!&!$qB#NG9*7m9 zVweJQJKV3&=g~}SMh2=Ixg!roiF2n(9=F}p9pCL^X-Ez+BLc-T0tgwXTLw5qL8Ov~ z{S8fzIEki&TfUpf9Cabt!c)_*Qc4!B6zvQAxBNgLF9d{IbV4|LB((t;006aW@`D8e zVvZ&>ZzV)r4BYfH3*>!xl}T4p133j&LtIR_fdCvhAlQbdpp=Mm&@LkQ=Ak16C0%Ps?DT_*qz4G|Stl?a2OPif~h|1}>##6s;S zb>5@V1dN7+YyK;~j(BE|Ob{mL3O5?`YXd+FaKjT~uEx>cjw;SxNx0(2K($FFVss$x zcU+MjS2_|p5-}JVAZCoj3$L0FVg7*~13VO34q}UR$_3H!K)cTwLp&7)VHH&I{W424 zBHSWl&fUjjWb5TtQq+iyBaYg+0quyLjs=4XH70KH29qu8oVOvcTpc`O}OM|3PNQ(f<)2% z!b{Z4v%S12M?G(&#UBA6`ozMJ(r65~3~UKB`!LjSn5rF_b^3C+#YcaOE%8j&8<8)< zKpd;X?#Zg_3?up#X!2oPJVYW?2W5vT`JIlego&pLre=sW^NU{9Y3g_{O8-NiP# zv7r%hGz9-cmzpm&TxdY{%Hiy6I3GdTpu54In{7y*G&EeG z)zTr(UmrSssx=(Gm~b)J*wB2fN0NUKy3u?N5mge5V{zE%nWs8|@q?k+aOtKApTLwi z8aV63OsIzHsh)5c`JXk8YuwjdY!DjO!mIPy3lUI-s7x8eE@@K|0!!Pb_%3x<#jUh zGI$O(wh6@*N!nWHtDHzE=rWFG?UcXPsyTO)?064; zq0xZV*w-n;)Kqu6j`p5DY}8Qquj>@t3LPV8Plrd&Y0-%W#-w(@+wKx;#$G>#^ z%j&2-RZBn}Tf*;$?QZ--ZT1ZG(u1B(*rzSEr;$IkIzlFyxDz|40Qlz*qtu& zKfLx_FVtvGxJf3`agDGVB!X0=6~)Kmro3_UhVMp#!vI+?Ac%pBu$ykg-$;aw)yM|( zW79kgkUqL0yKZz1WD$gL8K_wx_0)7fn3G$B)RDuSY#fQ#ng)Y2Qc)O39s+S@^T;&& zV%GrJZt6WvLy=s9UNjWBdkh8H*OkDdk+NdW-qGUX0Qe0syD>C1l>7qp()Po!8q8Ve zDZ#L}AlZbEZm^Ne_VrF26aj*auh#BY#+PMNmOIcH4;qfAxn=l3DAjTYGkhyVXu*iQnuA0AyWmDWw5mlz>kP`sD54wqH3_!{`Lg^ofs-Z zhaA`i`#hrKiS3CX!6d`FxLUx{ogdUb047(54&L8!o0O7ITKktGM8%KFylbez-S%Y= z5P`-hMIZ$*>f8g$KEqNe^R0f@&JAh{K#C8)lWiuqhw!*ddA5xn8k+z5zz+#0#}2!6 zadt2)G!9WX{Gz|x=Pvs%lP3|+WM%TPw3rkbO?m!atKNzRwSm?EM&pnM%r}ij9t-B! zI@1(xO=yj88Ndo5Z^a6cakT2&$ks|^mIg+MFsK+uhZE|%!Yv6cgB?>#0yTtV$rgv| zd%{RtJb*!*)5TE^JQ$>;c%(RE)}IZ9o9}DxZn}sQTCKQRf-@Gdu!MqHb-0Ln>!M92 zTQv2xbll)mhkV8&)O)iXSsbcEtk7iO1?Ok*4Bc#2))yHuw(45_ zz0+NQAw7jelzH)THyPKUywQ06!iDoBnoO_`X0WcX>1Ja~w=%7*=a@`BgRkC%XAnll zO1mg=MB_nN=k~Mbange{ArlXWAbe53Xf?L>D%0wkZXeu@lWnlA+C{Um;R+d7OSV#C z(O|zZL*aT%nci_B{%c!5l|GRv}#?$2Xb16f$vt;{vs)03isO{}z=>*Lw(YA3DQ z3*BsB>&f23pd!L*cfr6)m|>j1ht<^(KP3xWhie45SBifd^zYCaWs^_(YEfOc83+Bz z9-s8r+BMkUT31d|u-(AFMGY))H(fm^s$N4WGm&$i;*A`rqwpjLWIP?&sBP1RjogYz zK{-(E_HN_Wdx%Y7^GDUgnEn1?GOJcNvap8b47wwP8-#C83K#oXfQ-++>Q z7m7)7h%-k1o?W{<@?)VKlu`}pJ|`a`2c>`$&yHp4%pbEN`5&)e#JaM44!pTc_jx+} zv8sS-=+7iF&&MpPw`-QOR4Bx{kf3K(I2~*HMUU!}Pe{H46&dIIFMmucEDscMR}PCd zc^=j0uxr<`1|t1Du%3siTJr5ckVd@#P+;x41*|M_CFB>sRjhymrMTmDmnnx2DO82< znb>+8Y&UKsu4~pRw#=0p5`NLUs<^IMJD)W`NZjS>ysB>dKzgmQScvtkaiI$NzhUY? zMe2EX{new2z5wd6Lerx{<*Iqa^QxT*tddU=M;96W3*=QL=zFZJ=Sd;t=vk>K(-kWj8dR?zBDZ(49;B(Ug@}m{+vI-PNWmUZj&wqhEEA_DA0n(4C=`L0Bf%RidS zT)~s#*v!-W1wwGsW$Hj}3ct57G_nBE_nIg5(Q>g|&LO_qfeVCm=TA{b&kL{;vs%7v zIU~h^^ASy>)8B|lW<)-qvx#Yw3Ti{Tqv;j|C%w~$cysA8#@=z5y6fNJD1i4GYsCsI zN0HcBya=92xl7eg0n~m~(8#3Js}i&RD?dWjTl8`La3*tgrteJ3o&UNu&QcNb0|(0M zq<~6KJX7!dfn4&z45cfc7Qa~TOBj9pSX5@}K&d1Bl_rM|y+{sq@sjDp>AvvQGppCp zYItZU@;VL0MnNlF<*t8Z+#;q84qveku2#dMJQpooG!+9uytjpB7{fD4>{uI}{<5Xq z@OI&Xg;RLsz0)gJD_z9$JG40HhKpvn3lz04|05!5{Wt)V+Vyh?ENc<>O6oC zR3F{<=R4HiO?cvp7~%W=y!YSXv7H^OocF=yy8sP%#XS3MCWq>l)7pvUHzQ=Czc+6Z zlg*rEOueC~C{oMRci$s2C984R78`tP>vH1J?VxC4)Z|yZYrh3tuk##~pmwSq>^v;B z2C5(MT!P-B7as2hKLCaAWTp<1pmXQU`JaGxQ$8RDc@0QsT^G!s$H>`pCIWg}y>IS= zLvdT;sh)t+bncwlvt|?Y4yOm`tx?jie-1f!_AD{Wo&anQOOHT1B2Z`g@6Me)d)CZZ z)GSx5y$92vo}pOYPKqvZ+96{~@69FoGiS^kN6gg8Z)M+mD;uP?hRWmIMz|9R$~=;It0=&B-JNNwJwRu|3V;b zn?!$>dMrzuS|>%CJkoibsB}>iI#E+JYS?hMqhW{+P*ICV;!BGaRp4`2N)2%5vDjrr z8J_-JDTuCGlzBSL;qMK9ZT;lis*|28ouYYa`RO=_8KWvUno?Z7ijsJO7J6o-ww^R0 zipjc9l>XFXanY40oL4li^m-Y_ET_9fO7a&Y-6cW=#X5uz=@rpV^nZt;YU-Y(7u`~$`V!ERVcfvRm3Q#R1+NT zSE>;&y~4`z&cM;11fseTKL=DhiUm2#p=RnnFJjgY7YGG{_RSL{j24T8g~}?t;_eFv zucJ@^&QZ|=Cx9q3E3!Qb-50hlRiB`uYP{TE1d!*r=$b><%PjBs!3e6H;^AtxbWY1G zi$&Ny0I0`F{0WfdQDJb{>a0>4>a3ymIe={8;%aywh=Ep$EBLV-%(GQEr>Le#N4fd| zC=*rd01W0JO3czsj}n7$R+<5>2vHI$E{R2iIxFccq*_Kg{Pj76*^C+=3_z(U07S7F z!y|zFuXU=@h0qx_I-Xjb;n75Wl{Cy%ttd^0&dT+JsrQFY2a?7FB5|U~jAh1eS$~vX zWO*gpHK_WhJ5R`69#23rOrgzO{<1Uh~C{$Q< z%p)txMnOmtj{U`8Y6fnc|>t5FgN6n zbU7m2u9bpUfKFg1y^X^tlS1AT+a5q&!}kSHm1HL_38m$2kSxk`8(na^SAWVu=fN*2T}0~+3zydG^n zNe5ZHE9NdPEGuHPG0=z~O_`AX4+0gDy=-v}BigRhRl)J65bKS>y7Pi#5%Y8@C~Osb zb>$QdEr82W83=)RaX%nUDOOE3*q9PI@gAO@UU#ufDMZ~5s}e-sxhtkdf1vxEh@;ot z1xwx+c;vIU>%OiYw+eG7XpslX-fsWd({nOvgJ^!>g{s#?iFX}%nvinKJv^iDDam&1 zEd$xt?uS$h|8F5JBnr5#qxz$A90xEZBMxud%apuH{^N^y+dOdr$M_3?%6*S&Nesz@ z_HeY|AkuX%=TD!FG|xx*+8$(o$+SMo&491%!Fea`K5u39V{uN)U&=$d8oJIG`OqGY zp$n0l8;+aiOq$F*oM;bJrP9=sfsf zVi(Nu`y#whQ@qN`^RROItg013RZjLH=^T9wip~b9^|0$|QP7yIEiIfc7M2(?f;iWK zI?_Bt5=I`nM;yGlt`mI0g$ykL$l=b7L0Feh)KW<)2B7BY=K%5omz-h^3|LRgJ09Mg zd76ilt*5j7q4LUDfZYSSayf}TZa`G#cr<`Bcm23KLv^8`KR~u}m0<`#(MrB>P#(*{ zJWoC=y@SL+!;OR(06B9f9RP8l%K>J2wnqta<{04KX%J1;u3iv5PI9gU_6$5F_X33SQ9bPF{ZT%aJ1O$} z%+f3m#@#m1ywFjgTzN;a7@ZFW(fdK7l;TE-mFZzaK@nFJUB@6czE}v|{?XavQ`(S& z*$+bJkLWc~Ga?_%Adfi1BPhy4iO~QdpV*6WyifXjP*Wm7{XwoOYC?J`>a3DOobHi@ zHl_bpwhR6DdHvtr4)*`l^X1lGQ>URt7g*V0$h~-7r%Mm!E-ocC=LmMTJNk=3GiJ_M7G%t&KaF{vL8R2gw`koR&8N?pvNarl%9s8$`ZdNx6?dQE z^mj59_{3vua6P-?}ur_t`daxbrz5me&geC0u;6WW!TVRV%RkoLduXNKt>;B z#O2=&9WrDn%OQ2KdhTNweZm0@dG%FBL{w2s-k8wAfWF_p`U)e&up5pXegYp*#mnX^#6=9d1q>w7l)z`ys&FFoZ2RYzUwr9hAe1V!a%$<44>8^T=NH9`R|yeO z#o>`$c z*s|~dDA@kXKm8Ga=U;>jFN;_AGr8-3{P8)alMarSVwe~%hm5|RIq}DCmf@($ zt09vwGr;=!v(Ex@-DTpcG32d#Oz-&lv(EqlAu%joT7m+E55@)m`_oT9^9(V#6~o~9 z-AwL#{13nX!_!RfcId3;^QBDgc>G^}M|1!be_E)&ygcL*Q+xmEcfSSM8Q-H6c;Uqt z-n_@;EC2M{-~9G>JU*)53(w8RS;YtAZvW(|r+)KW8t$mq2l0P?{+aDez4h3WPZIri zzmL#Epg&X0)SLhKHmYZ~pw3&R}Bwn_<07y#0fp{q$##KLO6f8ft|={q+pg?07Kl*7tw< z6Ga4PMYW##`9`MoegCn?e)7|w65}2<7Ha)GhiP}e`{<*O{e*{#C+@1Tzy3`l)9iYfK;0+QbkV^qjm>A{4O4iq}&VSN#4T7@hl zgCCW3pa8e5RS{Lyo(s8%iqcjO#!Na;kQV{Dddt!-4+g?e(<1^B#X48Ll@<3qAS-Eq zejXKhRpN+~UMf#W52O!}+*}VQ0*O*tOs3vdI;22~lPL7_?5xtFOCF4wv_CH=SLC|l z2qa2mF?j<~KKpP*-UIR#8O5@+*8{PV_R)o7&EcaL`iexcRa7lK7!w$(Fs$Wjk-1sL zMP%lqlJ@3i_s7V>77rwtv^OU!2BR11xV255fXk+jFkX<3w?r$F#XTO7MIhWa&(@%k zjvBlKGK&f;r3Yjt?a8K!xVU=l(OVYO!AQH(LWJoL5`{9a$pf;I_GFhWMggmFQ*Fb~DGd@HX)dN9GH-I-KfSIZPxxBW9h28{%-oz3>48{DJJV55RtFc6sUR}D1~%K()dYYl&T8p8>!N3i9GThRL0CyU z(oud^i$Jt&D<|`c2Vo}dI7Ss}y%3q(<3WT7;e{AkvdHuRAcU^AYs^E-(6PEQ;{*(M zRL|{cRDL;%u(HxGcmSa!02D%ps9;*pu>S1m(a$uDHd2i=nP+;4X40ph9Y!?(4TDKDnNkUx9n}id!$&ocX|R=+ za>hfolC~T@EDr0=V42eHA%;LiA!Kow2K!Pp)MCEvA!F>}L*kIeqx~r|$Mg_Q5D$W? zW3c|2o+3Z1fYFXBwHaj#B4jN^9;((6fB;EY%XjaHWHbJH;%sPf^pvyxgu-~p`R?;Ra!=N9~m~0(AaJoNL z1C^L-hvmV`9->UzwBNU11I;)jk9K<~X41y}`}P|=P8_lhW`sP%5E1ubB+0Z6?JtG} zk7@&&*cSjz9F+S{c}St84Jcg!q8~)-;QmVy~<2p7ZOv1BK#$D+f>->;3Q7j9a zJ(v*Ke1T_k-2*X`_rd#7K8q_1j!j9g2NOzW?}*fS~ACV5M1u$DG{CcTbaeKgP?TV zd5nD^HJ<#JQ^>afZ$-9U4?(g1H021BmB(bI<-0zDGx#W~3&HtdzeOCAsg;hQj*zto**bRg zjE5?dw;u5wfpfvrk}0l(pyX~na@e8jsq)xu&$v+X7F0h1Q9nv6Vx{D`&ViB%syOTi z6+X+6GRIC6i3YMaL_`5%y@zNeZzf+z$4Kp{m2#xbLo-1;c!>N3B9gD7+zllUju1II z`r*J42|p!D!Z3D%$V>4Zk%vnhTdj22OdgQN1N9^F;AsyLx*R;9u>+$Y79d{s5JSoE zeq8rQKP-=Qdx%!@C$95j95N3dN_YJOl4*jtZ@C_mPS%gMp+!)rEd+_*_>5}#=J-Q2qQU)(wQJY8L=8moiCkZf zLM!$;d6t+zI;Ehc)_Mxv)KTd`6h#2RfaA*}VQE5l#GmGOfcVO;J>T#x#Q zFk+Qht*;Sl^>uP}9RYf`G2qs!<%C$VQmoVf(ZI%9x4MxK*EVbb3wiOw<=S%XBe6nT zDOPDVr_{AlvpYkrveh{+lcie%#FqKkP%(5 zP%IRSbnw!kl3Yj+Ltdc^RnVnj?!5QkpD*T%1)2jAAIQZ`1UdbIf)vhtZ{EE3#rtBu z29O3$a``O+Oae~u9iT5>#lkb)S6YRj1$R$hQNrui%qYS5*Bj$>CHHVp;NAHF_ zHGi?TI58Z`n@ky1F`GadgUt8mEn$VT<}K0|C5C%snf(X4LEJ4m}SnM_5Mvl zrO$d#e=i{%+&TdnRS!(e&}UlHSF+MuX1}Yy8xN%yPehti!L+H`G!iN&?`EYxoIO{c z8xN(YOhlgd?RUgG+GH_Bq2${IglV5MdyYOQJ{+olb0V^#Vv=}UyrTo9Pqn5^J`YIf z#8lkk!{Ww=?`4cex|>qHgww!HpCV`9AeB?6%oH;%+_%HF>E5Q%NI(-%yroUj02Gt0 zx0jlP+c;&0F(ZI);`?KffF3_VOb`<_popEEu$@%8w{YroV|pOmdwnA3G!;x^nDzF! z9Ku|lHg%dYEfBWqUK{&{ctd+rjMK)82|7S>!Wkff)o&rYKV_=BgT z6r=4iW5rkvIDMQoe#~Z4srS9N-uAuiM}7CPNOFqT#Oq?T#@w_wt#Pkqvf7j0n&g}0 z58pjGj8dVLMs{J^8*+3#DOB~wL~WvYOHOCHEw;G497zOY&I5^}BjFx$myBdt+M z^f{MhBgY2D`oh6Oe@4!iFxqfpr;m_h>1!?bC65V=fkMmvOgUc{q9uuBYuGGO*<3ne z^y{O2qm6JdIO%yxx{6oD5N)UdPFB*ZYXGr&Cy#tB@R||s=@|OLOO%);UJ7Ile>sI9 zEjTd{7-fV*)z7_vX{0&D^Wp`9T7Q9LVrsoSIYi(kLz1*4J2^>D($N9u z&1Wd5irn%G;sp(G4GPFVe?)?NCk`DRKODY9==Qa?OPAU^(Qxh&R2}PXs;|Y#^ivJD z;76Eu8ft5BeAIW!sy)>KH^P*c@q@qCKxMNV@E}5c=Z>StX+o_4;+&*{WWA@4*XT6^ zYwaF53|8-{Y8*}Vp>d|H;r=-GnX2O_bo}DSFKy%=bisOCeJ84qQ?pE84Z6LbXeG1W zQwM^13zu*}@tp|N$P<_7_qH1RF54@s^=bpZqfPtq%k+B_{cbi;M_e2r1jR(L_ja1A zeXa)P*OGPAIu!MyAxBoT;SSXi#q6u*E}yl~wu>kn~Zj^;%katF`(BvAR%JQNbjm+N!FziE;rHvhKBI6?%ogGJwe9Mf`4> zr;gJ`-Ri3>tMFCCSL#)=q5}lJ+XPlO(Gs*G0D@Ij*-ZnkS5{N(UtTUQ)63$)N*)i= z@0Zl?=DA{Y5!K5QDzr+gqK;&nXR#cx$d;@rF4annvP5J+$q0nZT17GY%1TN~jZ$Bk zze1MMa@jqYGNjxnD$z@PrT#L?nz9+|tUwRd0b#SX5FO_Yrpd}a43RBmOp#{_izz

MI<8RO4mRsizc^#)#{A`2{)%@gT5mxkjszfO0@NNb?#6 zexyQW2}{3NggJv!my@sO`!Gf;vI-DL1h-Q}iii9P3To>4vLL^}C@>I_ z+*^>Fr|0?d0~Cn6L~Ssy4qBo$eWD+OvuQ)W)Cp6dq*h0BZXk+6n* zCdt}NWMh38W+2*zv?oZl`GrOy43#X+LZeYXXeNrip9aY?%LdmFps)3O*kK&>- zn|w$6PJzDwBRey)vW;v5V+FE>{ORw=E%1RPFQy~H=gaox7`d`QX_AA|))-Wio|&a* z`O(+Y%0?~|ik!%$qSsKQC>@DR12i3>GR7CKM z3_nNEkgEj!cvg--2ew2Oq6e^%?gLS#*O16i5f;M-n$?q$c8sE5KE&SSG5W)mw{uYJ z8pE!oph2hsnmi+{#5xH?KVQ8Jgn9+N@Y6cPt*qRZw=KpxiyUq!vnkHKPhY5BUU6>d!k0GNkmgunP&`TLU6OFh})EH!w z_{a8IG9VpQFhfR70-GT~o9S2cBF^XI_N%WZfZyq3Y}prT@m;rMeH9xhG&X>NQpTg6 zC?-r-^>G6>#b0C9o?zqATT2w#O_qpkl)uKNXK!`YFt7H?s%kPrY@$xG^=Lc6BVfH; z3uPTYOW!c70Ti+xFkWPb0>@=F|54ylMGaez>dLAB2*z<)bp-?LUR?I}z=j zt>e{fJi4k+kQtI~rImVRBFvzyVdHUG{cfEvEw3~x2USTnTITs`7$-6uwWZ}+xq%VN zYK6>`iekcSm6hw|J~C+KY`1zbT{0XcFhV-IH&YIcWXtPx)fR@MO&KU(d7x5Olk`vv zU4Mlf3EQQS5kj8NDregzVY@^X_HgoqYoW@O@segOmWeD!8Jj2{q`9^WZ7M*3?IJ_t zFO#LDK(Mn67i253k&~^3=ud!-24%8}zuwJduyzmv%T-LB3y`!zo()6lsS|uT@)?$_ zE=2q50d!iYgf=6VdAtm3k`Ojwxr&O7;(;(wWVd8bIhkKJTx8J_OKfF3Xh8sP$%B2; zi~MBK%1L@(6ZQQrErS$UpN9qoJ`lAMs~CARu!H&1OXINsRp=9LFi{qg{W4*{N=ikk zU4osCb+0JDK!e@!A%`Uk$c)){Fn#ds%7w8alLTX>%-Gc;7;aEt#+0$bAsfex6*9%F z%fb00gGae$>|_!809feMg5;Pb%X45I@D<0o8fC{UvyxT;yO&-%y@U@n*{eKdJ}_UL z1}efxc|98@3!ivSCh@-KYCh~7ObDzQ+8X2rz*e#`3pJ7{$AW+ed#1xG`SXxH%0;#d z3j=%REA->jR7y5W&k5wBakOjCXfdq9f&%PZEHY{u><^o*Z2G&gZx;j#pbQL}V5

wlNQLuuxs3>nM{_IovF-f7E@#+HEJe{q$A;z%ckRe zW=;SmNuEe^jFp*{5iw+e95Nb($I|`j0WwwTwX8!?7HkzLy*Sd7kS<||j0`I?o$Q!} zUN20PS5sl)22g@0V#eah23@kBp5nqXn3FAd^ZRem~rKb1PT+M|B~mybvnq;Kk?-UO7`;Qc`sm;mogvJMZn> zsG04y=4zE!x}&WTdd8ff2MB^eu8caAJf|3mb*}{iV&E`Wnkhr2R_j@ z$#vvy-41R>BvjQbTGIof4#dL1^LyO5bh*Afv~20pg~ve~v|j(%T(@=$ zy~lT|KS1RvP*8rU6|?~#`_`FjS92)zPR0^N`C!QkI@XuCF0fXvI7TYlyFQ>Hs2H_y zE2ZfZ*ZS93tCv?2$G&wRFlEW&_fzpn46G5W?KP`?tF2WZHG(d$Ev2~W60z7?G_MMP z_|^VZVx_fW4Sk>Q)GtJoZ!y*Ln%@GVZ&kudvBFxum0|(Gy!lKK3$6J}?|`Z!EE`y% ze5$MHXhja$}iAcVwXH>GvooitwEE-dxJbCoc0XwU%roC*e-^ocZs+KaVO^Exrv> z{8Imi)&~m@kQzJY;&<_$c-NYRcw&55e25^g_F({%mh+2faPT9~g};KgE*{D7b2shM+$AZA;$W?Temz@ol|ixw`N zzm}9TGp5g(Ls8FJ){OVNK}}d_scFmi_ChncWjd(#EQ-ucT_phsERgdT%r~$o`552V z_aOqJD3K**$anU^6Y$RuU|r@?{LJi_%>-?_oKy@F;;EQ_0+wbzDQ4DALA@+7T}-p4 zPHqH!(0k^)b5^idllIQ^>4<8ErcRkW=N3o<-VM!tckav`&?va+ooUmiPDS9;nzT%U zmN3_tBd6sM$G%0AiGWbyJL7l3TMEpPv**m#XUpldVGFiTn>=OmWP~sg0xSTTCN^7} z?Vn}MoJ%fY=)}ay@4WLi!k3fega!b#nV5O}Ol#Uw(n_X`e_I31d~4$Lo1hMuVNRbh zc?+%5$KxrS`4&QgZ+;+&gjvZ+85m^s&7&Bmm?$R5G27r)CQQ?(%1Nh)U;Ctqh+Iw( zkA|(i8P@%(}?B0*<*zDc$w zczN8|H^z=d^wJuglnoM8rU>Fh198r`$w3TNjT$><%xHuLUmr?j*zcrfSd@G$B`O$%7=}_HJs$+q@-b#Pfr+Q zj(%+>`D50;AtMmR6iMdrVWYc13XG;0C3~0A_h3bdiA9`P_1q# zBM{z0oX;9Qg%#QL!cdAPBCeNwofw=;icgY5mXaciUPL%jd&PYD`6;A`oq;%>e0e|V z(mU>@S6;z|81a&PZY8vdTl;(g4|0W34M$zC`oU1#X zh^^eITQI5Ez|>%;Mg8sIX=@cQyq$G*rvs>c z=mofl{Rwtn0^zh8lPu|#+PcI#9|Dj!c`OwYLaeP4+ZNc-+Ee$RqDUjSULcB&VHHiV zUZ5nR^?|Ip0-A*&B=#!UpHy4{?aSEF(cg<@*V9IY?!e}z_u}yg#yBWG- zE{iU}2;I+DJD^oh88$08rdQOCU%!Oq!gUV(+(}VHipkx=&xw@?K=Rg0HdCyPE*g{t z36Iq}M`BMGm6Tv3CW>T@t$xQzMm#mMio$I8x!@dGr;0v=V9Ux%_yP5lVC!Ol)CSU* z%IHQ02`=Q8aJTSVe3@N}s3J{LZMvEYu_v(#u_?kv4$~RP*Dvrs)iI&qxDCQUR7R4x zaY5263X5>W5Elc^!j{~JpUF1aJ&1Pt@Z)7NFeUejv3n{h#u-FNzed4G9H`5`3EE(F zp8|{5h}|J#j#hpV-5VQ(-wW$8ZESY%EAM@5!ecJ9g9w>r`2_{o#L1j`*bDrOyArh2 z1&FX&dD*S_@xCICSxf9}5VOP%=>}OAR6Vi^R9vtj2O&2rx3CYt58`uzkcS-&VwJfF z+1dCxzOb(VdKKn!YSt{t^XYXnaT61wk=DJ+}`BK6A2p z!(?7a&&tZwGp&rAJ^%*f(0M-0y_z<0ILTIGM#3W3$Xcx}BkC$h2B~atGjN z`d-L8nHdQErP4($E!%{8tVoVhfI^ixx^|4d--%F#V|>=xG-}0X9m~3hp9Mwxa6B^8 z%*sqZP6|}gt`7(GQ;TdGhKQpImFFMxACoCH_@>ioNh8C|%s582Oy1^59M0cg1m3nY z5;A1w9a@+66zWr_Abkw09;2+cX@l&mbl28=%OvDu_2L5Ya@`(Ml;~ z)6&fMGc?>ANI6H#5Xz;vqdaq=YO6^aIuZJ}78YAS=R z?!Ar+D6TIrcC%eMS9{XwH-qyj8nT-0XY1?FpmC0U4RpJKiPV{f?8$}G^#khV>23=2 zHL^8Bnua@)hVAS1gV83Skqybk8W=O`bz`@pdxaIgdKyG=+Ga!2M0HN6a)x!hmJP|( zI+XtpAPU({D}Rw{m>Br#BJ2i3Qg`}3ge)o82pZQn(J`>9nM_IDX&+h&q0s;7Iv*g{ z0I?8`#Ft#dxA__%57b#F@jI5xK0}pB^y4_XP9`N01!wHKlLi8m%CgiQk61Ulh5Z0J z5!CRPD_9GwN9GGw0Hy`&2%AJ77G>|MDx2`#msrL7YdN?mZ&o2A29s8P6$F^H(`;0( zkVW%VVJxgft#v{L6`Nttd{vY_f<>t{kV2zvDROiEDytl}4A>f3%`X^YZ8&&eS$2^= z^;jUu`7yf}(MZ_CQo8y~R%|d?()tgH0?iV~c`E7^JEwr^%vNEoD@wPL#7X zgGs@9HP8y5Ek^=LFSkk&`or;Rn0r~JOiE38C8CUM(qK{WGl$fAE6YVWf_No#pLq~W zHXZv1|Ldg`g2Rwutz+1V@v~J7 zvxaCO=0%gRE6e?SZXc^q&-5agH;h9cU^=P~c4k#E8$>d22yT{@mkq?s=`FE)N<@hd zQ9rthJQ!9Shw?GUYlVo%!5|irg~2&|gcwUHvdI?CY$_-&rm$RLNf6{g_#O8PAsq(J zC^8Gl!X<`LMG!X5=6MuRC=XG-sG)Wt(6xckRM2F3zJHrEE>`1V6C2H(oRHDX?BA7Vz4P%F&9aASy zWM%u5rQ;tROIJ*mPJ=b0oxy-ynSB+EV~iZk0!*;7avd4yo{B@;Fmw1BW>$p*&!;1} zN!BfwKJbZ9v%hu2ua0dCl5N9(v61FN5MzQmd(C_kf2Own^tu!9XYNvV8Ey)(+>lidF(u`mzhr_nM!iwPEwRG z6;Ea4{^6P-!WA&)aIwCfU z+8sSfx~ln&Aj4#oC?Tk727Rd{X9*_5RoOi#3Mst(_G^G8gj?Hbwh*Y;D1l(}&fAkF zjZ%oJcNGAU99PbiDFk@SMWoCnjcFBTJCnog2r%)jk>C*@zGQEDH-eanAPP-iAtp>5 zp&)lYBo>fCLZ%~P;s}|DawEwK((*1NNR1dCfQh389{e;sQh*pYE=fTS&tyfJ z1?hpcfTm!95ClO2;TE}mmePYXjY7Z#JO1Gcb7v7WrSF_js*$#&E(J^gd42Q{@bQP6<=UyN3oYzBlfZ>B zZ!qSy*I!kj+wV_{EG;X;QzFc0R3v$YL3T3&Syy=Z=`C=Uh9WJaU#IYg{j%a!`nGyT z^q}8D+(+rmpphd+zNA3=C#fe*!uTOTX~k$1LLvn6g|4fpG=W^q5x?ik1AC zQJ43QbCz;EmfO{86yhhqXv6G30cC_Q%jM(gnK)}O7K=$*ky^u1Cd3*t^pA|Yczfmq z7p2x*2_)FCp(tMRuMD#<;)Gx)ttUNUJQ^=EIdllZF}6ZpEFPm|GLD6^YQzquPF{ZH zIc9iu_h>aoT6tOsXr@#%0iytpO+Y=|XfGW@ZyrgFaAW0$5bmWH|MCpu`fg7h%c~(q zcSQNIRE2xt#Xm66Zp4v6977!4rq&iK;>`6`1dd*M5tU99u(1%Ky6Cz=JGCT`%CJ8_ z|9fV5bH@lZ2YP*&8OpHGT)~alq4deW0H}o3>B|-JNyXv+;hKSq}@158`Y*Fd9He|l6ou`IPskQ6eBMc409LRd9uztNI}Ah zXYGGxic^oBA^UPDq+3Adho=IK9(KiI!wY^Y&4u@u8vNjj|$){&WF;?ENF_rLr7FBxj( zy+S}S^cJhBX71?OOX30j?Qj1FV=ruZncr!ek9sr&bXgMcH@|rT$iA@s-YakL%<`F$_M_B&msiz3r8SWnYDv5T1j%@MlZA|?SzXQ*wo_g}h{|=}OH-Mvf<<850 zQO^g9L<`^J@FeJy+LQLv)8J>-rP7kdS&j*Vnp6ns6 zz&`Q#af100ETFVX zN>%Yh@bRDh^d~=ggft%hoCe@El1`6ZJtGD(&ptvv_Sg>r)t>97`M5Y&_%GFz?lQ~A z9)0u&fZqB|1nH{TsV2`e`YBjG`sj~;Owij;;l<>MJM$60?xa;>Aj<~uM?d;Lpj10w zN3CsDid;h=Pb(A!W^A!-+TeX26v{8~mgH7^XlJNujH7T!0YOGadx&5I*Pjh;&*FSt z>ZmA4RM6&8lu%Vp;)H@I;3V=xdHG(L(XO|JE;m<{mSTTKD3Yf!$|Diy4DUDyKl4Fl z5S+|Qje*Gaq7ttpjkYhb5VL_IqqyMhi1{1@J7K#P@kcL22$M^qqe-{Rn`;zGKJY>{PV$AZlE{31GRLXU_v##xjv zKv@^^sRDq8y(q;;QGo@ZP-fO)k3gJ{u}^Rb0EZQ!RDdrQW(G?N3VC0vCPyd@VZd0F zeG8jqnm=i-Afb%RLoou)wmigvH4g~2W)L6jhdP@Jl;G(pR;YB65lYYS!Vsjhk8>F# zXMUs;>bKx-??I1DWQl^s`S~~nq*ehJn!gxzD-S4ltrRaUJ#;KRmYHl*dv8)pvC`7T zF*E%@tjIGsmxP0y^o%v^ZBb^0;-p>>4y@Q!IXLl2n{5}P9Sfx$i$%#`5f~B36^HsU zX=Xo^bqmpAC5@(sLN)+}Qq%hJx=ev=;zaAIKqA#l+wV~xXIJVob8>iJPl_uPqO4w! z4&^Ip?@yd)awJYq4W*>UqGYfD2VY$f4q*Z$Qq0tS9)UQDbCZJNwB)#Pj&&lGo?^v< zWNU^4A&pQUCe`Q1p7~->b1(hWtjLSxRX7&g5<*N|7VuXq^ zDH=d~VmR41Im%8DQU~QfFV(ek4~7~9)$$%H{>2mR}ppt>geF3AGTsK)=hO9HqFNh<6`<@fs4$$ zVi?Iwbi|4zkv6z}epve)#;i*@MiF}>)z2EEkj$ab!9!j~X4o!GjX-c;91wBPJhamz zEEw*~PUSGZ9||i`+QHC)gI-+S*VviYvpgQkbwcjbjts$#Q8BsH)Y_-!6Mb+w7RklA5Qf0QO zJT`en#73i*Dc~1=^9Ff9KUM#}uD4{UAtA}#( zVj;4DBL7i@o@b?c)F5}LHU^+vyjV!E5?KPp14Zv(>ehm)LE z4V`Z&@J$k~gLC)*1cLc++cLJ=JHmgD+^ zXD(U>%ICnHh{b47xe7-XT(=E1 zbYSGI{T?g zG>Q0zqdcE$ydISqaO&t81=wLjiJp|u(UiS0C>hMh`3TZR@zGM?X7$6sT_Y~>yfEZo z>PM|uF4pCA$}A~4sK*0D=*ZDMv6AYKp}tdqSB@ZXppKX^m{8a*$Nec%#lW!zg7}VD zyFFa2a4j^i;{JA{=5h?D+Qh%A>vtt^8Gw4 zaIPd3m&}N+d_(PkfLpi21F?@p0OVhyf+3tT58XPFLWP`&fpQ)7{h@vPy@>x)jST*s z&%Zak`2XWO@L-(%okzqYy+8W#kE!T3mGc%)SU-Pk9qM2`7}xpjAN=qi9$}hWk^8YM zMfujl5C8BV6wRsD{qqK<-2LYF|DI?VCVqU63w=NKuRToa`A^^b9@7*t1mcgzhwq1= z>;LhghrS!7{rJ&GA706XzQ6mtotaTA6b7 zKYa6BL~=Mn&7Up*c>Vef0yTj&loZl8KN#2bH{bY%LV>x`=^^ou^KVs< zGw!O;b(+e3yq+{`dRmTx;1>6-xLbfEdgWmb-<14F#Y6UY(NG&2NE2+;@2_xVuz~R1n{g-~3kG_u_7Vud^cjOnBtOad#p=D|s{~ z?gw$VMI7d$Sg{^I!V>hV9{~Jn+>hdHp$pwk93%!=_dW1H++zTQ{0V~w-WNA35RP-p z@tP=+yBjrfb=1h!_sc!C_*->pD4)A$x7e+Lh-d?dEgqto^zoiu4%OHv_uur;LP;O* z-nDDD#zVzEYi~-(L$!!1cImryFqXUX@n`Ew3?;4GwR5LK6no{)8V}JTqS%S6ce{-} za%Z!LWP-EOsVx4@L`uMZClH&=bSWeW!C&Ojg7V%S%Q4fhSVhAsbei`WuiqK~-> zB7S)&i0076pKjiwF_rj|M)$>8i7zG%H-=k7W`2qiaXP35Psg`Y5LJ8nuS6aC<|nvg zNK}2BkH>x~PChYWxPQ3$(H0b-)0~k8<|1~2_E0jZGi36XPsAsNhxny9>r1avO9(S* z`Q}XyF&;9F?PACMaq>wT7aaP=hEGtlP7PI5(7yFok~id?O`qU4qQ-3FKlN=B+wYIF ze9tg(=#-5_)R~w7whzR~?$F!(ER*8lU@B;3CV4jVFT#F%loQ1#^r z_Qbfa#K|8g4<9gG4Ex&9ug1xrjTru&;a`n2^~Z4V;lB*Wd9=SDl05ud!yk+@AADv+ z((rE%|J%5b?~%WZ`0Ma*4F7su@V*~E|5yC~zZ?FIxM2LZp7`U?zm9lt#D9(p8V~&E zA3XZA$G`gcEZ=PYKB#J*o2Xp3&NVW{A-Vsg^OvmZXKvb0^}2P`>()`syVXMrC2yi) z-Wn58>HAQ(hiE2mbV_y;V{890_xnIxL&+QW?%A_f!$(KoFAh-MZuNyw`fS*{M}3jh z=PF7HC2!cX+x?aa{#-?g77+zLIU4>n<$*gfEb*CBUn8+k9;`jBogQ3IeS^ z$6vf}uOEZmKU+2V_51;}_uxaP`0D$O1I9u7P~2c=w3W=CI(Milm+rDQL8Nb8JbzI1 zeMA)pT`Gps*G}6N;P0ftWS3|rtM8r8RMN_~-#!o#8%o9}Z^upzpE1SU{aUFNT)Puj zk~REc`gi+yO6pTZ#85ImW!vy6q)%V~UlKKA`|E!V{8g^mL7yp*v|S0j82pj?S`x;Y zNh^1Jx(y#8{v5MjZjd#9J*8i&R%ERNTuKT<9ZL?~#Ht&lh zFS3PfBy3qI8lk0@6faU_%f_25k0qB9NNhtPX@enYXvk6$AT3L~#z zYmz4Y+I;yircIN!AtVqs<3*Zz@2S6YXKaS{ulDO7Ez9@KyXSY$z4z&!^Uk~PEROxh zyoxt@t=^2|3 zI$>QOE#t`Np}#VSGL)Nv-GRPFPEYDKJV{zi7X1}5GsOq_-$wnDy`MQd1T98XXev;Z zaYodVK8$M|`P{&uJg8(4NjY>9Ez$c&1A|0ObtT4Zob*EkHp}4pG=*QP5;y4E?aC@0iyH>XD=U_nbI+ z>g1{HQ}TTUl6hO7*XdKz-BUzeDx*|J^);8=r^|doE zTS%(vl0M|NoE6od>^uGB2t4M?&~lk6&f;~S*n8s9XMHbTnJ=ApOP9HxyW-ZiqYwVs zKoE}LC1`pdsG_p1oyQ+~VlX)MkJsi`&Sy#QaXo9%de82=A3pGhcjiFfcLm;?Lf*=9j?Wdy}b-{8gJ;_TTgHx1Jx1OrC$?<$w73o9}*j^;+Sz z9O-SYx(Zj_+T8sIfBeYvbyFpa1gXfAReAm@g1QTIqsFp7a~o=B@=x*KKb8)R9j=@W^A2 zKlSvpc;ucP$REg;-r|}gcVTJe?X6uscb$0f^p_re?5kh@`Zw~wQ6T+}D>iq*lCsLJ zP1|=LIC}i7!~#o*~~{ql(82tM)uxV;13SGH^nNO-{04x^%!tEokuK`zXdhNQa~<%$-Uh@B0%+A_o2hK!}9#KKW(AfVkwl| zW$oU>_ZXcgM8wad-3R4^$|36@a$I~kcYGTaSW7__F|xVy&&c=w>ZkV|&<+^z1_bWf zuRB^%X^o(4C)>q4lun~-dkDDXFAgAgp>02(pZ#G|tEjlfwzC~16O?K?OlLArAp`G1&w*hZTSefSW26(fvZLF2Iv34$? z#*RA&@le&F%&l=e*OT{lB7>oA58GmH^)$0)8F=1mwYBqhwq4$#weW3ntJ=oft&T1Y zw-vY6!y2r{Ce~y%x1`W1?lLB#?2;eGxnrk& z=PaEp(9r^~=V_i?faO-DO>MVh@z!=YfnF6S79T;m7@nZ&Hl~>zKV4A!^Zj^zvw`6T&o2;s8jspa?h1VD~ zB&$=wO$LTHsm-}9Y$Myms$@X;7OSQ<#YU}{JqjRZQ)3fvGT`LbtT)fW(~wS;*hYg6 zW{4}tz`T}YU>!Zw2EEbNWYKv9t0ca>idV~G=oH9auX%L*;Tm`YF9(hboJEHmN;L;J zF1RUmE-b^U_o$v(I$)r~kQ9`rssbqkDc35uD|K1*tcbpB zj+gQ>oGcNDZICf2fjBsso2xg|xCWddYm^#iEn8u&DrEw0tu1He1m}Qr!0pw_X8RVU z#@d480=7b&st{kk)*`;D;LA5DRjz8sW@C{!RUjBEl?95psoc*PAE#t2+h{=a=t;Ykb$;gY}_MWbqz-fI44?+LkTV>K%6CA&K<1~DUk_O4^oj9kt#db@PqlWYIH>3cccjUeMU?purl#%F=wrSZi7B9IRC|e%C>V+FrAb%fRLnpHQJyrTA|&>G zsxE<#5L57CLFZqGB2g6xEpEm{Z0@h(Q38#?w zq;1kJ2==HW#^P2io)fo%A$v#*>k&uP9ZSYz_*aYbpc8F2v=zjxKu`|4LN;9HMvWNi zJivJbFF$THIU@_AYCsKUwP=OhuZdXI^wVc9{ zFy5k6p!8sVC@%~xD&tbe;caeQ^=abTM180wAyvF&*a9{j5d}jv8wmY?$UAmw$hr_Uo^gRw=gA{!!`R%di%Bs8ZfVBGrP1;-1Z{C0wNhdN_o z&noB584;2D22yU>Ks8^F;rkYxwvZ&zn0Y}&4qlu=k}Q0O{&5WTM`WZtR1rU&ikU`a zw*3O_(wC^Z2<1>@GiLnn#WsDJa%@LU$7Z}6Pu7H z)VMyy5JgRqedZfe=N;!kxfDiu5NzHl8rQr)`*ikIe8t1Z!1L;Pe$jj>I$;L@dp>^Q z;-$Px$=5HvGCkv%Nd`lq2+D`JKvZKn6Bd2hGiRvgodFjwAvW$bXWNvCuXef9ETTYE zvL@_eysW7NTJawWy+BgbSoVZFo;!s-PnVDl)HTVPFyj;PthhxV?C@0(GN4BFSiuA~ z`$XIwhrB3ZO|&hYu&BZc@F?6tje67;TQ&i4P#_|7>5LUk&nZW%d32AkD(y$i13$!8NdD`gI}ad(H0;HWDTGm) z)`LB)$2!s@_ZWwJDdnH>{jUAU9?ka}-CaK=#0Q5C>xYd4Cq=@GbDfj{x;wdN_hCu~ zV7}IVNIPU4?EV^JUz1<%L~sk+!*&}Tr;x%q~jep168fCt_7#w04=> zA4jaF`R#`N`hK(PZ$UJF+1({~!4hnZqOW0{Y}}{qGxi>&$Yt|yc62HTHZZqS+63c+ z?j~few)S@OZlmj7%5G^6wWY%rj2(O5Ok^dDpEWn_(e@;}?(+M64xcv8#*DjnbnJpn zy3^RvK{?I&agLlm+waaZvtFp~#*}e-#EZ@Plo9)}k1Z;+3 z)(AV!ldShNHgx87njM`TtOLg2PPRkd&f8)6wZgh<;R0zO6roe4kf6>EtwRNhr%^EM z1j=e^B!(7+V3@l)b~$&cI~8DLaN?~pFsvCiqZku5WSw)TwnN#jV1h)2C5drhE@DP7 z9qSUe*CHGOwx0tKrA-Dzy~zOP7$C4RYpt4gd%M}z-lk!hd>brZF$T<~n-ExfTa>m$ zE8;5JS~c7Yy+sCyhT#e%mX`B>YuFeHfX@V(; zzAE~1L4zTlY(l1ck0>1f2XXlOqKM`z6f}-DC!vd82~EKOou)~czlsX)UNia@k!GjDa9cJeUlLmA)ks{0bvSj! z%rcF)!$dMCUwqB|ntn0)J(vP8%*;3?oYvu6WlV>{u=k8eG#YV5)UXweY4G^AB4IV` z3_0N|ZB514U4dM*+r~Bg=I09iPPFS|lx5Q)Sh{_%0>M&2n|W>&-{#Dbb0hfV z;!BLfIP}4uvlqWKA}H@pz~L;F zHplq8Y1-}StU2Z{qeMX+UfQlNQASi)dlQs1$Na}A@*PmkIohtt4^Ws7_Nh5R#nsKf ziJsTbQwd&d);Hp4$F>$CW6WKMVMfWB=~rmCCNEA1&n(!1V!yrz7`C{gV5V`7%imjI*hikXIKzjK&4P;Npl8siI##9Na7_v4>X@)Onl0OPLvs!B}J{ z2OEo$jW}gZLwf9SHyqvIVm1xKl6>pX>O!kI5ubp`DV)!8FwrS&`J4&7p~3r|QGq-% z72N`sYStoo@d5!bv1hs1)hsMbMFXbg#Nr7(rU{9$LK90TY(ivIgbEaj`ROTnfYB*r z3ic=_O>#xt@S+Ae!zb{VH4)2=xr8K<|GpIfh8E@&I9=dKJ|JY+zbud+O`?A^VvFeT z$R`=H`~atg+YTpx4Q;Z&0Fs5bIf#MkcSW-C+m;rg0*I^)@}fa99APsQ4!I#e#9c{C zhwjT4@`X1nwBp4Ko81}@*+RKOSX>aEoM0jlgy9%)g6f9Zlfq^NOn(6HKiLoO+BWr%| zCN3RKiovfD&x(o->&pN&D2|ml&ZUv8{w+Y$@uWZsXnA=FuyV`KMa=2VSkjxQ6hnI8 zuLSPjDK1v5IvsO*11AQ?2~~Mw?V7T(<$%tLHRJa1`s=BsAdEG2e=2d7b!|gAJ=aT1 zmI2Ee$#jHGNQPD-wQ5%V&PQ`3rlqS^ECM!-L=5eXwXo{9{|j>2GQ!IB0!H_n#{RHs zQ))RhHhgzx=_$QzNgdZlj3Ht-3LEYCSYd z!Yy1_m<{9{{jJ5mU{&v??>Kh3J@m(6 z8-AU}eKfLQ=+I>Hg@SFyD}F_`S896VHl|jBQ60K0Es=!13Hyu|%gWwLuYo{fZr~&{ zz~JkZvf)x7!HK4oT7XWVWs)%Shz%<>aVyxe_r)DfG14_)R!G83gm;=4h!A3N`PtcS zhl9ARlE{Wkb!SUHNKxq~iTQOkQ0YB1RnxeDhZ!|Fe6e*%^=Et2A82sYAQY|6+lH+jFCyS zqByotM0gckru^K;#*@B*4Bj87D&s~b5muaxN#R>RCMEkg-anJ$XES>XDshHG!nizU zWD*&O_>wP%g`?6~Vsz|GM&A@)niCP_Q6rOxlIrM5%c0ce@@V4RXeJrH2bj~1QQ;kM zE;;&q25%T?I6+B|ERW%k?wpZHr`~WDz>45!H7cKDBgu2$={o|i9`jfk{1R9O9Ur#O zDI-QES;ljZrVR%G5akgzoE*udGr=4m@xh!r%frTV8Qf{)8$@|T`#2Gw{e_1HI;Td~ zhmB`5I^*a@j0^Xw5q&r@IQ&eSSb}-qm{f|$2aTbpy}c?HwF*RdOyN$D`N%LQcv20b zMCQ=9$VbXNx`b8@0KV}H#_}K^FoqvJOvIi%>W#I)lL`IgL3to~cJTfla)LVAZ3od9 z6M5rr(3OG2*@36i^J;?=13v$Yt_r&Hv&P`Vf^T0HNbudjfOf%EKASi*@aO2K^;*T$ z4V`SDDCi`gF;9`lmf_tcQ@3+!nk1j)XN-X_^q`a3w?WF0lr%R5gyh*X4-p0Pbb@4z z43EH9Nf8{CGs%Hx{;&rQO=e%Elq!kJ0cMOO;IB&vwe9LnfSVIN$?XA)VV0M1M@N=GBWlvE52Oag0Ul3|`wtOId} zOP0`lLb+uOfUhGWfE2Du7#*IBP;VJzA`){V!rcj)aUg@5$y&y1*vF};6`k&zQc*a! zWb``qF%CBo9PekGbY}*C92|?WsXtK@W8<0R8$<;|%R+Sy^dI#qUbBx>3|=&8-lSRK z^aPQHdOVv!PZ6mvsYf=N{1}x9&592jf~t`6{D^f*H^OlVT=^(;8+7MKRMKrS-aJXM zLc@Uid6g{BiES*XE> zdub*8l~59-2IWx|L#N|%R}!MTCK;qN@y3y&wWeW4)w8_hc=>9-%|)&>_^eSLhIU#q}TBDs+gK_ZfHA3Kh~n z*LQSp^BPhi#dV3AjiqS~^8Yaqd?=@B24EmXB+EDD^l3fCy(Tv|uWc+^c#_x*x z*(<$`>r)+Y@sq=Mlo#2lH!hw#=vh}FE;h_xfBS(u>(&(57)ZF9KDB3S=>nau*3Vyi z?<@Co+_o}DOV* z&RPu{vTM`NPl>JQF-{l5K}g-el~((3sSjUS(T>fz(tZ(Es*E>5Xz{_^ZA z{^!1Oa-V1YLKlW~N~QCcFMlsM@c8Gw?NuxDY_jB%R?WXZ`|3<+@X6D^|Eb#5;Kk6m zV*b6vzq~jZ8hYwW_jKQ}VR5$3CAYL}{+-LOefRuC0L`D>-+b$eJO|Kjsp#69S3i93 zH^2DlU%haCGBW=B6OY_`;P-AVDR5%UEU5@Vp6oB*xiB*s@ee=!wJ+Sg@As-!6}mJI z`76S)&)YN8ld-UG__-&({J`-&O;sgDS$N6FkqQxg_f{z28yi0J^fw>rP@$-D_gNBKijDTUnCWlyuWJY%H>NJ6)nhd!H5U Date: Fri, 18 Oct 2024 13:42:33 -0700 Subject: [PATCH 407/485] Move test_samples_default.csv to data --- test_samples_default.csv => micall/data/test_samples_default.csv | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test_samples_default.csv => micall/data/test_samples_default.csv (100%) diff --git a/test_samples_default.csv b/micall/data/test_samples_default.csv similarity index 100% rename from test_samples_default.csv rename to micall/data/test_samples_default.csv From 56d81ffd4421be6faa01460db952d811bbab63b1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 18 Oct 2024 14:39:40 -0700 Subject: [PATCH 408/485] Move top level files into micall/ subfolder --- .github/workflows/build-and-test.yml | 2 +- .gitignore | 2 +- Singularity | 16 +++++++--------- micall/main.py | 18 +++++++++--------- .../monitor/micall_watcher.py | 4 ++-- micall/tests/test_kive_watcher.py | 2 +- micall/tests/test_release_test_compare.py | 2 +- .../utils/docker_build.py | 0 .../utils/micall_docker.py | 0 micall_kive.py => micall/utils/micall_kive.py | 0 .../utils/micall_kive_resistance.py | 0 .../utils/micall_logging_config.py | 0 .../utils/release_test_compare.py | 2 +- .../utils/release_test_microtest.py | 2 +- .../utils/release_test_publish.py | 0 .../utils/release_test_setup.py | 0 pyproject.toml | 12 +++++++++++- 17 files changed, 35 insertions(+), 27 deletions(-) rename micall_watcher.py => micall/monitor/micall_watcher.py (98%) rename docker_build.py => micall/utils/docker_build.py (100%) rename micall_docker.py => micall/utils/micall_docker.py (100%) rename micall_kive.py => micall/utils/micall_kive.py (100%) rename micall_kive_resistance.py => micall/utils/micall_kive_resistance.py (100%) rename micall_logging_config.py => micall/utils/micall_logging_config.py (100%) rename release_test_compare.py => micall/utils/release_test_compare.py (99%) rename release_test_microtest.py => micall/utils/release_test_microtest.py (99%) rename release_test_publish.py => micall/utils/release_test_publish.py (100%) rename release_test_setup.py => micall/utils/release_test_setup.py (100%) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index fb75d6fc1..35f4333bd 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -119,7 +119,7 @@ jobs: run: sudo singularity build micall.simg Singularity - name: Singularity test - run: python release_test_microtest.py micall.simg + run: python micall/main.py release_test_microtest micall.simg - name: Upload Singularity Image uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index 5e2eb7a2e..039db64ff 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/micall_logging_override.py +/micall/utils/micall_logging_override.py /micall/monitor/HIV1_COM_2015_genome_DNA.csv /micall/monitor/compare_454_samples.csv /micall/monitor/compare_454_samples.png diff --git a/Singularity b/Singularity index 928a15ef6..bd0396060 100644 --- a/Singularity +++ b/Singularity @@ -35,9 +35,6 @@ From: python:3.8 .git /opt/micall/.git ## MiCall - micall_docker.py /opt/micall/ - micall_kive.py /opt/micall/ - micall_kive_resistance.py /opt/micall/ micall/__init__.py /opt/micall/micall/ micall/project* /opt/micall/micall/ @@ -49,6 +46,7 @@ From: python:3.8 micall/monitor /opt/micall/micall/monitor micall/utils /opt/micall/micall/utils + micall/main.py /opt/micall/micall/ README.md /opt/micall/ LICENSE.txt /opt/micall/ pyproject.toml /opt/micall/ @@ -118,19 +116,19 @@ From: python:3.8 echo ===== Installing Python packages ===== >/dev/null # Install dependencies for genetracks/drawsvg apt-get install -q -y libcairo2-dev - # Also trigger matplotlib to build its font cache. - cd /opt + # Install micall main executable. pip install --upgrade pip + python /opt/micall/micall/main.py make_blast_db pip install /opt/micall + # Also trigger matplotlib to build its font cache. python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' - python /opt/micall/micall/blast_db/make_blast_db.py %environment export PATH=/opt/bowtie2:/bin:/usr/local/bin export LANG=en_US.UTF-8 %runscript - python /opt/micall/micall_kive.py "$@" + python /opt/micall/micall/utils/micall_kive.py "$@" %apphelp filter_quality Post-processing of short-read alignments. @@ -156,10 +154,10 @@ From: python:3.8 KIVE_MEMORY 200 %apprun resistance - python /opt/micall/micall_kive_resistance.py "$@" + python /opt/micall/micall/utils/micall_kive_resistance.py "$@" %apprun denovo - python /opt/micall/micall_kive.py --denovo "$@" + python /opt/micall/micall/utils/micall_kive.py --denovo "$@" %applabels denovo KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv diff --git a/micall/main.py b/micall/main.py index 04acd4b4d..d1c86f6a8 100644 --- a/micall/main.py +++ b/micall/main.py @@ -17,15 +17,6 @@ # Run micall/utils/get_list_of_executables.py to get the up-to-date list of these executables. # The consistency of this list is verified in micall/tests/test_installation.py EXECUTABLES = [ - "release_test_publish.py", - "micall_kive.py", - "micall_watcher.py", - "release_test_microtest.py", - "docker_build.py", - "release_test_setup.py", - "release_test_compare.py", - "micall_kive_resistance.py", - "micall_docker.py", "micall/main.py", "micall/resistance/genreport.py", "micall/resistance/resistance.py", @@ -48,6 +39,7 @@ "micall/blast_db/make_blast_db.py", "micall/utils/concordance_evaluation.py", "micall/utils/basespace_upload.py", + "micall/utils/release_test_publish.py", "micall/utils/compare_mapping.py", "micall/utils/project_seeds_from_compendium.py", "micall/utils/fasta_to_csv.py", @@ -61,10 +53,12 @@ "micall/utils/fetch_sequences.py", "micall/utils/sam_g2p_simplify.py", "micall/utils/contig_summary.py", + "micall/utils/micall_kive.py", "micall/utils/compare_454_samples.py", "micall/utils/genreport_rerun.py", "micall/utils/remove_dupe_dirs.py", "micall/utils/find_missing_samples.py", + "micall/utils/release_test_microtest.py", "micall/utils/denovo_simplify.py", "micall/utils/sort_sam.py", "micall/utils/sample_fastq.py", @@ -72,21 +66,27 @@ "micall/utils/projects_upload.py", "micall/utils/projects_dump.py", "micall/utils/find_chimera.py", + "micall/utils/docker_build.py", "micall/utils/probe_finder.py", "micall/utils/aln2counts_simplify.py", + "micall/utils/release_test_setup.py", "micall/utils/samples_from_454.py", "micall/utils/amplicon_finder.py", "micall/utils/driver_utils.py", "micall/utils/seed_alignments.py", + "micall/utils/release_test_compare.py", "micall/utils/remap_fastq_simplify.py", "micall/utils/contig_counts.py", + "micall/utils/micall_kive_resistance.py", "micall/utils/ref_aligner.py", "micall/utils/scan_run_folders.py", "micall/utils/contig_blaster.py", + "micall/utils/micall_docker.py", "micall/utils/hcv_reference_tree.py", "micall/utils/sample_project_summary.py", "micall/utils/get_list_of_executables.py", "micall/monitor/update_qai.py", + "micall/monitor/micall_watcher.py", "micall/tcr/igblast.py", ] diff --git a/micall_watcher.py b/micall/monitor/micall_watcher.py similarity index 98% rename from micall_watcher.py rename to micall/monitor/micall_watcher.py index 763e6b2e2..5461270b6 100644 --- a/micall_watcher.py +++ b/micall/monitor/micall_watcher.py @@ -9,9 +9,9 @@ from micall.monitor.kive_watcher import find_samples, KiveWatcher, FolderEventType from micall.monitor import update_qai try: - from micall_logging_override import LOGGING + from micall.utils.micall_logging_override import LOGGING except ImportError: - from micall_logging_config import LOGGING + from micall.utils.micall_logging_config import LOGGING POLLING_DELAY = 10 # seconds between scans for new samples or finished runs logger = logging.getLogger(__name__) diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py index 4ec503c34..e402fca5a 100644 --- a/micall/tests/test_kive_watcher.py +++ b/micall/tests/test_kive_watcher.py @@ -23,7 +23,7 @@ trim_run_name, compress_old_versions from micall.monitor.sample_watcher import PipelineType, ALLOWED_GROUPS, FolderWatcher, SampleWatcher from micall.monitor.find_groups import SampleGroup -from micall_watcher import parse_args +from micall.monitor.micall_watcher import parse_args class DummyDataset: diff --git a/micall/tests/test_release_test_compare.py b/micall/tests/test_release_test_compare.py index 823bd03b3..cc6a8608a 100644 --- a/micall/tests/test_release_test_compare.py +++ b/micall/tests/test_release_test_compare.py @@ -4,7 +4,7 @@ import typing -from release_test_compare import compare_sample, SampleFiles, Sample, \ +from micall.utils.release_test_compare import compare_sample, SampleFiles, Sample, \ MiseqRun, Scenarios, ConsensusDistance, group_samples_file, \ group_nucs_file, compare_consensus diff --git a/docker_build.py b/micall/utils/docker_build.py similarity index 100% rename from docker_build.py rename to micall/utils/docker_build.py diff --git a/micall_docker.py b/micall/utils/micall_docker.py similarity index 100% rename from micall_docker.py rename to micall/utils/micall_docker.py diff --git a/micall_kive.py b/micall/utils/micall_kive.py similarity index 100% rename from micall_kive.py rename to micall/utils/micall_kive.py diff --git a/micall_kive_resistance.py b/micall/utils/micall_kive_resistance.py similarity index 100% rename from micall_kive_resistance.py rename to micall/utils/micall_kive_resistance.py diff --git a/micall_logging_config.py b/micall/utils/micall_logging_config.py similarity index 100% rename from micall_logging_config.py rename to micall/utils/micall_logging_config.py diff --git a/release_test_compare.py b/micall/utils/release_test_compare.py similarity index 99% rename from release_test_compare.py rename to micall/utils/release_test_compare.py index 197ac857e..eb10bb4dd 100644 --- a/release_test_compare.py +++ b/micall/utils/release_test_compare.py @@ -21,7 +21,7 @@ from micall.utils.primer_tracker import PrimerTracker from micall.utils.report_amino import SeedNucleotide, MAX_CUTOFF from micall.utils.translation import translate -from micall_docker import get_available_memory +from micall.utils.micall_docker import get_available_memory MICALL_VERSION = '7.15' # ^^^^^^ Version of the MiCall release being tested. diff --git a/release_test_microtest.py b/micall/utils/release_test_microtest.py similarity index 99% rename from release_test_microtest.py rename to micall/utils/release_test_microtest.py index 157cd13de..0c76d0aa5 100644 --- a/release_test_microtest.py +++ b/micall/utils/release_test_microtest.py @@ -627,7 +627,7 @@ def main(): type=Path, help='Singularity image to run tests in.') args = parser.parse_args() - source_path: Path = Path(__file__).parent / 'micall' / 'tests' / 'microtest' + source_path: Path = Path(__file__).parent.parent / 'tests' / 'microtest' if args.sandbox is None: sandbox_path = source_path shutil.rmtree(source_path / 'scratch', ignore_errors=True) diff --git a/release_test_publish.py b/micall/utils/release_test_publish.py similarity index 100% rename from release_test_publish.py rename to micall/utils/release_test_publish.py diff --git a/release_test_setup.py b/micall/utils/release_test_setup.py similarity index 100% rename from release_test_setup.py rename to micall/utils/release_test_setup.py diff --git a/pyproject.toml b/pyproject.toml index 21d077560..eb8a628e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -186,6 +186,9 @@ per-file-ignores = [ 'micall/utils/sample_sheet_parser.py: E275,W504', 'micall/utils/scan_run_folders.py: W504', 'micall/utils/v3loop_alignment_scores/plot.py: W293', + 'micall/utils/docker_build.py: W504', + 'micall/utils/micall_docker.py: W291,W293,W503,W504', + 'micall/utils/release_test_compare.py: W293,W503,W504,F841', ] [tool.mypy] @@ -212,6 +215,7 @@ exclude = '''(?x) ^micall/monitor/qai_helper[.]py$| ^micall/monitor/sample_watcher[.]py$| ^micall/monitor/update_qai[.]py$| + ^micall/monitor/micall_watcher[.]py$| ^micall/resistance/asi_algorithm[.]py$| ^micall/resistance/genreport[.]py$| ^micall/resistance/pdfreport[.]py$| @@ -259,7 +263,13 @@ exclude = '''(?x) ^micall/utils/scan_run_folders[.]py$| ^micall/utils/seed_alignments[.]py$| ^micall/utils/spring_beads[.]py$| - ^micall/utils/v3loop_alignment_scores/plot[.]py$ + ^micall/utils/v3loop_alignment_scores/plot[.]py$| + ^micall/utils/release_test_compare[.]py$| + ^micall/utils/release_test_microtest[.]py$| + ^micall/utils/release_test_setup[.]py$| + ^micall/utils/micall_kive[.]py$| + ^micall/utils/micall_kive_resistance[.]py$| + ^micall/utils/micall_docker[.]py$ ''' check_untyped_defs = true From 467bd8f31d6560310238aa2c984de46a3429c820 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 18 Oct 2024 14:32:43 -0700 Subject: [PATCH 409/485] Use main.py in Singularity --- Singularity | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Singularity b/Singularity index bd0396060..bc53151c0 100644 --- a/Singularity +++ b/Singularity @@ -123,12 +123,15 @@ From: python:3.8 # Also trigger matplotlib to build its font cache. python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' + # Cleanup. + rm -rf /opt/micall + %environment export PATH=/opt/bowtie2:/bin:/usr/local/bin export LANG=en_US.UTF-8 %runscript - python /opt/micall/micall/utils/micall_kive.py "$@" + micall micall_kive "$@" %apphelp filter_quality Post-processing of short-read alignments. @@ -140,7 +143,7 @@ From: python:3.8 KIVE_MEMORY 200 %apprun filter_quality - PYTHONPATH=/opt/micall python -m micall.core.filter_quality "$@" + micall filter_quality "$@" %apphelp resistance Combine HCV results with HCV-Midi results, and generate resistance @@ -154,10 +157,10 @@ From: python:3.8 KIVE_MEMORY 200 %apprun resistance - python /opt/micall/micall/utils/micall_kive_resistance.py "$@" + micall micall_kive_resistance "$@" %apprun denovo - python /opt/micall/micall/utils/micall_kive.py --denovo "$@" + micall micall_kive --denovo "$@" %applabels denovo KIVE_INPUTS sample_info_csv fastq1 fastq2 bad_cycles_csv From f0cabeced07023f3842598ebb6c51bc17276114a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 17 Oct 2024 08:35:07 -0700 Subject: [PATCH 410/485] Update docs on scripts locations --- CONTRIBUTING.md | 26 +++++++++++++------------- docs/admin.md | 12 ++++++------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c3cf74388..73445487f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -60,20 +60,20 @@ this, you will have to log in to your account using sudo docker login docker.illumina.com -Use the `docker_build.py` script to build a Docker image and push it to +Use the `micall/utils/docker_build.py` script to build a Docker image and push it to BaseSpace. If you add `-t vX.Y`, it will add a tag to the Docker image. If you add `-a `, it will launch the spacedock tool to process samples as a local agent. You can also set the `BASESPACE_AGENT_ID` environment variable so you don't have to supply it every time. You can get the agent id from the Form Builder page on BaseSpace. - sudo python3 /media/sf_MiCall/docker_build.py -a abcde12345 + sudo python3 /media/sf_MiCall/micall/utils/docker_build.py -a abcde12345 [bsvm]: https://developer.basespace.illumina.com/docs/content/documentation/native-apps/setup-dev-environment [vbox guest]: https://linuxize.com/post/how-to-install-virtualbox-guest-additions-in-ubuntu/ ### Test data -If you want to run `micall_watcher.py`, you have to set up data folders for raw +If you want to run `micall/monitor/micall_watcher.py`, you have to set up data folders for raw data and for the working folders. You'll also need to set up the QAI project and the MiseqQCReport so you can download QC data and upload results. @@ -96,7 +96,7 @@ and the MiseqQCReport so you can download QC data and upload results. 10. Run the Ruby console for QAI and `LabMiseqRun.import('01-Jan-2000')` for the date of your sample run. 11. Upload the projects to a micall pipelines in QAI, use `micall.utils.projects_upload` to create a new pipeline in QAI -11. Run micall_watcher.py, it does need arguments. Look up the container app ids from Kive, check the Kive server URL and ports as well as QAI server and port +11. Run `micall/monitor/micall_watcher.py`, it does need arguments. Look up the container app ids from Kive, check the Kive server URL and ports as well as QAI server and port [cifs]: https://wiki.ubuntu.com/MountWindowsSharesPermanently @@ -105,7 +105,7 @@ When you don't understand the pipeline's output, it can be helpful to look at the raw reads in a sequence viewer like [Tablet][tablet]. Run the `micall_docker` script on a run folder or a single sample, like this: - python micall_docker.py folder --debug_remap --all_projects --keep_scratch /path/to/run + python micall/utils/micall_docker.py folder --debug_remap --all_projects --keep_scratch /path/to/run The options tell it to write the debug files, use all projects, and save the scratch folder that holds all the debug files. Look through the scratch folders @@ -167,13 +167,13 @@ similar steps to setting up a development workstation. Follow these steps: 1. Check that all the issues in the current milestone are closed, and make sure the code works in your development environment. Run all the unit tests as described above, process the microtest data set with - `release_test_microtest.py`. + `micall/utils/release_test_microtest.py`. 2. Check if the kiveapi package needs a new release by looking for new commits. Make sure you tested with the latest version. 3. Determine what version number should be used next. -4. Use the `projects_dump.py` script for the previous version and compare +4. Use the `micall/utils/projects_dump.py` script for the previous version and compare `projects.json` to check that the projects match, or that the differences - were intended. Test the `projects_upload.py` script with your updated project + were intended. Test the `micall/utils/projects_upload.py` script with your updated project files in your local test QAI. 5. Check the history of the HIV and HCV rules files in the `micall/resistance` folder. If they have changed, create a new display file in the `docs` folder @@ -192,13 +192,13 @@ similar steps to setting up a development workstation. Follow these steps: ids of the new apps. 8. Process all the samples from test_samples.csv on the Kive test server, and run the micall_watcher service on a VirtualBox. Use the - `release_test_*.py` scripts to compare the results of the new release with + `micall/utils/release_test_*.py` scripts to compare the results of the new release with the previous version. Also run the internal scripts `miseq_gen_results.rb` and `miseq_compare_results.rb` to look for differences. Get the comparison signed off to begin the release process. 8. Upload the Singularity image to the main Kive server, and record the id of the new apps. -8. Upload the pipeline definitions to QAI, using the `projects_upload.py` +8. Upload the pipeline definitions to QAI, using the `micall/utils/projects_upload.py` script. There is no need to create the new pipeline version in QAI beforehand, the script will do this for you - just remember to update the `Order by` field afterwards. @@ -216,7 +216,7 @@ similar steps to setting up a development workstation. Follow these steps: git fetch git checkout tags/vX.Y -10. Look for changes in [`micall_watcher.py`'s `parse_args()` function][parse_args]. +10. Look for changes in [`micall/monitor/micall_watcher.py`'s `parse_args()` function][parse_args]. Either look at the blame annotations at the link above, or review the changes in the new release. If there are new or changed settings, adjust the configuration in `/etc/systemd/system/micall_watcher.service` or @@ -247,7 +247,7 @@ similar steps to setting up a development workstation. Follow these steps: a new Docker image from GitHub. Tag it with the release number. cd /media/sf_micall - sudo python3 docker_build.py -t vX.Y --nopush + sudo python3 micall/utils/docker_build.py -t vX.Y --nopush The script is able to push the docker image to the illumina repo and launch spacedock as well, but that is currently broken because of the old docker version @@ -300,7 +300,7 @@ similar steps to setting up a development workstation. Follow these steps: Zenodo version. [release]: https://help.github.com/categories/85/articles -[parse_args]: https://github.com/cfe-lab/MiCall/blame/master/micall_watcher.py +[parse_args]: https://github.com/cfe-lab/MiCall/blame/master/micall/monitor/micall_watcher.py [Zenodo]: https://doi.org/10.5281/zenodo.2644171 [can't log in]: https://www.docker.com/blog/registry-v1-api-deprecation/ [docker hub]: https://hub.docker.com/orgs/cfelab/members diff --git a/docs/admin.md b/docs/admin.md index 2c5bf6a19..f2bf8fb54 100644 --- a/docs/admin.md +++ b/docs/admin.md @@ -76,7 +76,7 @@ Create a Python 3.6 virtual environment to run MiCall. Copy the logging configuration if you want to change any of the settings. - $ cp micall_logging_config.py micall_logging_override.py + $ cp micall/utils/micall_logging_config.py micall/utils/micall_logging_override.py Read the instructions in the file, and edit the override copy. If the default settings are fine, you don't need the override file. @@ -97,9 +97,9 @@ log folder and grant access to the micall user. Test that everything is installed with the right permissions: - $ sudo su -c "/usr/local/share/venv-micall/bin/python micall_watcher.py --help" micall + $ sudo su -c "/usr/local/share/venv-micall/bin/python micall/monitor/micall_watcher.py --help" micall -Look at the options you can give to the `micall_watcher.py` script when you +Look at the options you can give to the `micall/monitor/micall_watcher.py` script when you configure the service file in the next step. Now configure the service using a systemd [service unit] configuration. @@ -110,7 +110,7 @@ Here's an example configuration, in `/etc/systemd/system/micall_watcher.service` [Service] ExecStart=/usr/local/share/venv-micall/bin/python3 \ - /usr/local/share/MiCall/micall_watcher.py + /usr/local/share/MiCall/micall/monitor/micall_watcher.py EnvironmentFile=/etc/micall/micall.conf User=micall @@ -134,7 +134,7 @@ Environment variables go in the configuration file listed in the Make sure you reduce the read permissions on the `.conf` file so other users can't read it. The environment variable names are the same as the command options, but they add a `MICALL_` prefix, if it's not already there. -To list all the available options, run `python3 micall_watcher.py --help`. +To list all the available options, run `python3 micall/monitor/micall_watcher.py --help`. Below is the example config: # This is an example of /etc/micall/micall.conf @@ -175,7 +175,7 @@ If you installed it as a service as described above, then it's easy: sudo systemctl restart micall_watcher -Don't launch the `micall_watcher.py` script on its own, or the service will run +Don't launch the `micall/monitor/micall_watcher.py` script on its own, or the service will run won't know that it's running. That can end up running two copies of the watcher process, and it gets confused. From b2628ed237afee01393f7d0351a61ba9d1635096 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 18 Oct 2024 16:34:32 -0700 Subject: [PATCH 411/485] Make blast_db installation compatible with pyproject.toml --- micall/core/denovo.py | 3 +- micall/tests/test_denovo.py | 41 +++++++++--------- micall/utils/contig_summary.py | 23 +++++----- micall/utils/fasta_to_csv.py | 77 +++++++++++++++++++++++++++------- 4 files changed, 98 insertions(+), 46 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 61dca8ec4..13dcbdcee 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -15,10 +15,9 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from micall.utils.fasta_to_csv import run, DEFAULT_DATABASE, genotype +from micall.utils.fasta_to_csv import run, genotype import micall.core.contig_stitcher as stitcher -assert DEFAULT_DATABASE assert genotype is not None IVA = "iva" diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index 06001779b..a9d9a7496 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -4,33 +4,36 @@ from Bio import SeqIO from pytest import fixture, mark -from micall.core.denovo import write_contig_refs, denovo, DEFAULT_DATABASE, genotype +from micall.core.denovo import write_contig_refs, denovo, genotype +from micall.utils.fasta_to_csv import default_database from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS @fixture(scope='session', name='hcv_db') def check_hcv_db(): - db_path = Path(DEFAULT_DATABASE) - index_path = db_path.parent / "refs.fasta.nin" - build_needed = not index_path.exists() - if not build_needed: - projects_date = Path(DEFAULT_PROJECTS).stat().st_mtime - index_date = index_path.stat().st_mtime - build_needed = index_date < projects_date - if build_needed: - with open(DEFAULT_PROJECTS) as projects_json, \ - open(DEFAULT_DATABASE, 'w') as refs_fasta: - make_blast_db(projects_json, refs_fasta) - assert index_path.exists() - return db_path + with default_database() as DEFAULT_DATABASE: + db_path = Path(DEFAULT_DATABASE) + index_path = db_path.parent / "refs.fasta.nin" + build_needed = not index_path.exists() + if not build_needed: + projects_date = Path(DEFAULT_PROJECTS).stat().st_mtime + index_date = index_path.stat().st_mtime + build_needed = index_date < projects_date + if build_needed: + with open(DEFAULT_PROJECTS) as projects_json, \ + open(DEFAULT_DATABASE, 'w') as refs_fasta: + make_blast_db(projects_json, refs_fasta) + assert index_path.exists() + return db_path def test_make_blast_db_excludes_hivgha(hcv_db): - fasta_path = Path(DEFAULT_DATABASE) - with fasta_path.open() as f: - for reference in SeqIO.parse(f, 'fasta'): - # Exclude the Ghana project, because they're recombinant. - assert reference.name != 'HIV1-CRF02_AG-GH-AB286855-seed' + with default_database() as DEFAULT_DATABASE: + fasta_path = Path(DEFAULT_DATABASE) + with fasta_path.open() as f: + for reference in SeqIO.parse(f, 'fasta'): + # Exclude the Ghana project, because they're recombinant. + assert reference.name != 'HIV1-CRF02_AG-GH-AB286855-seed' def test_write_contig_refs_two_sequences(tmpdir, hcv_db): diff --git a/micall/utils/contig_summary.py b/micall/utils/contig_summary.py index a798139d7..2df68bdf9 100644 --- a/micall/utils/contig_summary.py +++ b/micall/utils/contig_summary.py @@ -5,7 +5,7 @@ from Bio.Blast.Applications import NcbiblastnCommandline -from micall.core.denovo import DEFAULT_DATABASE +from micall.utils.fasta_to_csv import default_database import matplotlib matplotlib.use('Agg') @@ -61,16 +61,17 @@ def main(): print(sample_dir, contigs_fasta_paths) continue contigs_fasta_path, = contigs_fasta_paths - cline = NcbiblastnCommandline(query=str(contigs_fasta_path), - db=DEFAULT_DATABASE, - outfmt=blast_format, - evalue=0.0001, - gapopen=5, - gapextend=2, - penalty=-3, - reward=1, - max_target_seqs=5000) - stdout, _ = cline(stderr=False) + with default_database() as DEFAULT_DATABASE: + cline = NcbiblastnCommandline(query=str(contigs_fasta_path), + db=DEFAULT_DATABASE, + outfmt=blast_format, + evalue=0.0001, + gapopen=5, + gapextend=2, + penalty=-3, + reward=1, + max_target_seqs=5000) + stdout, _ = cline(stderr=False) plot_contigs(sample_dir, stdout) plot_path = contig_plots_path / (sample_dir.name + '.png') plt.savefig(str(plot_path)) diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index d03cc19aa..9357aa750 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -2,14 +2,16 @@ import logging import os import typing -from typing import Optional, TextIO, Iterable, Dict, cast, Sequence +from typing import Optional, TextIO, Iterable, Dict, cast, Sequence, Iterator from collections import Counter from csv import DictWriter, DictReader from itertools import groupby from operator import itemgetter from pathlib import Path +import contextlib from io import StringIO +import importlib.resources as resources from Bio import SeqIO from Bio.Blast.Applications import NcbiblastnCommandline @@ -18,7 +20,44 @@ from micall.utils.contig_stitcher_contigs import GenotypedContig -DEFAULT_DATABASE = os.path.normpath((Path(__file__).parent.parent / 'blast_db' / 'refs.fasta').resolve()) +@contextlib.contextmanager +def reference_dir() -> Iterator[Path]: + """ + A context manager handling reference sequences paths packaged with MiCall. + + The complexity of the function arises from the need to maintain compatibility with + multiple python versions due to changes in APIs of the `importlib.resources` package. + + It first tries to fetch the resource using `resources.files` function introduced in + Python 3.9. If it fails, it falls back on `resources.path`. + It further ensures that the obtained resource is returned + as a Path instance regardless of it being a string, Path, or contextlib context-manager instance. + + Note: `resources.path` is set to be deprecated in future Python versions, hence the + intended primary method is using `resources.files`. + + Yields: + Path: A path-like object pointing to the reference directory within 'micall'. + """ + + try: + ret = resources.as_file(resources.files('micall').joinpath('blast_db')) # type: ignore + except AttributeError: + ret = resources.path('micall', 'blast_db') # type: ignore + + if isinstance(ret, str): + yield Path(ret) + elif isinstance(ret, Path): + yield ret + else: + with ret as path: + yield path + + +@contextlib.contextmanager +def default_database() -> Iterator[str]: + with reference_dir() as blast_db: + yield str(blast_db / "refs.fasta") def read_assembled_contigs(group_refs: Dict[str, str], @@ -95,14 +134,14 @@ def write_unstitched_contigs(writer: DictWriter, contig=contig.seq)) -def genotype(fasta: str, db: str = DEFAULT_DATABASE, +def genotype(fasta: str, db: Optional[str] = None, blast_csv: Optional[TextIO] = None, group_refs: Optional[Dict[str, str]] = None) -> Dict[str, typing.Tuple[str, float]]: """Use Blastn to search for the genotype of a set of reference sequences. Args: fasta (str): File path of the FASTA file containing the query sequences. - db (str): File path of the database to search for matches. + db (Optional[str]): File path of the database to search for matches. blast_csv (Optional[TextIO]): Open file to write the blast matches to, or None. group_refs (Optional[Dict[str, str]]): Dictionary to fill with the mapping from each contig's reference name to the best matched reference for the whole seed group. @@ -126,16 +165,26 @@ def genotype(fasta: str, db: str = DEFAULT_DATABASE, 'qend', 'sstart', 'send'] - cline = NcbiblastnCommandline(query=fasta, - db=db, - outfmt=f'"10 {" ".join(blast_columns)}"', - evalue=0.0001, - gapopen=5, - gapextend=2, - penalty=-3, - reward=1, - max_target_seqs=5000) - stdout, _ = cline() + + def invoke_blast(db: str) -> str: + cline = NcbiblastnCommandline(query=fasta, + db=db, + outfmt=f'"10 {" ".join(blast_columns)}"', + evalue=0.0001, + gapopen=5, + gapextend=2, + penalty=-3, + reward=1, + max_target_seqs=5000) + stdout, _ = cline() + return stdout + + if db is None: + with default_database() as db: + stdout = invoke_blast(db) + else: + stdout = invoke_blast(db) + samples = {} # {query_name: (subject_name, matched_fraction)} matches = sorted(DictReader(StringIO(stdout), blast_columns), key=lambda row: (row['qaccver'], float(row['score']))) From 98e6651ddc2317dc320c3dfdd7c4c2f6010b668c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 18 Oct 2024 16:35:14 -0700 Subject: [PATCH 412/485] Improve Singularity definition Simplify and reduce file size. --- Singularity | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/Singularity b/Singularity index bc53151c0..fa8517951 100644 --- a/Singularity +++ b/Singularity @@ -28,31 +28,10 @@ From: python:3.8 # Unneeded once Singularity creates parent dirs: # https://github.com/singularityware/singularity/issues/1549 mkdir ${SINGULARITY_ROOTFS}/opt/micall - mkdir ${SINGULARITY_ROOTFS}/opt/micall/micall %files - ## Git files - .git /opt/micall/.git - - ## MiCall - micall/__init__.py /opt/micall/micall/ - micall/project* /opt/micall/micall/ - - micall/core /opt/micall/micall/core - micall/data /opt/micall/micall/data - micall/drivers /opt/micall/micall/drivers - micall/g2p /opt/micall/micall/g2p - micall/resistance /opt/micall/micall/resistance - micall/monitor /opt/micall/micall/monitor - micall/utils /opt/micall/micall/utils - - micall/main.py /opt/micall/micall/ - README.md /opt/micall/ - LICENSE.txt /opt/micall/ - pyproject.toml /opt/micall/ - - ## HCV genotyping database - micall/blast_db /opt/micall/micall/blast_db + ## These files will be deleted after the install. + . /opt/micall/ %post echo ===== Installing Prerequisites ===== >/dev/null @@ -65,7 +44,6 @@ From: python:3.8 git -C /opt/micall/ rev-parse HEAD > /etc/micall/git-version git -C /opt/micall/ -c 'core.fileMode=false' describe --tags --dirty 1>&2 > /etc/micall/git-describe || true git -C /opt/micall/ log -n 10 > /etc/micall/git-log - rm -rf /opt/micall/.git echo ===== Installing blast ===== >/dev/null apt-get install -q -y ncbi-blast+ @@ -118,8 +96,8 @@ From: python:3.8 apt-get install -q -y libcairo2-dev # Install micall main executable. pip install --upgrade pip - python /opt/micall/micall/main.py make_blast_db pip install /opt/micall + micall make_blast_db # Also trigger matplotlib to build its font cache. python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' From ca9e9d88b59b1ebc9fe2bb5692059bbc0d5434bc Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 10:24:21 -0700 Subject: [PATCH 413/485] Delete dev.dockerfile There is already a .devcontainer definition that makes this file much less useful than it was. --- dev.dockerfile | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 dev.dockerfile diff --git a/dev.dockerfile b/dev.dockerfile deleted file mode 100644 index 31b4b2906..000000000 --- a/dev.dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Dockerized version of MiCall development tools. -# To build this image, first build the production image in Dockerfile, probably -# using the docker_build.py script. Use the docker images command to see that -# docker.illumina.com/cfe_lab/micall:latest is the image you want to use, then -# build this image, using a command like this: -# -# docker build -t micall:dev --file dev.dockerfile . -# -# To test out the image, run the test suite, with a command like this: -# -# docker run --rm -it --entrypoint pytest -w /opt/micall \ -# --volume ~/git/micall:/opt/micall micall:dev -# -# That lets you edit the source code on your host system, but run it under -# docker with all the tools installed for you. - -FROM docker.illumina.com/cfe_lab/micall:latest - -## Add the dev packages. -COPY pyproject.toml /opt/micall/ -RUN pip install -r .[dev] From a41e952464d69d600ffaa064ed6215b452343595 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 10:25:29 -0700 Subject: [PATCH 414/485] Delete .devcontainer/Dockerfile --- .devcontainer/Dockerfile | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 .devcontainer/Dockerfile diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index 8eb70eabf..000000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM cfelab/micall:v7.15.13 - -# Uninstall some Python packages that don't upgrade correctly. -RUN python -m pip install --upgrade pip && \ - python -m pip uninstall -y python-Levenshtein Levenshtein genetracks pyvdrm - -ENTRYPOINT bash \ No newline at end of file From 768fe6887f0c821a5e02718df5829ad393caa479 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 10:25:59 -0700 Subject: [PATCH 415/485] Use top-level Dockerfile in .devcontainer --- .devcontainer/devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 561ade051..6ee8240e3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -5,7 +5,7 @@ // More info: https://containers.dev/guide/dockerfile "build": { - "dockerfile": "Dockerfile" + "dockerfile": "../Dockerfile" }, // Features to add to the dev container. More info: https://containers.dev/features. From 2df55e7a7238e36ae21f8ca84b390c7a2a4db347 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 10:27:32 -0700 Subject: [PATCH 416/485] Install test dependencies in dockerfile too --- .devcontainer/devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 6ee8240e3..2f887ae68 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -15,7 +15,7 @@ // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "python -m pip install .[dev]", + "postCreateCommand": "python -m pip install .[dev,test]", // Configure tool-specific properties. "customizations": { From 7d7be555f1537491051d66fb326a123f39a2a0a6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 10:29:22 -0700 Subject: [PATCH 417/485] Improve initialization of devcontainer --- .devcontainer/devcontainer.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2f887ae68..fbea85a74 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -14,8 +14,8 @@ // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], - // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "python -m pip install .[dev,test]", + // Use 'onCreateCommand' to run commands after the container is created, but before anything else starts. + "onCreateCommand": "python -m pip install .[dev,test]", // Configure tool-specific properties. "customizations": { From 13cce78c12cc7527f24ba5b05d278382cb0716a1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:06:20 -0700 Subject: [PATCH 418/485] Fix dockerfile definition --- Dockerfile | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/Dockerfile b/Dockerfile index dc71d699e..93924b1dd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -86,28 +86,15 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev -COPY pyproject.toml /opt/micall/ +COPY . /opt/micall/ -## Python packages, plus trigger matplotlib to build its font cache -WORKDIR /opt -RUN pip install --upgrade pip && \ - pip install .[basespace] && \ - python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' +RUN pip install --upgrade pip +RUN pip install /opt/micall +RUN micall make_blast_db -## MiCall -COPY micall_docker.py micall_kive.py micall_kive_resistance.py version.tx[t] /opt/micall/ -COPY micall/__init__.py micall/project* /opt/micall/micall/ - -COPY micall/blast_db/make_blast_db.py /opt/micall/micall/blast_db/make_blast_db.py -COPY micall/core /opt/micall/micall/core/ -COPY micall/data /opt/micall/micall/data/ -COPY micall/drivers /opt/micall/micall/drivers/ -COPY micall/g2p /opt/micall/micall/g2p/ -COPY micall/resistance /opt/micall/micall/resistance/ -COPY micall/monitor /opt/micall/micall/monitor/ -COPY micall/utils /opt/micall/micall/utils/ - -RUN python /opt/micall/micall/blast_db/make_blast_db.py +## Trigger matplotlib to build its font cache +RUN pip install .[basespace] +RUN python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' WORKDIR /data -ENTRYPOINT ["python", "/opt/micall/micall_docker.py"] +ENTRYPOINT ["micall", "micall_docker"] From 792283c094353262aed61b6a4cb1ca051c74310b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:13:38 -0700 Subject: [PATCH 419/485] Improve .dockerignore by ignoring cache files --- .dockerignore | 5 ++++- Dockerfile | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.dockerignore b/.dockerignore index cf4867d76..172286671 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,9 @@ .idea bin docs +*cache* +.github +Dockerfile micall/tests/working micall/tests/microtest/scratch micall/tests/microtest/micall-results @@ -12,4 +15,4 @@ venv_micall */*.pyc */*/*.pyc */*/*/*.pyc -simgs \ No newline at end of file +simgs diff --git a/Dockerfile b/Dockerfile index 93924b1dd..3644378b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -93,7 +93,7 @@ RUN pip install /opt/micall RUN micall make_blast_db ## Trigger matplotlib to build its font cache -RUN pip install .[basespace] +RUN pip install /opt/micall[basespace] RUN python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' WORKDIR /data From 5c9d3020266a96149fef990cf5f205b6dbae6288 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:14:52 -0700 Subject: [PATCH 420/485] Add CI job that builds MiCall docker image --- .github/workflows/build-and-test.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 35f4333bd..39e5fb16c 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -76,6 +76,17 @@ jobs: python -m pip install codecov codecov + docker-test: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Build docker image + run: docker build --tag micall --file Dockerfile . + - name: Run docker image entrypoint + run: docker run micall --help + - name: Check docker image entrypoint + run: docker run micall --help | grep -i -e 'docker' + singularity-test: runs-on: ubuntu-20.04 steps: From cb9609093dce7aae08eaec21da3db5705c95d472 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:16:18 -0700 Subject: [PATCH 421/485] Avoid double micall installation in Dockerfile --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3644378b8..af327bbd0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,11 +89,10 @@ RUN apt-get install -q -y libcairo2-dev COPY . /opt/micall/ RUN pip install --upgrade pip -RUN pip install /opt/micall +RUN pip install /opt/micall[basespace] RUN micall make_blast_db ## Trigger matplotlib to build its font cache -RUN pip install /opt/micall[basespace] RUN python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' WORKDIR /data From 208e45f9709cdc1d807c45605f9edf89331a9090 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:46:55 -0700 Subject: [PATCH 422/485] Move ruff config into pyproject.toml --- .ruff.toml | 62 --------------------- pyproject.toml | 145 ++++++++++++++++++++----------------------------- 2 files changed, 59 insertions(+), 148 deletions(-) delete mode 100644 .ruff.toml diff --git a/.ruff.toml b/.ruff.toml deleted file mode 100644 index cd7e4b28e..000000000 --- a/.ruff.toml +++ /dev/null @@ -1,62 +0,0 @@ - -[lint.per-file-ignores] - -# These are pre-file ignores that were added for compatibility purposes when ruff was introduced into MiCall. -# TODO: resolve these warnings insteads of ignoring them. - -# Syntax error: ignore unexpected token in this file -"micall/utils/make_hcv_genotype_refs.py" = ["E999"] - -# File with unused imports and local variable assignments -"micall/core/plot_simple.py" = ["F401", "F841"] - -# Ignore bare except usage in project configuration -"micall/core/project_config.py" = ["E722"] - -# Unused import in remap module -"micall/core/remap.py" = ["F401"] - -# Do not compare types and bare excepts in pssm_lib module -"micall/g2p/pssm_lib.py" = ["E722", "E721"] - -# F-string errors without placeholders in tests -"micall/tests/svg_differ.py" = ["F541"] - -# Unused import in test_aln2counts_report module -"micall/tests/test_aln2counts_report.py" = ["F401"] - -# Multiple issues in test_consensus_aligner: unused imports, multi-statement line, and unused local variable -"micall/tests/test_consensus_aligner.py" = ["F401", "E701", "F841"] - -# Unused import in test_primer_tracker module -"micall/tests/test_primer_tracker.py" = ["F401"] - -# F-string errors without placeholders in test_remap module -"micall/tests/test_remap.py" = ["F541"] - -# Unused import in test_sample module -"micall/tests/test_sample.py" = ["F401"] - -# Unused imports, undefined name, and type comparison in aln2counts_simplify module -"micall/utils/aln2counts_simplify.py" = ["F401", "F821", "E721"] - -# Unused local variable in compare_mapping module -"micall/utils/compare_mapping.py" = ["F841"] - -# Multiple issues in conseq_compare: unused sys, redefinitions, and unused local variables -"micall/utils/conseq_compare.py" = ["F811", "F401", "F841"] - -# Comparisons to None should use `is` or `is not` in dd module -"micall/utils/dd.py" = ["E711"] - -# Unused imports and local variable in find_reads_in_sam module -"micall/utils/find_reads_in_sam.py" = ["F401", "F841"] - -# F-string errors without placeholders in primer_tracker module -"micall/utils/primer_tracker.py" = ["F541"] - -# F-string errors without placeholders in ref_aligner module -"micall/utils/ref_aligner.py" = ["F541"] - -# Unused local variable in release_test_compare script -"release_test_compare.py" = ["F841"] diff --git a/pyproject.toml b/pyproject.toml index f8506b10b..ea891b85f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,92 +105,65 @@ markers = [ "iva: test requires IVA (deselect with '-m \"not iva\"')" ] -[tool.flake8] -ignore = ['E226', 'E302', 'E41'] -max-line-length = 120 -per-file-ignores = [ - 'micall/core/aln2counts.py: W504,E225,E126,W293', - 'micall/core/amplicon_finder.py: E126,W504', - 'micall/core/cascade_report.py: W293', - 'micall/core/filter_quality.py: W504', - 'micall/core/plot_contigs.py: W504,E501,F841,E231,E261,E262,E265,E302,E303,E305,E999,W291,W292,F401', - 'micall/core/plot_simple.py: F401,F841,W292', - 'micall/core/project_config.py: E722,E261', - 'micall/core/remap.py: F401', - 'micall/core/sam2aln.py: E128', - 'micall/core/trim_fastqs.py: W504', - 'micall/drivers/run_info.py: E225', - 'micall/g2p/pssm_lib.py: E722', - 'micall/monitor/find_groups.py: W504', - 'micall/monitor/kive_watcher.py: W504,E126,E501,W503', - 'micall/monitor/sample_watcher.py: W503,W504', - 'micall/monitor/update_qai.py: W503,W504', - 'micall/resistance/asi_algorithm.py: W504', - 'micall/resistance/pdfreport.py: E241,W504', - 'micall/resistance/resistance.py: W504', - 'micall/tcr/igblast.py: E265,E231,E502,E128,W291,E305', - 'micall/tests/microtest/make_sample.py: W504', - 'micall/tests/microtest/reverse_and_complement.py: E126', - 'micall/tests/microtest/translate_nuc.py: E126', - 'micall/tests/svg_differ.py: E126,F541', - 'micall/tests/test_align_it_gotoh.py: E261,E262,E225,E303,E501,W292', - 'micall/tests/test_aln2counts_report.py: W504,F401', - 'micall/tests/test_aln2counts_seed_amino.py: W293', - 'micall/tests/test_aln2counts_seed_nucleotide.py: W293', - 'micall/tests/test_cigar_tools.py: E704,W503', - 'micall/tests/test_consensus_aligner.py: F841,E126,W504,E701,F401', - 'micall/tests/test_contig_stitcher.py: W503', - 'micall/tests/test_error_metrics_parser.py: E231', - 'micall/tests/test_fastq_g2p.py: W293', - 'micall/tests/test_hcv_reference_tree.py: W293', - 'micall/tests/test_hcv_rules_import.py: E241', - 'micall/tests/test_kive_watcher.py: W504,E126,E127,F841', - 'micall/tests/test_plot_contigs.py: E303', - 'micall/tests/test_primer_tracker.py: F401', - 'micall/tests/test_projects_dump.py: W504', - 'micall/tests/test_quality_metrics_parser.py: E231', - 'micall/tests/test_remap.py: F541', - 'micall/tests/test_resistance.py: W504', - 'micall/tests/test_report_amino.py: W292', - 'micall/tests/test_sam2aln.py: E121', - 'micall/tests/test_sample.py: F401', - 'micall/tests/test_sample_sheet_parser.py: E123', - 'micall/tests/test_tile_metrics_parser.py: E231', - 'micall/tests/test_trim_fastqs.py: E121,E123', - 'micall/tests/test_update_qai.py: E127', - 'micall/utils/aln2counts_simplify.py: F401,E261,E126,F821', - 'micall/utils/big_counter.py: W293', - 'micall/utils/cigar_tools.py: E704', - 'micall/utils/compare_454_samples.py: W504,E305', - 'micall/utils/compare_mapping.py: W504,F841', - 'micall/utils/concordance_evaluation.py: E303', - 'micall/utils/consensus_aligner.py: W504,W293,E126,E303,F401,E261,E252', - 'micall/utils/conseq_compare.py: F811,E305,W503,F841,F401', - 'micall/utils/contig_summary.py: W504', - 'micall/utils/coverage_data.py: W504', - 'micall/utils/dd.py: E711,W504,E211,E303', - 'micall/utils/fetch_sequences.py: W504,E126,E303,E265', - 'micall/utils/find_chimera.py: W504', - 'micall/utils/find_reads_in_sam.py: F401,W293,F841', - 'micall/utils/hcv_reference_tree.py: W504', - 'micall/utils/hcv_rules_import.py: W503,W504,E126,E241', - 'micall/utils/make_fda_refs.py: E211,E261,W293,W292', - 'micall/utils/make_hcv_genotype_refs.py: E211,E261,E231,E722,E305,E127,E251,E127', - 'micall/utils/primer_checks.py: E303,F541', - 'micall/utils/primer_tracker.py: F541', - 'micall/utils/projects_dump.py: W504', - 'micall/utils/ref_aligner.py: F541', - 'micall/utils/reference_distances.py: E305', - 'micall/utils/report_amino.py: W504', - 'micall/utils/sam_g2p_simplify.py: W293,E265,E303,E305', - 'micall/utils/sample_fastq.py: E305', - 'micall/utils/sample_sheet_parser.py: E275,W504', - 'micall/utils/scan_run_folders.py: W504', - 'micall/utils/v3loop_alignment_scores/plot.py: W293', - 'micall/utils/docker_build.py: W504', - 'micall/utils/micall_docker.py: W291,W293,W503,W504', - 'micall/utils/release_test_compare.py: W293,W503,W504,F841', -] +[tool.ruff.lint.per-file-ignores] +# TODO: resolve these warnings insteads of ignoring them. + +# Syntax error: ignore unexpected token in this file +"micall/utils/make_hcv_genotype_refs.py" = ["E999"] + +# File with unused imports and local variable assignments +"micall/core/plot_simple.py" = ["F401", "F841"] + +# Ignore bare except usage in project configuration +"micall/core/project_config.py" = ["E722"] + +# Unused import in remap module +"micall/core/remap.py" = ["F401"] + +# Do not compare types and bare excepts in pssm_lib module +"micall/g2p/pssm_lib.py" = ["E722", "E721"] + +# F-string errors without placeholders in tests +"micall/tests/svg_differ.py" = ["F541"] + +# Unused import in test_aln2counts_report module +"micall/tests/test_aln2counts_report.py" = ["F401"] + +# Multiple issues in test_consensus_aligner: unused imports, multi-statement line, and unused local variable +"micall/tests/test_consensus_aligner.py" = ["F401", "E701", "F841"] + +# Unused import in test_primer_tracker module +"micall/tests/test_primer_tracker.py" = ["F401"] + +# F-string errors without placeholders in test_remap module +"micall/tests/test_remap.py" = ["F541"] + +# Unused import in test_sample module +"micall/tests/test_sample.py" = ["F401"] + +# Unused imports, undefined name, and type comparison in aln2counts_simplify module +"micall/utils/aln2counts_simplify.py" = ["F401", "F821", "E721"] + +# Unused local variable in compare_mapping module +"micall/utils/compare_mapping.py" = ["F841"] + +# Multiple issues in conseq_compare: unused sys, redefinitions, and unused local variables +"micall/utils/conseq_compare.py" = ["F811", "F401", "F841"] + +# Comparisons to None should use `is` or `is not` in dd module +"micall/utils/dd.py" = ["E711"] + +# Unused imports and local variable in find_reads_in_sam module +"micall/utils/find_reads_in_sam.py" = ["F401", "F841"] + +# F-string errors without placeholders in primer_tracker module +"micall/utils/primer_tracker.py" = ["F541"] + +# F-string errors without placeholders in ref_aligner module +"micall/utils/ref_aligner.py" = ["F541"] + +# Unused local variable in release_test_compare script +"release_test_compare.py" = ["F841"] [tool.mypy] files = ["micall/"] From f491b2f11e3e2edd681ffd153202b11e4f704e81 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 21 Oct 2024 11:49:08 -0700 Subject: [PATCH 423/485] Remove flake8 dependency --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ea891b85f..6e3db0585 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,8 +60,6 @@ test = [ ] dev = [ # Dependencies required for development (linting, type checking, etc.) - "flake8==7.0.0", - "Flake8-pyproject==1.2.3", "ruff==0.7.0", "mypy==1.10.0", "mypy-extensions==1.0.0", From 2539c8766ddf0dbfb9777a6afe6f505faaeb82c1 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:07:10 -0700 Subject: [PATCH 424/485] Completely isolatate mappy.Alignment in consensus_aligner.py Also improve the type of the cigar array from List[List[int]] to List[Tuple[int, CigarActions]]. --- micall/tests/test_consensus_aligner.py | 76 ++++++------- micall/utils/consensus_aligner.py | 152 +++++++++++++------------ 2 files changed, 119 insertions(+), 109 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 301b25390..4139f0039 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -106,9 +106,9 @@ def test_alignment_repr(): def test_wrap_overrides(): - alignment1 = AlignmentWrapper(r_st=100, r_en=200) + alignment1 = AlignmentWrapper(r_st=100, r_en=200, cigar_str='') alignment2 = AlignmentWrapper.wrap(alignment1, r_en=300, blen=200, cigar=[]) - expected_alignment = AlignmentWrapper(r_st=100, r_en=300, cigar=[]) + expected_alignment = AlignmentWrapper(r_st=100, r_en=300, blen=200, cigar=[], cigar_str='') assert alignment2 == expected_alignment @@ -412,9 +412,9 @@ def test_start_contig_deletion_minimap2(projects): q_st=0, q_en=59, mapq=9, - cigar=[[30, CigarActions.MATCH], - [1, CigarActions.DELETE], - [29, CigarActions.MATCH]], + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (29, CigarActions.MATCH)], NM=1) aligner = ConsensusAligner(projects) @@ -438,7 +438,7 @@ def test_start_contig_big_deletion_minimap2(projects): q_st=0, q_en=693, mapq=60, - cigar=[[693, CigarActions.MATCH]], + cigar=[(693, CigarActions.MATCH)], NM=25), AlignmentWrapper(ctg='N/A', ctg_len=len(seed_seq), @@ -448,7 +448,7 @@ def test_start_contig_big_deletion_minimap2(projects): q_st=693, q_en=6962, mapq=60, - cigar=[[6269, CigarActions.MATCH]], + cigar=[(6269, CigarActions.MATCH)], NM=256)] aligner = ConsensusAligner(projects) @@ -470,9 +470,9 @@ def test_start_contig_deletion_gotoh(projects): q_st=0, q_en=49, mapq=0, - cigar=[[30, CigarActions.MATCH], - [1, CigarActions.DELETE], - [19, CigarActions.MATCH]], + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (19, CigarActions.MATCH)], NM=0) aligner = ConsensusAligner(projects) @@ -494,7 +494,7 @@ def test_start_contig_matched_deletion_gotoh(projects): q_st=0, q_en=50, mapq=0, - cigar=[[50, CigarActions.MATCH]], + cigar=[(50, CigarActions.MATCH)], NM=0) aligner = ConsensusAligner(projects) @@ -516,9 +516,9 @@ def test_start_contig_insertion_minimap2(projects): q_st=0, q_en=63, mapq=9, - cigar=[[30, CigarActions.MATCH], - [3, CigarActions.INSERT], - [30, CigarActions.MATCH]], + cigar=[(30, CigarActions.MATCH), + (3, CigarActions.INSERT), + (30, CigarActions.MATCH)], NM=3) aligner = ConsensusAligner(projects) @@ -539,9 +539,9 @@ def test_start_contig_insertion_gotoh(projects): q_st=0, q_en=51, mapq=0, - cigar=[[30, CigarActions.MATCH], - [1, CigarActions.INSERT], - [20, CigarActions.MATCH]], + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.INSERT), + (20, CigarActions.MATCH)], NM=0) aligner = ConsensusAligner(projects) @@ -865,7 +865,7 @@ def test_count_coord_concordance(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATAAGCA" aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[[27, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_concordance_list = [1.0]*len(aligner.consensus) @@ -885,7 +885,7 @@ def test_count_coord_concordance_mismatch(): aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCA" # changed nucs: ^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[[27, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] # At the end of the consensus, the size of the averaging window for the concordance decreases from 20 to 11. # The concordance therefore decreases from 18/20 to 9/11 @@ -908,7 +908,7 @@ def test_count_coord_concordance_short_match(): aligner.consensus = "AGATTTCGATGATTCTCTTCTAAACGT" # last match position: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[[15, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] # We start out with 100% match for the first 6 positions expected_concordance_list = [1.0] * 6 # After that, the averaging window (whose size is still increasing) starts to slide past the match: @@ -936,8 +936,8 @@ def test_count_coord_concordance_two_matches(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCATTT" aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[[12, CigarActions.MATCH]]), - AlignmentWrapper(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[[15, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[(12, CigarActions.MATCH)]), + AlignmentWrapper(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[(15, CigarActions.MATCH)])] expected_concordance_list = [1.0] * 3 + [12/13, 12/14, 12/15, 13/16, 14/17, 15/18, 16/19] + [17/20]*11 + \ [16/19, 15/18, 15/17, 15/16] + [1.0]*5 @@ -958,9 +958,9 @@ def test_count_coord_concordance_with_insertion(): aligner.consensus = "AGATTTCGACCCTGATTCAGAAGATTTGCA" # insertion: ^^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[[9, CigarActions.MATCH], - [3, CigarActions.INSERT], - [18, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] # the window size increases from 10 to 20, while the averaging window slides over the insertion expected_concordance_list = [9/10, 9/11, 9/12, 10/13, 11/14, 12/15, 13/16, 14/17, 15/18, 16/19] # for 10 positions in the middle, the insertion is included in the full window size fo 20 @@ -984,9 +984,9 @@ def test_count_coord_concordance_with_deletion(): aligner.consensus = "AGATTTCGATTCAGAAGATTTGCA" # deletion behind this pos: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[[9, CigarActions.MATCH], - [3, CigarActions.DELETE], - [15, CigarActions.MATCH]])] + aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (15, CigarActions.MATCH)])] # the deletion does not decrease the concordance expected_concordance_list = [1.0]*len(aligner.consensus) @@ -1005,7 +1005,7 @@ def test_count_seed_region_concordance(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[[27, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1028,7 +1028,7 @@ def test_count_seed_region_concordance_mismatch(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[[30, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1050,7 +1050,7 @@ def test_count_seed_region_concordance_seed_not_aligned(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[[15, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1072,7 +1072,7 @@ def test_count_seed_region_concordance_larger_match(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[[30, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1095,9 +1095,9 @@ def test_count_seed_region_concordance_insertion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[[9, CigarActions.MATCH], - [3, CigarActions.INSERT], - [18, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1120,9 +1120,9 @@ def test_count_seed_region_concordance_deletion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[[9, CigarActions.MATCH], - [3, CigarActions.DELETE], - [18, CigarActions.MATCH]])] + seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 7d182e466..0130e4def 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,11 +1,11 @@ -from typing import Dict, List, Optional, Iterable, Set +from typing import Dict, List, Optional, Iterable, Set, Tuple from dataclasses import dataclass, replace from itertools import count from operator import attrgetter import csv import os import logging -from aligntools import CigarActions +from aligntools import CigarActions, Cigar from gotoh import align_it, align_it_aa from mappy import Alignment, Aligner @@ -57,10 +57,31 @@ def map_amino_sequences(from_seq: str, to_seq: str): return seq_map -class AlignmentWrapper(Alignment): +@dataclass +class AlignmentWrapper: + + ctg: str # The reference contig name. + ctg_len: int # The reference contig length. + r_st: int + r_en: int + strand: int # Either forward (1 : int) or reverse (-1 : int) strand. + q_st: int + q_en: int + mapq: int # The map quality. + cigar: List[Tuple[int, CigarActions]] + is_primary: bool + mlen: int # How many matches. + blen: int + NM: int + trans_strand: int + read_num: int + cs: str + MD: str + cigar_str: str + init_fields = ( 'ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary mlen ' - 'blen NM trans_strand read_num cs MD').split() + 'blen NM trans_strand read_num cs MD cigar_str').split() @classmethod def wrap(cls, source: Alignment, **overrides): @@ -75,76 +96,54 @@ def wrap(cls, source: Alignment, **overrides): args[i] = value return cls(*args) - # noinspection PyPep8Naming - def __new__(cls, - ctg='', - ctg_len=0, - r_st=0, - r_en=0, - strand=1, - q_st=0, - q_en=0, - mapq=0, - cigar: Iterable[List[int]] = tuple(), - is_primary=True, - mlen=0, - blen=0, - NM=0, - trans_strand=0, - read_num=1, - cs='', - MD=''): - """ Create an instance. - - :param ctg: name of the reference sequence the query is mapped to - :param ctg_len: total length of the reference sequence - :param r_st and r_en: start and end positions on the reference - :param strand: +1 if on the forward strand; -1 if on the reverse strand - :param q_st and q_en: start and end positions on the query - :param mapq: mapping quality - :param cigar: CIGAR returned as an array of shape (n_cigar,2). The two - numbers give the length and the operator of each CIGAR operation. - :param is_primary: if the alignment is primary (typically the best and - the first to generate) - :param mlen: length of the matching bases in the alignment, excluding - ambiguous base matches. - :param blen: length of the alignment, including both alignment matches - and gaps but excluding ambiguous bases. - :param NM: number of mismatches, gaps and ambiguous positions in the - alignment - :param trans_strand: transcript strand. +1 if on the forward strand; -1 - if on the reverse strand; 0 if unknown - :param read_num: read number that the alignment corresponds to; 1 for - the first read and 2 for the second read - :param cs: the cs tag. - :param MD: the MD tag as in the SAM format. It is an empty string unless - the MD argument is applied when calling mappy.Aligner.map(). - """ + def __init__(self, + ctg='', + ctg_len=0, + r_st=0, + r_en=0, + strand=1, + q_st=0, + q_en=0, + mapq=0, + cigar: Iterable[Tuple[int, CigarActions]] = tuple(), + is_primary=True, + mlen=0, + blen=0, + NM=0, + trans_strand=0, + read_num=1, + cs='', + MD='', + cigar_str=None): + cigar = list(cigar) if not mlen: mlen = min(q_en-q_st, r_en-r_st) if not blen: blen = max(q_en-q_st, r_en-r_st) if not cigar: - cigar = [[max(q_en-q_st, r_en-r_st), CigarActions.MATCH]] - return super().__new__(cls, - ctg, - ctg_len, - r_st, - r_en, - strand, - q_st, - q_en, - mapq, - cigar, - is_primary, - mlen, - blen, - NM, - trans_strand, - read_num-1, - cs, - MD) + cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] + if cigar_str is None: + cigar_str = str(Cigar(cigar)) + + self.ctg = ctg + self.ctg_len = ctg_len + self.r_st = r_st + self.r_en = r_en + self.strand = strand + self.q_st = q_st + self.q_en = q_en + self.mapq = mapq + self.cigar = cigar + self.is_primary = is_primary + self.mlen = mlen + self.blen = blen + self.NM = NM + self.trans_strand = trans_strand + self.read_num = read_num + self.cs = cs + self.MD = MD + self.cigar_str = cigar_str def __eq__(self, other: Alignment): for field_name in self.init_fields: @@ -173,7 +172,7 @@ def __init__(self, self.coordinate_name = self.consensus = self.amino_consensus = '' self.algorithm = '' self.consensus_offset = 0 - self.alignments: List[Alignment] = [] + self.alignments: List[AlignmentWrapper] = [] self.reading_frames: Dict[int, List[SeedAmino]] = {} self.seed_nucs: List[SeedNucleotide] = [] self.amino_alignments: List[AminoAlignment] = [] @@ -276,9 +275,17 @@ def start_contig(self, else: self.algorithm = 'gotoh' self.align_gotoh(coordinate_seq, self.consensus) - self.alignments = [alignment + + self.alignments = [AlignmentWrapper.wrap(alignment) for alignment in self.alignments if alignment.is_primary] + + for alignment in self.alignments: + new = [] + for (size, action) in alignment.cigar: + new.append((size, CigarActions(action))) + alignment.cigar = new + self.alignments.sort(key=attrgetter('q_st')) if self.overall_alignments_writer is not None: @@ -326,6 +333,9 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): cigar[-1][0] += 1 else: cigar.append([1, expected_action]) + + typed_cigar: List[Tuple[int, CigarActions]] = [(a, CigarActions(b)) + for [a, b] in cigar] self.alignments.append(AlignmentWrapper( 'N/A', len(coordinate_seq), @@ -333,7 +343,7 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): ref_index, q_st=0, q_en=consensus_index, - cigar=cigar)) + cigar=typed_cigar)) def find_amino_alignments(self, start_pos: int, From d3f061699761f47806986b66c59d46559e41d5d2 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:10:24 -0700 Subject: [PATCH 425/485] Remove redundant code in consensus_aligner.py --- micall/utils/consensus_aligner.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 0130e4def..f6b3590e3 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -57,28 +57,7 @@ def map_amino_sequences(from_seq: str, to_seq: str): return seq_map -@dataclass class AlignmentWrapper: - - ctg: str # The reference contig name. - ctg_len: int # The reference contig length. - r_st: int - r_en: int - strand: int # Either forward (1 : int) or reverse (-1 : int) strand. - q_st: int - q_en: int - mapq: int # The map quality. - cigar: List[Tuple[int, CigarActions]] - is_primary: bool - mlen: int # How many matches. - blen: int - NM: int - trans_strand: int - read_num: int - cs: str - MD: str - cigar_str: str - init_fields = ( 'ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary mlen ' 'blen NM trans_strand read_num cs MD cigar_str').split() From 4f54093bbefcd57cb187bbd3c115e3749ecc6e1d Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:29:23 -0700 Subject: [PATCH 426/485] Add comments for AlignmentWrapper class --- micall/utils/consensus_aligner.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index f6b3590e3..720f31c4f 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -58,15 +58,18 @@ def map_amino_sequences(from_seq: str, to_seq: str): class AlignmentWrapper: + """ + Our representation of mappy's Alignment object. + """ + init_fields = ( 'ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary mlen ' 'blen NM trans_strand read_num cs MD cigar_str').split() @classmethod def wrap(cls, source: Alignment, **overrides): - """ Wrap an Alignment object to make it easier to compare and display. - - Mostly used when testing. + """ + Wrap mappy's Alignment object to make it easier to work with. """ args = [getattr(source, field_name) for field_name in cls.init_fields] From aba56deb2d6f448180c2dcd2e959a8847f8ee249 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:31:37 -0700 Subject: [PATCH 427/485] Rename "fasta_to_svg" to "fasta_to_csv" --- micall/drivers/sample.py | 4 ++-- micall/tests/test_fasta_to_csv.py | 32 +++++++++++++++---------------- micall/utils/contig_blaster.py | 4 ++-- micall/utils/fasta_to_csv.py | 4 ++-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index e909d46ef..c63c3353c 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -20,7 +20,7 @@ from micall.core.denovo import denovo from micall.g2p.fastq_g2p import fastq_g2p, DEFAULT_MIN_COUNT, MIN_VALID, MIN_VALID_PERCENT from micall.utils.driver_utils import makedirs -from micall.utils.fasta_to_csv import fasta_to_svg +from micall.utils.fasta_to_csv import fasta_to_csv from contextlib import contextmanager logger = logging.getLogger(__name__) @@ -429,7 +429,7 @@ def run_denovo(self, excluded_seeds): with open(self.unstitched_contigs_csv, 'w') as unstitched_contigs_csv, \ open(self.merged_contigs_csv, 'r') as merged_contigs_csv, \ open(self.blast_csv, 'w') as blast_csv: - fasta_to_svg(self.unstitched_contigs_fasta, + fasta_to_csv(self.unstitched_contigs_fasta, unstitched_contigs_csv, merged_contigs_csv, blast_csv=blast_csv, diff --git a/micall/tests/test_fasta_to_csv.py b/micall/tests/test_fasta_to_csv.py index 930eefaa7..4ab951b6d 100644 --- a/micall/tests/test_fasta_to_csv.py +++ b/micall/tests/test_fasta_to_csv.py @@ -4,7 +4,7 @@ from Bio import SeqIO import pytest -from micall.utils.fasta_to_csv import default_database, genotype, fasta_to_svg +from micall.utils.fasta_to_csv import default_database, genotype, fasta_to_csv from micall.blast_db.make_blast_db import make_blast_db, DEFAULT_PROJECTS @@ -60,7 +60,7 @@ def test_genotype(tmpdir, hcv_db): assert expected_blast_csv == blast_csv.getvalue() -def test_fasta_to_svg_two_sequences(tmpdir, hcv_db): +def test_fasta_to_csv_two_sequences(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -75,12 +75,12 @@ def test_fasta_to_svg_two_sequences(tmpdir, hcv_db): HCV-1a,1.0,HCV-1a,CAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_fasta_to_svg_two_groups(tmpdir, hcv_db): +def test_fasta_to_csv_two_groups(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -98,12 +98,12 @@ def test_fasta_to_svg_two_groups(tmpdir, hcv_db): HCV-2b,1.0,HCV-2b,TGCAATGACAGCTTACAGACGGGTTTCCTCGCTTCCTTGTTTTACACCCA """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_fasta_to_svg_not_found(tmpdir, hcv_db): +def test_fasta_to_csv_not_found(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -115,12 +115,12 @@ def test_fasta_to_svg_not_found(tmpdir, hcv_db): unknown,0,,CATCACATAGGAGA """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_fasta_to_svg_partial_match(tmpdir, hcv_db): +def test_fasta_to_csv_partial_match(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -135,12 +135,12 @@ def test_fasta_to_svg_partial_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_fasta_to_svg_reversed_match(tmpdir, hcv_db): +def test_fasta_to_csv_reversed_match(tmpdir, hcv_db): """ If BLAST match is reversed, then reverse the contig before reporting. """ contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ @@ -156,12 +156,12 @@ def test_fasta_to_svg_reversed_match(tmpdir, hcv_db): HCV-1a,0.75,HCV-1a,CATCACATAGGAGACAGGGCTCCAGGACTGCACCATGCTCGTGTGTGGCGACGAC """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() -def test_fasta_to_svg(tmpdir, hcv_db): +def test_fasta_to_csv(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ >foo @@ -183,13 +183,13 @@ def test_fasta_to_svg(tmpdir, hcv_db): 1,HCV-1a,41,1.0,100,1,41,8187,8227 """ - fasta_to_svg(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv, blast_csv=blast_csv) assert expected_contigs_csv == contigs_csv.getvalue() assert expected_blast_csv == blast_csv.getvalue() -def test_fasta_to_svg_none(tmpdir, hcv_db): +def test_fasta_to_csv_none(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / 'contigs.fasta' assert not contigs_fasta.exists() @@ -198,7 +198,7 @@ def test_fasta_to_svg_none(tmpdir, hcv_db): ref,match,group_ref,contig """ - fasta_to_svg(str(contigs_fasta), contigs_csv) + fasta_to_csv(str(contigs_fasta), contigs_csv) assert expected_contigs_csv == contigs_csv.getvalue() @@ -220,7 +220,7 @@ def test_merged_contig(tmpdir, hcv_db): """ with merged_contigs_path.open() as merged_contigs_csv: - fasta_to_svg(str(contigs_fasta), + fasta_to_csv(str(contigs_fasta), contigs_csv, merged_contigs_csv=merged_contigs_csv) diff --git a/micall/utils/contig_blaster.py b/micall/utils/contig_blaster.py index 7158d25ef..ccdbacd0a 100644 --- a/micall/utils/contig_blaster.py +++ b/micall/utils/contig_blaster.py @@ -5,7 +5,7 @@ from operator import itemgetter from tempfile import NamedTemporaryFile -from micall.utils.fasta_to_csv import fasta_to_svg +from micall.utils.fasta_to_csv import fasta_to_csv def parse_args(): @@ -44,7 +44,7 @@ def main(): fasta_file.flush() new_contigs_csv = StringIO() blast_csv = StringIO() - fasta_to_svg(fasta_file.name, new_contigs_csv, blast_csv=blast_csv) + fasta_to_csv(fasta_file.name, new_contigs_csv, blast_csv=blast_csv) blast_csv.seek(0) for source_contig_num, contig_rows in groupby(DictReader(blast_csv), itemgetter('contig_num')): diff --git a/micall/utils/fasta_to_csv.py b/micall/utils/fasta_to_csv.py index 60d2f2431..892291a98 100644 --- a/micall/utils/fasta_to_csv.py +++ b/micall/utils/fasta_to_csv.py @@ -245,7 +245,7 @@ def invoke_blast(db: str) -> str: return samples -def fasta_to_svg(contigs_fasta_path: str, +def fasta_to_csv(contigs_fasta_path: str, contigs_csv: TextIO, merged_contigs_csv: Optional[TextIO] = None, blast_csv: Optional[TextIO] = None) -> None: @@ -285,7 +285,7 @@ def main(argv: Sequence[str]): parser.add_argument('--blast_csv', type=argparse.FileType('w'), help="Optional CSV file to write BLAST search results.") args = parser.parse_args(argv) - fasta_to_svg(args.contigs_fasta, args.contigs_csv, args.merged_contigs_csv, args.blast_csv) + fasta_to_csv(args.contigs_fasta, args.contigs_csv, args.merged_contigs_csv, args.blast_csv) if __name__ == "__main__": From d89b2a1feab796cf2aecaa15c6801cc2049c5de0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:33:42 -0700 Subject: [PATCH 428/485] Do not import mappy's Alignment class --- micall/utils/consensus_aligner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 720f31c4f..1e92cc59c 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -8,7 +8,7 @@ from aligntools import CigarActions, Cigar from gotoh import align_it, align_it_aa -from mappy import Alignment, Aligner +from mappy import Aligner from micall.core.project_config import ProjectConfig from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide @@ -67,7 +67,7 @@ class AlignmentWrapper: 'blen NM trans_strand read_num cs MD cigar_str').split() @classmethod - def wrap(cls, source: Alignment, **overrides): + def wrap(cls, source: object, **overrides): """ Wrap mappy's Alignment object to make it easier to work with. """ @@ -127,7 +127,7 @@ def __init__(self, self.MD = MD self.cigar_str = cigar_str - def __eq__(self, other: Alignment): + def __eq__(self, other: object): for field_name in self.init_fields: self_value = getattr(self, field_name) other_value = getattr(other, field_name) From 85cb566993ce103c90c766cd7aaa3a40887e07c4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:35:44 -0700 Subject: [PATCH 429/485] Rename "AlignmentWrapper" to "Alignment" --- micall/tests/test_consensus_aligner.py | 302 ++++++++++++------------- micall/utils/consensus_aligner.py | 10 +- 2 files changed, 156 insertions(+), 156 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 4139f0039..f0fe300d9 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -5,7 +5,7 @@ from pytest import approx from micall.core.aln2counts import SeedAmino, ReportAmino -from micall.utils.consensus_aligner import ConsensusAligner, AlignmentWrapper, AminoAlignment +from micall.utils.consensus_aligner import ConsensusAligner, Alignment, AminoAlignment from aligntools import CigarActions from micall.core.project_config import ProjectConfig @@ -29,15 +29,15 @@ def mutate(x): def assert_alignments(aligner: ConsensusAligner, - *expected_alignments: AlignmentWrapper): + *expected_alignments: Alignment): __tracebackhide__ = True - wrapped_alignments = tuple(AlignmentWrapper.wrap(alignment) + wrapped_alignments = tuple(Alignment.wrap(alignment) for alignment in aligner.alignments) if repr(wrapped_alignments) != repr(expected_alignments): assert wrapped_alignments == expected_alignments for i, (wrapped_alignment, expected_alignment) in enumerate( zip(wrapped_alignments, expected_alignments)): - for field_name in AlignmentWrapper.init_fields: + for field_name in Alignment.init_fields: wrapped = (i, field_name, getattr(wrapped_alignment, field_name)) expected = (i, field_name, getattr(expected_alignment, field_name)) assert wrapped == expected @@ -100,15 +100,15 @@ def test_create_reading_frames(): def test_alignment_repr(): - alignment = AlignmentWrapper('R1', 0, 1001, 1100, 1, 1, 100) + alignment = Alignment('R1', 0, 1001, 1100, 1, 1, 100) - assert repr(alignment) == "AlignmentWrapper('R1', 0, 1001, 1100, 1, 1, 100)" + assert repr(alignment) == "Alignment('R1', 0, 1001, 1100, 1, 1, 100)" def test_wrap_overrides(): - alignment1 = AlignmentWrapper(r_st=100, r_en=200, cigar_str='') - alignment2 = AlignmentWrapper.wrap(alignment1, r_en=300, blen=200, cigar=[]) - expected_alignment = AlignmentWrapper(r_st=100, r_en=300, blen=200, cigar=[], cigar_str='') + alignment1 = Alignment(r_st=100, r_en=200, cigar_str='') + alignment2 = Alignment.wrap(alignment1, r_en=300, blen=200, cigar=[]) + expected_alignment = Alignment(r_st=100, r_en=300, blen=200, cigar=[], cigar_str='') assert alignment2 == expected_alignment @@ -117,13 +117,13 @@ def test_start_contig(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[1000:2000] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=1000, - r_en=2000, - q_st=0, - q_en=1000, - mapq=60) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=1000, + r_en=2000, + q_st=0, + q_en=1000, + mapq=60) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -137,27 +137,27 @@ def test_start_contig_multiple_sections(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[6000:6500] + seed_seq[3000:3500] + seed_seq[1000:2000] - expected_alignments = [AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=6000, - r_en=6500, - q_st=0, - q_en=500, - mapq=60), - AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=3000, - r_en=3500, - q_st=500, - q_en=1000, - mapq=60), - AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=1000, - r_en=2000, - q_st=1000, - q_en=2000, - mapq=60)] + expected_alignments = [Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=6000, + r_en=6500, + q_st=0, + q_en=500, + mapq=60), + Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=3000, + r_en=3500, + q_st=500, + q_en=1000, + mapq=60), + Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=1000, + r_en=2000, + q_st=1000, + q_en=2000, + mapq=60)] aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -387,12 +387,12 @@ def test_start_contig_short_consensus(projects): start = 1560 end = 1617 consensus = seed_seq[start:end] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=start, - r_en=end, - q_st=0, - q_en=end-start) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=start, + r_en=end, + q_st=0, + q_en=end-start) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -405,17 +405,17 @@ def test_start_contig_deletion_minimap2(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + seed_seq[2031:2060] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2060, - q_st=0, - q_en=59, - mapq=9, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.DELETE), - (29, CigarActions.MATCH)], - NM=1) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2060, + q_st=0, + q_en=59, + mapq=9, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (29, CigarActions.MATCH)], + NM=1) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -430,26 +430,26 @@ def test_start_contig_big_deletion_minimap2(projects): seed_seq = mutate_sequence(seq=seed_seq, rate=0.04) consensus = seed_seq[290:983] + seed_seq[3000:9269] - expected_alignment = [AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - mlen=668, - r_st=290, - r_en=983, - q_st=0, - q_en=693, - mapq=60, - cigar=[(693, CigarActions.MATCH)], - NM=25), - AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - mlen=6013, - r_st=3000, - r_en=9269, - q_st=693, - q_en=6962, - mapq=60, - cigar=[(6269, CigarActions.MATCH)], - NM=256)] + expected_alignment = [Alignment(ctg='N/A', + ctg_len=len(seed_seq), + mlen=668, + r_st=290, + r_en=983, + q_st=0, + q_en=693, + mapq=60, + cigar=[(693, CigarActions.MATCH)], + NM=25), + Alignment(ctg='N/A', + ctg_len=len(seed_seq), + mlen=6013, + r_st=3000, + r_en=9269, + q_st=693, + q_en=6962, + mapq=60, + cigar=[(6269, CigarActions.MATCH)], + NM=256)] aligner = ConsensusAligner(projects) @@ -463,17 +463,17 @@ def test_start_contig_deletion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + seed_seq[2031:2050] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=49, - mapq=0, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.DELETE), - (19, CigarActions.MATCH)], - NM=0) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=49, + mapq=0, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (19, CigarActions.MATCH)], + NM=0) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -487,15 +487,15 @@ def test_start_contig_matched_deletion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + '-' + seed_seq[2031:2050] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=50, - mapq=0, - cigar=[(50, CigarActions.MATCH)], - NM=0) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=50, + mapq=0, + cigar=[(50, CigarActions.MATCH)], + NM=0) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -509,17 +509,17 @@ def test_start_contig_insertion_minimap2(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + 'ACT' + seed_seq[2030:2060] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2060, - q_st=0, - q_en=63, - mapq=9, - cigar=[(30, CigarActions.MATCH), - (3, CigarActions.INSERT), - (30, CigarActions.MATCH)], - NM=3) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2060, + q_st=0, + q_en=63, + mapq=9, + cigar=[(30, CigarActions.MATCH), + (3, CigarActions.INSERT), + (30, CigarActions.MATCH)], + NM=3) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -532,17 +532,17 @@ def test_start_contig_insertion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + 'T' + seed_seq[2030:2050] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=51, - mapq=0, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.INSERT), - (20, CigarActions.MATCH)], - NM=0) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=51, + mapq=0, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.INSERT), + (20, CigarActions.MATCH)], + NM=0) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -556,13 +556,13 @@ def test_start_contig_with_only_primary_matches(projects): seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[:500] - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=0, - r_en=500, - q_st=0, - q_en=500, - mapq=60) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=0, + r_en=500, + q_st=0, + q_en=500, + mapq=60) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -577,13 +577,13 @@ def test_start_contig_reading_frames(projects): reading_frames = create_reading_frames(expected_consensus) seed_name = 'HCV-6t' seed_seq = projects.getReference(seed_name) - expected_alignment = AlignmentWrapper(ctg='N/A', - ctg_len=len(seed_seq), - r_st=4798, - r_en=4807, - q_st=0, - q_en=9, - mapq=0) + expected_alignment = Alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=4798, + r_en=4807, + q_st=0, + q_en=9, + mapq=0) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, reading_frames=reading_frames) @@ -865,7 +865,7 @@ def test_count_coord_concordance(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATAAGCA" aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_concordance_list = [1.0]*len(aligner.consensus) @@ -885,7 +885,7 @@ def test_count_coord_concordance_mismatch(): aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCA" # changed nucs: ^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] # At the end of the consensus, the size of the averaging window for the concordance decreases from 20 to 11. # The concordance therefore decreases from 18/20 to 9/11 @@ -908,7 +908,7 @@ def test_count_coord_concordance_short_match(): aligner.consensus = "AGATTTCGATGATTCTCTTCTAAACGT" # last match position: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] # We start out with 100% match for the first 6 positions expected_concordance_list = [1.0] * 6 # After that, the averaging window (whose size is still increasing) starts to slide past the match: @@ -936,8 +936,8 @@ def test_count_coord_concordance_two_matches(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCATTT" aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[(12, CigarActions.MATCH)]), - AlignmentWrapper(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[(15, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[(12, CigarActions.MATCH)]), + Alignment(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[(15, CigarActions.MATCH)])] expected_concordance_list = [1.0] * 3 + [12/13, 12/14, 12/15, 13/16, 14/17, 15/18, 16/19] + [17/20]*11 + \ [16/19, 15/18, 15/17, 15/16] + [1.0]*5 @@ -958,9 +958,9 @@ def test_count_coord_concordance_with_insertion(): aligner.consensus = "AGATTTCGACCCTGATTCAGAAGATTTGCA" # insertion: ^^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.INSERT), - (18, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] # the window size increases from 10 to 20, while the averaging window slides over the insertion expected_concordance_list = [9/10, 9/11, 9/12, 10/13, 11/14, 12/15, 13/16, 14/17, 15/18, 16/19] # for 10 positions in the middle, the insertion is included in the full window size fo 20 @@ -984,9 +984,9 @@ def test_count_coord_concordance_with_deletion(): aligner.consensus = "AGATTTCGATTCAGAAGATTTGCA" # deletion behind this pos: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.DELETE), - (15, CigarActions.MATCH)])] + aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (15, CigarActions.MATCH)])] # the deletion does not decrease the concordance expected_concordance_list = [1.0]*len(aligner.consensus) @@ -1005,7 +1005,7 @@ def test_count_seed_region_concordance(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1028,7 +1028,7 @@ def test_count_seed_region_concordance_mismatch(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1050,7 +1050,7 @@ def test_count_seed_region_concordance_seed_not_aligned(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1072,7 +1072,7 @@ def test_count_seed_region_concordance_larger_match(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1095,9 +1095,9 @@ def test_count_seed_region_concordance_insertion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.INSERT), - (18, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1120,9 +1120,9 @@ def test_count_seed_region_concordance_deletion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [AlignmentWrapper(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.DELETE), - (18, CigarActions.MATCH)])] + seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 1e92cc59c..7850d159f 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -57,7 +57,7 @@ def map_amino_sequences(from_seq: str, to_seq: str): return seq_map -class AlignmentWrapper: +class Alignment: """ Our representation of mappy's Alignment object. """ @@ -136,7 +136,7 @@ def __eq__(self, other: object): return True def __repr__(self): - return (f'AlignmentWrapper({self.ctg!r}, {self.ctg_len}, ' + return (f'Alignment({self.ctg!r}, {self.ctg_len}, ' f'{self.r_st}, {self.r_en}, {self.strand}, ' f'{self.q_st}, {self.q_en})') @@ -154,7 +154,7 @@ def __init__(self, self.coordinate_name = self.consensus = self.amino_consensus = '' self.algorithm = '' self.consensus_offset = 0 - self.alignments: List[AlignmentWrapper] = [] + self.alignments: List[Alignment] = [] self.reading_frames: Dict[int, List[SeedAmino]] = {} self.seed_nucs: List[SeedNucleotide] = [] self.amino_alignments: List[AminoAlignment] = [] @@ -258,7 +258,7 @@ def start_contig(self, self.algorithm = 'gotoh' self.align_gotoh(coordinate_seq, self.consensus) - self.alignments = [AlignmentWrapper.wrap(alignment) + self.alignments = [Alignment.wrap(alignment) for alignment in self.alignments if alignment.is_primary] @@ -318,7 +318,7 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): typed_cigar: List[Tuple[int, CigarActions]] = [(a, CigarActions(b)) for [a, b] in cigar] - self.alignments.append(AlignmentWrapper( + self.alignments.append(Alignment( 'N/A', len(coordinate_seq), ref_start, From 46f94267579b52c08c7a87151854c13c32bec022 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:45:25 -0700 Subject: [PATCH 430/485] Remove unused fields from Alignment This will make synthetic manipulations involving alignments easier. --- micall/tests/test_consensus_aligner.py | 27 +++++++++----------------- micall/utils/consensus_aligner.py | 22 +-------------------- 2 files changed, 10 insertions(+), 39 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index f0fe300d9..0d48a166b 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -107,8 +107,8 @@ def test_alignment_repr(): def test_wrap_overrides(): alignment1 = Alignment(r_st=100, r_en=200, cigar_str='') - alignment2 = Alignment.wrap(alignment1, r_en=300, blen=200, cigar=[]) - expected_alignment = Alignment(r_st=100, r_en=300, blen=200, cigar=[], cigar_str='') + alignment2 = Alignment.wrap(alignment1, r_en=300, cigar=[]) + expected_alignment = Alignment(r_st=100, r_en=300, cigar=[], cigar_str='') assert alignment2 == expected_alignment @@ -414,8 +414,7 @@ def test_start_contig_deletion_minimap2(projects): mapq=9, cigar=[(30, CigarActions.MATCH), (1, CigarActions.DELETE), - (29, CigarActions.MATCH)], - NM=1) + (29, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -432,24 +431,20 @@ def test_start_contig_big_deletion_minimap2(projects): expected_alignment = [Alignment(ctg='N/A', ctg_len=len(seed_seq), - mlen=668, r_st=290, r_en=983, q_st=0, q_en=693, mapq=60, - cigar=[(693, CigarActions.MATCH)], - NM=25), + cigar=[(693, CigarActions.MATCH)]), Alignment(ctg='N/A', ctg_len=len(seed_seq), - mlen=6013, r_st=3000, r_en=9269, q_st=693, q_en=6962, mapq=60, - cigar=[(6269, CigarActions.MATCH)], - NM=256)] + cigar=[(6269, CigarActions.MATCH)])] aligner = ConsensusAligner(projects) @@ -472,8 +467,7 @@ def test_start_contig_deletion_gotoh(projects): mapq=0, cigar=[(30, CigarActions.MATCH), (1, CigarActions.DELETE), - (19, CigarActions.MATCH)], - NM=0) + (19, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -494,8 +488,7 @@ def test_start_contig_matched_deletion_gotoh(projects): q_st=0, q_en=50, mapq=0, - cigar=[(50, CigarActions.MATCH)], - NM=0) + cigar=[(50, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -518,8 +511,7 @@ def test_start_contig_insertion_minimap2(projects): mapq=9, cigar=[(30, CigarActions.MATCH), (3, CigarActions.INSERT), - (30, CigarActions.MATCH)], - NM=3) + (30, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -541,8 +533,7 @@ def test_start_contig_insertion_gotoh(projects): mapq=0, cigar=[(30, CigarActions.MATCH), (1, CigarActions.INSERT), - (20, CigarActions.MATCH)], - NM=0) + (20, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 7850d159f..0eb122110 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -62,9 +62,7 @@ class Alignment: Our representation of mappy's Alignment object. """ - init_fields = ( - 'ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary mlen ' - 'blen NM trans_strand read_num cs MD cigar_str').split() + init_fields = ('ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary cigar_str').split() @classmethod def wrap(cls, source: object, **overrides): @@ -89,20 +87,9 @@ def __init__(self, mapq=0, cigar: Iterable[Tuple[int, CigarActions]] = tuple(), is_primary=True, - mlen=0, - blen=0, - NM=0, - trans_strand=0, - read_num=1, - cs='', - MD='', cigar_str=None): cigar = list(cigar) - if not mlen: - mlen = min(q_en-q_st, r_en-r_st) - if not blen: - blen = max(q_en-q_st, r_en-r_st) if not cigar: cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] if cigar_str is None: @@ -118,13 +105,6 @@ def __init__(self, self.mapq = mapq self.cigar = cigar self.is_primary = is_primary - self.mlen = mlen - self.blen = blen - self.NM = NM - self.trans_strand = trans_strand - self.read_num = read_num - self.cs = cs - self.MD = MD self.cigar_str = cigar_str def __eq__(self, other: object): From 0df9f60f4dd34f992d022bfa6103b2e5350c832e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 14:51:21 -0700 Subject: [PATCH 431/485] Remove is_primary attribute from Alignment --- micall/utils/consensus_aligner.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 0eb122110..14ee8406e 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -62,7 +62,7 @@ class Alignment: Our representation of mappy's Alignment object. """ - init_fields = ('ctg ctg_len r_st r_en strand q_st q_en mapq cigar is_primary cigar_str').split() + init_fields = ('ctg ctg_len r_st r_en strand q_st q_en mapq cigar cigar_str').split() @classmethod def wrap(cls, source: object, **overrides): @@ -86,7 +86,6 @@ def __init__(self, q_en=0, mapq=0, cigar: Iterable[Tuple[int, CigarActions]] = tuple(), - is_primary=True, cigar_str=None): cigar = list(cigar) @@ -104,7 +103,6 @@ def __init__(self, self.q_en = q_en self.mapq = mapq self.cigar = cigar - self.is_primary = is_primary self.cigar_str = cigar_str def __eq__(self, other: object): @@ -231,23 +229,21 @@ def start_contig(self, except KeyError: coordinate_seq = self.projects.getReference(coordinate_name) aligner = Aligner(seq=coordinate_seq, preset='map-ont') - self.alignments = list(aligner.map(self.consensus)) - if self.alignments or 10_000 < len(self.consensus): + mappy_alignments = list(aligner.map(self.consensus)) + if mappy_alignments or 10_000 < len(self.consensus): self.algorithm = 'minimap2' + self.alignments = [Alignment.wrap(alignment) + for alignment in mappy_alignments + if alignment.is_primary] + for alignment in self.alignments: + new = [] + for (size, action) in alignment.cigar: + new.append((size, CigarActions(action))) + alignment.cigar = new else: self.algorithm = 'gotoh' self.align_gotoh(coordinate_seq, self.consensus) - self.alignments = [Alignment.wrap(alignment) - for alignment in self.alignments - if alignment.is_primary] - - for alignment in self.alignments: - new = [] - for (size, action) in alignment.cigar: - new.append((size, CigarActions(action))) - alignment.cigar = new - self.alignments.sort(key=attrgetter('q_st')) if self.overall_alignments_writer is not None: From 79e8410821f227bd7a7871dc0314a249255e909c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 15:07:17 -0700 Subject: [PATCH 432/485] Add conversions between Alignment and CigarHit --- micall/utils/consensus_aligner.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 14ee8406e..b364e2528 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -5,7 +5,7 @@ import csv import os import logging -from aligntools import CigarActions, Cigar +from aligntools import CigarActions, Cigar, CigarHit from gotoh import align_it, align_it_aa from mappy import Aligner @@ -105,6 +105,23 @@ def __init__(self, self.cigar = cigar self.cigar_str = cigar_str + def to_cigar_hit(self) -> CigarHit: + return CigarHit(Cigar(self.cigar), + r_st=self.r_st, r_ei=self.r_en - 1, + q_st=self.q_st, q_ei=self.q_en - 1) + + @staticmethod + def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Alignment': + return Alignment(ctg=ctg, + ctg_len=ctg_len, + r_st=hit.r_st, r_en=hit.r_ei + 1, + strand=strand, + q_st=hit.q_st, q_en=hit.q_ei + 1, + mapq=mapq, + cigar=hit.cigar._data, + cigar_str=str(hit.cigar), + ) + def __eq__(self, other: object): for field_name in self.init_fields: self_value = getattr(self, field_name) From 8fab55bdb23fcb7583226d22c1e2e4bf2a4fa8db Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 15:22:33 -0700 Subject: [PATCH 433/485] Remove init_fields attribute from Alignment --- micall/tests/test_consensus_aligner.py | 14 ++----- micall/utils/consensus_aligner.py | 53 +++++++++++++------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 0d48a166b..d8c9cef6b 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -31,13 +31,15 @@ def mutate(x): def assert_alignments(aligner: ConsensusAligner, *expected_alignments: Alignment): __tracebackhide__ = True - wrapped_alignments = tuple(Alignment.wrap(alignment) + wrapped_alignments = tuple(Alignment.coerce(alignment) for alignment in aligner.alignments) if repr(wrapped_alignments) != repr(expected_alignments): assert wrapped_alignments == expected_alignments for i, (wrapped_alignment, expected_alignment) in enumerate( zip(wrapped_alignments, expected_alignments)): - for field_name in Alignment.init_fields: + for field_name in dir(expected_alignment): + if callable(getattr(expected_alignment, field_name)) or field_name.startswith('_'): + continue wrapped = (i, field_name, getattr(wrapped_alignment, field_name)) expected = (i, field_name, getattr(expected_alignment, field_name)) assert wrapped == expected @@ -105,14 +107,6 @@ def test_alignment_repr(): assert repr(alignment) == "Alignment('R1', 0, 1001, 1100, 1, 1, 100)" -def test_wrap_overrides(): - alignment1 = Alignment(r_st=100, r_en=200, cigar_str='') - alignment2 = Alignment.wrap(alignment1, r_en=300, cigar=[]) - expected_alignment = Alignment(r_st=100, r_en=300, cigar=[], cigar_str='') - - assert alignment2 == expected_alignment - - def test_start_contig(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index b364e2528..e05c7e499 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -8,7 +8,7 @@ from aligntools import CigarActions, Cigar, CigarHit from gotoh import align_it, align_it_aa -from mappy import Aligner +import mappy from micall.core.project_config import ProjectConfig from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide @@ -62,20 +62,6 @@ class Alignment: Our representation of mappy's Alignment object. """ - init_fields = ('ctg ctg_len r_st r_en strand q_st q_en mapq cigar cigar_str').split() - - @classmethod - def wrap(cls, source: object, **overrides): - """ - Wrap mappy's Alignment object to make it easier to work with. - """ - args = [getattr(source, field_name) - for field_name in cls.init_fields] - for name, value in overrides.items(): - i = cls.init_fields.index(name) - args[i] = value - return cls(*args) - def __init__(self, ctg='', ctg_len=0, @@ -105,6 +91,23 @@ def __init__(self, self.cigar = cigar self.cigar_str = cigar_str + @staticmethod + def coerce(obj: object) -> 'Alignment': + if isinstance(obj, Alignment): + return obj + elif isinstance(obj, mappy.Alignment): + return Alignment(ctg=obj.ctg, + ctg_len=obj.ctg_len, + r_st=obj.r_st, r_en=obj.r_en, + strand=obj.strand, + q_st=obj.q_st, q_en=obj.q_en, + mapq=obj.mapq, + cigar=obj.cigar, + cigar_str=obj.cigar_str, + ) + else: + raise TypeError(f"Cannot coerce from {obj!r}.") + def to_cigar_hit(self) -> CigarHit: return CigarHit(Cigar(self.cigar), r_st=self.r_st, r_ei=self.r_en - 1, @@ -123,12 +126,10 @@ def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Align ) def __eq__(self, other: object): - for field_name in self.init_fields: - self_value = getattr(self, field_name) - other_value = getattr(other, field_name) - if self_value != other_value: - return False - return True + # Filter out private attributes (those starting with an underscore) + self_public_attrs = {k: v for k, v in self.__dict__.items() if not k.startswith('_')} + other_public_attrs = {k: v for k, v in other.__dict__.items() if not k.startswith('_')} + return self_public_attrs == other_public_attrs def __repr__(self): return (f'Alignment({self.ctg!r}, {self.ctg_len}, ' @@ -245,11 +246,11 @@ def start_contig(self, coordinate_seq = self.projects.getGenotypeReference(coordinate_name) except KeyError: coordinate_seq = self.projects.getReference(coordinate_name) - aligner = Aligner(seq=coordinate_seq, preset='map-ont') - mappy_alignments = list(aligner.map(self.consensus)) + aligner = mappy.Aligner(seq=coordinate_seq, preset='map-ont') + mappy_alignments: List[mappy.Alignment] = list(aligner.map(self.consensus)) if mappy_alignments or 10_000 < len(self.consensus): self.algorithm = 'minimap2' - self.alignments = [Alignment.wrap(alignment) + self.alignments = [Alignment.coerce(alignment) for alignment in mappy_alignments if alignment.is_primary] for alignment in self.alignments: @@ -881,8 +882,8 @@ def seed_concordance(self, seed_name, projects, seed_coordinates, excluded_regio if self.seed_concordance_writer is None: return seed_ref = self.projects.getReference(seed_name) - seed_aligner = Aligner(seq=seed_ref, preset='map-ont') - seed_alignments = list(seed_aligner.map(self.consensus)) + seed_aligner = mappy.Aligner(seq=seed_ref, preset='map-ont') + seed_alignments: List[mappy.Alignment] = list(seed_aligner.map(self.consensus)) regions = projects.getCoordinateReferences(seed_name) for region in regions: From 6f52f8040f1b550274f7898c8da11f6337104529 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 15:28:37 -0700 Subject: [PATCH 434/485] Streamline coersion from mappy.Alignment to Alignment --- micall/utils/consensus_aligner.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index e05c7e499..a55fdf16e 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -96,13 +96,17 @@ def coerce(obj: object) -> 'Alignment': if isinstance(obj, Alignment): return obj elif isinstance(obj, mappy.Alignment): + cigar: List[Tuple[int, CigarActions]] = [] + for (size, action) in obj.cigar: + cigar.append((size, CigarActions(action))) + return Alignment(ctg=obj.ctg, ctg_len=obj.ctg_len, r_st=obj.r_st, r_en=obj.r_en, strand=obj.strand, q_st=obj.q_st, q_en=obj.q_en, mapq=obj.mapq, - cigar=obj.cigar, + cigar=cigar, cigar_str=obj.cigar_str, ) else: @@ -253,11 +257,6 @@ def start_contig(self, self.alignments = [Alignment.coerce(alignment) for alignment in mappy_alignments if alignment.is_primary] - for alignment in self.alignments: - new = [] - for (size, action) in alignment.cigar: - new.append((size, CigarActions(action))) - alignment.cigar = new else: self.algorithm = 'gotoh' self.align_gotoh(coordinate_seq, self.consensus) From 92efaff407434c2e29c6cfbbc2f4c15ced60e39f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 16:19:18 -0700 Subject: [PATCH 435/485] Move Alignment definition into its own file --- micall/utils/alignment.py | 88 ++++++++++++++++++++++++++++++ micall/utils/consensus_aligner.py | 89 ++----------------------------- 2 files changed, 91 insertions(+), 86 deletions(-) create mode 100644 micall/utils/alignment.py diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py new file mode 100644 index 000000000..7b6aa1833 --- /dev/null +++ b/micall/utils/alignment.py @@ -0,0 +1,88 @@ +from typing import Iterable, Tuple, List + +from aligntools import CigarActions, Cigar, CigarHit +import mappy + + +class Alignment: + """ + Our representation of mappy's Alignment object. + """ + + def __init__(self, + ctg='', + ctg_len=0, + r_st=0, + r_en=0, + strand=1, + q_st=0, + q_en=0, + mapq=0, + cigar: Iterable[Tuple[int, CigarActions]] = tuple(), + cigar_str=None): + + cigar = list(cigar) + if not cigar: + cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] + if cigar_str is None: + cigar_str = str(Cigar(cigar)) + + self.ctg = ctg + self.ctg_len = ctg_len + self.r_st = r_st + self.r_en = r_en + self.strand = strand + self.q_st = q_st + self.q_en = q_en + self.mapq = mapq + self.cigar = cigar + self.cigar_str = cigar_str + + @staticmethod + def coerce(obj: object) -> 'Alignment': + if isinstance(obj, Alignment): + return obj + elif isinstance(obj, mappy.Alignment): + cigar: List[Tuple[int, CigarActions]] = [] + for (size, action) in obj.cigar: + cigar.append((size, CigarActions(action))) + + return Alignment(ctg=obj.ctg, + ctg_len=obj.ctg_len, + r_st=obj.r_st, r_en=obj.r_en, + strand=obj.strand, + q_st=obj.q_st, q_en=obj.q_en, + mapq=obj.mapq, + cigar=cigar, + cigar_str=obj.cigar_str, + ) + else: + raise TypeError(f"Cannot coerce from {obj!r}.") + + def to_cigar_hit(self) -> CigarHit: + return CigarHit(Cigar(self.cigar), + r_st=self.r_st, r_ei=self.r_en - 1, + q_st=self.q_st, q_ei=self.q_en - 1) + + @staticmethod + def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Alignment': + return Alignment(ctg=ctg, + ctg_len=ctg_len, + r_st=hit.r_st, r_en=hit.r_ei + 1, + strand=strand, + q_st=hit.q_st, q_en=hit.q_ei + 1, + mapq=mapq, + cigar=hit.cigar._data, + cigar_str=str(hit.cigar), + ) + + def __eq__(self, other: object): + # Filter out private attributes (those starting with an underscore) + self_public_attrs = {k: v for k, v in self.__dict__.items() if not k.startswith('_')} + other_public_attrs = {k: v for k, v in other.__dict__.items() if not k.startswith('_')} + return self_public_attrs == other_public_attrs + + def __repr__(self): + return (f'Alignment({self.ctg!r}, {self.ctg_len}, ' + f'{self.r_st}, {self.r_en}, {self.strand}, ' + f'{self.q_st}, {self.q_en})') diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index a55fdf16e..7e833c087 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,11 +1,11 @@ -from typing import Dict, List, Optional, Iterable, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple from dataclasses import dataclass, replace from itertools import count from operator import attrgetter import csv import os import logging -from aligntools import CigarActions, Cigar, CigarHit +from aligntools import CigarActions from gotoh import align_it, align_it_aa import mappy @@ -13,6 +13,7 @@ from micall.core.project_config import ProjectConfig from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide from micall.utils.translation import translate +from micall.utils.alignment import Alignment logger = logging.getLogger(__name__) @@ -57,90 +58,6 @@ def map_amino_sequences(from_seq: str, to_seq: str): return seq_map -class Alignment: - """ - Our representation of mappy's Alignment object. - """ - - def __init__(self, - ctg='', - ctg_len=0, - r_st=0, - r_en=0, - strand=1, - q_st=0, - q_en=0, - mapq=0, - cigar: Iterable[Tuple[int, CigarActions]] = tuple(), - cigar_str=None): - - cigar = list(cigar) - if not cigar: - cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] - if cigar_str is None: - cigar_str = str(Cigar(cigar)) - - self.ctg = ctg - self.ctg_len = ctg_len - self.r_st = r_st - self.r_en = r_en - self.strand = strand - self.q_st = q_st - self.q_en = q_en - self.mapq = mapq - self.cigar = cigar - self.cigar_str = cigar_str - - @staticmethod - def coerce(obj: object) -> 'Alignment': - if isinstance(obj, Alignment): - return obj - elif isinstance(obj, mappy.Alignment): - cigar: List[Tuple[int, CigarActions]] = [] - for (size, action) in obj.cigar: - cigar.append((size, CigarActions(action))) - - return Alignment(ctg=obj.ctg, - ctg_len=obj.ctg_len, - r_st=obj.r_st, r_en=obj.r_en, - strand=obj.strand, - q_st=obj.q_st, q_en=obj.q_en, - mapq=obj.mapq, - cigar=cigar, - cigar_str=obj.cigar_str, - ) - else: - raise TypeError(f"Cannot coerce from {obj!r}.") - - def to_cigar_hit(self) -> CigarHit: - return CigarHit(Cigar(self.cigar), - r_st=self.r_st, r_ei=self.r_en - 1, - q_st=self.q_st, q_ei=self.q_en - 1) - - @staticmethod - def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Alignment': - return Alignment(ctg=ctg, - ctg_len=ctg_len, - r_st=hit.r_st, r_en=hit.r_ei + 1, - strand=strand, - q_st=hit.q_st, q_en=hit.q_ei + 1, - mapq=mapq, - cigar=hit.cigar._data, - cigar_str=str(hit.cigar), - ) - - def __eq__(self, other: object): - # Filter out private attributes (those starting with an underscore) - self_public_attrs = {k: v for k, v in self.__dict__.items() if not k.startswith('_')} - other_public_attrs = {k: v for k, v in other.__dict__.items() if not k.startswith('_')} - return self_public_attrs == other_public_attrs - - def __repr__(self): - return (f'Alignment({self.ctg!r}, {self.ctg_len}, ' - f'{self.r_st}, {self.r_en}, {self.strand}, ' - f'{self.q_st}, {self.q_en})') - - class ConsensusAligner: def __init__(self, projects: ProjectConfig, From 31a673d372e8218ca4f4f1d1fa16cb6dc6ed4e81 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 16:29:17 -0700 Subject: [PATCH 436/485] Turn Alignment into a dataclass --- micall/tests/test_consensus_aligner.py | 253 ++++++++++++++----------- micall/utils/alignment.py | 42 ++-- micall/utils/consensus_aligner.py | 15 +- 3 files changed, 164 insertions(+), 146 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index d8c9cef6b..778a23c4f 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -1,12 +1,13 @@ import math import typing +from typing import Iterable, Tuple import random from io import StringIO from pytest import approx from micall.core.aln2counts import SeedAmino, ReportAmino from micall.utils.consensus_aligner import ConsensusAligner, Alignment, AminoAlignment -from aligntools import CigarActions +from aligntools import CigarActions, Cigar from micall.core.project_config import ProjectConfig # noinspection PyUnresolvedReferences @@ -88,6 +89,36 @@ def create_reading_frames(consensus: str) -> typing.Dict[int, return reading_frames +def make_alignment( + ctg='', + ctg_len=0, + r_st=0, + r_en=0, + strand=1, + q_st=0, + q_en=0, + mapq=0, + cigar: Iterable[Tuple[int, CigarActions]] = tuple(), + cigar_str=None) -> Alignment: + + cigar = list(cigar) + if not cigar: + cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] + if cigar_str is None: + cigar_str = str(Cigar(cigar)) + + return Alignment(ctg=ctg, + ctg_len=ctg_len, + r_st=r_st, + r_en=r_en, + strand=strand, + q_st=q_st, + q_en=q_en, + mapq=mapq, + cigar=cigar, + cigar_str=cigar_str) + + def test_create_reading_frames(): reading_frames = create_reading_frames('AAACCCTTTGGG') @@ -102,7 +133,7 @@ def test_create_reading_frames(): def test_alignment_repr(): - alignment = Alignment('R1', 0, 1001, 1100, 1, 1, 100) + alignment = make_alignment('R1', 0, 1001, 1100, 1, 1, 100) assert repr(alignment) == "Alignment('R1', 0, 1001, 1100, 1, 1, 100)" @@ -111,7 +142,7 @@ def test_start_contig(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[1000:2000] - expected_alignment = Alignment(ctg='N/A', + expected_alignment = make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=1000, r_en=2000, @@ -131,21 +162,21 @@ def test_start_contig_multiple_sections(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[6000:6500] + seed_seq[3000:3500] + seed_seq[1000:2000] - expected_alignments = [Alignment(ctg='N/A', + expected_alignments = [make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=6000, r_en=6500, q_st=0, q_en=500, mapq=60), - Alignment(ctg='N/A', + make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=3000, r_en=3500, q_st=500, q_en=1000, mapq=60), - Alignment(ctg='N/A', + make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=1000, r_en=2000, @@ -381,12 +412,12 @@ def test_start_contig_short_consensus(projects): start = 1560 end = 1617 consensus = seed_seq[start:end] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=start, - r_en=end, - q_st=0, - q_en=end-start) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=start, + r_en=end, + q_st=0, + q_en=end-start) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -399,16 +430,16 @@ def test_start_contig_deletion_minimap2(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + seed_seq[2031:2060] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2060, - q_st=0, - q_en=59, - mapq=9, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.DELETE), - (29, CigarActions.MATCH)]) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2060, + q_st=0, + q_en=59, + mapq=9, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (29, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -423,22 +454,22 @@ def test_start_contig_big_deletion_minimap2(projects): seed_seq = mutate_sequence(seq=seed_seq, rate=0.04) consensus = seed_seq[290:983] + seed_seq[3000:9269] - expected_alignment = [Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=290, - r_en=983, - q_st=0, - q_en=693, - mapq=60, - cigar=[(693, CigarActions.MATCH)]), - Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=3000, - r_en=9269, - q_st=693, - q_en=6962, - mapq=60, - cigar=[(6269, CigarActions.MATCH)])] + expected_alignment = [make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=290, + r_en=983, + q_st=0, + q_en=693, + mapq=60, + cigar=[(693, CigarActions.MATCH)]), + make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=3000, + r_en=9269, + q_st=693, + q_en=6962, + mapq=60, + cigar=[(6269, CigarActions.MATCH)])] aligner = ConsensusAligner(projects) @@ -452,16 +483,16 @@ def test_start_contig_deletion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + seed_seq[2031:2050] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=49, - mapq=0, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.DELETE), - (19, CigarActions.MATCH)]) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=49, + mapq=0, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.DELETE), + (19, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -475,14 +506,14 @@ def test_start_contig_matched_deletion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + '-' + seed_seq[2031:2050] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=50, - mapq=0, - cigar=[(50, CigarActions.MATCH)]) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=50, + mapq=0, + cigar=[(50, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -496,16 +527,16 @@ def test_start_contig_insertion_minimap2(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + 'ACT' + seed_seq[2030:2060] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2060, - q_st=0, - q_en=63, - mapq=9, - cigar=[(30, CigarActions.MATCH), - (3, CigarActions.INSERT), - (30, CigarActions.MATCH)]) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2060, + q_st=0, + q_en=63, + mapq=9, + cigar=[(30, CigarActions.MATCH), + (3, CigarActions.INSERT), + (30, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -518,16 +549,16 @@ def test_start_contig_insertion_gotoh(projects): seed_name = 'SARS-CoV-2-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[2000:2030] + 'T' + seed_seq[2030:2050] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2050, - q_st=0, - q_en=51, - mapq=0, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.INSERT), - (20, CigarActions.MATCH)]) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=2000, + r_en=2050, + q_st=0, + q_en=51, + mapq=0, + cigar=[(30, CigarActions.MATCH), + (1, CigarActions.INSERT), + (20, CigarActions.MATCH)]) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -541,13 +572,13 @@ def test_start_contig_with_only_primary_matches(projects): seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[:500] - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=0, - r_en=500, - q_st=0, - q_en=500, - mapq=60) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=0, + r_en=500, + q_st=0, + q_en=500, + mapq=60) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, consensus) @@ -562,13 +593,13 @@ def test_start_contig_reading_frames(projects): reading_frames = create_reading_frames(expected_consensus) seed_name = 'HCV-6t' seed_seq = projects.getReference(seed_name) - expected_alignment = Alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=4798, - r_en=4807, - q_st=0, - q_en=9, - mapq=0) + expected_alignment = make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=4798, + r_en=4807, + q_st=0, + q_en=9, + mapq=0) aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, reading_frames=reading_frames) @@ -850,7 +881,7 @@ def test_count_coord_concordance(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATAAGCA" aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_concordance_list = [1.0]*len(aligner.consensus) @@ -870,7 +901,7 @@ def test_count_coord_concordance_mismatch(): aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCA" # changed nucs: ^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] # At the end of the consensus, the size of the averaging window for the concordance decreases from 20 to 11. # The concordance therefore decreases from 18/20 to 9/11 @@ -893,7 +924,7 @@ def test_count_coord_concordance_short_match(): aligner.consensus = "AGATTTCGATGATTCTCTTCTAAACGT" # last match position: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] # We start out with 100% match for the first 6 positions expected_concordance_list = [1.0] * 6 # After that, the averaging window (whose size is still increasing) starts to slide past the match: @@ -921,8 +952,8 @@ def test_count_coord_concordance_two_matches(): aligner = ConsensusAligner(projects) aligner.consensus = "AGATTTCGATGATTCAGAAGATTTGCATTT" aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[(12, CigarActions.MATCH)]), - Alignment(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[(15, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=12, q_st=0, q_en=12, cigar=[(12, CigarActions.MATCH)]), + make_alignment(r_st=15, r_en=30, q_st=15, q_en=30, cigar=[(15, CigarActions.MATCH)])] expected_concordance_list = [1.0] * 3 + [12/13, 12/14, 12/15, 13/16, 14/17, 15/18, 16/19] + [17/20]*11 + \ [16/19, 15/18, 15/17, 15/16] + [1.0]*5 @@ -943,9 +974,9 @@ def test_count_coord_concordance_with_insertion(): aligner.consensus = "AGATTTCGACCCTGATTCAGAAGATTTGCA" # insertion: ^^^ aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.INSERT), - (18, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] # the window size increases from 10 to 20, while the averaging window slides over the insertion expected_concordance_list = [9/10, 9/11, 9/12, 10/13, 11/14, 12/15, 13/16, 14/17, 15/18, 16/19] # for 10 positions in the middle, the insertion is included in the full window size fo 20 @@ -969,9 +1000,9 @@ def test_count_coord_concordance_with_deletion(): aligner.consensus = "AGATTTCGATTCAGAAGATTTGCA" # deletion behind this pos: ^ aligner.coordinate_name = 'test-region' - aligner.alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.DELETE), - (15, CigarActions.MATCH)])] + aligner.alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (15, CigarActions.MATCH)])] # the deletion does not decrease the concordance expected_concordance_list = [1.0]*len(aligner.consensus) @@ -990,7 +1021,7 @@ def test_count_seed_region_concordance(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=27, cigar=[(27, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1013,7 +1044,7 @@ def test_count_seed_region_concordance_mismatch(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1035,7 +1066,7 @@ def test_count_seed_region_concordance_seed_not_aligned(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=15, q_st=0, q_en=15, cigar=[(15, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1057,7 +1088,7 @@ def test_count_seed_region_concordance_larger_match(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=30, q_st=0, q_en=30, cigar=[(30, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1080,9 +1111,9 @@ def test_count_seed_region_concordance_insertion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.INSERT), - (18, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=27, q_st=0, q_en=30, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.INSERT), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered @@ -1105,9 +1136,9 @@ def test_count_seed_region_concordance_deletion(projects): seed_name = 'test-seed' seed_ref = "AGATTTCGATGATTCAGAAGATTTGCATGA" region = 'test-region' - seed_alignments = [Alignment(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[(9, CigarActions.MATCH), - (3, CigarActions.DELETE), - (18, CigarActions.MATCH)])] + seed_alignments = [make_alignment(r_st=0, r_en=30, q_st=0, q_en=27, cigar=[(9, CigarActions.MATCH), + (3, CigarActions.DELETE), + (18, CigarActions.MATCH)])] expected_file = """\ seed_name,contig,region,pct_concordance,pct_covered diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index 7b6aa1833..8eb59fb4d 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -1,42 +1,26 @@ -from typing import Iterable, Tuple, List +from typing import Tuple, List, Sequence +from dataclasses import dataclass from aligntools import CigarActions, Cigar, CigarHit import mappy +@dataclass(frozen=True) class Alignment: """ Our representation of mappy's Alignment object. """ - def __init__(self, - ctg='', - ctg_len=0, - r_st=0, - r_en=0, - strand=1, - q_st=0, - q_en=0, - mapq=0, - cigar: Iterable[Tuple[int, CigarActions]] = tuple(), - cigar_str=None): - - cigar = list(cigar) - if not cigar: - cigar = [(max(q_en-q_st, r_en-r_st), CigarActions.MATCH)] - if cigar_str is None: - cigar_str = str(Cigar(cigar)) - - self.ctg = ctg - self.ctg_len = ctg_len - self.r_st = r_st - self.r_en = r_en - self.strand = strand - self.q_st = q_st - self.q_en = q_en - self.mapq = mapq - self.cigar = cigar - self.cigar_str = cigar_str + ctg: str + ctg_len: int + r_st: int + r_en: int + strand: int + q_st: int + q_en: int + mapq: int + cigar: Sequence[Tuple[int, CigarActions]] + cigar_str: str @staticmethod def coerce(obj: object) -> 'Alignment': diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 7e833c087..384b59171 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -5,7 +5,7 @@ import csv import os import logging -from aligntools import CigarActions +from aligntools import CigarActions, Cigar from gotoh import align_it, align_it_aa import mappy @@ -229,13 +229,16 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): typed_cigar: List[Tuple[int, CigarActions]] = [(a, CigarActions(b)) for [a, b] in cigar] self.alignments.append(Alignment( - 'N/A', - len(coordinate_seq), - ref_start, - ref_index, + ctg='N/A', + ctg_len=len(coordinate_seq), + r_st=ref_start, + r_en=ref_index, + strand=1, q_st=0, q_en=consensus_index, - cigar=typed_cigar)) + cigar=typed_cigar, + mapq=0, + cigar_str=str(Cigar(typed_cigar)))) def find_amino_alignments(self, start_pos: int, From 34f10d721e8f0f7ceac41780aaa82e4ea912d755 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 16:32:22 -0700 Subject: [PATCH 437/485] Remove redundant methods in Alignment class --- micall/tests/test_consensus_aligner.py | 2 +- micall/utils/alignment.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 778a23c4f..a53e9ad98 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -135,7 +135,7 @@ def test_create_reading_frames(): def test_alignment_repr(): alignment = make_alignment('R1', 0, 1001, 1100, 1, 1, 100) - assert repr(alignment) == "Alignment('R1', 0, 1001, 1100, 1, 1, 100)" + assert repr(alignment) == "Alignment(ctg='R1', ctg_len=0, r_st=1001, r_en=1100, strand=1, q_st=1, q_en=100, mapq=0, cigar=[(99, )], cigar_str='99M')" def test_start_contig(projects): diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index 8eb59fb4d..cc0892de5 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -59,14 +59,3 @@ def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Align cigar=hit.cigar._data, cigar_str=str(hit.cigar), ) - - def __eq__(self, other: object): - # Filter out private attributes (those starting with an underscore) - self_public_attrs = {k: v for k, v in self.__dict__.items() if not k.startswith('_')} - other_public_attrs = {k: v for k, v in other.__dict__.items() if not k.startswith('_')} - return self_public_attrs == other_public_attrs - - def __repr__(self): - return (f'Alignment({self.ctg!r}, {self.ctg_len}, ' - f'{self.r_st}, {self.r_en}, {self.strand}, ' - f'{self.q_st}, {self.q_en})') From 2dac980ee7cfe1c41a7d97d3e8ec988160088649 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 16:33:44 -0700 Subject: [PATCH 438/485] Remove unjustified defaults in Alignment definition --- micall/utils/alignment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index cc0892de5..3c477bc39 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -49,7 +49,7 @@ def to_cigar_hit(self) -> CigarHit: q_st=self.q_st, q_ei=self.q_en - 1) @staticmethod - def from_cigar_hit(hit: CigarHit, ctg='', ctg_len=0, strand=1, mapq=0) -> 'Alignment': + def from_cigar_hit(hit: CigarHit, ctg: str, ctg_len: int, strand: int, mapq: int) -> 'Alignment': return Alignment(ctg=ctg, ctg_len=ctg_len, r_st=hit.r_st, r_en=hit.r_ei + 1, From 44d9e50fb93278fd0cc04fdd81339e0c28e0aae5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 17:01:15 -0700 Subject: [PATCH 439/485] Simplify align_gotoh implementation --- micall/utils/alignment.py | 2 +- micall/utils/consensus_aligner.py | 47 ++++++++----------------------- 2 files changed, 13 insertions(+), 36 deletions(-) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index 3c477bc39..dbdfbd052 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -56,6 +56,6 @@ def from_cigar_hit(hit: CigarHit, ctg: str, ctg_len: int, strand: int, mapq: int strand=strand, q_st=hit.q_st, q_en=hit.q_ei + 1, mapq=mapq, - cigar=hit.cigar._data, + cigar=list(hit.cigar._data), cigar_str=str(hit.cigar), ) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 384b59171..f8668b672 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,11 +1,11 @@ -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set from dataclasses import dataclass, replace from itertools import count from operator import attrgetter import csv import os import logging -from aligntools import CigarActions, Cigar +from aligntools import CigarActions, Cigar, CigarHit from gotoh import align_it, align_it_aa import mappy @@ -196,6 +196,8 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 + assert '&' not in consensus, "Consensus contains forbidden character '&'" + consensus = ''.join('&' if x == '-' else x for x in consensus) aligned_coordinate, aligned_consensus, score = align_it( coordinate_seq, consensus, @@ -203,42 +205,17 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): gap_extend_penalty, use_terminal_gap_penalty) if min(len(coordinate_seq), len(consensus)) < score: - ref_start = len(aligned_consensus) - len(aligned_consensus.lstrip('-')) - aligned_consensus: str = aligned_consensus[ref_start:] # type: ignore[no-redef] - aligned_coordinate: str = aligned_coordinate[ref_start:] # type: ignore[no-redef] - aligned_consensus = aligned_consensus.rstrip('-') - ref_index = ref_start - consensus_index = 0 - cigar: List[List[int]] = [] - for ref_nuc, nuc in zip(aligned_coordinate, aligned_consensus): - expected_nuc = consensus[consensus_index] - ref_index += 1 - consensus_index += 1 - expected_action = CigarActions.MATCH - if nuc == '-' and nuc != expected_nuc: - expected_action = CigarActions.DELETE - consensus_index -= 1 - if ref_nuc == '-': - expected_action = CigarActions.INSERT - ref_index -= 1 - if cigar and cigar[-1][1] == expected_action: - cigar[-1][0] += 1 - else: - cigar.append([1, expected_action]) - - typed_cigar: List[Tuple[int, CigarActions]] = [(a, CigarActions(b)) - for [a, b] in cigar] - self.alignments.append(Alignment( + cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) + hit = CigarHit(cigar, + q_st=0, q_ei=len(consensus)-1, + r_st=0, r_ei=len(coordinate_seq)-1) + hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() + self.alignments.append(Alignment.from_cigar_hit( + hit, ctg='N/A', ctg_len=len(coordinate_seq), - r_st=ref_start, - r_en=ref_index, strand=1, - q_st=0, - q_en=consensus_index, - cigar=typed_cigar, - mapq=0, - cigar_str=str(Cigar(typed_cigar)))) + mapq=0)) def find_amino_alignments(self, start_pos: int, From 67ee7c309f0d1b41e0d0c5ecf4b3e2f371894f64 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 22 Oct 2024 17:04:46 -0700 Subject: [PATCH 440/485] Make align_gotoh a static method --- micall/utils/consensus_aligner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index f8668b672..7bef033fb 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -176,7 +176,9 @@ def start_contig(self, if alignment.is_primary] else: self.algorithm = 'gotoh' - self.align_gotoh(coordinate_seq, self.consensus) + gotoh_alignment = ConsensusAligner.align_gotoh(coordinate_seq, self.consensus) + if gotoh_alignment: + self.alignments = [gotoh_alignment] self.alignments.sort(key=attrgetter('q_st')) @@ -192,7 +194,8 @@ def start_contig(self, "cigar_str": alignment.cigar_str} self.overall_alignments_writer.writerow(row) - def align_gotoh(self, coordinate_seq: str, consensus: str): + @staticmethod + def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 @@ -204,18 +207,21 @@ def align_gotoh(self, coordinate_seq: str, consensus: str): gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) + if min(len(coordinate_seq), len(consensus)) < score: cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) hit = CigarHit(cigar, q_st=0, q_ei=len(consensus)-1, r_st=0, r_ei=len(coordinate_seq)-1) hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() - self.alignments.append(Alignment.from_cigar_hit( + return Alignment.from_cigar_hit( hit, ctg='N/A', ctg_len=len(coordinate_seq), strand=1, - mapq=0)) + mapq=0) + else: + return None def find_amino_alignments(self, start_pos: int, From dffb6cedd60f84dfa64c12e9e1d323f1e4e94cf0 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 23 Oct 2024 09:14:08 -0700 Subject: [PATCH 441/485] Move base alignment operations into alignment.py --- micall/utils/alignment.py | 54 ++++++++++++++++++++++++++++++- micall/utils/consensus_aligner.py | 50 +++------------------------- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index dbdfbd052..c2278971b 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -1,7 +1,10 @@ -from typing import Tuple, List, Sequence +from typing import Tuple, List, Sequence, Optional from dataclasses import dataclass +from operator import attrgetter from aligntools import CigarActions, Cigar, CigarHit +from gotoh import align_it +from mappy import Aligner import mappy @@ -59,3 +62,52 @@ def from_cigar_hit(hit: CigarHit, ctg: str, ctg_len: int, strand: int, mapq: int cigar=list(hit.cigar._data), cigar_str=str(hit.cigar), ) + + +def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: + gap_open_penalty = 15 + gap_extend_penalty = 3 + use_terminal_gap_penalty = 1 + assert '&' not in consensus, "Consensus contains forbidden character '&'" + consensus = ''.join('&' if x == '-' else x for x in consensus) + aligned_coordinate, aligned_consensus, score = align_it( + coordinate_seq, + consensus, + gap_open_penalty, + gap_extend_penalty, + use_terminal_gap_penalty) + + if min(len(coordinate_seq), len(consensus)) < score: + cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) + hit = CigarHit(cigar, + q_st=0, q_ei=len(consensus)-1, + r_st=0, r_ei=len(coordinate_seq)-1) + hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() + return Alignment.from_cigar_hit( + hit, + ctg='N/A', + ctg_len=len(coordinate_seq), + strand=1, + mapq=0) + else: + return None + + +def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: + aligner = Aligner(seq=coordinate_seq, preset='map-ont') + mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) + if mappy_alignments or 10_000 < len(consensus): + algorithm = 'minimap2' + alignments = [Alignment.coerce(alignment) + for alignment in mappy_alignments + if alignment.is_primary] + else: + algorithm = 'gotoh' + gotoh_alignment = align_gotoh(coordinate_seq, consensus) + if gotoh_alignment: + alignments = [gotoh_alignment] + else: + alignments = [] + + alignments.sort(key=attrgetter('q_st')) + return (alignments, algorithm) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 7bef033fb..b17d05f59 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,19 +1,18 @@ from typing import Dict, List, Optional, Set from dataclasses import dataclass, replace from itertools import count -from operator import attrgetter import csv import os import logging -from aligntools import CigarActions, Cigar, CigarHit +from aligntools import CigarActions -from gotoh import align_it, align_it_aa +from gotoh import align_it_aa import mappy from micall.core.project_config import ProjectConfig from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide from micall.utils.translation import translate -from micall.utils.alignment import Alignment +from micall.utils.alignment import Alignment, align_consensus logger = logging.getLogger(__name__) @@ -167,20 +166,8 @@ def start_contig(self, coordinate_seq = self.projects.getGenotypeReference(coordinate_name) except KeyError: coordinate_seq = self.projects.getReference(coordinate_name) - aligner = mappy.Aligner(seq=coordinate_seq, preset='map-ont') - mappy_alignments: List[mappy.Alignment] = list(aligner.map(self.consensus)) - if mappy_alignments or 10_000 < len(self.consensus): - self.algorithm = 'minimap2' - self.alignments = [Alignment.coerce(alignment) - for alignment in mappy_alignments - if alignment.is_primary] - else: - self.algorithm = 'gotoh' - gotoh_alignment = ConsensusAligner.align_gotoh(coordinate_seq, self.consensus) - if gotoh_alignment: - self.alignments = [gotoh_alignment] - self.alignments.sort(key=attrgetter('q_st')) + self.alignments, self.algorithm = align_consensus(coordinate_seq, self.consensus) if self.overall_alignments_writer is not None: for alignment in self.alignments: @@ -194,35 +181,6 @@ def start_contig(self, "cigar_str": alignment.cigar_str} self.overall_alignments_writer.writerow(row) - @staticmethod - def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: - gap_open_penalty = 15 - gap_extend_penalty = 3 - use_terminal_gap_penalty = 1 - assert '&' not in consensus, "Consensus contains forbidden character '&'" - consensus = ''.join('&' if x == '-' else x for x in consensus) - aligned_coordinate, aligned_consensus, score = align_it( - coordinate_seq, - consensus, - gap_open_penalty, - gap_extend_penalty, - use_terminal_gap_penalty) - - if min(len(coordinate_seq), len(consensus)) < score: - cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) - hit = CigarHit(cigar, - q_st=0, q_ei=len(consensus)-1, - r_st=0, r_ei=len(coordinate_seq)-1) - hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() - return Alignment.from_cigar_hit( - hit, - ctg='N/A', - ctg_len=len(coordinate_seq), - strand=1, - mapq=0) - else: - return None - def find_amino_alignments(self, start_pos: int, end_pos: int, From 518327d5927110621fa9d08c3409763ce0bd015f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 23 Oct 2024 09:33:06 -0700 Subject: [PATCH 442/485] Use Alignment class in contig stitcher --- micall/core/contig_stitcher.py | 5 +-- micall/tests/test_contig_stitcher.py | 60 ++++++++++++++-------------- micall/tests/utils.py | 35 +++++++++------- 3 files changed, 53 insertions(+), 47 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 7cf9c3418..238b0193c 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -4,7 +4,6 @@ import os from dataclasses import replace from math import ceil -from mappy import Aligner from functools import reduce from itertools import tee, islice, chain from gotoh import align_it @@ -19,6 +18,7 @@ from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig +from micall.utils.alignment import align_consensus import micall.utils.contig_stitcher_events as events @@ -169,8 +169,7 @@ def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)) return cigar, "forward" if x.strand == 1 else "reverse" - aligner = Aligner(seq=contig.ref_seq, preset='map-ont') - alignments = list(aligner.map(contig.seq)) + alignments, _algo = align_consensus(contig.ref_seq, contig.seq) hits_array = [init_hit(x) for x in alignments] for i, (hit, strand) in enumerate(hits_array): diff --git a/micall/tests/test_contig_stitcher.py b/micall/tests/test_contig_stitcher.py index e1fad2487..a3f8c5393 100644 --- a/micall/tests/test_contig_stitcher.py +++ b/micall/tests/test_contig_stitcher.py @@ -3,7 +3,7 @@ import logging import os import pytest -from typing import Iterable, List, Tuple +from typing import Tuple, List from aligntools import CigarActions, CigarHit, Cigar @@ -22,7 +22,7 @@ rstrip, ) from micall.core.plot_contigs import plot_stitcher_coverage -from micall.tests.utils import MockAligner, fixed_random_seed +from micall.tests.utils import mock_align_consensus, MockAlignment, fixed_random_seed from micall.tests.test_fasta_to_csv import check_hcv_db, DEFAULT_DATABASE # activates the fixture from micall.tests.test_remap import load_projects # activates the "projects" fixture @@ -39,7 +39,7 @@ @pytest.fixture() def exact_aligner(monkeypatch): - monkeypatch.setattr("micall.core.contig_stitcher.Aligner", MockAligner) + monkeypatch.setattr("micall.core.contig_stitcher.align_consensus", mock_align_consensus) @pytest.fixture @@ -1376,26 +1376,26 @@ def test_forward_and_reverse_match(projects, visualizer): def test_overlaping_in_reference_space(projects, visualizer, monkeypatch): # Scenario: Single contig is aligned in two parts that overlap in reference space. - @dataclass - class MockMappyHit: - strand: int - q_st: int - q_en: int - r_st: int - r_en: int - cigar: List[Tuple[int, CigarActions]] - - @dataclass - class MockAligner: - seq: str - preset: str - - def map(self, seq: str) -> Iterable[MockMappyHit]: - return [MockMappyHit(1, 100, 300, 200, 400, [(200, CigarActions.MATCH)]), - MockMappyHit(1, 300, 500, 300, 500, [(200, CigarActions.MATCH)]), - ] - - monkeypatch.setattr("micall.core.contig_stitcher.Aligner", MockAligner) + def mock_align(reference_seq: str, consensus: str) -> Tuple[List[MockAlignment], str]: + alignments = [MockAlignment(ctg="N/A", ctg_len=0, + strand=1, mapq=60, + is_primary=True, + q_st=100, q_en=300, + r_st=200, r_en=400, + cigar=[(200, CigarActions.MATCH)], + cigar_str="200M"), + MockAlignment(ctg="N/A", ctg_len=0, + strand=1, mapq=60, + is_primary=True, + q_st=300, q_en=500, + r_st=300, r_en=500, + cigar=[(200, CigarActions.MATCH)], + cigar_str="200M"), + ] + algorithm = 'mock' + return (alignments, algorithm) + + monkeypatch.setattr("micall.core.contig_stitcher.align_consensus", mock_align) ref = 'A' * 700 seq = 'C' * 600 @@ -1626,21 +1626,21 @@ def test_merge_intervals(intervals, expected): assert merge_intervals(intervals) == expected +@dataclass +class TestMockAlignment: + r_st: int + r_ei: int + + class MockAlignedContig: def __init__(self, ref_name, group_ref, r_st, r_ei, name="contig"): self.ref_name = ref_name self.group_ref = group_ref - self.alignment = MockAlignment(r_st, r_ei) + self.alignment = TestMockAlignment(r_st, r_ei) self.name = name self.id = id(self) -@dataclass -class MockAlignment: - r_st: int - r_ei: int - - # Simple function to create mock AlignedContig objects for testing, including ref_name. def create_mock_aligned_contig(ref_name, r_st, r_ei, name="contig"): return MockAlignedContig(ref_name, ref_name, r_st, r_ei, name) diff --git a/micall/tests/utils.py b/micall/tests/utils.py index 39eda8940..9b76cf0a2 100644 --- a/micall/tests/utils.py +++ b/micall/tests/utils.py @@ -1,7 +1,10 @@ -from dataclasses import dataclass from contextlib import contextmanager +from dataclasses import dataclass import random from aligntools import CigarActions +from typing import Tuple, List, Iterator + +from micall.utils.alignment import Alignment def find_all_occurrences(s, substring): @@ -14,16 +17,9 @@ def find_all_occurrences(s, substring): start += len(substring) -@dataclass -class MockAlignment: - strand: int # +1 if on the forward strand; -1 if on the reverse strand - mapq: int - cigar: list - cigar_str: str - q_st: int - q_en: int - r_st: int - r_en: int +@dataclass(frozen=True) +class MockAlignment(Alignment): + is_primary: bool class MockAligner: @@ -37,7 +33,7 @@ def __init__(self, seq, *args, **kwargs): self.max_matches = 5 self.min_length = 3 - def map(self, seq): + def map(self, seq) -> Iterator[Alignment]: max_matches = self.max_matches returned = set() for length in range(len(seq), self.min_length - 1, -1): @@ -50,17 +46,28 @@ def map(self, seq): r_en = r_st + len(substring) q_st = start q_en = end - cigar = [[q_en - q_st, CigarActions.MATCH]] + cigar = [(q_en - q_st, CigarActions.MATCH)] cigar_str = f'{(q_en - q_st)}M' if (q_st, q_en, r_st, r_en) not in returned: returned.add((q_st, q_en, r_st, r_en)) - yield MockAlignment(strand, mapq, cigar, cigar_str, q_st, q_en, r_st, r_en) + yield MockAlignment(ctg='N/A', ctg_len=0, + strand=strand, mapq=mapq, + cigar=cigar, cigar_str=cigar_str, + q_st=q_st, q_en=q_en, + r_st=r_st, r_en=r_en, + is_primary=True) max_matches -= 1 if max_matches < 1: return +def mock_align_consensus(reference_seq: str, consensus: str) -> Tuple[List[Alignment], str]: + alignments = list(MockAligner(reference_seq).map(consensus)) + algorithm = 'mock' + return (alignments, algorithm) + + @contextmanager def fixed_random_seed(seed): original_state = random.getstate() From 8b5d44be6ce13636f3936a10b2a7ea5d3db81c86 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 23 Oct 2024 09:53:13 -0700 Subject: [PATCH 443/485] Simplify Alignment handling in contig stitcher --- micall/core/contig_stitcher.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 238b0193c..31094ee76 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -12,13 +12,13 @@ import logging from fractions import Fraction from operator import itemgetter -from aligntools import Cigar, connect_cigar_hits, CigarHit +from aligntools import connect_cigar_hits, CigarHit from micall.core.project_config import ProjectConfig from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig -from micall.utils.alignment import align_consensus +from micall.utils.alignment import Alignment, align_consensus import micall.utils.contig_stitcher_events as events @@ -163,10 +163,8 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: yield contig return - def init_hit(x) -> Tuple[CigarHit, Literal["forward", "reverse"]]: - cigar = CigarHit(Cigar(x.cigar), - min(x.r_st, x.r_en - 1), max(x.r_st, x.r_en - 1), - min(x.q_st, x.q_en - 1), max(x.q_st, x.q_en - 1)) + def init_hit(x: Alignment) -> Tuple[CigarHit, Literal["forward", "reverse"]]: + cigar = x.to_cigar_hit() return cigar, "forward" if x.strand == 1 else "reverse" alignments, _algo = align_consensus(contig.ref_seq, contig.seq) From 1da4480125cda0dc8313484b51be86e0867db6a6 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 23 Oct 2024 16:02:31 -0700 Subject: [PATCH 444/485] Connect non-overlaping alignments in consensus aligner --- micall/tests/test_aln2counts_report.py | 68 +++++++++++--------------- micall/tests/test_consensus_aligner.py | 32 +++--------- micall/utils/alignment.py | 34 ++++++++++++- 3 files changed, 68 insertions(+), 66 deletions(-) diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py index e1c5c5a69..4260a38ef 100644 --- a/micall/tests/test_aln2counts_report.py +++ b/micall/tests/test_aln2counts_report.py @@ -2117,26 +2117,17 @@ def test_minimap_overlap(default_sequence_report, projects): # A,C,G,T expected_text = """\ -HIV1-B-FR-K03455-seed,INT,15,51,262,4491,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,52,263,4492,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,53,264,4493,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,54,265,4494,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,55,266,4495,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,56,267,4496,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,57,268,4497,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,58,269,4498,0,9,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,59,270,4499,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,INT,15,60,271,4500,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,61,452,3001,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,62,453,3002,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,63,454,3003,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,64,455,3004,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,65,456,3005,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,66,457,3006,0,0,0,9,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,67,458,3007,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,68,459,3008,0,0,9,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,69,460,3009,9,0,0,0,0,0,0,0,0,9 -HIV1-B-FR-K03455-seed,RT,15,70,461,3010,9,0,0,0,0,0,0,0,0,9""" +seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,genome.pos,A,C,G,T,N,del,ins,clip,v3_overlap,coverage +HIV1-B-FR-K03455-seed,INT,15,1,212,4441,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,2,213,4442,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,3,214,4443,0,0,9,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,4,215,4444,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,5,216,4445,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,6,217,4446,9,0,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,7,218,4447,0,0,0,9,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,8,219,4448,0,9,0,0,0,0,0,0,0,9 +HIV1-B-FR-K03455-seed,INT,15,9,220,4449,0,9,0,0,0,0,0,0,0,9""" + report_file = StringIO() default_sequence_report.write_nuc_header(report_file) default_sequence_report.read(aligned_reads) @@ -2144,11 +2135,11 @@ def test_minimap_overlap(default_sequence_report, projects): report = report_file.getvalue() report_lines = report.splitlines() - expected_size = 121 + expected_size = 61 if len(report_lines) != expected_size: assert (len(report_lines), report) == (expected_size, '') - key_lines = report_lines[51:71] + key_lines = report_lines[0:10] key_report = '\n'.join(key_lines) assert key_report == expected_text @@ -2156,24 +2147,26 @@ def test_minimap_overlap(default_sequence_report, projects): def test_minimap_overlap_at_start(default_sequence_report, projects): """ Actual overlaps cause blank query position. Check consensus offset. - In this example, the start of PR appears twice, so the consensus index - gets blanked. Make sure that the PR consensus has the correct offset and - doesn't crash. + In this example, the start of PR (first 6 nuleotides) appears twice, + so the shorter alignment is discarded. Make sure that the PR consensus + has the correct offset and doesn't crash. """ seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) - read_seq = seed_seq[2252:2400] + seed_seq[2000:2258] + first_part = seed_seq[2252:2400] + second_part = seed_seq[2000:2258] + _read_seq = first_part + second_part # refname,qcut,rank,count,offset,seq aligned_reads = prepare_reads(f"""\ -HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq} +HIV1-B-FR-K03455-seed,15,0,9,0,{second_part} """) expected_text = f"""\ seed,region,q-cutoff,consensus-percent-cutoff,seed-offset,region-offset,sequence -HIV1-B-FR-K03455-seed,,15,MAX,0,,{read_seq} -HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1463,{read_seq} -HIV1-B-FR-K03455-seed,PR,15,MAX,0,0,{read_seq} +HIV1-B-FR-K03455-seed,,15,MAX,0,,{second_part} +HIV1-B-FR-K03455-seed,HIV1B-gag,15,MAX,0,1211,{second_part} +HIV1-B-FR-K03455-seed,PR,15,MAX,252,0,{second_part[-6:]} """ report_file = StringIO() default_sequence_report.write_consensus_all_header(report_file) @@ -2290,7 +2283,7 @@ def test_contig_coverage_report_huge_gap(default_sequence_report): """ A gap so big that Gotoh can't bridge it, but minimap2 can. """ ref = default_sequence_report.projects.getReference('HIV1-B-FR-K03455-seed') seq = ref[100:150] + ref[1000:1050] - expected_positions = list(range(101, 151)) + list(range(1001, 1051)) + expected_positions = list(range(101, 1051)) remap_conseq_csv = StringIO(f"""\ region,sequence HIV1-B-FR-K03455-seed,{seq} @@ -2555,7 +2548,7 @@ def test_write_sequence_coverage_counts_without_coverage(projects, hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) seq = ref[100:150] + ref[1000:1050] - expected_positions = list(range(101, 151)) + list(range(1001, 1051)) + expected_positions = list(range(101, 1051)) report_file = StringIO() sequence_report.projects = projects @@ -2789,15 +2782,13 @@ def test_write_sequence_coverage_counts_with_unaligned_middle(projects, sequence_report): """ The middle 100 bases are from a different reference. - They get reported with query positions, but no reference positions. + They get reported with query positions, but with deletions at reference positions. """ hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) hcv_ref = projects.getReference('HCV-1a') seq = ref[:100] + hcv_ref[1000:1100] + ref[1000:1100] - expected_ref_positions = (list(range(1, 101)) + - list(range(501, 601)) + - list(range(1001, 1101))) + expected_ref_positions = list(range(1, 1101)) expected_query_positions = list(range(1, 301)) report_file = StringIO() @@ -2832,7 +2823,7 @@ def test_write_sequence_coverage_counts_with_double_mapped_edges( hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) seq = ref[2858:2908] + ref[8187:8237] - expected_ref_positions = (list(range(2859, 2916)) + list(range(8196, 8238))) + expected_ref_positions = list(range(2859, 2908 + (8237-8187))) expected_query_positions = list(range(1, len(seq)+1)) report_file = StringIO() @@ -2861,8 +2852,7 @@ def test_write_sequence_coverage_minimap_hits(projects, sequence_report): seq = ref[1000:1100] + ref[2000:2100] expected_minimap_hits = """\ contig,ref_name,start,end,ref_start,ref_end -1-my-contig,HIV1-B-FR-K03455-seed,1,100,1001,1100 -1-my-contig,HIV1-B-FR-K03455-seed,101,200,2001,2100 +1-my-contig,HIV1-B-FR-K03455-seed,1,200,1001,2100 """ report_file = StringIO() sequence_report.projects = projects diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index a53e9ad98..bad8a48dc 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -195,26 +195,19 @@ def test_start_contig_overlapping_sections(projects): In this example, positions 1-60 of the read map to pos 4441-4500 of the reference. Positions 55-120 of the read map to pos 2995-3060 of the ref. - Since positions 55-60 are in both alignments, remove them from the second - one. + This way, positions 55-60 are in both alignments. We expect our aligner + to drop the alignment with lesser quality + (quality is calculated from `mapq`, `query_length`, and others) """ seed_name = 'HIV1-B-FR-K03455-seed' seed_seq = projects.getReference(seed_name) consensus = seed_seq[4440:4500] + seed_seq[3000:3060] reading_frames = create_reading_frames(consensus) int_ref = projects.getReference('INT') - rt_ref = projects.getReference('RT') aligner = ConsensusAligner(projects) aligner.start_contig(seed_name, reading_frames=reading_frames) - rt_aminos: typing.List[ReportAmino] = [] - rt_nucleotides: typing.List[ReportNucleotide] = [] - aligner.report_region(2550, - 4229, - rt_nucleotides, - rt_aminos, - amino_ref=rt_ref) int_aminos: typing.List[ReportAmino] = [] int_nucleotides: typing.List[ReportNucleotide] = [] aligner.report_region(4230, @@ -228,11 +221,6 @@ def test_start_contig_overlapping_sections(projects): list(range(3001, 3061)), 4230, 5096) - assert_consensus_nuc_indexes(rt_aminos, - list(range(4441, 4501)) + - list(range(3001, 3061)), - 2550, - 4229) # noinspection DuplicatedCode @@ -457,19 +445,13 @@ def test_start_contig_big_deletion_minimap2(projects): expected_alignment = [make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=290, - r_en=983, - q_st=0, - q_en=693, - mapq=60, - cigar=[(693, CigarActions.MATCH)]), - make_alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=3000, r_en=9269, - q_st=693, + q_st=0, q_en=6962, mapq=60, - cigar=[(6269, CigarActions.MATCH)])] + cigar=[(693, CigarActions.MATCH), + (2017, CigarActions.DELETE), + (6269, CigarActions.MATCH)])] aligner = ConsensusAligner(projects) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index c2278971b..6f766ab60 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -1,8 +1,9 @@ -from typing import Tuple, List, Sequence, Optional +from typing import Tuple, List, Sequence, Optional, Iterable, Iterator from dataclasses import dataclass from operator import attrgetter +from itertools import groupby -from aligntools import CigarActions, Cigar, CigarHit +from aligntools import CigarActions, Cigar, CigarHit, connect_cigar_hits from gotoh import align_it from mappy import Aligner import mappy @@ -93,6 +94,26 @@ def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: return None +def alignment_quality(alignment: Alignment) -> Tuple[int, ...]: + cigar = Cigar(alignment.cigar) + mlen = sum(1 for action in cigar.iterate_operations() + if action == CigarActions.MATCH) + return (alignment.mapq * cigar.query_length, mlen, cigar.query_length) + + +def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: + stranded = groupby(alignments, key=lambda x: (x.strand, x.ctg, x.ctg_len)) + for (strand, ctg, ctg_len), group_iter in stranded: + group = list(group_iter) + hits = list(map(Alignment.to_cigar_hit, group)) + connected_hits = connect_cigar_hits(hits) + mapq = min(x.mapq for x in group) + for hit in connected_hits: + yield Alignment.from_cigar_hit(hit, + ctg=ctg, ctg_len=ctg_len, + strand=strand, mapq=mapq) + + def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: aligner = Aligner(seq=coordinate_seq, preset='map-ont') mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) @@ -101,6 +122,15 @@ def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment alignments = [Alignment.coerce(alignment) for alignment in mappy_alignments if alignment.is_primary] + + # Following code will connect non-overlapping alignments + # that mappy outputs sometimes. + # It will also drop overlapping (in query coords) alignments. + # We are sorting the alignments before connect in order + # to drop the lowest quality contigs in case they overlap with + # higher quality alignments. + alignments.sort(key=alignment_quality) + alignments = list(connect_alignments(reversed(alignments))) else: algorithm = 'gotoh' gotoh_alignment = align_gotoh(coordinate_seq, consensus) From 56e7afcfdfd11c843835c94502b8a4d1230bc1c8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 24 Oct 2024 11:12:31 -0700 Subject: [PATCH 445/485] Split alignments with big deletions in align_consensus This change tries to revert the behaviour of the aligner to one that is closer to that of the original mappy-based one. The original mappy aligner was reporting multiple alignments if they were far enough apart from each other. I am not sure how much of that original behaviour is intentional, or was it just a limitation of mappy. In case we don't want to split the alignments like that, we simply need to revert this commit. --- micall/tests/test_aln2counts_report.py | 12 +++++--- micall/tests/test_consensus_aligner.py | 36 ++++++---------------- micall/utils/alignment.py | 42 ++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py index 4260a38ef..a99604c85 100644 --- a/micall/tests/test_aln2counts_report.py +++ b/micall/tests/test_aln2counts_report.py @@ -2283,7 +2283,7 @@ def test_contig_coverage_report_huge_gap(default_sequence_report): """ A gap so big that Gotoh can't bridge it, but minimap2 can. """ ref = default_sequence_report.projects.getReference('HIV1-B-FR-K03455-seed') seq = ref[100:150] + ref[1000:1050] - expected_positions = list(range(101, 1051)) + expected_positions = list(range(101, 151)) + list(range(1001, 1051)) remap_conseq_csv = StringIO(f"""\ region,sequence HIV1-B-FR-K03455-seed,{seq} @@ -2548,7 +2548,7 @@ def test_write_sequence_coverage_counts_without_coverage(projects, hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) seq = ref[100:150] + ref[1000:1050] - expected_positions = list(range(101, 1051)) + expected_positions = list(range(101, 151)) + list(range(1001, 1051)) report_file = StringIO() sequence_report.projects = projects @@ -2782,13 +2782,14 @@ def test_write_sequence_coverage_counts_with_unaligned_middle(projects, sequence_report): """ The middle 100 bases are from a different reference. - They get reported with query positions, but with deletions at reference positions. + They get reported with query positions, but no reference positions. """ hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) hcv_ref = projects.getReference('HCV-1a') seq = ref[:100] + hcv_ref[1000:1100] + ref[1000:1100] - expected_ref_positions = list(range(1, 1101)) + expected_ref_positions = (list(range(1, 101)) + + list(range(1001, 1101))) expected_query_positions = list(range(1, 301)) report_file = StringIO() @@ -2852,7 +2853,8 @@ def test_write_sequence_coverage_minimap_hits(projects, sequence_report): seq = ref[1000:1100] + ref[2000:2100] expected_minimap_hits = """\ contig,ref_name,start,end,ref_start,ref_end -1-my-contig,HIV1-B-FR-K03455-seed,1,200,1001,2100 +1-my-contig,HIV1-B-FR-K03455-seed,1,100,1001,1100 +1-my-contig,HIV1-B-FR-K03455-seed,101,200,2001,2100 """ report_file = StringIO() sequence_report.projects = projects diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index bad8a48dc..129d5bee6 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -414,28 +414,6 @@ def test_start_contig_short_consensus(projects): assert aligner.algorithm == 'gotoh' -def test_start_contig_deletion_minimap2(projects): - seed_name = 'SARS-CoV-2-seed' - seed_seq = projects.getReference(seed_name) - consensus = seed_seq[2000:2030] + seed_seq[2031:2060] - expected_alignment = make_alignment(ctg='N/A', - ctg_len=len(seed_seq), - r_st=2000, - r_en=2060, - q_st=0, - q_en=59, - mapq=9, - cigar=[(30, CigarActions.MATCH), - (1, CigarActions.DELETE), - (29, CigarActions.MATCH)]) - aligner = ConsensusAligner(projects) - - aligner.start_contig(seed_name, consensus) - - assert_alignments(aligner, expected_alignment) - assert aligner.algorithm == 'minimap2' - - def test_start_contig_big_deletion_minimap2(projects): seed_name = 'HCV-1a' seed_seq = projects.getReference(seed_name) @@ -445,13 +423,19 @@ def test_start_contig_big_deletion_minimap2(projects): expected_alignment = [make_alignment(ctg='N/A', ctg_len=len(seed_seq), r_st=290, - r_en=9269, + r_en=983, q_st=0, + q_en=693, + mapq=60, + cigar=[(693, CigarActions.MATCH)]), + make_alignment(ctg='N/A', + ctg_len=len(seed_seq), + r_st=3000, + r_en=9269, + q_st=693, q_en=6962, mapq=60, - cigar=[(693, CigarActions.MATCH), - (2017, CigarActions.DELETE), - (6269, CigarActions.MATCH)])] + cigar=[(6269, CigarActions.MATCH)])] aligner = ConsensusAligner(projects) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index 6f766ab60..cccf10c31 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -9,6 +9,14 @@ import mappy +# +# Alignments with deletions larger than MAX_GAP_SIZE +# will be split around those deletions into multiple +# separate alignments. +# +MAX_GAP_SIZE = 600 # TODO: make this smaller? + + @dataclass(frozen=True) class Alignment: """ @@ -114,6 +122,39 @@ def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: strand=strand, mapq=mapq) +def collect_big_gaps_cut_points(alignment: Alignment) -> Iterator[float]: + hit = alignment.to_cigar_hit() + for deletion in hit.deletions(): + if deletion.ref_length > MAX_GAP_SIZE: + midpoint = deletion.r_st + deletion.ref_length / 2 + yield int(midpoint) + hit.epsilon + + +def cut_hit_into_multiple_parts(hit: CigarHit, cut_points: Iterable[float]) -> Iterator[CigarHit]: + for cut_point in cut_points: + left, right = hit.cut_reference(cut_point) + left = left.rstrip_reference() + right = right.lstrip_reference() + yield left + hit = right + yield hit + + +def split_around_big_gaps(alignments: Iterable[Alignment]) -> Iterator[Alignment]: + for alignment in alignments: + cut_points = list(collect_big_gaps_cut_points(alignment)) + if cut_points: + hit = alignment.to_cigar_hit() + for part in cut_hit_into_multiple_parts(hit, cut_points): + yield Alignment.from_cigar_hit(part, + ctg=alignment.ctg, + ctg_len=alignment.ctg_len, + strand=alignment.strand, + mapq=alignment.mapq) + else: + yield alignment + + def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: aligner = Aligner(seq=coordinate_seq, preset='map-ont') mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) @@ -139,5 +180,6 @@ def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment else: alignments = [] + alignments = list(split_around_big_gaps(alignments)) alignments.sort(key=attrgetter('q_st')) return (alignments, algorithm) From 089ef6deb272d502843d193cf9d46861d33e8850 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 24 Oct 2024 11:44:04 -0700 Subject: [PATCH 446/485] Update minimap version to the most recent one Since minimap base algorithm changed between versions, this update requires setting the bw and bw_long options to retain the original behaviour. See lh3/minimap2#1124 for more details. Also, the quality calculation was adjusted in minimap. This is not a significant change for us because that quality metrics is not actually used anywhere. --- micall/tests/test_consensus_aligner.py | 2 +- micall/utils/alignment.py | 2 +- micall/utils/consensus_aligner.py | 2 +- micall/utils/ref_aligner.py | 2 +- pyproject.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index 129d5bee6..719cdc36e 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -499,7 +499,7 @@ def test_start_contig_insertion_minimap2(projects): r_en=2060, q_st=0, q_en=63, - mapq=9, + mapq=8, cigar=[(30, CigarActions.MATCH), (3, CigarActions.INSERT), (30, CigarActions.MATCH)]) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index cccf10c31..13ee94d3b 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -156,7 +156,7 @@ def split_around_big_gaps(alignments: Iterable[Alignment]) -> Iterator[Alignment def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: - aligner = Aligner(seq=coordinate_seq, preset='map-ont') + aligner = Aligner(seq=coordinate_seq, bw=500, bw_long=500, preset='map-ont') mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) if mappy_alignments or 10_000 < len(consensus): algorithm = 'minimap2' diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index b17d05f59..60915a081 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -742,7 +742,7 @@ def seed_concordance(self, seed_name, projects, seed_coordinates, excluded_regio if self.seed_concordance_writer is None: return seed_ref = self.projects.getReference(seed_name) - seed_aligner = mappy.Aligner(seq=seed_ref, preset='map-ont') + seed_aligner = mappy.Aligner(seq=seed_ref, bw=500, bw_long=500, preset='map-ont') seed_alignments: List[mappy.Alignment] = list(seed_aligner.map(self.consensus)) regions = projects.getCoordinateReferences(seed_name) diff --git a/micall/utils/ref_aligner.py b/micall/utils/ref_aligner.py index 41b99e6d0..67601cd91 100644 --- a/micall/utils/ref_aligner.py +++ b/micall/utils/ref_aligner.py @@ -44,7 +44,7 @@ def __init__(self, group_seeds.append(seed) self.seeds = group_seeds self.aligner_preset = aligner_preset - self.aligner = Aligner(seq=self.main_reference, preset=aligner_preset) + self.aligner = Aligner(seq=self.main_reference, bw=500, bw_long=500, preset=aligner_preset) self.num_warning_seeds = 0 self.verbose = verbose self.warning_seeds = {'insertions': [], 'deletions': [], 'frameshift': [], 'coverage': [], 'alignment': []} diff --git a/pyproject.toml b/pyproject.toml index 6e3db0585..ad4e963aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "reportlab==4.2.0", "pysam==0.22.1", "genetracks @ git+https://github.com/cfe-lab/genetracks.git@v0.4", - "mappy==2.17", + "mappy==2.28", "drawsvg==2.3.0", "cairosvg==2.7.1", "aligntools", From de2b85a5c7a875f12f15988ab675a6275aa6eba3 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 24 Oct 2024 14:47:41 -0700 Subject: [PATCH 447/485] Do not use deprecated universal lines In python 3.11, "universal lines" feature [has been deprecated][update]. This feature was implemented as "U" flag that could be passed to: 1) `open(filename, "rU")` type of constructors, 2) `argparse.FileType("rU")` type of constructors. The flag is no longer supported. As mentioned in [the update][update], "universal lines" became the default behaviour. So the flag can be ommited. This commit removes all the usages of the flag. [update]: https://docs.python.org/3/whatsnew/3.11.html#porting-to-python-3-11 --- micall/core/coverage_plots.py | 2 +- micall/core/filter_quality.py | 2 +- micall/core/remap.py | 6 +++--- micall/core/sam2aln.py | 2 +- micall/g2p/fastq_g2p.py | 4 ++-- micall/monitor/update_qai.py | 2 +- micall/utils/find_chimera.py | 2 +- micall/utils/make_fda_refs.py | 6 +++--- micall/utils/make_hcv_genotype_refs.py | 2 +- micall/utils/project_seeds_from_compendium.py | 2 +- micall/utils/reference_distances.py | 2 +- micall/utils/remap_fastq_simplify.py | 12 ++++++------ micall/utils/sam_g2p_simplify.py | 4 ++-- micall/utils/sample_fastq.py | 4 ++-- 14 files changed, 26 insertions(+), 26 deletions(-) diff --git a/micall/core/coverage_plots.py b/micall/core/coverage_plots.py index c225d36dc..4d8c5fc74 100644 --- a/micall/core/coverage_plots.py +++ b/micall/core/coverage_plots.py @@ -283,7 +283,7 @@ def make_tar_path(tar_path): def parse_args(): parser = argparse.ArgumentParser(description='Generate coverage plots from MiCall outputs.') - parser.add_argument('amino_csv', type=argparse.FileType('rU'), + parser.add_argument('amino_csv', type=argparse.FileType('r'), help=' CSV containing amino acid frequency outputs.') parser.add_argument('coverage_scores_csv', type=argparse.FileType('w'), help=' CSV coverage scores.') diff --git a/micall/core/filter_quality.py b/micall/core/filter_quality.py index 3bc080610..e4a4deb25 100755 --- a/micall/core/filter_quality.py +++ b/micall/core/filter_quality.py @@ -13,7 +13,7 @@ def parse_args(): description='Post-processing of short-read alignments.') parser.add_argument('quality_csv', - type=argparse.FileType('rU'), + type=argparse.FileType('r'), help='QC error rate data, grouped by tile') parser.add_argument('bad_cycles_csv', type=argparse.FileType('w'), diff --git a/micall/core/remap.py b/micall/core/remap.py index 5940eda50..268689bc5 100644 --- a/micall/core/remap.py +++ b/micall/core/remap.py @@ -631,7 +631,7 @@ def remap(fastq1: str, stderr, callback) new_counts.update(split_counts) - with open(samfile, 'rU') as f: + with open(samfile, 'r') as f: for fields in splitter.walk(f): remap_writer.writerow(dict(zip(SAM_FIELDS, fields))) @@ -771,7 +771,7 @@ def map_to_contigs(fastq1, stderr, callback) new_counts.update(split_counts) - with open(samfile, 'rU') as f: + with open(samfile, 'r') as f: for fields in splitter.walk(f): write_remap_row(remap_writer, fields) @@ -1199,7 +1199,7 @@ def main(): parser.add_argument('fastq1', help=' FASTQ containing forward reads') parser.add_argument('fastq2', help=' FASTQ containing reverse reads') parser.add_argument('contigs_csv', - type=argparse.FileType('rU'), + type=argparse.FileType('r'), help=' CSV containing assembled contigs') parser.add_argument('remap_csv', type=argparse.FileType('w'), diff --git a/micall/core/sam2aln.py b/micall/core/sam2aln.py index 8312f79b3..81da1bdab 100755 --- a/micall/core/sam2aln.py +++ b/micall/core/sam2aln.py @@ -26,7 +26,7 @@ def parse_args(): parser = argparse.ArgumentParser( description='Conversion of SAM data into aligned format.') parser.add_argument('remap_csv', - type=argparse.FileType('rU'), + type=argparse.FileType('r'), help=' SAM output of bowtie2 in CSV format') parser.add_argument('aligned_csv', type=argparse.FileType('w'), diff --git a/micall/g2p/fastq_g2p.py b/micall/g2p/fastq_g2p.py index 9235b3a48..c89276dbc 100755 --- a/micall/g2p/fastq_g2p.py +++ b/micall/g2p/fastq_g2p.py @@ -36,9 +36,9 @@ def parse_args(): parser = argparse.ArgumentParser(description='Calculate g2p scores from amino acid sequences.') - parser.add_argument('fastq1', type=argparse.FileType('rU'), + parser.add_argument('fastq1', type=argparse.FileType('r'), help=' FASTQ file containing read 1 reads') - parser.add_argument('fastq2', type=argparse.FileType('rU'), + parser.add_argument('fastq2', type=argparse.FileType('r'), help=' FASTQ file containing read 2 reads') parser.add_argument('g2p_csv', type=argparse.FileType('w'), help=' CSV containing g2p predictions.') diff --git a/micall/monitor/update_qai.py b/micall/monitor/update_qai.py index 3926388ba..28cbe6466 100644 --- a/micall/monitor/update_qai.py +++ b/micall/monitor/update_qai.py @@ -373,7 +373,7 @@ def find_pipeline_id(session, pipeline_version): def load_ok_sample_regions(result_folder): ok_sample_regions = set() coverage_file = os.path.join(result_folder, 'coverage_scores.csv') - with open(coverage_file, "rU") as f: + with open(coverage_file, "r") as f: reader = csv.DictReader(f) for row in reader: if row['on.score'] == '4': diff --git a/micall/utils/find_chimera.py b/micall/utils/find_chimera.py index 3df477e36..3b636aa71 100644 --- a/micall/utils/find_chimera.py +++ b/micall/utils/find_chimera.py @@ -100,7 +100,7 @@ def process_file(sample_name, projects, args): nuc_counts = defaultdict(dict) # {(source, dest): {pos: (agree, disagree)}} nucleotide_path = os.path.join(args.results, 'nuc.csv') - with open(nucleotide_path, 'rU') as nuc_csv: + with open(nucleotide_path, 'r') as nuc_csv: reader = DictReader(nuc_csv) sample_rows = (row for row in reader if row['sample'] == sample_name) diff --git a/micall/utils/make_fda_refs.py b/micall/utils/make_fda_refs.py index e2155a908..8ee471ce9 100644 --- a/micall/utils/make_fda_refs.py +++ b/micall/utils/make_fda_refs.py @@ -10,11 +10,11 @@ hyphy = HyPhy._THyPhy (os.getcwd(), 1) # instance of HyPhy hyphyAlign.change_settings(hyphy) # default settings -handle = open('fda_hcv_polyprotein.fa', 'rU') +handle = open('fda_hcv_polyprotein.fa', 'r') fasta = convert_fasta(handle) handle.close() -handle = open('/Users/art/git/MiseqPipeline/projects.json', 'rU') +handle = open('/Users/art/git/MiseqPipeline/projects.json', 'r') proj = json.load(handle) handle.close() @@ -35,4 +35,4 @@ outfile.close() - \ No newline at end of file + diff --git a/micall/utils/make_hcv_genotype_refs.py b/micall/utils/make_hcv_genotype_refs.py index ec04d9adb..7a04cc969 100644 --- a/micall/utils/make_hcv_genotype_refs.py +++ b/micall/utils/make_hcv_genotype_refs.py @@ -44,7 +44,7 @@ def convert_fasta (lines): noTerminalPenalty = 1) -with open('HCV_REF_2012_genome.fasta', 'rU') as handle: +with open('HCV_REF_2012_genome.fasta', 'r') as handle: genomes = convert_fasta(handle) # keep one per genotype projects = ProjectConfig.loadDefault() diff --git a/micall/utils/project_seeds_from_compendium.py b/micall/utils/project_seeds_from_compendium.py index 6b3acdaa0..fd5b759a7 100644 --- a/micall/utils/project_seeds_from_compendium.py +++ b/micall/utils/project_seeds_from_compendium.py @@ -82,7 +82,7 @@ def main(): clean_count = 0 dirty_count = 0 recombinant_names = [] - with open(filename, 'rU') as f: + with open(filename, 'r') as f: reader = csv.reader(f) for description, seed_seq in reader: seed_seq = seed_seq.replace('-', '') diff --git a/micall/utils/reference_distances.py b/micall/utils/reference_distances.py index 375ee3fa5..1c5a3783d 100644 --- a/micall/utils/reference_distances.py +++ b/micall/utils/reference_distances.py @@ -56,7 +56,7 @@ def calculate_distance(source, destination): def plot_distances(projects_filename): - with open(projects_filename, 'rU') as f: + with open(projects_filename, 'r') as f: config = json.load(f) populate_key_references(config['regions']) groups = defaultdict(list) diff --git a/micall/utils/remap_fastq_simplify.py b/micall/utils/remap_fastq_simplify.py index d0c8a6af4..7eac913e7 100644 --- a/micall/utils/remap_fastq_simplify.py +++ b/micall/utils/remap_fastq_simplify.py @@ -60,7 +60,7 @@ def filter_fastqs(self, filename1): devnull, devnull, devnull) - with open(remap_filename, 'rU') as remap_csv: + with open(remap_filename, 'r') as remap_csv: print('Filtering.') reader = DictReader(remap_csv) mapped_qnames = {row['qname'] @@ -72,7 +72,7 @@ def filter_fastqs(self, filename1): return filter_name1 def filter_reads(self, filename, filter_name, qnames): - with open(filename, 'rU') as fin, open(filter_name, 'w') as fout: + with open(filename, 'r') as fin, open(filter_name, 'w') as fout: for read in zip(fin, fin, fin, fin): qname = read[0].split()[0][1:] if qname in qnames: @@ -92,12 +92,12 @@ def _test(self, read_indexes, debug_file_prefix=None): trimmed_filename2 = os.path.join(workdir, 'rerun.trimmed2.fastq') prelim_censored_filename = os.path.join(workdir, 'rerun_censored.prelim.csv') prelim_trimmed_filename = os.path.join(workdir, 'rerun_trimmed.prelim.csv') - with open(self.bad_cycles_filename, 'rU') as bad_cycles: + with open(self.bad_cycles_filename, 'r') as bad_cycles: bad_cycles = list(csv.DictReader(bad_cycles)) - with open(simple_filename1, 'rU') as simple1, \ + with open(simple_filename1, 'r') as simple1, \ open(censored_filename1, 'w') as censored1: censor(simple1, bad_cycles, censored1, use_gzip=False) - with open(simple_filename2, 'rU') as simple2, \ + with open(simple_filename2, 'r') as simple2, \ open(censored_filename2, 'w') as censored2: censor(simple2, bad_cycles, censored2, use_gzip=False) with open(prelim_censored_filename, 'w+') as prelim_censored_csv, \ @@ -205,7 +205,7 @@ def read_fastq(filename, reads): @param filename: the FASTQ file to open @param reads: defaultdict({qname: [line1, line2, line3, line4, line1, line2, line3, line4]} """ - with open(filename, 'rU') as f: + with open(filename, 'r') as f: for line1, line2, line3, line4 in zip(f, f, f, f): qname = line1.split()[0] lines = reads[qname] diff --git a/micall/utils/sam_g2p_simplify.py b/micall/utils/sam_g2p_simplify.py index e573e355d..637db00bc 100644 --- a/micall/utils/sam_g2p_simplify.py +++ b/micall/utils/sam_g2p_simplify.py @@ -34,7 +34,7 @@ def test(remap_lines, temp_prefix, pssm, ruby_script, delete_results=True): try: check_call([ruby_script, remap_file.name, nuc_filename, ruby_out_filename], cwd=ruby_path) - with open(nuc_filename, 'rU') as nuc_csv, \ + with open(nuc_filename, 'r') as nuc_csv, \ open(python_out_filename, 'wb') as g2p_csv: # TODO: update this to the new arguments. @@ -83,7 +83,7 @@ def ddmin(remap_lines, temp_prefix, pssm, ruby_script): return header + remap_lines def compare_conseqs(txtfilename, ruby_script, pssm): - with open(txtfilename, 'rU') as remap_file: + with open(txtfilename, 'r') as remap_file: remap_lines = remap_file.readlines() simple_prefix = os.path.splitext(txtfilename)[0] + '_simple' if test(remap_lines, simple_prefix, pssm, ruby_script) != 'PASS': diff --git a/micall/utils/sample_fastq.py b/micall/utils/sample_fastq.py index f1df838b2..35d2aa56e 100755 --- a/micall/utils/sample_fastq.py +++ b/micall/utils/sample_fastq.py @@ -8,10 +8,10 @@ def parse_args(): parser = argparse.ArgumentParser( description="Randomly sample reads from FASTQ files for quick processing.") parser.add_argument('fastq1', - type=argparse.FileType('rU'), + type=argparse.FileType('r'), help='original FASTQ file of forward reads') parser.add_argument('fastq2', - type=argparse.FileType('rU'), + type=argparse.FileType('r'), help='original FASTQ file of reverse reads') parser.add_argument('short_fastq1', type=argparse.FileType('w'), From 72a46eaf86401527e2a2cc16eda945dbd04f23af Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 24 Oct 2024 14:57:26 -0700 Subject: [PATCH 448/485] Update python version to 3.11 The most recent version on Ubuntu and Debian is `3.12`, but it does not work with MiCall yet. --- .github/workflows/build-and-test.yml | 4 ++-- CONTRIBUTING.md | 2 +- Dockerfile | 2 +- Singularity | 2 +- pyproject.toml | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 5e7c7740b..189e1723d 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -8,7 +8,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.11 - run: mkdir -p ~/bin - run: echo ~/bin >> $GITHUB_PATH @@ -95,7 +95,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.11 - uses: actions/setup-go@v2 with: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 73445487f..8d3e81efc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -45,7 +45,7 @@ Check that Python is already installed. python --version -We have tested with Python 3.8. +We have tested with Python `3.11`. ### BaseSpace Set up the [native apps virtual machine][bsvm], and configure a shared folder diff --git a/Dockerfile b/Dockerfile index af327bbd0..f2f9cbf96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ # If you omit the `--target` tag altogether, `docker build` will build # the development image. -FROM python:3.8 +FROM python:3.11 MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/MiCall diff --git a/Singularity b/Singularity index fa8517951..41ec83e93 100644 --- a/Singularity +++ b/Singularity @@ -1,6 +1,6 @@ # Generate the Singularity container to run MiCall on Kive. Bootstrap: docker -From: python:3.8 +From: python:3.11 %help MiCall maps all the reads from a sample against a set of reference diff --git a/pyproject.toml b/pyproject.toml index ad4e963aa..11215689c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,12 +8,12 @@ authors = [ license = {text = "AGPL-3.0"} license-files = { paths = ["LICENSE.txt"] } readme = "README.md" -requires-python = ">=3.8,<3.9" +requires-python = ">=3.11,<3.12" classifiers = [ "License :: OSI Approved :: GNU Affero General Public License v3", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.11", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", From beaf35d4232b56497b295ddbe246318e783fe110 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 24 Oct 2024 15:35:07 -0700 Subject: [PATCH 449/485] Optimize docker commands order --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index af327bbd0..9ba16134e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,10 +85,10 @@ RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev +RUN pip install --upgrade pip COPY . /opt/micall/ -RUN pip install --upgrade pip RUN pip install /opt/micall[basespace] RUN micall make_blast_db From f939791e02241e3a711e1986f1b4414b4f8e7a27 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 25 Oct 2024 13:25:05 -0700 Subject: [PATCH 450/485] Specify aligntools version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6e3db0585..8e42bf72f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "mappy==2.17", "drawsvg==2.3.0", "cairosvg==2.7.1", - "aligntools", + "aligntools==1.0.8", ] [project.optional-dependencies] From 8fb0a334b31be972a76f346512b523280a8594f4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 25 Oct 2024 13:40:54 -0700 Subject: [PATCH 451/485] Update aligntools --- micall/core/contig_stitcher.py | 32 ++++++++++++++++---------------- pyproject.toml | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 31094ee76..723ff10ec 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -12,13 +12,13 @@ import logging from fractions import Fraction from operator import itemgetter -from aligntools import connect_cigar_hits, CigarHit +from aligntools import CigarHit, connect_nonoverlapping_cigar_hits, drop_overlapping_cigar_hits from micall.core.project_config import ProjectConfig from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig -from micall.utils.alignment import Alignment, align_consensus +from micall.utils.alignment import align_consensus import micall.utils.contig_stitcher_events as events @@ -163,41 +163,41 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: yield contig return - def init_hit(x: Alignment) -> Tuple[CigarHit, Literal["forward", "reverse"]]: - cigar = x.to_cigar_hit() - return cigar, "forward" if x.strand == 1 else "reverse" - alignments, _algo = align_consensus(contig.ref_seq, contig.seq) - hits_array = [init_hit(x) for x in alignments] + hits = [x.to_cigar_hit() for x in alignments] + strands: List[Literal["forward", "reverse"]] = ["forward" if x.strand == 1 else "reverse" for x in alignments] - for i, (hit, strand) in enumerate(hits_array): + for i, (hit, strand) in enumerate(zip(hits, strands)): log(events.InitialHit(contig, i, hit, strand)) - if not hits_array: + if not hits: log(events.ZeroHits(contig)) yield contig return - if len(set(strand for hit, strand in hits_array)) > 1: + if len(set(strands)) > 1: log(events.StrandConflict(contig)) yield contig return - strand = hits_array[0][1] + strand = strands[0] if strand == "reverse": rc = str(Seq.Seq(contig.seq).reverse_complement()) original_contig = contig new_contig = replace(contig, seq=rc) contig = new_contig - hits_array = [(replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1), strand) - for hit, strand in hits_array] + hits = [replace(hit, q_st=len(rc)-hit.q_ei-1, q_ei=len(rc)-hit.q_st-1) for hit in hits] log(events.ReverseComplement(original_contig, new_contig)) - for i, (hit, strand) in enumerate(hits_array): + for i, (hit, strand) in enumerate(zip(hits, strands)): log(events.InitialHit(contig, i, hit, strand)) - connected = connect_cigar_hits([hit for hit, strand in hits_array]) if hits_array else [] - log(events.HitNumber(contig, hits_array, connected)) + def quality(x: CigarHit): + return x.ref_length + + filtered = list(drop_overlapping_cigar_hits(hits, quality)) + connected = list(connect_nonoverlapping_cigar_hits(filtered)) + log(events.HitNumber(contig, list(zip(hits, strands)), connected)) for i, single_hit in enumerate(connected): query = replace(contig, name=None) diff --git a/pyproject.toml b/pyproject.toml index 8e42bf72f..b2a817763 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "mappy==2.17", "drawsvg==2.3.0", "cairosvg==2.7.1", - "aligntools==1.0.8", + "aligntools==1.1.1", ] [project.optional-dependencies] From 022f8e1f88411beb10725798ac81b23649a7910e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 25 Oct 2024 16:33:03 -0700 Subject: [PATCH 452/485] Remove code for removal of overlapping regions in Aligner --- micall/utils/alignment.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index 9a39a1628..ce1769b37 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -102,13 +102,6 @@ def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: return None -def alignment_quality(alignment: Alignment) -> Tuple[int, ...]: - cigar = Cigar(alignment.cigar) - mlen = sum(1 for action in cigar.iterate_operations() - if action == CigarActions.MATCH) - return (alignment.mapq * cigar.query_length, mlen, cigar.query_length) - - def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: stranded = groupby(alignments, key=lambda x: (x.strand, x.ctg, x.ctg_len)) for (strand, ctg, ctg_len), group_iter in stranded: @@ -166,11 +159,6 @@ def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment # Following code will connect non-overlapping alignments # that mappy outputs sometimes. - # It will also drop overlapping (in query coords) alignments. - # We are sorting the alignments before connect in order - # to drop the lowest quality contigs in case they overlap with - # higher quality alignments. - alignments.sort(key=alignment_quality) alignments = list(connect_alignments(reversed(alignments))) else: algorithm = 'gotoh' From 543ccea7680e5edace3215581a928150384f3ed5 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 25 Oct 2024 16:38:13 -0700 Subject: [PATCH 453/485] Move all alignment procedures to ConsensusAligner --- micall/core/contig_stitcher.py | 2 +- micall/utils/alignment.py | 116 +--------------------------- micall/utils/consensus_aligner.py | 121 ++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 120 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 723ff10ec..93c7c2c63 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -18,7 +18,7 @@ from micall.core.plot_contigs import plot_stitcher_coverage from micall.utils.contig_stitcher_context import context, StitcherContext from micall.utils.contig_stitcher_contigs import GenotypedContig, AlignedContig -from micall.utils.alignment import align_consensus +from micall.utils.consensus_aligner import align_consensus import micall.utils.contig_stitcher_events as events diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index ce1769b37..dbdfbd052 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -1,22 +1,10 @@ -from typing import Tuple, List, Sequence, Optional, Iterable, Iterator +from typing import Tuple, List, Sequence from dataclasses import dataclass -from operator import attrgetter -from itertools import groupby -from aligntools import CigarActions, Cigar, CigarHit, connect_nonoverlapping_cigar_hits -from gotoh import align_it -from mappy import Aligner +from aligntools import CigarActions, Cigar, CigarHit import mappy -# -# Alignments with deletions larger than MAX_GAP_SIZE -# will be split around those deletions into multiple -# separate alignments. -# -MAX_GAP_SIZE = 600 # TODO: make this smaller? - - @dataclass(frozen=True) class Alignment: """ @@ -71,103 +59,3 @@ def from_cigar_hit(hit: CigarHit, ctg: str, ctg_len: int, strand: int, mapq: int cigar=list(hit.cigar._data), cigar_str=str(hit.cigar), ) - - -def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: - gap_open_penalty = 15 - gap_extend_penalty = 3 - use_terminal_gap_penalty = 1 - assert '&' not in consensus, "Consensus contains forbidden character '&'" - consensus = ''.join('&' if x == '-' else x for x in consensus) - aligned_coordinate, aligned_consensus, score = align_it( - coordinate_seq, - consensus, - gap_open_penalty, - gap_extend_penalty, - use_terminal_gap_penalty) - - if min(len(coordinate_seq), len(consensus)) < score: - cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) - hit = CigarHit(cigar, - q_st=0, q_ei=len(consensus)-1, - r_st=0, r_ei=len(coordinate_seq)-1) - hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() - return Alignment.from_cigar_hit( - hit, - ctg='N/A', - ctg_len=len(coordinate_seq), - strand=1, - mapq=0) - else: - return None - - -def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: - stranded = groupby(alignments, key=lambda x: (x.strand, x.ctg, x.ctg_len)) - for (strand, ctg, ctg_len), group_iter in stranded: - group = list(group_iter) - hits = list(map(Alignment.to_cigar_hit, group)) - connected_hits = connect_nonoverlapping_cigar_hits(hits) - mapq = min(x.mapq for x in group) - for hit in connected_hits: - yield Alignment.from_cigar_hit(hit, - ctg=ctg, ctg_len=ctg_len, - strand=strand, mapq=mapq) - - -def collect_big_gaps_cut_points(alignment: Alignment) -> Iterator[float]: - hit = alignment.to_cigar_hit() - for deletion in hit.deletions(): - if deletion.ref_length > MAX_GAP_SIZE: - midpoint = deletion.r_st + deletion.ref_length / 2 - yield int(midpoint) + hit.epsilon - - -def cut_hit_into_multiple_parts(hit: CigarHit, cut_points: Iterable[float]) -> Iterator[CigarHit]: - for cut_point in cut_points: - left, right = hit.cut_reference(cut_point) - left = left.rstrip_reference() - right = right.lstrip_reference() - yield left - hit = right - yield hit - - -def split_around_big_gaps(alignments: Iterable[Alignment]) -> Iterator[Alignment]: - for alignment in alignments: - cut_points = list(collect_big_gaps_cut_points(alignment)) - if cut_points: - hit = alignment.to_cigar_hit() - for part in cut_hit_into_multiple_parts(hit, cut_points): - yield Alignment.from_cigar_hit(part, - ctg=alignment.ctg, - ctg_len=alignment.ctg_len, - strand=alignment.strand, - mapq=alignment.mapq) - else: - yield alignment - - -def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: - aligner = Aligner(seq=coordinate_seq, bw=500, bw_long=500, preset='map-ont') - mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) - if mappy_alignments or 10_000 < len(consensus): - algorithm = 'minimap2' - alignments = [Alignment.coerce(alignment) - for alignment in mappy_alignments - if alignment.is_primary] - - # Following code will connect non-overlapping alignments - # that mappy outputs sometimes. - alignments = list(connect_alignments(reversed(alignments))) - else: - algorithm = 'gotoh' - gotoh_alignment = align_gotoh(coordinate_seq, consensus) - if gotoh_alignment: - alignments = [gotoh_alignment] - else: - alignments = [] - - alignments = list(split_around_big_gaps(alignments)) - alignments.sort(key=attrgetter('q_st')) - return (alignments, algorithm) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 60915a081..5274a5c99 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -1,18 +1,20 @@ -from typing import Dict, List, Optional, Set +from typing import Dict, List, Optional, Set, Iterator, Iterable, Tuple from dataclasses import dataclass, replace -from itertools import count +from itertools import count, groupby +from operator import attrgetter import csv import os import logging -from aligntools import CigarActions +from aligntools import CigarActions, Cigar, CigarHit, connect_nonoverlapping_cigar_hits -from gotoh import align_it_aa +from gotoh import align_it, align_it_aa +from mappy import Aligner import mappy from micall.core.project_config import ProjectConfig from micall.utils.report_amino import SeedAmino, ReportAmino, ReportNucleotide, SeedNucleotide from micall.utils.translation import translate -from micall.utils.alignment import Alignment, align_consensus +from micall.utils.alignment import Alignment logger = logging.getLogger(__name__) @@ -24,6 +26,115 @@ MAXIMUM_AMINO_GAP = 10 +# +# Alignments with deletions larger than MAX_GAP_SIZE +# will be split around those deletions into multiple +# separate alignments. +# +MAX_GAP_SIZE = 600 # TODO: make this smaller? + + + +def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: + gap_open_penalty = 15 + gap_extend_penalty = 3 + use_terminal_gap_penalty = 1 + assert '&' not in consensus, "Consensus contains forbidden character '&'" + consensus = ''.join('&' if x == '-' else x for x in consensus) + aligned_coordinate, aligned_consensus, score = align_it( + coordinate_seq, + consensus, + gap_open_penalty, + gap_extend_penalty, + use_terminal_gap_penalty) + + if min(len(coordinate_seq), len(consensus)) < score: + cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) + hit = CigarHit(cigar, + q_st=0, q_ei=len(consensus)-1, + r_st=0, r_ei=len(coordinate_seq)-1) + hit = hit.lstrip_query().lstrip_reference().rstrip_query().rstrip_reference() + return Alignment.from_cigar_hit( + hit, + ctg='N/A', + ctg_len=len(coordinate_seq), + strand=1, + mapq=0) + else: + return None + + +def connect_alignments(alignments: Iterable[Alignment]) -> Iterator[Alignment]: + stranded = groupby(alignments, key=lambda x: (x.strand, x.ctg, x.ctg_len)) + for (strand, ctg, ctg_len), group_iter in stranded: + group = list(group_iter) + hits = list(map(Alignment.to_cigar_hit, group)) + connected_hits = connect_nonoverlapping_cigar_hits(hits) + mapq = min(x.mapq for x in group) + for hit in connected_hits: + yield Alignment.from_cigar_hit(hit, + ctg=ctg, ctg_len=ctg_len, + strand=strand, mapq=mapq) + + +def collect_big_gaps_cut_points(alignment: Alignment) -> Iterator[float]: + hit = alignment.to_cigar_hit() + for deletion in hit.deletions(): + if deletion.ref_length > MAX_GAP_SIZE: + midpoint = deletion.r_st + deletion.ref_length / 2 + yield int(midpoint) + hit.epsilon + + +def cut_hit_into_multiple_parts(hit: CigarHit, cut_points: Iterable[float]) -> Iterator[CigarHit]: + for cut_point in cut_points: + left, right = hit.cut_reference(cut_point) + left = left.rstrip_reference() + right = right.lstrip_reference() + yield left + hit = right + yield hit + + +def split_around_big_gaps(alignments: Iterable[Alignment]) -> Iterator[Alignment]: + for alignment in alignments: + cut_points = list(collect_big_gaps_cut_points(alignment)) + if cut_points: + hit = alignment.to_cigar_hit() + for part in cut_hit_into_multiple_parts(hit, cut_points): + yield Alignment.from_cigar_hit(part, + ctg=alignment.ctg, + ctg_len=alignment.ctg_len, + strand=alignment.strand, + mapq=alignment.mapq) + else: + yield alignment + + +def align_consensus(coordinate_seq: str, consensus: str) -> Tuple[List[Alignment], str]: + aligner = Aligner(seq=coordinate_seq, bw=500, bw_long=500, preset='map-ont') + mappy_alignments: List[mappy.Alignment] = list(aligner.map(consensus)) + if mappy_alignments or 10_000 < len(consensus): + algorithm = 'minimap2' + alignments = [Alignment.coerce(alignment) + for alignment in mappy_alignments + if alignment.is_primary] + + # Following code will connect non-overlapping alignments + # that mappy outputs sometimes. + alignments = list(connect_alignments(reversed(alignments))) + else: + algorithm = 'gotoh' + gotoh_alignment = align_gotoh(coordinate_seq, consensus) + if gotoh_alignment: + alignments = [gotoh_alignment] + else: + alignments = [] + + alignments = list(split_around_big_gaps(alignments)) + alignments.sort(key=attrgetter('q_st')) + return (alignments, algorithm) + + def align_aminos(reference: str, query: str, gap_open: int = 40, From fe1e5a46ba17f4983ddb333a833a25f2862d5179 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 25 Oct 2024 17:09:05 -0700 Subject: [PATCH 454/485] Update core python dependencies These packages need to be updated all together because they depend on each other. --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1e34aa032..4d43dba1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,9 +28,9 @@ dependencies = [ "biopython==1.83", "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", "pyvdrm @ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2", - "numpy==1.24.3", - "scipy==1.10.1", - "matplotlib==3.7.3", + "numpy==2.1.2", + "scipy==1.14.1", + "matplotlib==3.9.2", "cutadapt==4.8", "python-Levenshtein==0.25.1", "PyYAML==6.0.1", @@ -48,7 +48,7 @@ test = [ # Dependencies required for running the test suite "pytest==8.2.2", "coverage==7.5.3", - "pandas==2.0.2", + "pandas==2.2.3", "seaborn==0.13.2", "ete3", # For reading HCV rules from an Excel file. From 8db1046b088ec2e33c9c60e989b6585e8769df1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Oct 2024 00:36:30 +0000 Subject: [PATCH 455/485] Bump openpyxl from 3.1.3 to 3.1.5 Bumps [openpyxl](https://openpyxl.readthedocs.io) from 3.1.3 to 3.1.5. --- updated-dependencies: - dependency-name: openpyxl dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d43dba1d..f56c0b7c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ test = [ "seaborn==0.13.2", "ete3", # For reading HCV rules from an Excel file. - "openpyxl==3.1.3", + "openpyxl==3.1.5", # Also includes watcher's dependencies. "kiveapi @ git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api", From e9f8780c88a72834d369f2d53983733a39843e2d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Oct 2024 00:36:39 +0000 Subject: [PATCH 456/485] Bump cutadapt from 4.8 to 4.9 Bumps [cutadapt](https://github.com/marcelm/cutadapt) from 4.8 to 4.9. - [Release notes](https://github.com/marcelm/cutadapt/releases) - [Changelog](https://github.com/marcelm/cutadapt/blob/main/CHANGES.rst) - [Commits](https://github.com/marcelm/cutadapt/compare/v4.8...v4.9) --- updated-dependencies: - dependency-name: cutadapt dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d43dba1d..d70f102f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "numpy==2.1.2", "scipy==1.14.1", "matplotlib==3.9.2", - "cutadapt==4.8", + "cutadapt==4.9", "python-Levenshtein==0.25.1", "PyYAML==6.0.1", "reportlab==4.2.0", From ae606e0d69b9901a1e7c44d6a7831d0b2c048116 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Oct 2024 00:36:41 +0000 Subject: [PATCH 457/485] Bump drawsvg from 2.3.0 to 2.4.0 Bumps [drawsvg](https://github.com/cduck/drawsvg) from 2.3.0 to 2.4.0. - [Commits](https://github.com/cduck/drawsvg/compare/2.3.0...2.4.0) --- updated-dependencies: - dependency-name: drawsvg dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d43dba1d..1c734f3d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "pysam==0.22.1", "genetracks @ git+https://github.com/cfe-lab/genetracks.git@v0.4", "mappy==2.28", - "drawsvg==2.3.0", + "drawsvg==2.4.0", "cairosvg==2.7.1", "aligntools==1.1.1", ] From 724882130bbe50ff4e349c5b5b801235825262f9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Oct 2024 00:36:49 +0000 Subject: [PATCH 458/485] Bump coverage from 7.5.3 to 7.6.4 Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.3 to 7.6.4. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.3...7.6.4) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d43dba1d..c9f4ab97b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ test = [ # Dependencies required for running the test suite "pytest==8.2.2", - "coverage==7.5.3", + "coverage==7.6.4", "pandas==2.2.3", "seaborn==0.13.2", "ete3", From b357a477af9bc270bbd68556140ac94c9313b598 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:29:24 +0000 Subject: [PATCH 459/485] Bump ruff from 0.7.0 to 0.7.1 Bumps [ruff](https://github.com/astral-sh/ruff) from 0.7.0 to 0.7.1. - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/0.7.0...0.7.1) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ac726e6c..a1bcf87f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ test = [ ] dev = [ # Dependencies required for development (linting, type checking, etc.) - "ruff==0.7.0", + "ruff==0.7.1", "mypy==1.10.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", From 2ea356949ad369d56181798c597516a3e2c1e70f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:29:29 +0000 Subject: [PATCH 460/485] Bump biopython from 1.83 to 1.84 Bumps [biopython](https://github.com/biopython/biopython) from 1.83 to 1.84. - [Changelog](https://github.com/biopython/biopython/blob/master/NEWS.rst) - [Commits](https://github.com/biopython/biopython/commits) --- updated-dependencies: - dependency-name: biopython dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ac726e6c..a64755aba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ # Requirements for running the bare pipeline under Kive. "pytz==2024.1", "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", - "biopython==1.83", + "biopython==1.84", "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", "pyvdrm @ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2", "numpy==2.1.2", From e27ff7ecad33e85d9a0ccfb24bbe523874afe4d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:29:38 +0000 Subject: [PATCH 461/485] Bump python-levenshtein from 0.25.1 to 0.26.1 Bumps [python-levenshtein](https://github.com/rapidfuzz/python-Levenshtein) from 0.25.1 to 0.26.1. - [Release notes](https://github.com/rapidfuzz/python-Levenshtein/releases) - [Changelog](https://github.com/rapidfuzz/python-Levenshtein/blob/main/HISTORY.md) - [Commits](https://github.com/rapidfuzz/python-Levenshtein/compare/v0.25.1...v0.26.1) --- updated-dependencies: - dependency-name: python-levenshtein dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ac726e6c..6cc82c8eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "scipy==1.14.1", "matplotlib==3.9.2", "cutadapt==4.9", - "python-Levenshtein==0.25.1", + "python-Levenshtein==0.26.1", "PyYAML==6.0.1", "reportlab==4.2.0", "pysam==0.22.1", From fde49e512025491a60e7a719bf16e00d80b92066 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:29:55 +0000 Subject: [PATCH 462/485] Bump pytest from 8.2.2 to 8.3.3 Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.2.2 to 8.3.3. - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/8.2.2...8.3.3) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6ac726e6c..af19b314b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ [project.optional-dependencies] test = [ # Dependencies required for running the test suite - "pytest==8.2.2", + "pytest==8.3.3", "coverage==7.6.4", "pandas==2.2.3", "seaborn==0.13.2", From 866724e36ad9f86829f41fda73ccb647c7d6fd10 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:26:09 +0000 Subject: [PATCH 463/485] Bump rexml from 3.3.7 to 3.3.9 in /docs Bumps [rexml](https://github.com/ruby/rexml) from 3.3.7 to 3.3.9. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.3.7...v3.3.9) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 12f102f9d..036f1aa2f 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -240,7 +240,7 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.7) + rexml (3.3.9) rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) From 2d829c6836f71e9de05b7f5d5af9d9cc1a132bda Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:26:26 +0000 Subject: [PATCH 464/485] Bump pytz from 2024.1 to 2024.2 Bumps [pytz](https://github.com/stub42/pytz) from 2024.1 to 2024.2. - [Release notes](https://github.com/stub42/pytz/releases) - [Commits](https://github.com/stub42/pytz/compare/release_2024.1...release_2024.2) --- updated-dependencies: - dependency-name: pytz dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 27349fc58..556e2f3f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ dependencies = [ # Requirements for running the bare pipeline under Kive. - "pytz==2024.1", + "pytz==2024.2", "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", "biopython==1.84", "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", From 4528550fdb3b1ec90a1fec6b9f8f3f1c27172388 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:08:05 +0000 Subject: [PATCH 465/485] Bump reportlab from 4.2.0 to 4.2.5 Bumps [reportlab](https://www.reportlab.com/) from 4.2.0 to 4.2.5. --- updated-dependencies: - dependency-name: reportlab dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 782e9e934..5816ebec2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "cutadapt==4.9", "python-Levenshtein==0.26.1", "PyYAML==6.0.1", - "reportlab==4.2.0", + "reportlab==4.2.5", "pysam==0.22.1", "genetracks @ git+https://github.com/cfe-lab/genetracks.git@v0.4", "mappy==2.28", From ff657dd7d5e24ea5b16cde6971fe5d651d91a7d8 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 10:21:54 -0700 Subject: [PATCH 466/485] Fix type error in denovo.py --- micall/core/denovo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 3614575a8..ca8352484 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -79,8 +79,8 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass - with open(contigs_fasta_path) as reader: - copyfileobj(reader, fasta) + with open(contigs_fasta_path, 'rt') as reader: + copyfileobj(reader, fasta) # type: ignore os.chdir(start_dir) duration = datetime.now() - start_time From 3c31a5524f3f71607077dc2b3e3e352f41b275a9 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 10:24:18 -0700 Subject: [PATCH 467/485] Bump mypy to 1.13.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 782e9e934..27e4058da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ test = [ dev = [ # Dependencies required for development (linting, type checking, etc.) "ruff==0.7.1", - "mypy==1.10.0", + "mypy==1.13.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", ] From b654eb45d6be244940c350e8adb7160fe112b73a Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 10:36:40 -0700 Subject: [PATCH 468/485] Remove unnecessary type: ignore in denovo.py --- micall/core/denovo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/micall/core/denovo.py b/micall/core/denovo.py index ca8352484..5e32f6acb 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -1,7 +1,7 @@ import argparse import logging import os -from typing import Optional, TextIO +from typing import Optional, TextIO, cast, BinaryIO from csv import DictReader from datetime import datetime from glob import glob @@ -79,8 +79,8 @@ def denovo(fastq1_path: str, with open(contigs_fasta_path, 'a'): pass - with open(contigs_fasta_path, 'rt') as reader: - copyfileobj(reader, fasta) # type: ignore + with open(contigs_fasta_path) as reader: + copyfileobj(cast(BinaryIO, reader), fasta) os.chdir(start_dir) duration = datetime.now() - start_time From b512b966af2baf420a2ede87be74b7d33fc3f8ab Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:48:10 +0000 Subject: [PATCH 469/485] Bump pyyaml from 6.0.1 to 6.0.2 Bumps [pyyaml](https://github.com/yaml/pyyaml) from 6.0.1 to 6.0.2. - [Release notes](https://github.com/yaml/pyyaml/releases) - [Changelog](https://github.com/yaml/pyyaml/blob/main/CHANGES) - [Commits](https://github.com/yaml/pyyaml/compare/6.0.1...6.0.2) --- updated-dependencies: - dependency-name: pyyaml dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 923912c84..ef0dae104 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "matplotlib==3.9.2", "cutadapt==4.9", "python-Levenshtein==0.26.1", - "PyYAML==6.0.1", + "PyYAML==6.0.2", "reportlab==4.2.5", "pysam==0.22.1", "genetracks @ git+https://github.com/cfe-lab/genetracks.git@v0.4", From af78cbd9c71f86c25732dadd57fc5d1b60d1ef48 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 11:27:49 -0700 Subject: [PATCH 470/485] Improve implementation of Alignment class --- micall/utils/alignment.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/micall/utils/alignment.py b/micall/utils/alignment.py index dbdfbd052..876b4077d 100644 --- a/micall/utils/alignment.py +++ b/micall/utils/alignment.py @@ -1,4 +1,4 @@ -from typing import Tuple, List, Sequence +from typing import Tuple, Sequence, Union, NoReturn from dataclasses import dataclass from aligntools import CigarActions, Cigar, CigarHit @@ -23,14 +23,12 @@ class Alignment: cigar_str: str @staticmethod - def coerce(obj: object) -> 'Alignment': + def coerce(obj: Union['Alignment', mappy.Alignment]) -> 'Alignment': if isinstance(obj, Alignment): return obj elif isinstance(obj, mappy.Alignment): - cigar: List[Tuple[int, CigarActions]] = [] - for (size, action) in obj.cigar: - cigar.append((size, CigarActions(action))) - + cigar = [(size, CigarActions(action)) + for (size, action) in obj.cigar] return Alignment(ctg=obj.ctg, ctg_len=obj.ctg_len, r_st=obj.r_st, r_en=obj.r_en, @@ -41,6 +39,7 @@ def coerce(obj: object) -> 'Alignment': cigar_str=obj.cigar_str, ) else: + _: NoReturn = obj raise TypeError(f"Cannot coerce from {obj!r}.") def to_cigar_hit(self) -> CigarHit: From 5d90205006c9299434d4f555fbcc62fb48c65ba4 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 11:32:59 -0700 Subject: [PATCH 471/485] Bump aligntools version --- micall/tests/test_consensus_aligner.py | 2 +- micall/utils/consensus_aligner.py | 1 + pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index dda982c25..0857a1f9b 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -135,7 +135,7 @@ def test_create_reading_frames(): def test_alignment_repr(): alignment = make_alignment('R1', 0, 1001, 1100, 1, 1, 100) - assert repr(alignment) == "Alignment(ctg='R1', ctg_len=0, r_st=1001, r_en=1100, strand=1, q_st=1, q_en=100, mapq=0, cigar=[(99, )], cigar_str='99M')" + assert repr(alignment) == "Alignment(ctg='R1', ctg_len=0, r_st=1001, r_en=1100, strand=1, q_st=1, q_en=100, mapq=0, cigar=[(99, CigarActions.MATCH)], cigar_str='99M')" def test_start_contig(projects): diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 5274a5c99..35fbe4cee 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -50,6 +50,7 @@ def align_gotoh(coordinate_seq: str, consensus: str) -> Optional[Alignment]: if min(len(coordinate_seq), len(consensus)) < score: cigar = Cigar.from_msa(aligned_coordinate, aligned_consensus) + cigar = cigar.relax() # turn '=' and 'X' into 'M'. hit = CigarHit(cigar, q_st=0, q_ei=len(consensus)-1, r_st=0, r_ei=len(coordinate_seq)-1) diff --git a/pyproject.toml b/pyproject.toml index ef0dae104..e1c62b120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "mappy==2.28", "drawsvg==2.4.0", "cairosvg==2.7.1", - "aligntools==1.1.1", + "aligntools==1.2.0", ] [project.optional-dependencies] From 30214d66302f345e4a283f72b0e8b743559df271 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 11:35:55 -0700 Subject: [PATCH 472/485] Improve quality calculations in contig stitcher --- micall/core/contig_stitcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index 93c7c2c63..fb0a61991 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -12,7 +12,7 @@ import logging from fractions import Fraction from operator import itemgetter -from aligntools import CigarHit, connect_nonoverlapping_cigar_hits, drop_overlapping_cigar_hits +from aligntools import CigarHit, connect_nonoverlapping_cigar_hits, drop_overlapping_cigar_hits, CigarActions from micall.core.project_config import ProjectConfig from micall.core.plot_contigs import plot_stitcher_coverage @@ -193,7 +193,9 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: log(events.InitialHit(contig, i, hit, strand)) def quality(x: CigarHit): - return x.ref_length + mlen = sum(1 for x in x.cigar.relax().iterate_operations() + if x == CigarActions.MATCH) + return 0.9 * mlen + 0.1 * x.ref_length filtered = list(drop_overlapping_cigar_hits(hits, quality)) connected = list(connect_nonoverlapping_cigar_hits(filtered)) From 10627f7326619b528b5e97710396508c9035301c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 13:19:25 -0700 Subject: [PATCH 473/485] Further improve quality calculations in contig stitcher --- micall/core/contig_stitcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/core/contig_stitcher.py b/micall/core/contig_stitcher.py index fb0a61991..5292a27f5 100644 --- a/micall/core/contig_stitcher.py +++ b/micall/core/contig_stitcher.py @@ -195,7 +195,7 @@ def align_to_reference(contig: GenotypedContig) -> Iterable[GenotypedContig]: def quality(x: CigarHit): mlen = sum(1 for x in x.cigar.relax().iterate_operations() if x == CigarActions.MATCH) - return 0.9 * mlen + 0.1 * x.ref_length + return (mlen, x.ref_length) filtered = list(drop_overlapping_cigar_hits(hits, quality)) connected = list(connect_nonoverlapping_cigar_hits(filtered)) From 3bfdee02f304bbddee86c793f8d26f158eddc102 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 14:43:11 -0700 Subject: [PATCH 474/485] Install codecov as dev dependency --- .github/workflows/build-and-test.yml | 1 - pyproject.toml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 189e1723d..198702d9a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -73,7 +73,6 @@ jobs: - name: Report code coverage run: | rm -rf micall/tests/microtest/scratch - python -m pip install codecov codecov docker-test: diff --git a/pyproject.toml b/pyproject.toml index e1c62b120..dbcf47020 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ dev = [ "mypy==1.13.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", + "codecov==2.1.13", # For reporting the code coverage. ] watcher = [ # Requirements for running the MISEQ_MONITOR.py script From 7866287a52749f19aaae84b4ec9091480b33ee7f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 14:45:21 -0700 Subject: [PATCH 475/485] Split denovo dependencies in pyproject.toml --- .devcontainer/devcontainer.json | 2 +- .github/workflows/build-and-test.yml | 4 +--- Dockerfile | 2 +- Singularity | 2 +- pyproject.toml | 5 ++++- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index fbea85a74..4d7fc8ea5 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -15,7 +15,7 @@ // "forwardPorts": [], // Use 'onCreateCommand' to run commands after the container is created, but before anything else starts. - "onCreateCommand": "python -m pip install .[dev,test]", + "onCreateCommand": "python -m pip install .[denovo,watcher,dev,test]", // Configure tool-specific properties. "customizations": { diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 198702d9a..c2d870fe0 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -53,11 +53,9 @@ jobs: echo "/opt/bowtie2" >> $GITHUB_PATH - name: Install MiCall's Python dependencies - # Have to clean up alignment build files. Otherwise, they'll get reused - # in Singularity build with the wrong GLIBC libraries. run: | python -m pip install --upgrade pip - pip install .[dev,test] + pip install .[denovo,watcher,dev,test] - name: Run type checker run: | diff --git a/Dockerfile b/Dockerfile index 75bbf4383..6c65f2eb2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,7 +89,7 @@ RUN pip install --upgrade pip COPY . /opt/micall/ -RUN pip install /opt/micall[basespace] +RUN pip install /opt/micall[denovo,basespace] RUN micall make_blast_db ## Trigger matplotlib to build its font cache diff --git a/Singularity b/Singularity index 41ec83e93..ef346e276 100644 --- a/Singularity +++ b/Singularity @@ -96,7 +96,7 @@ From: python:3.11 apt-get install -q -y libcairo2-dev # Install micall main executable. pip install --upgrade pip - pip install /opt/micall + pip install /opt/micall[denovo] micall make_blast_db # Also trigger matplotlib to build its font cache. python -c 'import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot' diff --git a/pyproject.toml b/pyproject.toml index dbcf47020..e335ac6d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ classifiers = [ dependencies = [ # Requirements for running the bare pipeline under Kive. "pytz==2024.2", - "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", "biopython==1.84", "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", "pyvdrm @ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2", @@ -66,6 +65,10 @@ dev = [ "gprof2dot==2024.6.6", "codecov==2.1.13", # For reporting the code coverage. ] +denovo = [ + # Requirements for running De-Novo pipeline (only problematic ones). + "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", +] watcher = [ # Requirements for running the MISEQ_MONITOR.py script "kiveapi @ git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api", From 5f17f3afd78ef4f68dad5d62864768a75bdbdc6c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 15:03:01 -0700 Subject: [PATCH 476/485] Change default log file path to `/tmp/micall.log` This can help avoid permissions issues during testing. --- micall/utils/micall_logging_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/micall/utils/micall_logging_config.py b/micall/utils/micall_logging_config.py index 0e67e7897..fce41308f 100644 --- a/micall/utils/micall_logging_config.py +++ b/micall/utils/micall_logging_config.py @@ -13,7 +13,7 @@ # Production server probably needs /var/log/micall/micall.log # Don't forget to create the folder and change owner to micall. -LOG_FILE = 'micall.log' +LOG_FILE = '/tmp/micall.log' LOGGING = { # This is the default logger. Probably want to switch console to mail. From ae487ed4dd69d5d3518be10dccda028cbca3d245 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 15:41:48 -0700 Subject: [PATCH 477/485] Update documentation on micall execution Now since MiCall has a main executable `micall`, some installation instructions are simpler. --- docs/admin.md | 145 +++++++++++++++++++++++++--------------- docs/design/stitcher.md | 2 +- 2 files changed, 92 insertions(+), 55 deletions(-) diff --git a/docs/admin.md b/docs/admin.md index f2bf8fb54..92637ddc8 100644 --- a/docs/admin.md +++ b/docs/admin.md @@ -61,64 +61,88 @@ If a MiCall sample finishes processing and the number of active samples dips bel that limit, MiCall Watcher looks at its list of Samples That Need Reprocessing and starts the next one, moving it from that list to Samples In Progress. +### Installing base packages ### + +MiCall is written in python, thus we need the following packages: + +```shell +apt-get install -y python3 python3-venv git # on Ubuntu & Debian. +``` + ### Installing MiCall Watcher ### + Install the MiCall source code in a shared location: - $ cd /usr/local/share - $ sudo git clone https://github.com/cfe-lab/MiCall.git +```shell +cd /usr/local/share +sudo git clone https://github.com/cfe-lab/MiCall.git +``` + +Create a Python virtual environment to run MiCall. + +```shell +sudo python3 -m venv venv-micall +``` -Create a Python 3.6 virtual environment to run MiCall. +Configure micall logging, and then install micall package: - $ cd /usr/local/share - $ sudo python3.6 -m venv venv-micall - $ cd MiCall - $ sudo ../venv-micall/bin/pip install .[watcher] +```shell Copy the logging configuration if you want to change any of the settings. - $ cp micall/utils/micall_logging_config.py micall/utils/micall_logging_override.py +```shell +sudo cp micall/utils/micall_logging_config.py micall/utils/micall_logging_override.py +sudo emacs micall/utils/micall_logging_override.py +sudo venv-micall/bin/pip install ./MiCall[watcher] +``` Read the instructions in the file, and edit the override copy. If the default settings are fine, you don't need the override file. -It should be run as a service, under its own user account, so first create the -new user: - - $ sudo useradd --system micall - -A system user won't show up in the list of accounts to log in, and it won't get -a home folder. - Depending on how you configured the logging, you'll probably need to create a log folder and grant access to the micall user. - $ sudo mkdir /var/log/micall - $ sudo chown micall:micall /var/log/micall +```shell +sudo mkdir /var/log/micall +sudo chown micall:micall /var/log/micall +``` + +MiCall watcher should be run as a service, under its own user account, +so first create the new user: + +```shell +sudo useradd --system micall +sudo su micall # switch to micall account. +. venv-micall/bin/activate # activate the virtual environment. +``` Test that everything is installed with the right permissions: - $ sudo su -c "/usr/local/share/venv-micall/bin/python micall/monitor/micall_watcher.py --help" micall +```shell +micall micall_watcher --help +``` -Look at the options you can give to the `micall/monitor/micall_watcher.py` script when you +Look at the options you can give to the `micall_watcher` script when you configure the service file in the next step. Now configure the service using a systemd [service unit] configuration. Here's an example configuration, in `/etc/systemd/system/micall_watcher.service`: - [Unit] - Description=micall_watcher +```toml +[Unit] +Description=micall_watcher - [Service] - ExecStart=/usr/local/share/venv-micall/bin/python3 \ - /usr/local/share/MiCall/micall/monitor/micall_watcher.py - EnvironmentFile=/etc/micall/micall.conf - User=micall +[Service] +ExecStart=/usr/local/share/venv-micall/bin/micall micall_watcher +EnvironmentFile=/etc/micall/micall.conf +User=micall - # Allow the process to log its exit. - KillSignal=SIGINT +# Allow the process to log its exit. +KillSignal=SIGINT - [Install] - WantedBy=multi-user.target +[Install] +WantedBy=multi-user.target +``` Micall watcher accepts multiple settings which can be passed directly as command line arguments, or as environment variables. @@ -127,31 +151,36 @@ because the command line is visible to all users. Environment variables go in the configuration file listed in the `EnvironmentFile=` setting. In this example, it's `/etc/micall/micall.conf` - $ sudo mkdir /etc/micall - $ sudo emacs -nw /etc/micall/micall.conf - $ sudo chmod 600 /etc/micall/micall.conf +```shell +exit # logout from "micall" account. +sudo mkdir /etc/micall +sudo emacs /etc/micall/micall.conf +sudo chmod 600 /etc/micall/micall.conf +``` Make sure you reduce the read permissions on the `.conf` file so other users can't read it. The environment variable names are the same as the command options, but they add a `MICALL_` prefix, if it's not already there. -To list all the available options, run `python3 micall/monitor/micall_watcher.py --help`. +To list all the available options, run `micall micall_watcher --help`. Below is the example config: - # This is an example of /etc/micall/micall.conf - # You can add comment lines that start with # - MICALL_KIVE_SERVER=https://kive.example.com - MICALL_KIVE_USER=kiveusername - MICALL_KIVE_PASSWORD=kivepassword +```shell +# This is an example of /etc/micall/micall.conf +# You can add comment lines that start with # +MICALL_KIVE_SERVER=https://kive.example.com +MICALL_KIVE_USER=kiveusername +MICALL_KIVE_PASSWORD=kivepassword - MICALL_QAI_SERVER=https://qai.example.com - MICALL_QAI_USER=qaiuser - MICALL_QAI_PASSWORD=qaipassword +MICALL_QAI_SERVER=https://qai.example.com +MICALL_QAI_USER=qaiuser +MICALL_QAI_PASSWORD=qaipassword - MICALL_RAW_DATA=/data/raw +MICALL_RAW_DATA=/data/raw - MICALL_MAIN_PIPELINE_ID=100 - MICALL_FILTER_QUALITY_PIPELINE_ID=101 - MICALL_RESISTANCE_PIPELINE_ID=102 +MICALL_MAIN_PIPELINE_ID=100 +MICALL_FILTER_QUALITY_PIPELINE_ID=101 +MICALL_RESISTANCE_PIPELINE_ID=102 +``` Don't put the environment variables directly in the `.service` file, because its contents are visible to all users with `systemctl show micall_watcher`. @@ -160,10 +189,12 @@ Once you write the configuration file, you have to enable and start the service. From then on, it will start automatically when the server boots up. - $ sudo systemctl daemon-reload - $ sudo systemctl enable micall_watcher - $ sudo systemctl start micall_watcher - $ sudo systemctl status micall_watcher +```shell +sudo systemctl daemon-reload +sudo systemctl enable micall_watcher +sudo systemctl start micall_watcher +sudo systemctl status micall_watcher +``` If the service fails to start, look for detailed messages in the log file, in `/var/log/syslog`, or in `/var/log/messages`. @@ -173,7 +204,9 @@ If the service fails to start, look for detailed messages in the log file, in ### Restarting the MiCall Watcher ### If you installed it as a service as described above, then it's easy: - sudo systemctl restart micall_watcher +```shell +sudo systemctl restart micall_watcher +``` Don't launch the `micall/monitor/micall_watcher.py` script on its own, or the service will run won't know that it's running. That can end up running two copies of the watcher @@ -221,9 +254,13 @@ in a run under `Data/Intensities/BaeCalls/L001/*/*.bcl`. You can see how much space they take within a run folder: - find -name "*.bcl" -print0 | du -ch --files0-from - +```shell +find -name "*.bcl" -print0 | du -ch --files0-from - +``` We usually keep the last year's worth of BCL files around, so to delete all the BCL files from before May 2022, we ran this command in the runs folder: - find */Data/Intensities/BaseCalls/L001 -name "*.bcl" -not -newer 220527_M04401_0226_000000000-K5YRD/SampleSheet.csv -print -delete +```shell +find */Data/Intensities/BaseCalls/L001 -name "*.bcl" -not -newer 220527_M04401_0226_000000000-K5YRD/SampleSheet.csv -print -delete +``` diff --git a/docs/design/stitcher.md b/docs/design/stitcher.md index 265cad691..52e14e930 100644 --- a/docs/design/stitcher.md +++ b/docs/design/stitcher.md @@ -27,7 +27,7 @@ stitcher module can be run as a CLI script, separately from the rest of the pipeline. The following command runs the Stitcher: ```sh -PYTHONPATH="/path/to/micall/repository" python3 -m micall.core.contig_stitcher --help +micall contig_stitcher --help ``` From 2b20429bd33dfa58e6deca7be69c82fe9b00814e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Mon, 28 Oct 2024 15:43:10 -0700 Subject: [PATCH 478/485] Remove python version check in pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e335ac6d8..cf471b0b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ authors = [ license = {text = "AGPL-3.0"} license-files = { paths = ["LICENSE.txt"] } readme = "README.md" -requires-python = ">=3.11,<3.12" classifiers = [ "License :: OSI Approved :: GNU Affero General Public License v3", "Programming Language :: Python", From 768a29e96e6e983e9eecc970ffa2bbf14fde8d29 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 07:33:14 +0000 Subject: [PATCH 479/485] Bump aligntools from 1.2.0 to 1.2.1 Bumps [aligntools](https://github.com/cfe-lab/aligntools) from 1.2.0 to 1.2.1. - [Release notes](https://github.com/cfe-lab/aligntools/releases) - [Commits](https://github.com/cfe-lab/aligntools/compare/v1.2.0...v1.2.1) --- updated-dependencies: - dependency-name: aligntools dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cf471b0b3..c1f097493 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "mappy==2.28", "drawsvg==2.4.0", "cairosvg==2.7.1", - "aligntools==1.2.0", + "aligntools==1.2.1", ] [project.optional-dependencies] From 0c4606b7a320af4e86d0eeb0468b76bfde36611b Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 29 Oct 2024 14:06:03 -0700 Subject: [PATCH 480/485] Update CONTRIBUTING.md basespace instructions --- CONTRIBUTING.md | 140 ++++++++++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 59 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d3e81efc..93957ac5f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,36 +41,26 @@ in our [milestones]. [milestones]: https://github.com/cfe-lab/MiCall/milestones ### Python + Check that Python is already installed. - python --version +```shell +python --version +``` We have tested with Python `3.11`. ### BaseSpace -Set up the [native apps virtual machine][bsvm], and configure a shared folder -called MiCall that points to the source code. To get the shared folder working, -you'll probably need to [update the VBox guest additions][vbox guest] and add the -basespace user to the `vboxsf` group. Then, run - - sudo mount -t vboxsf MiCall /media/sf_MiCall -Make sure you have a developer account on illumina.com. The first time you run -this, you will have to log in to your account using - - sudo docker login docker.illumina.com +Set up the [native apps virtual machine][bsvm]. Make sure you have a +developer account on illumina.com. The first time you run this, you +will have to log in to your account using -Use the `micall/utils/docker_build.py` script to build a Docker image and push it to -BaseSpace. If you add `-t vX.Y`, it will add a tag to the Docker image. If you -add `-a `, it will launch the spacedock tool to process samples as a -local agent. You can also set the `BASESPACE_AGENT_ID` environment variable so -you don't have to supply it every time. You can get the agent id from the Form -Builder page on BaseSpace. - - sudo python3 /media/sf_MiCall/micall/utils/docker_build.py -a abcde12345 +```shell +sudo docker login docker.illumina.com +``` [bsvm]: https://developer.basespace.illumina.com/docs/content/documentation/native-apps/setup-dev-environment -[vbox guest]: https://linuxize.com/post/how-to-install-virtualbox-guest-additions-in-ubuntu/ ### Test data If you want to run `micall/monitor/micall_watcher.py`, you have to set up data folders for raw @@ -105,7 +95,9 @@ When you don't understand the pipeline's output, it can be helpful to look at the raw reads in a sequence viewer like [Tablet][tablet]. Run the `micall_docker` script on a run folder or a single sample, like this: - python micall/utils/micall_docker.py folder --debug_remap --all_projects --keep_scratch /path/to/run +```shell +python micall/utils/micall_docker.py folder --debug_remap --all_projects --keep_scratch /path/to/run +``` The options tell it to write the debug files, use all projects, and save the scratch folder that holds all the debug files. Look through the scratch folders @@ -131,18 +123,24 @@ out the web site locally before publishing it. 1. Install Ruby 2.6, preferably with [Ruby Version Manager]. - rvm install 2.6 - rvm use 2.6 + ```shell + rvm install 2.6 + rvm use 2.6 + ``` 2. Install the gems for the web site. - cd MiCall/docs - gem install bundler - bundle install + ```shell + cd MiCall/docs + gem install bundler + bundle install + ``` 3. Serve the web site. - bundle exec jekyll serve + ```shell + bundle exec jekyll serve + ``` What changes might you want to make? The web site is based on the [Bulma Clean Theme], so read through the documentation there to see if it @@ -182,9 +180,9 @@ similar steps to setting up a development workstation. Follow these steps: 5. Check the history of the `micall.alignment` folder. If it has changed since the last release, then update the version number in `setup.py`. 5. Update the change notes in the Singularity file, and commit those changes. -6. [Create a release][release] on Github. Use "vX.Y" as the tag, where X.Y +6. [Create a release][release] on Github. Use "vX.Y.Z" as the tag, where X.Y matches the version you used in QAI. If you have to redo - a release, you can create additional releases with tags vX.Y.1, vX.Y.2, and + a release, you can create additional releases with tags vX.Y.Z.1, vX.Y.Z.2, and so on. Mark the release as pre-release until you finish deploying it. 7. Rebuild the Singularity image, and upload it to your local Kive server. Process the microtest data. @@ -205,16 +203,20 @@ similar steps to setting up a development workstation. Follow these steps: 8. Stop the micall_watcher service on the main Kive server after you check that it's not processing any important runs. - ssh user@server - tail /var/log/micall/micall.log - sudo systemctl stop micall_watcher + ```shell + ssh user@server + tail /var/log/micall/micall.log + sudo systemctl stop micall_watcher + ``` 9. Get the code from Github into the server's environment. - ssh user@server - cd /usr/local/share/MiCall - git fetch - git checkout tags/vX.Y + ```shell + ssh user@server + cd /usr/local/share/MiCall + git fetch + git checkout tags/vX.Y.Z + ``` 10. Look for changes in [`micall/monitor/micall_watcher.py`'s `parse_args()` function][parse_args]. Either look at the blame annotations at the link above, or review the @@ -224,30 +226,45 @@ similar steps to setting up a development workstation. Follow these steps: 11. Update the container app ids and pipeline version number in `/etc/systemd/system/micall_watcher.service`. If you change the configuration, reload it: - sudo systemctl daemon-reload + ```shell + sudo systemctl daemon-reload + ``` 12. Check that the kiveapi package is the same version you tested with. If not, do a Kive release first. - cd /usr/local/share/Kive - /usr/local/share/venv-micall/bin/pip show kiveapi - cat api/setup.py + ```shell + cd /usr/local/share/Kive + /usr/local/share/venv-micall/bin/pip show kiveapi + cat api/setup.py + ``` 13. Start the micall_watcher service, and tail the log to see that it begins processing all the runs with the new version of the pipeline. - sudo systemctl start micall_watcher - sudo systemctl status micall_watcher - tail -f /var/log/micall/micall.log + ```shell + sudo systemctl start micall_watcher + sudo systemctl status micall_watcher + tail -f /var/log/micall/micall.log + ``` If the log doesn't help, look in `/var/log/messages` on CentOS or `/var/log/syslog` on Ubuntu. -14. Launch the basespace virtual machine (see BaseSpace section above), and build - a new Docker image from GitHub. Tag it with the release number. +14. Launch the basespace virtual machine (see BaseSpace section above) and copy + MiCall source files into it. The easiest way to copy is via scp: + + ```shell + scp -P 2222 /path/to/micall/on/host basespace@localhost:MiCall + # (password is "basespace") + ``` + + Then login to virtual machine and build the docker image: - cd /media/sf_micall - sudo python3 micall/utils/docker_build.py -t vX.Y --nopush + ```shell + ssh basespace@localhost -p2222 # (password is "basespace") + sudo python3 MiCall/micall/utils/docker_build.py -t vX.Y.Z --nopush + ``` The script is able to push the docker image to the illumina repo and launch spacedock as well, but that is currently broken because of the old docker version @@ -264,25 +281,30 @@ similar steps to setting up a development workstation. Follow these steps: is denied), try `docker logout` and `docker login` again, and make sure you are on the owner team of cfelab on [docker hub]. - ssh basespace@localhost -p2222 - sudo su - cd /media/sf_micall - sudo docker save cfelab/micall:vX.Y >micall-vX.Y.tar - exit (twice) - sudo docker load micall-vX.Y.Z.tar + exit # Exit the root shell. + exit # Exit the virtual machine. + sudo docker load < micall-vX.Y.Z.tar + sudo docker login docker.illumina.com + sudo docker tag docker.illumina.com/cfe_lab/micall:vX.Y.Z cfelab/micall:vX.Y.Z + sudo docker push docker.illumina.com/cfe_lab/micall:vX.Y.Z + rm micall-vX.Y.Z.tar + ``` 16. Duplicate the MiCall form in the revisions section of the form builder, then - edit the `callbacks.js` in the form builder itself, and add the `:vX.Y` tag to the + edit the `callbacks.js` in the form builder itself, and add the `:vX.Y.Z` tag to the `containerImageId` field. In My Apps, create a new version of the App with the new version number. Record the new agent ID (click the arrow on the bottom right of the form builder). 17. Launch the spacedock version by running this in your basespace VM: - sudo spacedock -a [agent ID] -m https://mission.basespace.illumina.com + ```shell + sudo spacedock -a [agent ID] -m https://mission.basespace.illumina.com + ``` 18. Check that the new MiCall version works as expected by processing some of the microtests in BaseSpace. From b36113bf7f20955daf1911cd05f1aaee2af16300 Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Tue, 29 Oct 2024 14:31:46 -0700 Subject: [PATCH 481/485] Cleanup whitespace in CONTRIBUTING.md --- CONTRIBUTING.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 93957ac5f..35b1ed174 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -253,7 +253,7 @@ similar steps to setting up a development workstation. Follow these steps: 14. Launch the basespace virtual machine (see BaseSpace section above) and copy MiCall source files into it. The easiest way to copy is via scp: - + ```shell scp -P 2222 /path/to/micall/on/host basespace@localhost:MiCall # (password is "basespace") @@ -266,8 +266,8 @@ similar steps to setting up a development workstation. Follow these steps: sudo python3 MiCall/micall/utils/docker_build.py -t vX.Y.Z --nopush ``` - The script is able to push the docker image to the illumina repo and launch - spacedock as well, but that is currently broken because of the old docker version + The script is able to push the docker image to the illumina repo and launch + spacedock as well, but that is currently broken because of the old docker version in the VM. If this is ever updated, or we build our own VM, you won't have to do these steps manually anymore and can remove the `--nopush`. @@ -275,7 +275,7 @@ similar steps to setting up a development workstation. Follow these steps: Unfortunately, the old version of docker that comes with the basespace virtual machine [can't log in] to docker hub or illumina, so you'll have to save it to a tar file and - load that into your host system's version of docker. Before pushing it anywhere, + load that into your host system's version of docker. Before pushing it anywhere, check that the docker image works by running the microtests. If the docker push fails with mysterious error messages (access to the resource is denied), try `docker logout` and `docker login` again, and make sure you are @@ -297,8 +297,8 @@ similar steps to setting up a development workstation. Follow these steps: 16. Duplicate the MiCall form in the revisions section of the form builder, then edit the `callbacks.js` in the form builder itself, and add the `:vX.Y.Z` tag to the - `containerImageId` field. In My Apps, create a new version of the App with the new - version number. Record the new agent ID (click the arrow on the bottom right of the + `containerImageId` field. In My Apps, create a new version of the App with the new + version number. Record the new agent ID (click the arrow on the bottom right of the form builder). 17. Launch the spacedock version by running this in your basespace VM: From 84cd63b1a6c9e33024a72d550c35f641ba47366c Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 31 Oct 2024 15:35:58 -0700 Subject: [PATCH 482/485] Fix type error in micall_docker.py --- micall/utils/micall_docker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/micall/utils/micall_docker.py b/micall/utils/micall_docker.py index ce44356d4..c13383fd1 100644 --- a/micall/utils/micall_docker.py +++ b/micall/utils/micall_docker.py @@ -591,6 +591,7 @@ def process_run(run_info, args): run_summary = summarize_run(run_info) def runner(func, inputs): + inputs = list(inputs) if args.max_active > 1 and len(inputs) > 1: with ProcessPoolExecutor(max_workers=args.max_active) as pool: list(pool.map(func, inputs)) From 46b86f90876a3a5dd2a8833899684173534be27f Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Thu, 31 Oct 2024 16:10:42 -0700 Subject: [PATCH 483/485] Fix documentation error in CONTRIBUTING.md --- CONTRIBUTING.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 35b1ed174..7c190d791 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -284,7 +284,6 @@ similar steps to setting up a development workstation. Follow these steps: ```shell ssh basespace@localhost -p2222 # password is "basespace" sudo su - cd /media/sf_micall sudo docker save cfelab/micall:vX.Y.Z > micall-vX.Y.Z.tar exit # Exit the root shell. exit # Exit the virtual machine. From 646d9967f7f5b2772579c152d20ec7eb38712231 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 07:35:03 +0000 Subject: [PATCH 484/485] Bump ruff from 0.7.1 to 0.7.2 Bumps [ruff](https://github.com/astral-sh/ruff) from 0.7.1 to 0.7.2. - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/0.7.1...0.7.2) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c1f097493..7df208bc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,7 @@ test = [ ] dev = [ # Dependencies required for development (linting, type checking, etc.) - "ruff==0.7.1", + "ruff==0.7.2", "mypy==1.13.0", "mypy-extensions==1.0.0", "gprof2dot==2024.6.6", From 9285d643f98e5a21fe65843cec1ef736256170a1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 07:35:17 +0000 Subject: [PATCH 485/485] Bump numpy from 2.1.2 to 2.1.3 Bumps [numpy](https://github.com/numpy/numpy) from 2.1.2 to 2.1.3. - [Release notes](https://github.com/numpy/numpy/releases) - [Changelog](https://github.com/numpy/numpy/blob/main/doc/RELEASE_WALKTHROUGH.rst) - [Commits](https://github.com/numpy/numpy/compare/v2.1.2...v2.1.3) --- updated-dependencies: - dependency-name: numpy dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c1f097493..63990cc6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "biopython==1.84", "gotoh @ git+https://github.com/cfe-lab/gotoh.git@v0.3.0#egg=gotoh&subdirectory=alignment/gotoh", "pyvdrm @ git+https://github.com/cfe-lab/pyvdrm.git@v0.3.2", - "numpy==2.1.2", + "numpy==2.1.3", "scipy==1.14.1", "matplotlib==3.9.2", "cutadapt==4.9",