From e229317c8ee7a34366ff240b69ba82ab31bacc5e Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Wed, 6 Nov 2024 14:16:42 -0800 Subject: [PATCH] Replace IVA by Haploflow This change does not handle `merged_contigs_csv`. --- .github/workflows/build-and-test.yml | 29 +++++---------- Dockerfile | 37 +++++-------------- README.md | 2 +- Singularity | 35 ++++-------------- micall/core/denovo.py | 53 ++++++++++++++++------------ pyproject.toml | 5 +-- 6 files changed, 58 insertions(+), 103 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index c2d870fe0..30fb68015 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -15,27 +15,16 @@ jobs: - name: Run apt update run: sudo apt-get update - - name: Install IVA assembler dependencies + - name: Install Haploflow run: | - sudo apt-get install -qq zlib1g-dev libncurses5-dev libncursesw5-dev mummer ncbi-blast+ - cd ~/bin - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc_dump - # Server doesn't support HTTPS, so check for changed files. - echo "\ - db1935884aec2d23d4d623ff85eb4eae8d7a946c9ee0c33ea1818215c40d3099 kmc - 34a97db2dab5fdae0276d2589c940142813e9cd87ae10e5e2dd37ed3545b4436 kmc_dump" | sha256sum --check - chmod +x kmc kmc_dump - wget -q https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 - tar -xf samtools-1.3.1.tar.bz2 --no-same-owner --bzip2 - cd samtools-1.3.1 - ./configure --prefix=$HOME - make - make install - cd ~ - wget -q https://downloads.sourceforge.net/project/smalt/smalt-0.7.6-bin.tar.gz - tar -xzf smalt-0.7.6-bin.tar.gz - ln -s ~/smalt-0.7.6-bin/smalt_x86_64 ~/bin/smalt + sudo apt-get update + sudo apt-get install -y build-essential git ronn + cd /opt/ + git clone https://github.com/hzi-bifo/Haploflow + cd Haploflow + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 + sh build.sh + sudo ln -s /opt/Haploflow/build/haploflow ~/bin/haploflow - name: Install Rust and merge-mates run: | diff --git a/Dockerfile b/Dockerfile index 6c65f2eb2..b2edb1e77 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,34 +54,15 @@ RUN wget -q -O bowtie2.zip https://github.com/BenLangmead/bowtie2/releases/downl ENV PATH $PATH:/opt/bowtie2 -## Installing IVA dependencies -RUN apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev && \ - cd /bin && \ - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc && \ - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc_dump && \ - chmod +x kmc kmc_dump && \ - cd /opt && \ - wget -q https://sourceforge.net/projects/mummer/files/mummer/3.23/MUMmer3.23.tar.gz && \ - tar -xzf MUMmer3.23.tar.gz --no-same-owner && \ - cd MUMmer3.23 && \ - make --quiet install && \ - rm -r docs src ../MUMmer3.23.tar.gz && \ - ln -s /opt/MUMmer3.23/nucmer \ - /opt/MUMmer3.23/delta-filter \ - /opt/MUMmer3.23/show-coords \ - /bin && \ - cd /opt && \ - wget -q https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 && \ - tar -xf samtools-1.3.1.tar.bz2 --no-same-owner --bzip2 && \ - cd samtools-1.3.1 && \ - ./configure --quiet --prefix=/ && \ - make --quiet && \ - make --quiet install && \ - cd /opt && \ - rm -rf samtools-1.3.1* && \ - wget -q http://downloads.sourceforge.net/project/smalt/smalt-0.7.6-bin.tar.gz && \ - tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner && \ - ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt +## Install Haploflow +RUN apt-get update && \ + apt-get install -y build-essential sudo git ronn cmake && \ + cd /opt/ && \ + git clone https://github.com/hzi-bifo/Haploflow && \ + cd Haploflow && \ + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 && \ + yes | sh build.sh && \ + ln -s /opt/Haploflow/build/haploflow /bin/haploflow ## Install dependencies for genetracks/drawsvg RUN apt-get install -q -y libcairo2-dev diff --git a/README.md b/README.md index a928002f5..2482dcc1e 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Requests is distributed under the Apache 2.0 license. Python 3 is distributed under the [Python 3 license][python]. -Bowtie2, IVA, and Python-Levenshtein are distributed under the GNU General +Bowtie2, Haploflow, and Python-Levenshtein are distributed under the GNU General Public License (GPL). Matplotlib is distributed under the [Matplotlib license][matplotlib]. diff --git a/Singularity b/Singularity index ef346e276..6db037015 100644 --- a/Singularity +++ b/Singularity @@ -62,34 +62,13 @@ From: python:3.11 ln -s /opt/bowtie2-2.2.8/ /opt/bowtie2 rm bowtie2.zip - echo ===== Installing IVA dependencies ===== >/dev/null - apt-get install -q -y zlib1g-dev libncurses5-dev libncursesw5-dev - cd /bin - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc - wget -q http://sun.aei.polsl.pl/kmc/download-2.1.1/linux/kmc_dump - chmod +x kmc kmc_dump - cd /opt - wget -q https://sourceforge.net/projects/mummer/files/mummer/3.23/MUMmer3.23.tar.gz - tar -xzf MUMmer3.23.tar.gz --no-same-owner - cd MUMmer3.23 - make --quiet install - rm -r docs src ../MUMmer3.23.tar.gz - ln -s /opt/MUMmer3.23/nucmer \ - /opt/MUMmer3.23/delta-filter \ - /opt/MUMmer3.23/show-coords \ - /bin - cd /opt - wget -q https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 - tar -xf samtools-1.3.1.tar.bz2 --no-same-owner --bzip2 - cd samtools-1.3.1 - ./configure --quiet --prefix=/ - make --quiet - make --quiet install - cd /opt - rm -rf samtools-1.3.1* - wget -q http://downloads.sourceforge.net/project/smalt/smalt-0.7.6-bin.tar.gz - tar -xzf smalt-0.7.6-bin.tar.gz --no-same-owner - ln -s /opt/smalt-0.7.6-bin/smalt_x86_64 /bin/smalt + echo ===== Installing Haploflow ===== >/dev/null + apt-get install -q -y libboost-all-dev build-essential sudo git ronn cmake + cd /opt/ + git clone https://github.com/hzi-bifo/Haploflow + cd Haploflow + git checkout 9a5a0ff6c3a0435e723e41f98fe82ec2ad19cf50 + yes | sh build.sh echo ===== Installing Python packages ===== >/dev/null # Install dependencies for genetracks/drawsvg diff --git a/micall/core/denovo.py b/micall/core/denovo.py index 5e32f6acb..693b41bbd 100644 --- a/micall/core/denovo.py +++ b/micall/core/denovo.py @@ -2,7 +2,6 @@ import logging import os from typing import Optional, TextIO, cast, BinaryIO -from csv import DictReader from datetime import datetime from glob import glob from shutil import rmtree, copyfileobj @@ -10,12 +9,8 @@ import subprocess from tempfile import mkdtemp -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord - -IVA = "iva" +HAPLOFLOW = "haploflow" logger = logging.getLogger(__name__) @@ -40,6 +35,10 @@ def denovo(fastq1_path: str, amplicon reads """ + if merged_contigs_csv is not None: + # TODO: implement this. + logger.error("Haploflow implementation does not support contig extensions yet.") + old_tmp_dirs = glob(os.path.join(work_dir, 'assembly_*')) for old_tmp_dir in old_tmp_dirs: rmtree(old_tmp_dir, ignore_errors=True) @@ -55,26 +54,36 @@ def denovo(fastq1_path: str, '--interleave', '-o', joined_path], check=True) - iva_out_path = os.path.join(tmp_dir, 'iva_out') - contigs_fasta_path = os.path.join(iva_out_path, 'contigs.fasta') - iva_args = [IVA, '--fr', joined_path, '-t', '2'] - if merged_contigs_csv is not None: - seeds_fasta_path = os.path.join(tmp_dir, 'seeds.fasta') - with open(seeds_fasta_path, 'w') as seeds_fasta: - SeqIO.write((SeqRecord(Seq(row['contig']), f'seed-{i}', '', '') - for i, row in enumerate(DictReader(merged_contigs_csv))), - seeds_fasta, - 'fasta') - seeds_size = seeds_fasta.tell() - if seeds_size > 0: - iva_args.extend(['--contigs', seeds_fasta_path, '--make_new_seeds']) - iva_args.append(iva_out_path) + + haplo_args = {'long': 0, + 'filter': 500, + 'thres': -1, + 'strict': 5, + 'error': 0.02, + 'kmer': 41, + 'merge': False, + 'scaffold': False, + 'patch': False, + 'ref': None, + 'RP': False, + } + assembly_out_path = os.path.join(tmp_dir, 'haplo_out') + contigs_fasta_path = os.path.join(assembly_out_path, 'contigs.fa') + haplo_cmd = [HAPLOFLOW, + '--read-file', joined_path, + '--out', assembly_out_path, + '--k', str(haplo_args['kmer']), + '--error-rate', str(haplo_args['error']), + '--strict', str(haplo_args['strict']), + '--filter', str(haplo_args['filter']), + '--thres', str(haplo_args['thres']), + '--long', str(haplo_args['long'])] try: - subprocess.run(iva_args, check=True, stdout=PIPE, stderr=STDOUT) + subprocess.run(haplo_cmd, check=True, stdout=PIPE, stderr=STDOUT) except CalledProcessError as ex: output = ex.output and ex.output.decode('UTF8') if output != 'Failed to make first seed. Cannot continue\n': - logger.warning('iva failed to assemble.', exc_info=True) + logger.warning('Haploflow failed to assemble.', exc_info=True) logger.warning(output) with open(contigs_fasta_path, 'a'): pass diff --git a/pyproject.toml b/pyproject.toml index a6e44d0c3..4324e3eb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,10 +64,7 @@ dev = [ "gprof2dot==2024.6.6", "codecov==2.1.13", # For reporting the code coverage. ] -denovo = [ - # Requirements for running De-Novo pipeline (only problematic ones). - "iva @ git+https://github.com/cfe-lab/iva.git@v1.1.1", -] +denovo = [] watcher = [ # Requirements for running the MISEQ_MONITOR.py script "kiveapi @ git+https://github.com/cfe-lab/Kive.git@v0.15#egg=kiveapi&subdirectory=api",