diff --git a/README.md b/README.md index b0fd415..bb2f149 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Besides python packages dependencies, Meteor requires: - python 3.10+ - [bowtie2 2.3.5+](https://github.com/BenLangmead/bowtie2) - [freebayes 1.3.6+](https://github.com/freebayes/freebayes) +- [mafft 7.407+](https://mafft.cbrc.jp/alignment/software/) ## Installation diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml index cc5b73d..8ece951 100644 --- a/conda_recipe/meta.yaml +++ b/conda_recipe/meta.yaml @@ -39,11 +39,13 @@ requirements: - biom-format - py-bgzip - freebayes >=1.3.6 + - mafft >=7.407 test: commands: - meteor -h - bowtie2 -h - freebayes -h + - mafft -h imports: - pysam - pandas diff --git a/meteor/phylogeny.py b/meteor/phylogeny.py index 7cb75c6..2bd59aa 100644 --- a/meteor/phylogeny.py +++ b/meteor/phylogeny.py @@ -13,6 +13,9 @@ """Effective phylogeny""" import re import logging +import sys +from subprocess import run, Popen, PIPE +from packaging.version import parse # import sys from dataclasses import dataclass, field @@ -28,7 +31,7 @@ from collections import OrderedDict from datetime import datetime from typing import Iterable, Tuple -from cogent3 import load_unaligned_seqs, make_aligned_seqs +from cogent3 import load_aligned_seqs from cogent3.evolve.distance import EstimateDistances from cogent3.evolve.models import GTR from cogent3.cluster.UPGMA import upgma @@ -98,7 +101,7 @@ def set_tree_config(self): config = { "meteor_version": self.meteor.version, "phylogeny": { - "phylogeny_tool": "cogent3", + "" "phylogeny_tool": "cogent3", "phylogeny_date": datetime.now().strftime("%Y-%m-%d"), "tree_files": ",".join([tree.name for tree in self.tree_files]), }, @@ -122,48 +125,80 @@ def process_msp_file( msp_file.name.replace(".fasta", ""), ) tree_file = tree_dir / f"{msp_file.stem}.tree" + ali_file = tree_dir / f"{msp_file.stem}_aligned.fasta" + self.tree_files: list[Path] = [] - with NamedTemporaryFile(mode="wt", dir=tmp_dir, suffix=".fasta") as temp_clean: - # Clean sites - logging.info("Clean sites for %s", msp_file.name) - _, info_sites = self.clean_sites(msp_file, temp_clean) - - if info_sites < self.min_info_sites: - logging.info( - "Only %d informative sites (< %d threshold) left after cleaning, skipping %s.", - info_sites, - self.min_info_sites, - msp_file.name.replace(".fasta", ""), - ) - return tree_file, False # Return False to indicate skipping - - # Perform alignments and UPGMA - logging.info("Running UPGMA and Distance Estimation") - aligned_seqs = make_aligned_seqs( - load_unaligned_seqs(temp_clean.name, moltype="dna"), - moltype="dna", - array_align=True, - ) - d = EstimateDistances(aligned_seqs, submodel=GTR()) - d.run(show_progress=False) - - # Create UPGMA Tree - mycluster = upgma(d.get_pairwise_distances()) - mycluster = mycluster.unrooted_deepcopy() + with NamedTemporaryFile( + suffix=".fasta", dir=tmp_dir, delete=True + ) as temp_ali_file: + # Start alignment + with Popen( + [ + "mafft", + "--thread", + str(2), + "--quiet", + str(msp_file.resolve()), + ], + stdout=temp_ali_file, # Redirect stdout to the temp file + stderr=PIPE, + ) as align_exec: + _, error = align_exec.communicate() + if align_exec.returncode != 0: + raise RuntimeError(f"MAFFT failed with error: {error}") + logging.info("Clean sites for %s", msp_file.name) + with ali_file.open("w") as aligned_seq: + _, info_sites = self.clean_sites( + Path(temp_ali_file.name), aligned_seq + ) - with tree_file.open("w") as f: - f.write( - self.remove_edge_labels(mycluster.get_newick(with_distances=True)) - ) + if info_sites < self.min_info_sites: + logging.info( + "Only %d informative sites (< %d threshold) left after cleaning, skipping %s.", + info_sites, + self.min_info_sites, + msp_file.name.replace(".fasta", ""), + ) + return tree_file, False # Return False to indicate skipping + cleaned_alignment = load_aligned_seqs(ali_file, moltype="dna") + d = EstimateDistances(cleaned_alignment, submodel=GTR()) + d.run(show_progress=False) + + # Create UPGMA Tree + mycluster = upgma(d.get_pairwise_distances()) + mycluster = mycluster.unrooted_deepcopy() + + with tree_file.open("w") as f: + f.write( + self.remove_edge_labels( + mycluster.get_newick(with_distances=True) + ) + ) + # Perform alignments and UPGMA + logging.info("Align sequences") - return tree_file, tree_file.exists() + return tree_file, tree_file.exists() def execute(self) -> None: logging.info("Launch phylogeny analysis") start = perf_counter() - - self.tree_files: list[Path] = [] msp_count = len(self.msp_file_list) + # Check the mafft version + mafft_exec = run(["mafft", "--version"], check=False, capture_output=True) + if mafft_exec.returncode != 0: + logging.error( + "Checking mafft version failed:\n%s", + mafft_exec.stderr.decode("utf-8"), + ) + sys.exit(1) + mafft_version = str(mafft_exec.stderr.decode("utf-8")).split(" ")[0][1:] + if parse(mafft_version) < self.meteor.MIN_MAFFT_VERSION: + logging.error( + "The mafft version %s is outdated for meteor. Please update mafft to >= %s.", + mafft_version, + self.meteor.MIN_MAFFT_VERSION, + ) + sys.exit(1) # Using ProcessPoolExecutor to parallelize the MSP file processing with ProcessPoolExecutor(max_workers=self.meteor.threads) as executor: futures = { diff --git a/meteor/session.py b/meteor/session.py index 45c56f3..4947a1a 100644 --- a/meteor/session.py +++ b/meteor/session.py @@ -30,6 +30,7 @@ class Component: MIN_BOWTIE2_VERSION: ClassVar[Version] = Version("2.3.5") MIN_FREEBAYES_VERSION: ClassVar[Version] = Version("1.3.6") + MIN_MAFFT_VERSION: ClassVar[Version] = Version("7.407") DEFAULT_GAP_CHAR: ClassVar[str] = "N" DEFAULT_CORE_SIZE: ClassVar[int] = 100 diff --git a/pyproject.toml b/pyproject.toml index 193b408..100e446 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "meteor" -version = "2.0.16" +version = "2.0.17" description = "Meteor - A plateform for quantitative metagenomic profiling of complex ecosystems" authors = ["Amine Ghozlane ", "Florence Thirion ", "Florian Plaza-OƱate "] license = "GPL-3.0-or-later"