diff --git a/paper/paper.bib b/paper/paper.bib index eb23a5f..3455a0d 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -171,15 +171,20 @@ @article{hyatt2010prodigal } - -@article{metagWGS_inprep, - title={MetagWGS, a complete workflow to analyse metagenomic data (from Illumina reads or PacBio HiFi reads)}, - author={Mainguy, Jean and Vienne, Maïna and Fourquet, Joanna and Darbot, Vincent and Noirot, Céline and Castinel, Adrien and Combes, Sylvie and Gaspin, Christine and Milan, Denis and Donnadieu, Cécile and Iampietro, Carole and Bouchez, Olivier and Pascal, Géraldine and Hoede, Claire}, - journal={Journal}, - year={in preparation} - +@article {metagWGS, + author = {Mainguy, Jean and Vienne, M{\"a}ina and Fourquet, Joanna and Darbot, Vincent and Noirot, C{\'e}line and Castinel, Adrien and Combes, Sylvie and Gaspin, Christine and Milan, Denis and Donnadieu, Cecile and Iampietro, Carole and Bouchez, Olivier and Pascal, G{\'e}raldine and Hoede, Claire}, + title = {metagWGS, a comprehensive workflow to analyze metagenomic data using Illumina or PacBio HiFi reads}, + elocation-id = {2024.09.13.612854}, + year = {2024}, + doi = {10.1101/2024.09.13.612854}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Background: To study communities of micro-organisms taxonomically and functionally, metagenomic analyses are now often used. If there is no reference gene catalogue, a de novo approach is required. Because genomes are easier to interpret than contigs, the recovery of metagenome-assembled genomes (MAGs) by binning of contigs from metagenomic data has recently become a common task for microbial studies. However, during this process, there is a significant loss of information between the assembly and the binning of contigs. This is why it is important to produce taxonomic and functional matrices for all contigs and not just those included in correct bins. In addition, Pacbio HiFi reads (long and of good quality) are now a possible, albeit more expensive, alternative to short Illumina reads. We therefore developed a workflow that is easy to install with dependencies fixed using singularity images and easy to use on a computing cluster, that is capable of analyzing either short or long reads, and that should allow analysis at the contig and/or bin level, depending on the user{\textquoteright}s choice. Following is a presentation of metagWGS, a fully automated workflow for metagenomic data analysis. It uses a new tool for refining bins (called Binette) that we will demonstrate is more efficient than competing tools. Methods: metagWGS is a Nextflow workflow distributed with two singularity images and complete documentation to facilitate its installation and use. Because the main original features of metagWGS concern binning (short and long reads) and the analysis of HiFi reads, we compared metagWGS with the MAG construction workflow proposed by PacBio to a public dataset used by Pacbio to promote its workflow. Results: metagWGS differs from existing workflows by (i) offering flexible approaches for the assembly; (ii) supporting short reads (Illumina) or PacBio HiFi reads; (iii) combining multiple binning algorithms with a new bin refinement tool, referred to as Binette, to achieve high-quality genome bins; and (iv) providing taxonomic and functional annotation for all genes, all contigs built and bins. metagWGS produces more medium (708) and high-quality (255) bins on 11 public metagenomic samples from human gut data than the Pacbio HiFi dedicated workflow, referred to as the HiFi-MAGS-pipeline (659 medium quality bins and 231 high quality bins), primarily due to the better performance of Binette.Competing Interest StatementThe authors have declared no competing interest.}, + URL = {https://www.biorxiv.org/content/early/2024/09/18/2024.09.13.612854}, + eprint = {https://www.biorxiv.org/content/early/2024/09/18/2024.09.13.612854.full.pdf}, + journal = {bioRxiv} } + @article{gruning2018bioconda, title={Bioconda: sustainable and comprehensive software distribution for the life sciences}, author={Gr{\"u}ning, Bj{\"o}rn and Dale, Ryan and Sj{\"o}din, Andreas and Chapman, Brad A and Rowe, Jillian and Tomkins-Tinch, Christopher H and Valieris, Renan and K{\"o}ster, Johannes and Bioconda Team}, diff --git a/paper/paper.md b/paper/paper.md index 93376ff..8ae3620 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -45,7 +45,7 @@ Binette is a Python reimplementation and enhanced version of the bin refinement Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1, which is what the metaWRAP pipeline uses. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that uses Cython to provide bindings to Prodigal [@hyatt2010prodigal]. The intermediate Checkm2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. -Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS_inprep], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. +Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. # Availability