diff --git a/.github/workflows/build_draft_pdf.yml b/.github/workflows/build_draft_pdf.yml index 98a0806..69a1689 100644 --- a/.github/workflows/build_draft_pdf.yml +++ b/.github/workflows/build_draft_pdf.yml @@ -17,7 +17,7 @@ jobs: # This should be the path to the paper within your repo. paper-path: paper/paper.md - name: Upload - uses: actions/upload-artifact@v1 + uses: actions/upload-artifact@v4 with: name: paper # This is the output path where Pandoc will write the compiled diff --git a/binette/__init__.py b/binette/__init__.py index 34c1db3..77139f6 100644 --- a/binette/__init__.py +++ b/binette/__init__.py @@ -1 +1 @@ -__version__ = '1.0.2' \ No newline at end of file +__version__ = '1.0.3' \ No newline at end of file diff --git a/paper/paper.bib b/paper/paper.bib index eb23a5f..3455a0d 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -171,15 +171,20 @@ @article{hyatt2010prodigal } - -@article{metagWGS_inprep, - title={MetagWGS, a complete workflow to analyse metagenomic data (from Illumina reads or PacBio HiFi reads)}, - author={Mainguy, Jean and Vienne, Maïna and Fourquet, Joanna and Darbot, Vincent and Noirot, Céline and Castinel, Adrien and Combes, Sylvie and Gaspin, Christine and Milan, Denis and Donnadieu, Cécile and Iampietro, Carole and Bouchez, Olivier and Pascal, Géraldine and Hoede, Claire}, - journal={Journal}, - year={in preparation} - +@article {metagWGS, + author = {Mainguy, Jean and Vienne, M{\"a}ina and Fourquet, Joanna and Darbot, Vincent and Noirot, C{\'e}line and Castinel, Adrien and Combes, Sylvie and Gaspin, Christine and Milan, Denis and Donnadieu, Cecile and Iampietro, Carole and Bouchez, Olivier and Pascal, G{\'e}raldine and Hoede, Claire}, + title = {metagWGS, a comprehensive workflow to analyze metagenomic data using Illumina or PacBio HiFi reads}, + elocation-id = {2024.09.13.612854}, + year = {2024}, + doi = {10.1101/2024.09.13.612854}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Background: To study communities of micro-organisms taxonomically and functionally, metagenomic analyses are now often used. If there is no reference gene catalogue, a de novo approach is required. Because genomes are easier to interpret than contigs, the recovery of metagenome-assembled genomes (MAGs) by binning of contigs from metagenomic data has recently become a common task for microbial studies. However, during this process, there is a significant loss of information between the assembly and the binning of contigs. This is why it is important to produce taxonomic and functional matrices for all contigs and not just those included in correct bins. In addition, Pacbio HiFi reads (long and of good quality) are now a possible, albeit more expensive, alternative to short Illumina reads. We therefore developed a workflow that is easy to install with dependencies fixed using singularity images and easy to use on a computing cluster, that is capable of analyzing either short or long reads, and that should allow analysis at the contig and/or bin level, depending on the user{\textquoteright}s choice. Following is a presentation of metagWGS, a fully automated workflow for metagenomic data analysis. It uses a new tool for refining bins (called Binette) that we will demonstrate is more efficient than competing tools. Methods: metagWGS is a Nextflow workflow distributed with two singularity images and complete documentation to facilitate its installation and use. Because the main original features of metagWGS concern binning (short and long reads) and the analysis of HiFi reads, we compared metagWGS with the MAG construction workflow proposed by PacBio to a public dataset used by Pacbio to promote its workflow. Results: metagWGS differs from existing workflows by (i) offering flexible approaches for the assembly; (ii) supporting short reads (Illumina) or PacBio HiFi reads; (iii) combining multiple binning algorithms with a new bin refinement tool, referred to as Binette, to achieve high-quality genome bins; and (iv) providing taxonomic and functional annotation for all genes, all contigs built and bins. metagWGS produces more medium (708) and high-quality (255) bins on 11 public metagenomic samples from human gut data than the Pacbio HiFi dedicated workflow, referred to as the HiFi-MAGS-pipeline (659 medium quality bins and 231 high quality bins), primarily due to the better performance of Binette.Competing Interest StatementThe authors have declared no competing interest.}, + URL = {https://www.biorxiv.org/content/early/2024/09/18/2024.09.13.612854}, + eprint = {https://www.biorxiv.org/content/early/2024/09/18/2024.09.13.612854.full.pdf}, + journal = {bioRxiv} } + @article{gruning2018bioconda, title={Bioconda: sustainable and comprehensive software distribution for the life sciences}, author={Gr{\"u}ning, Bj{\"o}rn and Dale, Ryan and Sj{\"o}din, Andreas and Chapman, Brad A and Rowe, Jillian and Tomkins-Tinch, Christopher H and Valieris, Renan and K{\"o}ster, Johannes and Bioconda Team}, diff --git a/paper/paper.md b/paper/paper.md index 3213cde..8ffba95 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -10,7 +10,7 @@ tags: authors: - name: Jean Mainguy orcid: 0009-0006-9160-9744 - affiliation: "1, 2" + affiliation: "1, 2, 3" - name: Claire Hoede orcid: 0000-0001-5054-7731 affiliation: "1, 2" @@ -20,6 +20,8 @@ affiliations: index: 1 - name: Université de Toulouse, INRAE, UR 875 MIAT, 31326, Castanet-Tolosan, France index: 2 + - name: LABGeM, Génomique Métabolique, Genoscope, Institut François Jacob, CEA, CNRS, Univ Evry, Université Paris-Saclay, Evry, France + index: 3 date: 30 november 2023 bibliography: paper.bib --- @@ -41,9 +43,9 @@ Binette is a Python reimplementation and enhanced version of the bin refinement ![**Overview of Binette Steps**. **(A) Intermediate Bin Creation Example**: Bins are represented as square shapes, each containing colored lines representing the contigs they contain. Creation of intermediate bins involves the initial bins sharing at least one contig. Set operations are applied to the contigs within the bins to generate these intermediate bins. **(B) Binette Workflow Overview**: Input bins serve as the basis for generating intermediate bins. Each bin undergoes a scoring process utilizing quality metrics provided by CheckM2. Subsequently, the bins are sorted based on their scores, and a selection process is executed to retain non-redundant bins.\label{fig:overview}](./binette_overview.pdf) -Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1, which is what the metaWRAP pipeline uses. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that uses Cython to provide bindings to Prodigal [@hyatt2010prodigal]. The intermediate Checkm2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. +Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1, which is what the metaWRAP pipeline uses. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that uses Cython to provide bindings to Prodigal [@hyatt2010prodigal]. The intermediate CheckM2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. -Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS_inprep], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. +Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. # Availability