diff --git a/.editorconfig b/.editorconfig index b78de6e6..b6b31907 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js,cff}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 6e0ea2a2..2ac5f2d9 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -116,4 +116,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 491a2a47..146d5516 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,9 +42,9 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ * Version of nf-core/mag _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 2b1d3b46..46acbff1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,7 +15,8 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/mag/ - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md) +- [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 19051df4..c53ab09d 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,18 +14,23 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/mag/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-${{ github.sha }}" } - profiles: test_full,aws_tower + profiles: test_full + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 7a6ff7ef..be11af11 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,18 +12,22 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/mag/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-test-${{ github.sha }}" } - profiles: test,aws_tower + profiles: test + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index e08454a2..0d205d33 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/mag' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/mag ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/mag ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b95b00ef..3aaa6f3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: NXF_VER: - - "22.10.1" + - "23.04.0" - "latest-everything" steps: - name: Free some space @@ -61,6 +61,7 @@ jobs: test_ancient_dna, test_adapterremoval, test_binrefinement, + test_virus_identification, ] steps: - name: Free some space diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 00000000..694e90ec --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 858d622e..888cb4bc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.8" architecture: "x64" - name: Install dependencies diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..25488dcc 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,4 +1,9 @@ image: nfcore/gitpod:latest +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update vscode: extensions: # based on nf-core.nf-core-extensionpack diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0c31cdb9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/CHANGELOG.md b/CHANGELOG.md index 0358ddff..0ca54895 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,56 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 2.4.0 - 2023-09-25 + +### `Added` + +- [#497](https://github.com/nf-core/mag/pull/497) - Adds support for pointing at a local db for krona, using the parameter `--krona_db` (by @willros). +- [#395](https://github.com/nf-core/mag/pull/395) - Adds support for fast domain-level classification of bins using Tiara, to allow bins to be separated into eukaryotic and prokaryotic-specific processes. +- [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli) +- [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites) +- [#459](https://github.com/nf-core/mag/pull/459) - Adds ability to skip damage correction step in the ancient DNA workflow and just run pyDamage (by @jfy133) +- [#364](https://github.com/nf-core/mag/pull/364) - Adds geNomad nf-core modules for identifying viruses in assemblies (by @PhilPalmer and @CarsonJM) +- [#481](https://github.com/nf-core/mag/pull/481) - Adds MetaEuk for annotation of eukaryotic MAGs, and MMSeqs2 to enable downloading databases for MetaEuk (by @prototaxites) +- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb_db` also now supports directory input of an pre-uncompressed GTDB archive directory (reported by @alneberg, fix by @jfy133) +- [#494](https://github.com/nf-core/mag/pull/494) - Adds support for saving the BAM files from Bowtie2 mapping of input reads back to assembly (fix by @jfy133) + +### `Changed` + +- [#428](https://github.com/nf-core/mag/pull/428) [#467](https://github.com/nf-core/mag/pull/467) - Update to nf-core 2.8, 2.9 `TEMPLATE` (by @jfy133) +- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133) +- [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133). +- [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133). +- [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133) +- [#477](https://github.com/nf-core/mag/pull/477) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133) +- [#500](https://github.com/nf-core/mag/pull/500) - Temporarily disabled downstream processing of both refined and raw bins due to bug (by @jfy133) + +### `Fixed` + +- [#496](https://github.com/nf-core/mag/pull/496) - Fix help text for paramters `--bowtie2_mode`, `spades_options` and `megahit_options` (by @willros) +- [#400](https://github.com/nf-core/mag/pull/400) - Fix duplicated Zenodo badge in README (by @jfy133) +- [#406](https://github.com/nf-core/mag/pull/406) - Fix CheckM database always downloading, regardless if CheckM is selected (by @jfy133) +- [#419](https://github.com/nf-core/mag/pull/419) - Fix bug with busco_clean parameter, where it is always activated (by @prototaxites) +- [#426](https://github.com/nf-core/mag/pull/426) - Fixed typo in help text for parameters `--host_genome` and `--host_fasta` (by @tillenglert) +- [#434](https://github.com/nf-core/mag/pull/434) - Fix location of samplesheet for AWS full tests (reported by @Lfulcrum, fix by @jfy133) +- [#438](https://github.com/nf-core/mag/pull/438) - Fixed version inconsistency between conda and containers for GTDBTK_CLASSIFYWF (by @jfy133) +- [#439](https://github.com/nf-core/mag/pull/445) - Fix bug in assembly input (by @prototaxites) +- [#447](https://github.com/nf-core/mag/pull/447) - Remove `default: None` from parameter schema (by @drpatelh) +- [#449](https://github.com/nf-core/mag/pull/447) - Fix results file overwriting in Ancient DNA workflow (reported by @alexhbnr, fix by @jfy133) +- [#470](https://github.com/nf-core/mag/pull/470) - Fix binning preparation from running even when binning was requested to be skipped (reported by @prototaxites, fix by @jfy133) +- [#480](https://github.com/nf-core/mag/pull/480) - Improved `-resume` reliability through better meta map preservation (reported by @prototaxites, fix by @jfy133) +- [#493](https://github.com/nf-core/mag/pull/493) - Update `METABAT2` nf-core module so that it reduced the number of unnecessary file moves, enabling virtual filesystems (fix by @adamrtalbot) +- [#500](https://github.com/nf-core/mag/pull/500) - Fix MaxBin2 bins not being saved in results directly properly (reported by @Perugolate, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| -------- | ---------------- | ----------- | +| BCFtools | 1.16 | 1.17 | +| SAMtools | 1.16.1 | 1.17 | +| fastp | 0.23.2 | 0.23.4 | +| MultiQC | 1.14 | 1.15 | + ## v2.3.2 - [2023-06-23] ### `Fixed` @@ -36,6 +86,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#373](https://github.com/nf-core/mag/pull/373) - Removed parameter `--enable_conda`. Updated local modules to new conda syntax and updated nf-core modules (by @skrakau) - [#385](https://github.com/nf-core/mag/pull/385) - CAT also now runs on unbinned contigs as well as binned contigs (added by @jfy133) - [#399](https://github.com/nf-core/mag/pull/399/files) - Removed undocumented BUSCO_PLOT process (previously generated `*.busco_figure.png` plots unsuitable for metagenomics) (by @skrakau). +- [#416](https://github.com/nf-core/mag/pull/416) - Use GTDBTK_CLASSIFYWF nf-core module instead of local module (added by @alxndrdiaz) ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 09d165b3..846609b5 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,6 +14,8 @@ > Schubert, M., Lindgreen, S., and Orlando, L. 2016. "AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging." BMC Research Notes 9 (February): 88. doi: 10.1186/s13104-016-1900-2 +- [BBnorm/BBTools](http://sourceforge.net/projects/bbmap/) + - [BCFtools](https://doi.org/10.1093/gigascience/giab008) > Danecek, Petr, et al. "Twelve years of SAMtools and BCFtools." Gigascience 10.2 (2021): giab008. doi: 10.1093/gigascience/giab008 @@ -52,12 +54,18 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. Available online https://www.bioinformatics.babraham.ac.uk/projects/fastqc/. + - [Filtlong](https://github.com/rrwick/Filtlong) - [Freebayes](https://arxiv.org/abs/1207.3907) > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 [q-bio.GN] 2012 +- [geNomad](https://doi.org/10.1101/2023.03.05.531206) + + > Camargo, A. P., et al. (2023). You can move, but you can’t hide: identification of mobile genetic elements with geNomad. bioRxiv preprint. doi: https://doi.org/10.1101/2023.03.05.531206 + - [GTDB-Tk](https://doi.org/10.1093/bioinformatics/btz848) > Chaumeil, P. A., Mussig, A. J., Hugenholtz, P., & Parks, D. H. (2020). GTDB-Tk: a toolkit to classify genomes with the Genome Taxonomy Database. Bioinformatics , 36(6), 1925–1927. doi: 10.1093/bioinformatics/btz848. @@ -86,6 +94,14 @@ > Kang, D. D., Li, F., Kirton, E., Thomas, A., Egan, R., An, H., & Wang, Z. (2019). MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies. PeerJ, 7, e7359. doi: 10.7717/peerj.7359. +- [MetaEuk](https://doi.org/10.1186/s40168-020-00808-x) + +> Levy Karin, E., Mirdita, M. & Söding, J. MetaEuk—sensitive, high-throughput gene discovery, and annotation for large-scale eukaryotic metagenomics. Microbiome 8, 48 (2020). https://doi.org/10.1186/s40168-020-00808-x + +- [MMseqs2](https://www.nature.com/articles/nbt.3988) + +> Steinegger, M., Söding, J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). https://doi.org/10.1038/nbt.3988 + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. @@ -116,10 +132,16 @@ > Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. doi: 10.1093/bioinformatics/btp352. +- [Seqtk](https://github.com/lh3/seqtk) + - [SPAdes](https://doi.org/10.1101/gr.213959.116) > Nurk, S., Meleshko, D., Korobeynikov, A., & Pevzner, P. A. (2017). metaSPAdes: a new versatile metagenomic assembler. Genome research, 27(5), 824-834. doi: 10.1101/gr.213959.116. +- [Tiara](https://doi.org/10.1093/bioinformatics/btab672) + + > Karlicki, M., Antonowicz, S., Karnkowska, A., 2022. Tiara: deep learning-based classification system for eukaryotic sequences. Bioinformatics 38, 344–350. doi: 10.1093/bioinformatics/btab672 + ## Data - [Full-size test data](https://doi.org/10.1038/s41587-019-0191-2) @@ -141,5 +163,8 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/README.md b/README.md index c851487d..3ed797e8 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,16 @@ [![GitHub Actions CI Status](https://github.com/nf-core/mag/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+CI%22) [![GitHub Actions Linting Status](https://github.com/nf-core/mag/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3589527-1073c8)](https://doi.org/10.5281/zenodo.3589527) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3589527-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3589527) [![Cite Publication](https://img.shields.io/badge/Cite%20Us!-Cite%20Publication-orange)](https://doi.org/10.1093/nargab/lqac007) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/mag) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/mag)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/mag)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -21,51 +21,35 @@ nf-core/mag workflow overview

-The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/mag/results). - ## Pipeline summary -By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), and merge multiple sequencing runs. + The pipeline then: - assigns taxonomy to reads using [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) and/or [Kraken2](https://github.com/DerrickWood/kraken2/wiki) - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) -- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal) +- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) - performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). +- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) -- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) +- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions. -## Quick Start - -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) - -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. - -3. Download the pipeline and test it on a minimal dataset with a single command: - - ```bash - nextflow run nf-core/mag -profile test,YOURPROFILE --outdir - ``` +## Usage - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. +> **Note** +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +> with `-profile test` before running the workflow on actual data. - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. - -4. Start running your own analysis! - - ```bash - nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir - ``` +```bash +nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir +``` - or +or ```bash nextflow run nf-core/mag -profile --input samplesheet.csv --outdir @@ -73,9 +57,18 @@ nextflow run nf-core/mag -profile **Warning:** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/mag/usage) and the [parameter documentation](https://nf-co.re/mag/parameters). + +## Pipeline output -The nf-core/mag pipeline comes with documentation about the pipeline [usage](https://nf-co.re/mag/usage), [parameters](https://nf-co.re/mag/parameters) and [output](https://nf-co.re/mag/output). Detailed information about how to specify the input can be found under [input specifications](https://nf-co.re/mag/usage#input_specifications). +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/mag/results) tab on the nf-core website pipeline page. +For more details about the output files and reports, please refer to the +[output documentation](https://nf-co.re/mag/output). ### Group-wise co-assembly and co-abundance computation @@ -100,6 +93,7 @@ We thank the following people for their extensive assistance in the development - [Maxime Garcia](https://github.com/MaxUlysse) - [Michael L Heuer](https://github.com/heuermh) - [Alex Hübner](https://github.com/alexhbnr) +- [Jim Downie](https://github.com/prototaxites) ## Contributions and Support diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 1b18c63e..d3fe4abe 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,18 +3,22 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/mag Methods Description" section_href: "https://github.com/nf-core/mag" plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/mag v${workflow.manifest.version} (${doi_text}; Krakau et al., 2022) of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/mag v${workflow.manifest.version} (${doi_text}; Krakau et al., 2022) of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

    -
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • -
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • Krakau, S., Straub, D., Gourlé, H., Gabernet, G., & Nahnsen, S. (2022). nf-core/mag: a best-practice pipeline for metagenome hybrid assembly and binning. NAR Genomics and Bioinformatics, 4(1). https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 97e1f26e..4909cb1a 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/mag + This report has been generated by the nf-core/mag analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-mag-methods-description": order: -1000 @@ -14,22 +14,78 @@ export_plots: true data_format: "yaml" +run_modules: + - fastqc + - fastp + - adapterRemoval + - custom_content + - bowtie2 + - busco + - quast + - kraken + - prokka + +## Module order top_modules: - "fastqc": name: "FastQC: raw reads" path_filters_exclude: - "*trimmed*" - "fastp" - - "adapterRemoval": - - custom_content + - "adapterRemoval" - "fastqc": name: "FastQC: after preprocessing" info: "After trimming and, if requested, contamination removal." path_filters: - "*trimmed*" + - "bowtie2": + name: "Bowtie2: PhiX removal" + info: "Mapping statistics of reads mapped against PhiX and subsequently removed." + path_filters: + - "*_phix_removed.bowtie2.log" + - "bowtie2": + name: "Bowtie2: host removal" + info: "Mapping statistics of reads mapped against host genome and subsequently removed." + path_filters: + - "*_host_removed.bowtie2.log" + - "bowtie2": + name: "Bowtie2: assembly" + info: "Mapping statistics of reads mapped against assemblies." + path_filters_exclude: + - "*_host_removed.bowtie2.log" + - "*_phix_removed.bowtie2.log" + - "kraken": + name: "Kraken2" + anchor: "Kraken2" + target: "Kraken2" + doi: "10.1101/gr.210641.116" + path_filters: + - "*.kraken2_report.txt" + - "kraken": + name: "Centrifuge" + anchor: "centrifuge" + target: "Centrifuge" + doi: "10.1101/gr.210641.116" + info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title" + extra: "ℹ️: plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above." + path_filters: + - "*.centrifuge_kreport.txt" + - "porechop" + - "bcftools" + - custom_content - "busco": info: "assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO's automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown." - - "quast" + - "quast": + name: "QUAST: assembly" + info: "Assembly statistics of raw assemblies." + path_filters: + - "*rawassemblies.tsv" + - "quast": + name: "QUAST: bins" + info: "Assembly statistics of binned assemblies." + path_filters_exclude: + - "*rawassemblies.tsv" + - "prokka" custom_data: host_removal: @@ -42,12 +98,145 @@ custom_data: title: "Bowtie 2: reads mapped against host reference" ylab: "# Reads" +## Sample name cleaning sp: host_removal: fn: "host_removal_metrics.tsv" adapterRemoval: - fn: "*_ar2_*.log" + fn: "*_ar2.settings" + kraken: + fn_re: ".*[kraken2|centrifuge].*report.txt" + quast: + fn_re: "report.*.tsv" +## File name cleaning extra_fn_clean_exts: - ".bowtie2" - "_ar2" + - "host_removed" + - "phix_removed" + - "centrifuge_kreport" + - "_fastp" + +## Prettification +custom_logo: "nf-core-mag_logo_light.png" +custom_logo_url: https://github.com/nf-core/mag/ +custom_logo_title: "nf-core/mag" + +## Tool specific configuration +prokka_fn_snames: True + +## General Stats customisation +table_columns_visible: + "FastQC: raw reads": + avg_sequence_length: True + "FastQC: after preprocessing": + avg_sequence_length: True + "fastp": + pct_duplication: False + after_filtering_q30_rate: False + after_filtering_q30_bases: False + filtering_result_passed_filter_reads: 3300 + after_filtering_gc_content: False + pct_surviving: True + pct_adapter: True + "Bowtie2: assembly": False + "Kraken2": False + "Centrifuge": False + "QUAST: assembly": + N75: True + L50: True + L75: True + "Largest contig": True + "Total length": True + N50: True + "QUAST: bins": + N75: True + L50: True + L75: True + "Largest contig": True + "Total length": True + N50: True + "Prokka": False + +table_columns_placement: + "FastQC: raw reads": + percent_duplicates: 1000 + percent_gc: 1100 + avg_sequence_length: 1200 + median_sequence_length: 1300 + total_sequences: 1400 + percent_fails: 1500 + "FastQC: after preprocessing": + percent_duplicates: 2000 + percent_gc: 2100 + avg_sequence_length: 2200 + median_sequence_length: 2300 + total_sequences: 2400 + percent_fails: 2500 + "fastp": + pct_duplication: 3000 + after_filtering_q30_rate: 3100 + after_filtering_q30_bases: 3200 + filtering_result_passed_filter_reads: 3300 + after_filtering_gc_content: 3400 + pct_surviving: 3500 + pct_adapter: 3600 + "Adapter Removal": + percent_aligned: 4000 + aligned_total: 4100 + percent_discarded: 4200 + "Bowtie2: PhiX removal": + overall_alignment_rate: 5000 + "Bowtie2: host removal": + overall_alignment_rate: 6000 + "Bowtie2: assembly": + overall_alignment_rate: 7000 + "Kraken2": + "% root": 8000 + "% Top 5": 8100 + "% Unclassified": 8200 + "Centrifuge": + "% root": 9000 + "% Top 5": 9100 + "% Unclassified": 9200 + "QUAST: assembly": + "N50": 10000 + "Total length": 11000 + "QUAST: bins": + "N50": 10000 + "Total length": 11000 + Prokka: + contigs: 20000 + bases: 21000 + CDS: 22000 + organism: 23000 + +table_columns_name: + "FastQC: raw reads": + percent_duplicates: "% Dups (raw)" + percent_gc: "% GC (raw)" + avg_sequence_length: "Avg. length (raw)" + median_sequence_length: "Median length (raw)" + total_sequences: "M Seqs (raw)" + percent_fails: "% Fails (raw)" + "FastQC: after preprocessing": + percent_duplicates: "% Dups (processed)" + percent_gc: "% GC (processed)" + avg_sequence_length: "Avg. length (processed)" + median_sequence_length: "Median length (processed)" + total_sequences: "M Seqs (processed)" + percent_fails: "% Fails (processed)" + "Bowtie2: PhiX removal": + overall_alignment_rate: "% Aligned (PhiX)" + "Bowtie2: host removal": + overall_alignment_rate: "% Aligned (Host)" + "Bowtie2: assembly": + overall_alignment_rate: "% Aligned (Assem.)" + +custom_table_header_config: + general_stats_table: + "Total length": + hidden: True + N50: + hidden: True diff --git a/assets/nf-core-mag_logo_light.png b/assets/nf-core-mag_logo_light.png index 26d7ed5d..64276cbe 100644 Binary files a/assets/nf-core-mag_logo_light.png and b/assets/nf-core-mag_logo_light.png differ diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f2..bc7d3f0c 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/mag v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/domain_classification.R b/bin/domain_classification.R new file mode 100755 index 00000000..eb64b312 --- /dev/null +++ b/bin/domain_classification.R @@ -0,0 +1,156 @@ +#!/usr/bin/env Rscript + +# Written by Jim Downie and released under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + +library(optparse) +library(tidyverse) + +parser <- OptionParser() +parser <- add_option(parser, c("-t", "--classification_file"), + action = "store", + type = "character", + metavar = "character", + help = "The out.txt tsv file of per-contig classifications from Tiara.") +parser <- add_option(parser, c("-s", "--contig_to_bin"), + action = "store", + type = "character", + metavar = "character", + help = "A tsv file with two columns, bin and contig, listing the contig membership for each bin.") +parser <- add_option(parser, c("-j", "--join_prokaryotes"), + action = "store_true", + type = "logical", + default = TRUE, + metavar = "logical", + help = "Use an general prokaryote classification instead of separating Archaea and Bacteria.") +parser <- add_option(parser, c("-a", "--assembler"), + action = "store", + type = "character", + metavar = "character", + help = "Assembler used to assemble the contigs. 'MEGAHIT' or 'SPAdes' only.") +parser <- add_option(parser, c("-o", "--output_prefix"), + action = "store", + type = "character", + metavar = "character", + help = "Prefix for the output classification table name.") +args <- parse_args(parser) + +## optparse doesn't have a required flag so exit if we don't get given a file +if(is.null(args$classification_file)) { + stop("Tiara classification file not provided.") +} +if(is.null(args$contig_to_bin)) { + stop("Contig to bin file not provided.") +} +if(is.null(args$assembler)) { + stop("Assembler not provided.") +} +if(!(args$assembler %in% c("MEGAHIT", "SPAdes"))) { + stop("Invalid assembler provided.") +} + +find_classification <- function(probabilities, join_prokaryotes = TRUE) { + if(join_prokaryotes) { + classifications <- c("prokarya", "eukarya", "organelle", "unknown") + } else { + classifications <- c("archaea", "bacteria", "eukarya", "organelle", "unknown") + } + return(classifications[which.max(probabilities)]) +} + +classify_bins <- function(tiara, contig2bin, join_prokaryotes, assembler){ + ## MEGAHIT produces contigs with spaces in the name + ## Depending on the binner, everything after the first space is sometimes dropped + ## Make sure that we drop everything after a possible space before doing anything else to allow merging + if(assembler == "MEGAHIT"){ + tiara$sequence_id <- word(tiara$sequence_id) + contig2bin$sequence_id <- word(contig2bin$sequence_id) + } + if(join_prokaryotes) { + n_classifications <- 4 + } else { + n_classifications <- 5 + } + + ## combination of left_join and filter collectively eliminate unclassified contigs + tiara <- tiara |> + left_join(contig2bin) |> + filter(!is.na(BinID)) |> + select(sequence_id, + BinID, + Archaea = arc, + Bacteria = bac, + Eukarya = euk, + Organelle = org, + Unknown = unk1) + + if(join_prokaryotes) { + tiara <- tiara |> + mutate(Prokarya = Archaea + Bacteria) |> + select(sequence_id, BinID, Prokarya, Eukarya, Organelle, Unknown) + } + + ## Identify the columns to softmax + prob_columns <- 2:(2 + n_classifications - 1) + + ## Calculate softmax probabilites based on summed bin probabilities for each category + softmax_probabilities <- tiara |> + group_by(BinID) |> + summarise(across(all_of(prob_columns), sum), .groups = "drop") |> + rowwise() |> + mutate(denominator = sum(exp(c_across(all_of(prob_columns))))) |> + mutate(across(all_of(prob_columns), \(x) exp(x)/denominator), + classification = find_classification(c_across(all_of(prob_columns)), + join_prokaryotes = join_prokaryotes)) |> + select(-denominator) + + ## A bin may have no classified contigs if all contigs are below the minimum + ## Tiara length threshold + all_bins <- unique(contig2bin$BinID) + unclassified_bins <- all_bins[!(all_bins %in% softmax_probabilities$BinID)] + + ## Assign these as unclassified + if(length(unclassified_bins) > 0) { + if(join_prokaryotes == TRUE){ + unclassified_bins_tbl <- tibble( + BinID = unclassified_bins, + Prokarya = NA, + Eukarya = NA, + Organelle = NA, + Unknown = NA, + classification = "unknown" + ) + } else { + unclassified_bins_tbl <- tibble( + BinID = unclassified_bins, + Bacteria = NA, + Archaea = NA, + Eukarya = NA, + Organelle = NA, + Unknown = NA, + classification = "unknown" + ) + } + softmax_probabilities <- bind_rows(softmax_probabilities, unclassified_bins_tbl) + } + + return(softmax_probabilities) +} + +classifications <- read_tsv(args$classification_file, na = c("NA", "n/a")) +contig_to_bin <- read_tsv(args$contig_to_bin, col_names = c("sequence_id", "BinID")) + +results <- classify_bins(tiara = classifications, + contig2bin = contig_to_bin, + join_prokaryotes = args$join_prokaryotes, + assembler = args$assembler) + +## Keep just the classifications so we can loop over more easily +results_basic <- select(results, BinID, classification) + +## write outputs +write_tsv(results, paste0(args$output_prefix, ".binclassification.tsv")) +write_tsv(results_basic, "bin2classification.tsv", col_names = FALSE) + +## write out package versions +packageVersion("tidyverse") |> as.character() |> writeLines("tidyverse_version.txt") diff --git a/bin/run_busco.sh b/bin/run_busco.sh new file mode 100755 index 00000000..9e022e87 --- /dev/null +++ b/bin/run_busco.sh @@ -0,0 +1,158 @@ +#! /usr/bin/env bash + +p=$1 +cp_augustus_config=$2 +db=$3 +bin=$4 +task_cpus=$5 +lineage_dataset_provided=$6 +busco_clean=$7 + +# ensure augustus has write access to config directory +if [ ${cp_augustus_config} = "Y" ]; then + cp -r /usr/local/config/ augustus_config/ + export AUGUSTUS_CONFIG_PATH=augustus_config +fi + +# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) +if [ ${lineage_dataset_provided} = "Y" ]; then + mkdir dataset + mv ${db} dataset/ +fi + +# set nullgob: if pattern matches no files, expand to a null string rather than to itself +shopt -s nullglob + +# only used for saving busco downloads +most_spec_db="NA" + +if busco ${p} \ + --mode genome \ + --in ${bin} \ + --cpu ${task_cpus} \ + --out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err; then + + # get name of used specific lineage dataset + summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) + if [ ${#summaries[@]} -ne 1 ]; then + echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." + exit 1 + fi + [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]] + db_name_spec="${BASH_REMATCH[1]}" + most_spec_db=${db_name_spec} + echo "Used specific lineage dataset: ${db_name_spec}" + + if [ ${lineage_dataset_provided} = "Y" ]; then + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting + # report bin as failed to allow consistent warnings within the pipeline for both settings + if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then + echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" + fi + else + # auto lineage selection + if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } || + { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log && + egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then + # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately + # and cause different log messages + echo "Domain and specific lineage could be selected by BUSCO." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + db_name_gen="" + summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) + if [ ${#summaries_gen[@]} -lt 1 ]; then + echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + db_name_gen=${db_name_spec} + else + [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]] + db_name_gen="${BASH_REMATCH[1]}" + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt + fi + + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then + # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? + echo "Domain could not be selected by BUSCO, but virus dataset was selected." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + else + echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 + exit 1 + fi + fi + + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz + break + done + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz + break + done + +elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then + echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" + +elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then + echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." + echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt" + + message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log) + [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]] + db_name_gen="${BASH_REMATCH[1]}" + most_spec_db=${db_name_gen} + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt + + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + +else + echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 + exit 1 +fi + +# additionally output genes predicted with Prodigal (GFF3) +if [ -f BUSCO/logs/prodigal_out.log ]; then + mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" +fi + +# output value of most_spec_db +echo ${most_spec_db} > info_most_spec_db.txt + +# if needed delete temporary BUSCO files +if [ ${busco_clean} = "Y" ]; then + find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \; + find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \; + find . -depth -type d -name "run_*" -execdir rm -rf "{}" + +fi diff --git a/bin/split_fasta.py b/bin/split_fasta.py index c9149f25..87cb9dfa 100755 --- a/bin/split_fasta.py +++ b/bin/split_fasta.py @@ -45,10 +45,10 @@ ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name)) + pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name)) + remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) else: with open(input_file) as f: fasta_sequences = SeqIO.parse(f, "fasta") @@ -64,10 +64,10 @@ ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name)) + pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name)) + remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) # Sort sequences above threshold by length df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True) @@ -77,10 +77,10 @@ for index, row in df_above_threshold.iterrows(): if index + 1 <= max_sequences: print("write " + out_base + "." + str(index + 1) + ".fa") - out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"]) + out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") else: - pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"])) + pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")) print("write " + out_base + ".pooled.fa") SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") diff --git a/conf/base.config b/conf/base.config index ca06b1a6..7dec9e28 100644 --- a/conf/base.config +++ b/conf/base.config @@ -14,7 +14,7 @@ process { memory = { check_max( 7.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 3 maxErrors = '-1' @@ -117,7 +117,7 @@ process { memory = { check_max (40.GB * task.attempt, 'memory' ) } time = { check_max (12.h * task.attempt, 'time' ) } } - withName: GTDBTK_CLASSIFY { + withName: GTDBTK_CLASSIFYWF { cpus = { check_max (10 * task.attempt, 'cpus' ) } memory = { check_max (128.GB * task.attempt, 'memory' ) } time = { check_max (12.h * task.attempt, 'time' ) } diff --git a/conf/igenomes.config b/conf/igenomes.config index 7a1b3ac6..3f114377 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -36,6 +36,14 @@ params { macs_gsize = "2.7e9" blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } + 'CHM13' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" + gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" + mito_name = "chrM" + } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" diff --git a/conf/modules.config b/conf/modules.config index e61906cb..08167030 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,6 +27,8 @@ process { mode: params.publish_dir_mode, pattern: "*.html" ] + ext.prefix = { "${meta.id}_run${meta.run}_raw" } + tag = { "${meta.id}_run${meta.run}_raw" } } withName: FASTP { @@ -50,6 +52,8 @@ process { enabled: params.save_clipped_reads ] ] + ext.prefix = { "${meta.id}_run${meta.run}_fastp" } + tag = { "${meta.id}_run${meta.run}" } } withName: ADAPTERREMOVAL_PE { @@ -72,7 +76,8 @@ process { enabled: params.save_clipped_reads ] ] - ext.prefix = { "${meta.id}_ar2" } + ext.prefix = { "${meta.id}_run${meta.run}_ar2" } + tag = { "${meta.id}_run${meta.run}" } } withName: ADAPTERREMOVAL_SE { @@ -87,11 +92,12 @@ process { mode: params.publish_dir_mode, pattern: "*.{settings}" ] - ext.prefix = { "${meta.id}_ar2" } + ext.prefix = { "${meta.id}_run${meta.run}_ar2" } + tag = { "${meta.id}_run${meta.run}" } } withName: BOWTIE2_PHIX_REMOVAL_ALIGN { - ext.prefix = { "${meta.id}.phix_removed" } + ext.prefix = { "${meta.id}_run${meta.run}_phix_removed" } publishDir = [ [ path: { "${params.outdir}/QC_shortreads/remove_phix" }, @@ -105,12 +111,13 @@ process { enabled: params.save_phixremoved_reads ] ] + tag = { "${meta.id}_run${meta.run}" } } withName: BOWTIE2_HOST_REMOVAL_ALIGN { ext.args = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive" ext.args2 = params.host_removal_save_ids ? "--host_removal_save_ids" : '' - ext.prefix = { "${meta.id}.host_removed" } + ext.prefix = { "${meta.id}_run${meta.run}_host_removed" } publishDir = [ [ path: { "${params.outdir}/QC_shortreads/remove_host" }, @@ -124,16 +131,40 @@ process { enabled: params.save_hostremoved_reads ] ] + tag = { "${meta.id}_run${meta.run}" } } withName: FASTQC_TRIMMED { ext.args = '--quiet' - ext.prefix = { "${meta.id}.trimmed" } + ext.prefix = { "${meta.id}_run${meta.run}_trimmed" } publishDir = [ path: { "${params.outdir}/QC_shortreads/fastqc" }, mode: params.publish_dir_mode, pattern: "*.html" ] + tag = { "${meta.id}_run${meta.run}" } + } + + withName: BBMAP_BBNORM { + ext.args = [ + params.bbnorm_target ? "target=${params.bbnorm_target}" : '', + params.bbnorm_min ? "min=${params.bbnorm_min}" : '', + ].join(' ').trim() + publishDir = [ + [ + path : { "${params.outdir}/bbmap/bbnorm/logs" }, + enabled: params.save_bbnorm_reads, + mode : params.publish_dir_mode, + pattern: "*.log" + ], + [ + path : { "${params.outdir}/bbmap/bbnorm/"}, + mode : 'copy', + enabled: params.save_bbnorm_reads, + mode : params.publish_dir_mode, + pattern: "*.fastq.gz" + ] + ] } withName: PORECHOP { @@ -143,6 +174,7 @@ process { pattern: "*_porechop.fastq", enabled: params.save_porechop_reads ] + ext.prefix = { "${meta.id}_run${meta.run}_trimmed" } } withName: FILTLONG { @@ -152,6 +184,7 @@ process { pattern: "*_lr_filtlong.fastq.gz", enabled: params.save_filtlong_reads ] + ext.prefix = { "${meta.id}_run${meta.run}_lengthfiltered" } } withName: NANOLYSE { @@ -168,6 +201,7 @@ process { enabled: params.save_lambdaremoved_reads ] ] + ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" } } withName: NANOPLOT_RAW { @@ -252,29 +286,42 @@ process { ] } - withName: BOWTIE2_ASSEMBLY_ALIGN { - ext.args = params.bowtie2_mode ? params.bowtie2_mode : params.ancient_dna ? '--very-sensitive -N 1' : '' + withName: GENOMAD_ENDTOEND { + ext.args = [ + "--cleanup", + "--min-score ${params.genomad_min_score}", + "--splits ${params.genomad_splits}", + ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" }, + path: { "${params.outdir}/VirusIdentification/geNomad/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.log" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY|MAG_DEPTHS_PLOT_REFINED' { + withName: BOWTIE2_ASSEMBLY_ALIGN { + ext.args = params.bowtie2_mode ? params.bowtie2_mode : params.ancient_dna ? '--very-sensitive -N 1' : '' + ext.prefix = { "${meta.id}.assembly" } publishDir = [ - path: { "${params.outdir}/GenomeBinning/depths/bins" }, - mode: params.publish_dir_mode, - pattern: "*.{png,tsv}" + [ + path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{bam,bai}", + enabled: params.save_assembly_mapped_reads + ], ] } - withName: 'MAG_DEPTHS_SUMMARY_REFINED' { - ext.prefix = "bin_refined_depths_summary" + withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY' { publishDir = [ path: { "${params.outdir}/GenomeBinning/depths/bins" }, mode: params.publish_dir_mode, - pattern: "*.{tsv}" + pattern: "*.{png,tsv}" ] } @@ -409,7 +456,7 @@ process { ] } - withName: GTDBTK_CLASSIFY { + withName: GTDBTK_CLASSIFYWF { ext.args = "--extension fa" publishDir = [ path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" }, @@ -430,7 +477,7 @@ process { withName: PROKKA { ext.args = "--metagenome" publishDir = [ - path: { "${params.outdir}/Prokka/${meta.assembler}" }, + path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -448,7 +495,7 @@ process { withName: PRODIGAL { ext.args = "-p meta" publishDir = [ - path: { "${params.outdir}/Prodigal/${meta.assembler}/${meta.id}" }, + path: { "${params.outdir}/Annotation/Prodigal/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -572,7 +619,7 @@ process { [ path: { "${params.outdir}/GenomeBinning/MaxBin2/bins/" }, mode: params.publish_dir_mode, - pattern: '*/*.fa.gz' + pattern: '*.fa.gz' ], ] } @@ -622,6 +669,10 @@ process { ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" } } + withName: DASTOOL_FASTATOCONTIG2BIN_TIARA { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}" } + } + withName: DASTOOL_DASTOOL { publishDir = [ [ @@ -650,6 +701,57 @@ process { ] } + withName: TIARA_TIARA { + publishDir = [ + [ + path: { "${params.outdir}/Taxonomy/Tiara" }, + mode: params.publish_dir_mode, + pattern: { "${meta.assembler}-${meta.id}.tiara.{txt}" } + ], + [ + path: { "${params.outdir}/Taxonomy/Tiara/log" }, + mode: params.publish_dir_mode, + pattern: { "log_${meta.assembler}-${meta.id}.tiara.{txt}" } + ] + ] + ext.args = { "--min_len ${params.tiara_min_length} --probabilities" } + ext.prefix = { "${meta.assembler}-${meta.id}.tiara" } + } + + withName: TIARA_CLASSIFY { + ext.args = { "--join_prokaryotes --assembler ${meta.assembler}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.bin}-${meta.id}" } + } + + withName: TIARA_SUMMARY { + publishDir = [ + path: { "${params.outdir}/Taxonomy/" }, + mode: params.publish_dir_mode, + pattern: "tiara_summary.tsv" + ] + ext.prefix = "tiara_summary" + } + + withName: MMSEQS_DATABASES { + ext.prefix = { "${params.metaeuk_mmseqs_db.replaceAll("/", "-")}" } + publishDir = [ + path: { "${params.outdir}/Annotation/mmseqs_db/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_mmseqs_db + ] + } + + withName: METAEUK_EASYPREDICT { + ext.args = "" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/Annotation/MetaEuk/${meta.assembler}/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 21a099a6..348b95d5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv' centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" skip_krona = true @@ -28,6 +28,6 @@ params { max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" busco_clean = true - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index b4621c81..92d51aec 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -11,8 +11,8 @@ */ params { - config_profile_name = 'Test profile for running with AdapterRemoval' - config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data' + config_profile_name = 'Test profile for running with AdapterRemoval and domain classification' + config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data and domain classification.' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -20,14 +20,16 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.euk.csv' centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" + metaeuk_db = "https://github.com/nf-core/test-datasets/raw/modules/data/proteomics/database/yeast_UPS.fasta" skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true clip_tool = 'adapterremoval' skip_concoct = true + bin_domain_classification = true } diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index dcb8f7c9..325362fc 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -27,7 +27,7 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true ancient_dna = true binning_map_mode = 'own' skip_spades = false diff --git a/conf/test_no_clipping.config b/conf/test_bbnorm.config similarity index 78% rename from conf/test_no_clipping.config rename to conf/test_bbnorm.config index a4f1881e..5f481adf 100644 --- a/conf/test_no_clipping.config +++ b/conf/test_bbnorm.config @@ -11,8 +11,8 @@ */ params { - config_profile_name = 'Test profile for skipping all short read preprocessing' - config_profile_description = 'Minimal test dataset to check pipeline function when all short read preprocessing is skipped.' + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions max_cpus = 2 @@ -21,14 +21,20 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + keep_phix = true + skip_clipping = true + skip_prokka = true + skip_prodigal = true + skip_quast = true + skip_binning = true centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" - skip_clipping = true - keep_phix = true skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false - skip_concoct = true + busco_clean = true + skip_gtdbtk = true + bbnorm = true + coassemble_group = true } diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config index ddf44ceb..85dda8db 100644 --- a/conf/test_binrefinement.config +++ b/conf/test_binrefinement.config @@ -21,14 +21,17 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + assembly_input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/assembly_samplesheet.csv' centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true refine_bins_dastool = true refine_bins_dastool_threshold = 0 - postbinning_input = 'both' + // TODO not using 'both' until #489 merged + postbinning_input = 'refined_bins_only' + busco_clean = true } diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config index 9480575c..6479012f 100644 --- a/conf/test_busco_auto.config +++ b/conf/test_busco_auto.config @@ -24,7 +24,7 @@ params { skip_spades = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - gtdb = false + skip_gtdbtk = true skip_prokka = true skip_prodigal = true skip_quast = true diff --git a/conf/test_full.config b/conf/test_full.config index 039fff67..4917332e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,8 +10,6 @@ ---------------------------------------------------------------------------------------- */ -cleanup = true - params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' @@ -24,7 +22,7 @@ params { centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz" kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz" cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz" - gtdb = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz" + gtdb_db = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz" // reproducibility options for assembly spades_fix_cpus = 10 diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config index f91ef48c..b3487c6b 100644 --- a/conf/test_host_rm.config +++ b/conf/test_host_rm.config @@ -25,6 +25,6 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index 8cf5e525..bc22d3d2 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -24,6 +24,6 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config index 8a37b813..7a0e4a15 100644 --- a/conf/test_hybrid_host_rm.config +++ b/conf/test_hybrid_host_rm.config @@ -26,4 +26,5 @@ params { max_unbinned_contigs = 2 skip_binqc = true skip_concoct = true + skip_gtdbtk = true } diff --git a/conf/test_nothing.config b/conf/test_nothing.config new file mode 100644 index 00000000..53df219f --- /dev/null +++ b/conf/test_nothing.config @@ -0,0 +1,43 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Runs input data but skipping all possible steps to allow for a fast testing + profile for input checks etc. + + Use as follows: + nextflow run nf-core/mag -profile test_nothing, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + centrifuge_db = null + kraken2_db = null + skip_krona = true + skip_clipping = true + skip_adapter_trimming = true + skip_spades = true + skip_spadeshybrid = true + skip_megahit = true + skip_quast = true + skip_prodigal = true + skip_binning = true + skip_metabat2 = true + skip_maxbin2 = true + skip_concoct = true + skip_prokka = true + skip_binqc = true + skip_gtdbtk = true + skip_concoct = true +} diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config new file mode 100644 index 00000000..e15fab7d --- /dev/null +++ b/conf/test_virus_identification.config @@ -0,0 +1,42 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_virus_identification, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for running virus_identification' + config_profile_description = 'Minimal test dataset to check pipeline function virus identification' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + run_virus_identification = true + genomad_splits = 7 + + // For computational efficiency + reads_minlength = 150 + coassemble_group = true + skip_gtdbtk = true + skip_binning = true + skip_prokka = true + skip_spades = true + skip_spadeshybrid = true + skip_quast = true + skip_prodigal = true + skip_krona = true + skip_adapter_trimming = true + skip_metabat2 = true + skip_maxbin2 = true + skip_busco = true +} diff --git a/docs/images/mag_workflow.png b/docs/images/mag_workflow.png index f476287a..d4cda1a0 100644 Binary files a/docs/images/mag_workflow.png and b/docs/images/mag_workflow.png differ diff --git a/docs/images/mag_workflow.svg b/docs/images/mag_workflow.svg index f847cec3..cf9dfc1f 100644 --- a/docs/images/mag_workflow.svg +++ b/docs/images/mag_workflow.svg @@ -1,11 +1,11 @@ + originy="145.52081" + dotted="true" + spacingy="1" + spacingx="1" + units="mm" + visible="false" /> @@ -470,10 +474,10 @@ id="g6248" transform="translate(6.9563316,-9.8853133)"> + + + Domain classification + transform="translate(-48.61407,65.869579)"> + transform="translate(-127.8353,38.642012)"> Short reads(required) Long reads(optional) + transform="translate(-78.042851,145.23947)"> + transform="translate(197.63408,-247.93437)"> @@ -1471,20 +1513,10 @@ - + transform="translate(0.41482033,-21.713362)"> + transform="translate(12.595022,186.3768)"> @@ -1546,23 +1578,62 @@ y="-57.991726" style="font-size:3.175px;fill:#000000;fill-opacity:1;stroke-width:0.264583">GTDB-Tk + + + Tiara + + + + MetaEuk + - + transform="translate(34.122913,-23.872595)"> + transform="translate(249.24779,-79.485727)"> + transform="translate(-197.12776,198.55821)"> + + + + Virus identification + + @@ -1802,6 +1904,29 @@ x="71.375946" y="23.016262" style="font-size:3.175px;stroke-width:0.264583">BCFTools + + + geNomad + @@ -2131,12 +2256,12 @@ + transform="translate(87.850198,-14.321784)"> + transform="translate(31.0114,87.276994)"> MaxBin2 + transform="translate(146.5855,-168.13403)"> CONCOCT - Evaluation - + id="g1745" + transform="translate(0,43.915012)"> Evaluation + + + BUSCO - - - - BUSCO + + + + CheckM - - - - CheckM + + + + GUNC - - - - GUNC + + + + QUAST + id="text4774-3-1-7-1-9-7-7">QUAST + (Abundance estimation(Abundance estimation and and visualization) + id="tspan4193">visualization) v2.3.0 + y="65.938919">v2.4.0 + transform="translate(3.3065026,127.42992)"> - + + inkscape:export-ydpi="289.40701">Bin post-processing + diff --git a/docs/output.md b/docs/output.md index 301dc6ad..31b86883 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,8 +12,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Quality control](#quality-control) of input reads - trimming and contaminant removal - [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) +- [Digital sequencing normalisation](#digital-normalization-with-BBnorm) - [Assembly](#assembly) of trimmed reads - [Protein-coding gene prediction](#gene-prediction) of assemblies +- [Virus identification](#virus-identification-in-assemblies) of assemblies - [Binning and binning refinement](#binning-and-binning-refinement) of assembled contigs - [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes) - [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) @@ -129,6 +131,20 @@ NanoPlot is used to calculate various metrics and plots about the quality and le +## Digital normalization with BBnorm + +If the pipeline is called with the `--bbnorm` option, it will normalize sequencing depth of libraries prior assembly by removing reads to 1) reduce coverage of very abundant kmers and 2) delete very rare kmers (see `--bbnorm_target` and `--bbnorm_min` parameters). +When called in conjunction with `--coassemble_group`, BBnorm will operate on interleaved (merged) FastQ files, producing only a single output file. +If the `--save_bbnorm_reads` parameter is set, the resulting FastQ files are saved together with log output. + +
+Output files + +- `bbmap/bbnorm/[sample]\*.fastq.gz` +- `bbmap/bbnorm/log/[sample].bbnorm.log` + +
+ ## Taxonomic classification of trimmed reads ### Kraken @@ -177,6 +193,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. @@ -195,6 +212,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdes-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. @@ -213,6 +231,7 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdesHybrid-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. @@ -246,11 +265,40 @@ Protein-coding genes are predicted for each assembly.
Output files -- `Prodigal/` - - `[sample/group].gff`: Gene Coordinates in GFF format - - `[sample/group].faa`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. - - `[sample/group].fna`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). - - `[sample/group]_all.txt`: Information about start positions of genes. +- `Annotation/Prodigal/` + - `[assembler]-[sample/group].gff.gz`: Gene Coordinates in GFF format + - `[assembler]-[sample/group].faa.gz`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. + - `[assembler]-[sample/group].fna.gz`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). + - `[assembler]-[sample/group]_all.txt.gz`: Information about start positions of genes. + +
+ +## Virus identification in assemblies + +### geNomad + +[geNomad](https://github.com/apcamargo/genomad) identifies viruses and plasmids in sequencing data (isolates, metagenomes, and metatranscriptomes) + +
+Output files + +- `VirusIdentification/geNomad/[assembler]-[sample/group]*/` + - `[assembler]-[sample/group]*_annotate` + - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data + - `[assembler]-[sample/group]*_aggregated_classification` + - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format + - `[assembler]-[sample/group]*_find_proviruses` + - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad + - `[assembler]-[sample/group]*_summary` + - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format + - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format + - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format + - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format + - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format + - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary)
@@ -377,6 +425,22 @@ By default, only the raw bins (and unbinned contigs) from the actual binning met ⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. +### Tiara + +Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. + +
+Output files + +- `Taxonomy/Tiara/` + - `[assembler]-[sample/group].tiara.txt` - Tiara output classifications (with probabilities) for all contigs within the specified sample/group assembly + - `log/log_[assembler]-[sample/group].txt` - log file detailing the parameters used by the Tiara model for contig classification. +- `GenomeBinning/tiara_summary.tsv` - Summary of Tiara domain classification for all bins. + +
+ +Typically, you would use `tiara_summary.tsv` as the primary file to see which bins or unbins have been classified to which domains at a glance, whereas `[assembler]-[sample/group].tiara.txt` provides classifications for each contig. + ### Bin sequencing depth For each bin or refined bin the median sequencing depth is computed based on the corresponding contig depths. @@ -565,19 +629,34 @@ Whole genome annotation is the process of identifying features of interest in a
Output files -- `Prokka/[assembler]/[bin]/` - - `[bin].gff`: annotation in GFF3 format, containing both sequences and annotations - - `[bin].gbk`: annotation in GenBank format, containing both sequences and annotations - - `[bin].fna`: nucleotide FASTA file of the input contig sequences - - `[bin].faa`: protein FASTA file of the translated CDS sequences - - `[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) - - `[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank - - `[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file - - `[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file - - `[bin].err`: unacceptable annotations - the NCBI discrepancy report. - - `[bin].log`: contains all the output that Prokka produced during its run - - `[bin].txt`: statistics relating to the annotated features found - - `[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) +- `Annotation/Prokka/[assembler]/[bin]/` + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].gbk`: annotation in GenBank format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].fna`: nucleotide FASTA file of the input contig sequences + - `[assembler]-[binner]-[bin].faa`: protein FASTA file of the translated CDS sequences + - `[assembler]-[binner]-[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) + - `[assembler]-[binner]-[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank + - `[assembler]-[binner]-[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].err`: unacceptable annotations - the NCBI discrepancy report. + - `[assembler]-[binner]-[bin].log`: contains all the output that Prokka produced during its run + - `[assembler]-[binner]-[bin].txt`: statistics relating to the annotated features found + - `[assembler]-[binner]-[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) + +
+ +### MetaEuk + +In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://github.com/soedinglab/metaeuk) is also available to annotate eukaryotic genomes quickly with standards-compliant output files. + +
+Output files + +- `Annotation/MetaEuk/[assembler]/[bin]` + - `[assembler]-[binner]-[bin].fas`: fasta file of protein sequences identified by MetaEuk + - `[assembler]-[binner]-[bin].codon.fas`: fasta file of nucleotide sequences corresponding to the protein sequences fasta + - `[assembler]-[binner]-[bin].headersMap.tsv`: tab-separated table containing the information from each header in the fasta files + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format
@@ -610,7 +689,7 @@ Optional, only running when parameter `-profile ancient_dna` is specified. ### `variant_calling` -Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is re-called with a variant calling software using the reads aligned back to the contigs +Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied.
Output files @@ -640,6 +719,21 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +The general stats table at the top of the table will by default only display the most relevant pre- and post-processing statistics prior to assembly, i.e., FastQC, fastp/Adapter removal, and Bowtie2 PhiX and host removal mapping results. + +Note that the FastQC raw and processed columns are right next to each other for improved visual comparability, however the processed columns represent the input reads _after_ fastp/Adapter Removal processing (the dedicated columns of which come directly after the two FastQC set of columns). Hover your cursor over each column name to see the which tool the column is derived from. + +Summary tool-specific plots and tables of following tools are currently displayed (if activated): + +- FastQC (pre- and post-trimming) +- fastp +- Adapter Removal +- bowtie2 +- BUSCO +- QUAST +- Kraken2 / Centrifuge +- PROKKA + ### Pipeline information
diff --git a/docs/usage.md b/docs/usage.md index ea80671f..c991434c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Input specifications -The input data can be passed to nf-core/mag in two possible ways using the `--input` parameter. +The input data can be passed to nf-core/mag in three possible ways, either using the `--input` or `--assembly_input` parameters. ### Direct FASTQ input (short reads only) @@ -27,12 +27,13 @@ Please note the following additional requirements: - When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs - To run single-end data you must additionally specify `--single_end` - If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` +- Sample name and run combinations must be unique ### Samplesheet input file -Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. +Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. Furthermore when a `run` column is present, the pipeline will also run perform run- or lane-wise concatenation, for cases where you may have a sample or library sequenced with the same sequencing configuration across multiple runs. The optional run merging happens after short read QC (adapter clipping, host/PhiX removal etc.), and prior to normalisation, taxonomic profiling, and assembly. -This CSV file should contain the following columns: +At a minimum CSV file should contain the following columns: `sample,group,short_reads_1,short_reads_2,long_reads` @@ -53,12 +54,22 @@ sample1,0,data/sample1.fastq.gz,, sample2,0,data/sample2.fastq.gz,, ``` +or to additionally to perform run merging of two runs of sample1: + +```bash +sample,run,group,short_reads_1,short_reads_2,long_reads +sample1,1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz +sample1,2,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz +sample2,0,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz +sample3,1,0,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz, +``` + Please note the following requirements: -- 5 comma-seperated columns +- a minimum 5 of comma-seperated columns - Valid file extension: `.csv` -- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` -- Sample IDs must be unique +- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` (where `run` can be optionally added) +- Run IDs must be unique within a multi-run sample. A sample with multiple runs will be automatically concatenated. - FastQ files must be compressed (`.fastq.gz`, `.fq.gz`) - `long_reads` can only be provided in combination with paired-end short read data - Within one samplesheet either only single-end or only paired-end reads can be specified @@ -66,6 +77,47 @@ Please note the following requirements: Again, by default, the group information is only used to compute co-abundances for the binning step, but not for group-wise co-assembly (see the parameter docs for [`--coassemble_group`](https://nf-co.re/mag/parameters#coassemble_group) and [`--binning_map_mode`](https://nf-co.re/mag/parameters#binning_map_mode) for more information about how this group information can be used). +### Supplying pre-computed assemblies + +It is also possible to run nf-core/mag on pre-computed assemblies, by supplying a CSV file to the parameter `--assembly_input` in addition to the raw reads supplied to `--input`. Supplying assembly input skips all read pre-processing and assembly, jumping straight to the binning stage of the pipeline. + +The assembly CSV file should contain the following columns: + +`id,group,assembler,fasta` + +Where `id` is the ID of the assembly, group is the assembly/binning group (see samplesheet information section for more details), `assembler` is the assembler used to produce the assembly (one of `MEGAHIT`, `SPAdes`, or `SPAdesHybrid`), and `fasta` is the path to the assembly fasta file. Input fasta files can be compressed or uncompressed, but compressed assemblies will be automatically uncompressed for use within the pipeline. The exact information required for each supplied assembly depends on whether the assemblies provided are single assemblies or group-wise co-assemblies. For the following example `--input` CSV: + +```bash +sample,group,short_reads_1,short_reads_2,long_reads +sample1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz, +sample2,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz, +sample3,1,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz, +``` + +If the assemblies are single assemblies, then the `id` and `group` columns should match those supplied in the `-input` read CSV files for each read set: + +```bash +id,group,assembler,fasta +sample1,0,MEGAHIT,MEGAHIT-sample1.contigs.fa.gz +sample1,0,SPAdes,SPAdes-sample1.fasta.gz +sample2,0,MEGAHIT,MEGAHIT-sample2.contigs.fa.gz +sample2,0,SPAdes,SPAdes-sample2.contigs.fasta.gz +sample3,1,MEGAHIT,MEGAHIT-sample3.contigs.fa.gz +sample3,1,SPAdes,SPAdes-sample3.contigs.fasta.gz +``` + +If the assemblies are co-assemblies, the parameter `--coassemble_group` should additionally be specified. In this case, the `id` column should uniquely identify the assembly, while `group` should match those specified in the `--input` CSV file: + +```bash +id,group,assembler,fasta +group-0,0,MEGAHIT,MEGAHIT-group-0.contigs.fa.gz +group-0,0,SPAdes,SPAdes-group-0.contigs.fasta.gz +group-1,1,MEGAHIT,MEGAHIT-group-1.contigs.fa.gz +group-1,1,SPAdes,SPAdes-group-1.contigs.fasta.gz +``` + +When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies, i.e., adapter-removed, run-merged etc.. Preprocessing steps will not be ran on raw reads when pre-computed assemblies are supplied. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored. + ## Running the pipeline The typical command for running the pipeline is as follows: @@ -85,7 +137,29 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` -See the [nf-core/mag website documentation](https://nf-co.re/mag/usage#usage) for more information about pipeline specific parameters. +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). + +The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/mag -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './samplesheet.csv' +outdir: './results/' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + +See the [nf-core/mag website documentation](https://nf-co.re/mag/parameters) for more information about pipeline specific parameters. ### Updating the pipeline @@ -103,6 +177,10 @@ First, go to the [nf-core/mag releases page](https://github.com/nf-core/mag/rele This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. + Additionally, to enable also reproducible results from the individual assembly tools this pipeline provides extra parameters. SPAdes is designed to be deterministic for a given number of threads. To generate reproducible results set the number of cpus with `--spades_fix_cpus` or `--spadeshybrid_fix_cpus`. This will overwrite the number of cpus specified in the `base.config` file and additionally ensure that it is not increased in case of retries for individual samples. MEGAHIT only generates reproducible results when run single-threaded. You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do not specify the number of cpus for these processes in additional custom config files, this would result in an error. @@ -112,6 +190,8 @@ To allow also reproducible bin QC with BUSCO, run BUSCO providing already downlo For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved. +When it comes to visualizing taxonomic data using [Krona](https://github.com/marbl/Krona), you have the option to provide a taxonomy file, such as `taxonomy.tab`, using the `--krona_db` parameter. If you don't supply a taxonomy file, Krona is designed to automatically download the required taxonomy data for visualization. + The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reproducible, since the placement of bins in the reference tree is non-deterministic. However, the authors of the GTDB-Tk article examined the reproducibility on a set of 100 genomes across 50 trials and did not observe any difference (see [https://doi.org/10.1093/bioinformatics/btz848](https://doi.org/10.1093/bioinformatics/btz848)). ## Core Nextflow arguments @@ -122,7 +202,7 @@ The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reprod Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -146,8 +226,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -163,58 +245,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```bash -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - +### Custom Containers -Command exit status: - 137 +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. -Command output: - (empty) +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb +### Custom Tool Arguments -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. -#### For beginners - -A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. - -#### Advanced option on process level - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. Note, do not change number of CPUs with custom config files for the processes `spades`, `spadeshybrid` or `megahit` when specifying the parameters `--spades_fix_cpus`, `--spadeshybrid_fix_cpus` and `--megahit_fix_cpu_1` respectively. diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 33cd4f6e..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,528 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - System.exit(1) - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 2f9a1e42..28ad471d 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -58,9 +58,7 @@ class NfcoreTemplate { // Set up the e-mail variables def subject = "[$workflow.manifest.name] Successful: $workflow.runName" - if (busco_failed_bins.size() > 0) { - subject = "[$workflow.manifest.name] Partially successful: For ${busco_failed_bins.size()} bin(s) the BUSCO analysis failed because no genes where found or placements failed: $workflow.runName" - } + if (!workflow.success) { subject = "[$workflow.manifest.name] FAILED: $workflow.runName" } @@ -132,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -231,32 +229,6 @@ class NfcoreTemplate { // public static void summary(workflow, params, log, busco_failed_bins = [:]) { Map colors = logColours(params.monochrome_logs) - - if (busco_failed_bins.size() > 0) { - def failed_bins_no_genes = '' - def failed_bins_placements_failed = '' - def count_no_genes = 0 - def count_placements_failed = 0 - for (bin in busco_failed_bins) { - if (bin.value == "No genes"){ - count_no_genes += 1 - failed_bins_no_genes += " ${bin.key}\n" - } - if (bin.value == "Placements failed"){ - count_placements_failed += 1 - failed_bins_placements_failed += " ${bin.key}\n" - } - } - if (params.busco_reference) - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${busco_failed_bins.size()} bin(s) BUSCO did not find any matching genes:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-" - else { - if (count_no_genes > 0) - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_no_genes} bin(s) the BUSCO analysis failed because no BUSCO genes could be found:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-" - if (count_placements_failed > 0) - log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_placements_failed} bin(s) the BUSCO analysis using automated lineage selection failed due to failed placements:\n${failed_bins_placements_failed}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information. Results for selected domain are still used.${colors.reset}-" - } - } - if (workflow.success) { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy index de2d769d..51822e4e 100755 --- a/lib/WorkflowMag.groovy +++ b/lib/WorkflowMag.groovy @@ -2,6 +2,7 @@ // This file holds several functions specific to the workflow/mag.nf in the nf-core/mag pipeline // +import nextflow.Nextflow import groovy.text.SimpleTemplateEngine class WorkflowMag { @@ -9,25 +10,25 @@ class WorkflowMag { // // Check and validate parameters // + public static void initialise(params, log, hybrid) { // Check if binning mapping mode is valid if (!['all', 'group', 'own'].contains(params.binning_map_mode)) { - log.error "Invalid parameter '--binning_map_mode ${params.binning_map_mode}'. Valid values are 'all', 'group' or 'own'." - System.exit(1) + Nextflow.error("Invalid parameter '--binning_map_mode ${params.binning_map_mode}'. Valid values are 'all', 'group' or 'own'.") } if (params.coassemble_group && params.binning_map_mode == 'own') { - log.error "Invalid combination of parameter '--binning_map_mode own' and parameter '--coassemble_group'. Select either 'all' or 'group' mapping mode when performing group-wise co-assembly." - System.exit(1) + Nextflow.error("Invalid combination of parameter '--binning_map_mode own' and parameter '--coassemble_group'. Select either 'all' or 'group' mapping mode when performing group-wise co-assembly.") + } + if (params.ancient_dna && params.binning_map_mode != 'own') { + Nextflow.error("Invalid combination of parameter '--binning_map_mode' and parameter '--ancient_dna'. Ancient DNA mode can only be executed with --binning_map_mode own. You supplied: --binning_map_mode ${params.binning_map_mode}") } // Check if specified cpus for SPAdes are available if ( params.spades_fix_cpus > params.max_cpus ) { - log.error "Invalid parameter '--spades_fix_cpus ${params.spades_fix_cpus}', max cpus are '${params.max_cpus}'." - System.exit(1) + Nextflow.error("Invalid parameter '--spades_fix_cpus ${params.spades_fix_cpus}', max cpus are '${params.max_cpus}'.") } if ( params.spadeshybrid_fix_cpus > params.max_cpus ) { - log.error "Invalid parameter '--spadeshybrid_fix_cpus ${params.spadeshybrid_fix_cpus}', max cpus are '${params.max_cpus}'." - System.exit(1) + Nextflow.error("Invalid parameter '--spadeshybrid_fix_cpus ${params.spadeshybrid_fix_cpus}', max cpus are '${params.max_cpus}'.") } // Check if settings concerning reproducibility of used tools are consistent and print warning if not if (params.megahit_fix_cpu_1 || params.spades_fix_cpus != -1 || params.spadeshybrid_fix_cpus != -1) { @@ -52,8 +53,7 @@ class WorkflowMag { // Check if parameters for host contamination removal are valid if ( params.host_fasta && params.host_genome) { - log.error 'Both host fasta reference and iGenomes genome are specified to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome.' - System.exit(1) + Nextflow.error('Both host fasta reference and iGenomes genome are specified to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome.') } if ( hybrid && (params.host_fasta || params.host_genome) ) { log.warn 'Host read removal is only applied to short reads. Long reads might be filtered indirectly by Filtlong, which is set to use read qualities estimated based on k-mer matches to the short, already filtered reads.' @@ -63,25 +63,21 @@ class WorkflowMag { } if ( params.host_genome ) { if (!params.genomes) { - log.error 'No config file containing genomes provided!' - System.exit(1) + Nextflow.error('No config file containing genomes provided!') } // Check if host genome exists in the config file if (!params.genomes.containsKey(params.host_genome)) { - log.error '=============================================================================\n' + + Nextflow.error('=============================================================================\n' + " Host genome '${params.host_genome}' not found in any config files provided to the pipeline.\n" + ' Currently, the available genome keys are:\n' + " ${params.genomes.keySet().join(', ')}\n" + - '===================================================================================' - System.exit(1) + '===================================================================================') } if ( !params.genomes[params.host_genome].fasta ) { - log.error "No fasta file specified for the host genome ${params.host_genome}!" - System.exit(1) + Nextflow.error("No fasta file specified for the host genome ${params.host_genome}!") } if ( !params.genomes[params.host_genome].bowtie2 ) { - log.error "No Bowtie 2 index file specified for the host genome ${params.host_genome}!" - System.exit(1) + Nextflow.error("No Bowtie 2 index file specified for the host genome ${params.host_genome}!") } } @@ -93,56 +89,54 @@ class WorkflowMag { // Check more than one binner is run for bin refinement (required DAS by Tool) // If the number of run binners (i.e., number of not-skipped) is more than one, otherwise throw an error if ( params.refine_bins_dastool && !([ params.skip_metabat2, params.skip_maxbin2, params.skip_concoct ].count(false) > 1) ) { - log.error 'Bin refinement with --refine_bins_dastool requires at least two binners to be running (not skipped). Check input.' - System.exit(1) + Nextflow.error('Bin refinement with --refine_bins_dastool requires at least two binners to be running (not skipped). Check input.') } // Check that bin refinement is actually turned on if any of the refined bins are requested for downstream if (!params.refine_bins_dastool && params.postbinning_input != 'raw_bins_only') { - log.error 'The parameter '--postbinning_input ${ params.postbinning_input }' for downstream steps can only be specified if bin refinement is activated with --refine_bins_dastool! Check input.' - System.exit(1) + Nextflow.error("The parameter '--postbinning_input ${ params.postbinning_input }' for downstream steps can only be specified if bin refinement is activated with --refine_bins_dastool! Check input.") } // Check if BUSCO parameters combinations are valid if (params.skip_binqc && params.binqc_tool == 'checkm') { - log.error 'Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.' - System.exit(1) + Nextflow.error('Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.') } if (params.skip_binqc) { if (params.busco_reference) { - log.error 'Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.' - System.exit(1) + Nextflow.error('Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.') } if (params.busco_download_path) { - log.error 'Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.' - System.exit(1) + Nextflow.error('Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.') } if (params.busco_auto_lineage_prok) { - log.error 'Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.' - System.exit(1) + Nextflow.error('Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.') } } if (params.busco_reference && params.busco_download_path) { - log.error 'Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.' - System.exit(1) + Nextflow.error('Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.') } if (params.busco_auto_lineage_prok && params.busco_reference) { - log.error 'Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.' - System.exit(1) + Nextflow.error('Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.') } - if (params.skip_binqc && params.gtdb) { - log.warn '--skip_binqc and --gtdb are specified! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.' + if (params.skip_binqc && !params.skip_gtdbtk) { + log.warn '--skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.' } // Check if CAT parameters are valid if (params.cat_db && params.cat_db_generate) { - log.error 'Invalid combination of parameters --cat_db and --cat_db_generate is specified! Please specify either --cat_db or --cat_db_generate.' - System.exit(1) + Nextflow.error('Invalid combination of parameters --cat_db and --cat_db_generate is specified! Please specify either --cat_db or --cat_db_generate.') } if (params.save_cat_db && !params.cat_db_generate) { - log.error 'Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.' - System.exit(1) + Nextflow.error('Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.') + } + + // Chech MetaEuk db paramaters + if (params.metaeuk_mmseqs_db && params.metaeuk_db) { + Nextflow.error('Invalid parameter combination: both --metaeuk_mmseqs_db and --metaeuk_db are specified! Please specify either --metaeuk_mmseqs_db or --metaeuk_db.') + } + if (params.save_mmseqs_db && !params.metaeuk_mmseqs_db) { + Nextflow.error('Invalid parameter combination: --save_mmseqs_db supplied but no database has been requested for download with --metaeuk_mmseqs_db!') } } @@ -173,14 +167,56 @@ class WorkflowMag { return yaml_file_text } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // TODO Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta['manifest_map'] = run_workflow.manifest.toMap() - meta['doi_text'] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : '' - meta['nodoi_text'] = meta.manifest_map.doi ? '' : '
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • ' + // Pipeline DOI + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + def methods_text = mqc_methods_yaml.text @@ -188,17 +224,19 @@ class WorkflowMag { def description_html = engine.createTemplate(methods_text).make(meta) return description_html - }// + } + + // // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n' + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - ' Currently, the available genome keys are:\n' + - " ${params.genomes.keySet().join(', ')}\n" + - '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' - System.exit(1) + " Currently, the available genome keys are:\n" + + " ${params.genomes.keySet().join(", ")}\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index afd8c0f3..f90a0676 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the nf-core/mag pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -19,40 +21,10 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params, log) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params, log) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } - // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params, log) - System.exit(0) - } // Print workflow version and exit on --version if (params.version) { @@ -61,14 +33,6 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) @@ -82,8 +46,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } // diff --git a/main.nf b/main.nf index e28c4f87..0888d885 100644 --- a/main.nf +++ b/main.nf @@ -4,7 +4,6 @@ nf-core/mag ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/mag - Website: https://nf-co.re/mag Slack : https://nfcore.slack.com/channels/mag ---------------------------------------------------------------------------------------- @@ -18,6 +17,22 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index ede87e6b..73a43c4d 100644 --- a/modules.json +++ b/modules.json @@ -7,158 +7,203 @@ "nf-core": { "adapterremoval": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "aria2": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"], "patch": "modules/nf-core/aria2/aria2.diff" }, + "bbmap/bbnorm": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, "bcftools/consensus": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bcftools/index": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bcftools/view": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", "installed_by": ["modules"] }, "checkm/lineagewf": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "checkm/qa": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "concoct/concoct": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "c684a6edba2c8516a0d2cfeafb12489b99d2fffb", "installed_by": ["fasta_binning_concoct"] }, "concoct/concoctcoveragetable": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "36aa9d3ce6561d9bd5c240bcf82fe109af543c0d", "installed_by": ["fasta_binning_concoct"] }, "concoct/cutupfasta": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "36aa9d3ce6561d9bd5c240bcf82fe109af543c0d", "installed_by": ["fasta_binning_concoct"] }, "concoct/extractfastabins": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "36aa9d3ce6561d9bd5c240bcf82fe109af543c0d", "installed_by": ["fasta_binning_concoct"] }, "concoct/mergecutupclustering": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "36aa9d3ce6561d9bd5c240bcf82fe109af543c0d", "installed_by": ["fasta_binning_concoct"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "dastool/dastool": { "branch": "master", - "git_sha": "8ffb8ec8e49aafe43240b652147d70d56f150b3c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "dastool/fastatocontig2bin": { "branch": "master", - "git_sha": "8ffb8ec8e49aafe43240b652147d70d56f150b3c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "fastp": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "9a4517e720bc812e95b56d23d15a1653b6db4f53", "installed_by": ["modules"] }, "freebayes": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "genomad/download": { + "branch": "master", + "git_sha": "ca813f3f73adedf3547a5a677e992d9d43a71870", + "installed_by": ["modules"] + }, + "genomad/endtoend": { + "branch": "master", + "git_sha": "ca813f3f73adedf3547a5a677e992d9d43a71870", + "installed_by": ["modules"] + }, + "gtdbtk/classifywf": { + "branch": "master", + "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df", "installed_by": ["modules"] }, "gunc/downloaddb": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gunc/mergecheckm": { "branch": "master", - "git_sha": "93f8308f6c1ef35b6b8cd264cefd22853fc51526", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gunc/run": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", "installed_by": ["modules"] }, "maxbin2": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "metabat2/jgisummarizebamcontigdepths": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "metabat2/metabat2": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "d2e220fdec3aa2f4482c70017df4cdf8a4c94f27", + "installed_by": ["modules"] + }, + "metaeuk/easypredict": { + "branch": "master", + "git_sha": "30d06da5bd7ae67be32758bf512cd75a4325d386", + "installed_by": ["modules"] + }, + "mmseqs/databases": { + "branch": "master", + "git_sha": "699e078133f580548aeb43114f93ac29928c6143", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", "installed_by": ["modules"] }, "prodigal": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, "prokka": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "pydamage/analyze": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "pydamage/filter": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": ["modules"] + }, + "seqtk/mergepe": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "tiara/tiara": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] } } @@ -167,7 +212,7 @@ "nf-core": { "fasta_binning_concoct": { "branch": "master", - "git_sha": "038be4287743ec3afd8fdf7eb06039d7407bb631", + "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] } } diff --git a/modules/local/adjust_maxbin2_ext.nf b/modules/local/adjust_maxbin2_ext.nf index 99a7555c..4d7fecd0 100644 --- a/modules/local/adjust_maxbin2_ext.nf +++ b/modules/local/adjust_maxbin2_ext.nf @@ -6,7 +6,7 @@ process ADJUST_MAXBIN2_EXT { conda "bioconda::multiqc=1.12" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: tuple val(meta), path(bins) diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 8d5e5c25..4503502f 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -3,7 +3,7 @@ process BIN_SUMMARY { conda "conda-forge::pandas=1.4.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'quay.io/biocontainers/pandas:1.4.3' }" + 'biocontainers/pandas:1.4.3' }" input: path(bin_depths) diff --git a/modules/local/bowtie2_assembly_align.nf b/modules/local/bowtie2_assembly_align.nf index 17f71ad2..951dfb8d 100644 --- a/modules/local/bowtie2_assembly_align.nf +++ b/modules/local/bowtie2_assembly_align.nf @@ -4,7 +4,7 @@ process BOWTIE2_ASSEMBLY_ALIGN { conda "bioconda::bowtie2=2.4.2 bioconda::samtools=1.11 conda-forge::pigz=2.3.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0' : - 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0' }" + 'biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0' }" input: tuple val(assembly_meta), path(assembly), path(index), val(reads_meta), path(reads) diff --git a/modules/local/bowtie2_assembly_build.nf b/modules/local/bowtie2_assembly_build.nf index b3e3332c..1f305f70 100644 --- a/modules/local/bowtie2_assembly_build.nf +++ b/modules/local/bowtie2_assembly_build.nf @@ -4,7 +4,7 @@ process BOWTIE2_ASSEMBLY_BUILD { conda "bioconda::bowtie2=2.4.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : - 'quay.io/biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" input: tuple val(meta), path(assembly) diff --git a/modules/local/bowtie2_removal_align.nf b/modules/local/bowtie2_removal_align.nf index 924dc343..03cb9b25 100644 --- a/modules/local/bowtie2_removal_align.nf +++ b/modules/local/bowtie2_removal_align.nf @@ -7,7 +7,7 @@ process BOWTIE2_REMOVAL_ALIGN { conda "bioconda::bowtie2=2.4.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : - 'quay.io/biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" input: tuple val(meta), path(reads) diff --git a/modules/local/bowtie2_removal_build.nf b/modules/local/bowtie2_removal_build.nf index 70b2c4e7..ba152611 100644 --- a/modules/local/bowtie2_removal_build.nf +++ b/modules/local/bowtie2_removal_build.nf @@ -4,7 +4,7 @@ process BOWTIE2_REMOVAL_BUILD { conda "bioconda::bowtie2=2.4.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : - 'quay.io/biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" input: path fasta diff --git a/modules/local/busco.nf b/modules/local/busco.nf index b6f89fd1..58e79efc 100644 --- a/modules/local/busco.nf +++ b/modules/local/busco.nf @@ -4,7 +4,7 @@ process BUSCO { conda "bioconda::busco=5.4.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/busco:5.4.3--pyhdfd78af_0': - 'quay.io/biocontainers/busco:5.4.3--pyhdfd78af_0' }" + 'biocontainers/busco:5.4.3--pyhdfd78af_0' }" input: tuple val(meta), path(bin) @@ -38,151 +38,8 @@ process BUSCO { p += " --offline --download_path ${download_folder}" } """ - # ensure augustus has write access to config directory - if [ ${cp_augustus_config} = "Y" ] ; then - cp -r /usr/local/config/ augustus_config/ - export AUGUSTUS_CONFIG_PATH=augustus_config - fi - - # place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) - if [ ${lineage_dataset_provided} = "Y" ] ; then - mkdir dataset - mv ${db} dataset/ - fi - - # set nullgob: if pattern matches no files, expand to a null string rather than to itself - shopt -s nullglob - - # only used for saving busco downloads - most_spec_db="NA" - - if busco ${p} \ - --mode genome \ - --in ${bin} \ - --cpu "${task.cpus}" \ - --out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then - - # get name of used specific lineage dataset - summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) - if [ \${#summaries[@]} -ne 1 ]; then - echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." - exit 1 - fi - [[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; - db_name_spec="\${BASH_REMATCH[1]}" - most_spec_db=\${db_name_spec} - echo "Used specific lineage dataset: \${db_name_spec}" - - if [ ${lineage_dataset_provided} = "Y" ]; then - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - - # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting - # report bin as failed to allow consistent warnings within the pipeline for both settings - if egrep -q \$'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then - echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" - fi - else - # auto lineage selection - if { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q \$'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \ - { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q \$'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \ - && egrep -q \$'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then - # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately - # and cause different log messages - echo "Domain and specific lineage could be selected by BUSCO." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - - db_name_gen="" - summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) - if [ \${#summaries_gen[@]} -lt 1 ]; then - echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - db_name_gen=\${db_name_spec} - else - [[ \$summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]; - db_name_gen="\${BASH_REMATCH[1]}" - echo "Used generic lineage dataset: \${db_name_gen}" - cp BUSCO/short_summary.generic.\${db_name_gen}.BUSCO.txt short_summary.domain.\${db_name_gen}.${bin}.txt - fi - - for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz - break - done - for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz - break - done - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log ; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then - # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? - echo "Domain could not be selected by BUSCO, but virus dataset was selected." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - else - echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 - exit 1 - fi - fi - - for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_spec}.faa.gz - break - done - for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_spec}.fna.gz - break - done - - elif egrep -q \$'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then - echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'ERROR:\tPlacements failed' ${bin}_busco.err ; then - echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." - echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt" - - message=\$(egrep \$'INFO:\t\\S+ selected' ${bin}_busco.log) - [[ \$message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]; - db_name_gen="\${BASH_REMATCH[1]}" - most_spec_db=\${db_name_gen} - echo "Used generic lineage dataset: \${db_name_gen}" - cp BUSCO/auto_lineage/run_\${db_name_gen}/short_summary.txt short_summary.domain.\${db_name_gen}.${bin}.txt - - for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz - break - done - for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz - break - done - - else - echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 - exit 1 - fi - - # additionally output genes predicted with Prodigal (GFF3) - if [ -f BUSCO/logs/prodigal_out.log ]; then - mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" - fi - - # if needed delete temporary BUSCO files - if [ ${busco_clean} ]; then - find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \\; - find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \\; - find . -depth -type d -name "run_*" -execdir rm -rf "{}" + - fi + run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}" + most_spec_db=\$( versions.yml "${task.process}": diff --git a/modules/local/busco_db_preparation.nf b/modules/local/busco_db_preparation.nf index ad77423d..cddc130d 100644 --- a/modules/local/busco_db_preparation.nf +++ b/modules/local/busco_db_preparation.nf @@ -4,7 +4,7 @@ process BUSCO_DB_PREPARATION { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path database diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf index 5d3dab2e..74bcffbf 100644 --- a/modules/local/busco_save_download.nf +++ b/modules/local/busco_save_download.nf @@ -5,7 +5,7 @@ process BUSCO_SAVE_DOWNLOAD { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path(busco_downloads) diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf index c4023276..799196d7 100644 --- a/modules/local/busco_summary.nf +++ b/modules/local/busco_summary.nf @@ -3,7 +3,7 @@ process BUSCO_SUMMARY { conda "conda-forge::pandas=1.4.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'quay.io/biocontainers/pandas:1.4.3' }" + 'biocontainers/pandas:1.4.3' }" input: path(summaries_domain) diff --git a/modules/local/cat.nf b/modules/local/cat.nf index 8bf77cb0..48af75c0 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -4,7 +4,7 @@ process CAT { conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : - 'quay.io/biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + 'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" input: tuple val(meta), path("bins/*") diff --git a/modules/local/cat_db.nf b/modules/local/cat_db.nf index ea3b55c8..dac96bb0 100644 --- a/modules/local/cat_db.nf +++ b/modules/local/cat_db.nf @@ -4,7 +4,7 @@ process CAT_DB { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path(database) diff --git a/modules/local/cat_db_generate.nf b/modules/local/cat_db_generate.nf index 7f389aad..eaf6c1b4 100644 --- a/modules/local/cat_db_generate.nf +++ b/modules/local/cat_db_generate.nf @@ -3,7 +3,7 @@ process CAT_DB_GENERATE { conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : - 'quay.io/biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + 'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" output: tuple env(DB_NAME), path("database/*"), path("taxonomy/*"), emit: db diff --git a/modules/local/cat_summary.nf b/modules/local/cat_summary.nf index f0c6174d..8bd2d815 100644 --- a/modules/local/cat_summary.nf +++ b/modules/local/cat_summary.nf @@ -4,7 +4,7 @@ process CAT_SUMMARY { conda "bioconda::bioawk=1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : - 'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }" + 'biocontainers/bioawk:1.0--hed695b0_5' }" input: path(cat_summaries) diff --git a/modules/local/centrifuge.nf b/modules/local/centrifuge.nf index 78501e0f..c6618417 100644 --- a/modules/local/centrifuge.nf +++ b/modules/local/centrifuge.nf @@ -4,7 +4,7 @@ process CENTRIFUGE { conda "bioconda::centrifuge=1.0.4_beta" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--he513fc3_5' : - 'quay.io/biocontainers/centrifuge:1.0.4_beta--he513fc3_5' }" + 'biocontainers/centrifuge:1.0.4_beta--he513fc3_5' }" input: tuple val(meta), path(reads) @@ -13,18 +13,20 @@ process CENTRIFUGE { output: tuple val("centrifuge"), val(meta), path("results.krona"), emit: results_for_krona path "report.txt" , emit: report - path "kreport.txt" , emit: kreport + tuple val(meta), path("*kreport.txt") , emit: kreport path "versions.yml" , emit: versions script: def input = meta.single_end ? "-U \"${reads}\"" : "-1 \"${reads[0]}\" -2 \"${reads[1]}\"" + prefix = task.ext.prefix ?: "${meta.id}" + """ centrifuge -x "${db_name}" \ -p ${task.cpus} \ --report-file report.txt \ -S results.txt \ $input - centrifuge-kreport -x "${db_name}" results.txt > kreport.txt + centrifuge-kreport -x "${db_name}" results.txt > ${prefix}.centrifuge_kreport.txt cat results.txt | cut -f 1,3 > results.krona cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/centrifuge_db_preparation.nf b/modules/local/centrifuge_db_preparation.nf index b6cb8ea5..5bd76c69 100644 --- a/modules/local/centrifuge_db_preparation.nf +++ b/modules/local/centrifuge_db_preparation.nf @@ -3,7 +3,7 @@ process CENTRIFUGE_DB_PREPARATION { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path db diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf index 782915d9..5e62be27 100644 --- a/modules/local/combine_tsv.nf +++ b/modules/local/combine_tsv.nf @@ -4,7 +4,7 @@ process COMBINE_TSV { conda "bioconda::bioawk=1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : - 'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }" + 'biocontainers/bioawk:1.0--hed695b0_5' }" input: path(bin_summaries) diff --git a/modules/local/convert_depths.nf b/modules/local/convert_depths.nf index 240da1b6..f61e0c29 100644 --- a/modules/local/convert_depths.nf +++ b/modules/local/convert_depths.nf @@ -4,7 +4,7 @@ process CONVERT_DEPTHS { conda "bioconda::bioawk=1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : - 'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }" + 'biocontainers/bioawk:1.0--hed695b0_5' }" input: tuple val(meta), path(fasta), path(depth) diff --git a/modules/local/filtlong.nf b/modules/local/filtlong.nf index 6024b561..5410c1cb 100644 --- a/modules/local/filtlong.nf +++ b/modules/local/filtlong.nf @@ -4,7 +4,7 @@ process FILTLONG { conda "bioconda::filtlong=0.2.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/filtlong:0.2.0--he513fc3_3' : - 'quay.io/biocontainers/filtlong:0.2.0--he513fc3_3' }" + 'biocontainers/filtlong:0.2.0--he513fc3_3' }" input: tuple val(meta), path(long_reads), path(short_reads_1), path(short_reads_2) diff --git a/modules/local/gtdbtk_classify.nf b/modules/local/gtdbtk_classify.nf deleted file mode 100644 index c8cdb1bf..00000000 --- a/modules/local/gtdbtk_classify.nf +++ /dev/null @@ -1,53 +0,0 @@ -process GTDBTK_CLASSIFY { - tag "${meta.assembler}-${meta.binner}-${meta.id}" - - conda "bioconda::gtdbtk=1.5.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gtdbtk:1.5.0--pyhdfd78af_0' : - 'quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0' }" - - input: - tuple val(meta), path("bins/*") - tuple val(db_name), path("database/*") - - output: - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.classify.tree.gz" , emit: tree - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.markers_summary.tsv", emit: markers - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.msa.fasta.gz" , emit: msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.user_msa.fasta" , emit: user_msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.*.filtered.tsv" , emit: filtered - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.log" , emit: log - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.failed_genomes.tsv" , emit: failed - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" - """ - export GTDBTK_DATA_PATH="\${PWD}/database" - if [ ${pplacer_scratch} != "" ] ; then - mkdir pplacer_tmp - fi - - gtdbtk classify_wf $args \ - --genome_dir bins \ - --prefix "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}" \ - --out_dir "\${PWD}" \ - --cpus ${task.cpus} \ - --pplacer_cpus ${params.gtdbtk_pplacer_cpus} \ - ${pplacer_scratch} \ - --min_perc_aa ${params.gtdbtk_min_perc_aa} \ - --min_af ${params.gtdbtk_min_af} - - gzip "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}".*.msa.fasta - mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.log" - mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.binner}-${meta.id}.warnings.log" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gtdbtk: \$(gtdbtk --version | sed -n 1p | sed "s/gtdbtk: version //; s/ Copyright.*//") - END_VERSIONS - """ -} diff --git a/modules/local/gtdbtk_db_preparation.nf b/modules/local/gtdbtk_db_preparation.nf index 4382e96d..3be79c96 100644 --- a/modules/local/gtdbtk_db_preparation.nf +++ b/modules/local/gtdbtk_db_preparation.nf @@ -4,13 +4,13 @@ process GTDBTK_DB_PREPARATION { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path(database) output: - tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*") + tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*"), emit: db script: """ diff --git a/modules/local/gtdbtk_summary.nf b/modules/local/gtdbtk_summary.nf index cb79bbec..52c0a40d 100644 --- a/modules/local/gtdbtk_summary.nf +++ b/modules/local/gtdbtk_summary.nf @@ -4,7 +4,7 @@ process GTDBTK_SUMMARY { conda "conda-forge::pandas=1.4.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'quay.io/biocontainers/pandas:1.4.3' }" + 'biocontainers/pandas:1.4.3' }" input: path(qc_discarded_bins) diff --git a/modules/local/kraken2.nf b/modules/local/kraken2.nf index 8d0b593b..b67118a9 100644 --- a/modules/local/kraken2.nf +++ b/modules/local/kraken2.nf @@ -4,7 +4,7 @@ process KRAKEN2 { conda "bioconda::kraken2=2.0.8_beta" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/kraken2:2.0.8_beta--pl526hc9558a2_2' : - 'quay.io/biocontainers/kraken2:2.0.8_beta--pl526hc9558a2_2' }" + 'biocontainers/kraken2:2.0.8_beta--pl526hc9558a2_2' }" input: tuple val(meta), path(reads) @@ -12,17 +12,19 @@ process KRAKEN2 { output: tuple val("kraken2"), val(meta), path("results.krona"), emit: results_for_krona - path "kraken2_report.txt" , emit: report + tuple val(meta), path("*kraken2_report.txt") , emit: report path "versions.yml" , emit: versions script: def input = meta.single_end ? "\"${reads}\"" : "--paired \"${reads[0]}\" \"${reads[1]}\"" + prefix = task.ext.prefix ?: "${meta.id}" + """ kraken2 \ --report-zero-counts \ --threads ${task.cpus} \ --db database \ - --report kraken2_report.txt \ + --report ${prefix}.kraken2_report.txt \ $input \ > kraken2.kraken cat kraken2.kraken | cut -f 2,3 > results.krona diff --git a/modules/local/kraken2_db_preparation.nf b/modules/local/kraken2_db_preparation.nf index 6db472ab..5ae68b7f 100644 --- a/modules/local/kraken2_db_preparation.nf +++ b/modules/local/kraken2_db_preparation.nf @@ -3,7 +3,7 @@ process KRAKEN2_DB_PREPARATION { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path db diff --git a/modules/local/krona.nf b/modules/local/krona.nf index 0f11d60f..827cbc4a 100644 --- a/modules/local/krona.nf +++ b/modules/local/krona.nf @@ -4,19 +4,21 @@ process KRONA { conda "bioconda::krona=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/krona:2.7.1--pl526_5' : - 'quay.io/biocontainers/krona:2.7.1--pl526_5' }" + 'biocontainers/krona:2.7.1--pl526_5' }" input: tuple val(meta), path(report) - path "taxonomy/taxonomy.tab" + path(taxonomy_file), stageAs: 'taxonomy.tab' output: - path "*.html" , emit: html - path "versions.yml" , emit: versions + tuple val(meta), path("*.html") , emit: html + path "versions.yml" , emit: versions script: """ - ktImportTaxonomy "$report" -tax taxonomy + TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;) + + ktImportTaxonomy ${report} -tax \$TAXONOMY/ cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/krona_db.nf b/modules/local/krona_db.nf index 1f0539b1..0b1f4125 100644 --- a/modules/local/krona_db.nf +++ b/modules/local/krona_db.nf @@ -3,7 +3,7 @@ process KRONA_DB { conda "bioconda::krona=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/krona:2.7.1--pl526_5' : - 'quay.io/biocontainers/krona:2.7.1--pl526_5' }" + 'biocontainers/krona:2.7.1--pl526_5' }" output: path("taxonomy/taxonomy.tab"), emit: db diff --git a/modules/local/mag_depths.nf b/modules/local/mag_depths.nf index f55941f5..2ee63523 100644 --- a/modules/local/mag_depths.nf +++ b/modules/local/mag_depths.nf @@ -5,7 +5,7 @@ process MAG_DEPTHS { conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' : - 'quay.io/biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" + 'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" input: tuple val(meta), path(bins), path(contig_depths) diff --git a/modules/local/mag_depths_plot.nf b/modules/local/mag_depths_plot.nf index e4a02177..150d96fa 100644 --- a/modules/local/mag_depths_plot.nf +++ b/modules/local/mag_depths_plot.nf @@ -4,21 +4,21 @@ process MAG_DEPTHS_PLOT { conda "conda-forge::python=3.9 conda-forge::pandas=1.3.0 anaconda::seaborn=0.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-d14219255233ee6cacc427e28a7caf8ee42e8c91:0a22c7568e4a509925048454dad9ab37fa8fe776-0' : - 'quay.io/biocontainers/mulled-v2-d14219255233ee6cacc427e28a7caf8ee42e8c91:0a22c7568e4a509925048454dad9ab37fa8fe776-0' }" + 'biocontainers/mulled-v2-d14219255233ee6cacc427e28a7caf8ee42e8c91:0a22c7568e4a509925048454dad9ab37fa8fe776-0' }" input: tuple val(meta), path(depths) path(sample_groups) output: - tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}-binDepths.heatmap.png"), emit: heatmap + tuple val(meta), path("${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-binDepths.heatmap.png"), emit: heatmap path "versions.yml" , emit: versions script: """ plot_mag_depths.py --bin_depths ${depths} \ --groups ${sample_groups} \ - --out "${meta.assembler}-${meta.binner}-${meta.id}-binDepths.heatmap.png" + --out "${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-binDepths.heatmap.png" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/mag_depths_summary.nf b/modules/local/mag_depths_summary.nf index 62c476b8..1be7becc 100644 --- a/modules/local/mag_depths_summary.nf +++ b/modules/local/mag_depths_summary.nf @@ -3,7 +3,7 @@ process MAG_DEPTHS_SUMMARY { conda "conda-forge::pandas=1.4.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'quay.io/biocontainers/pandas:1.4.3' }" + 'biocontainers/pandas:1.4.3' }" input: path(mag_depths) diff --git a/modules/local/megahit.nf b/modules/local/megahit.nf index 9cea679d..5be5d01a 100644 --- a/modules/local/megahit.nf +++ b/modules/local/megahit.nf @@ -4,7 +4,7 @@ process MEGAHIT { conda "bioconda::megahit=1.2.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/megahit:1.2.9--h2e03b76_1' : - 'quay.io/biocontainers/megahit:1.2.9--h2e03b76_1' }" + 'biocontainers/megahit:1.2.9--h2e03b76_1' }" input: tuple val(meta), path(reads1), path(reads2) @@ -17,7 +17,7 @@ process MEGAHIT { script: def args = task.ext.args ?: '' - def input = params.single_end ? "-r \"" + reads1.join(",") + "\"" : "-1 \"" + reads1.join(",") + "\" -2 \"" + reads2.join(",") + "\"" + def input = meta.single_end ? "-r \"" + reads1.join(",") + "\"" : "-1 \"" + reads1.join(",") + "\" -2 \"" + reads2.join(",") + "\"" mem = task.memory.toBytes() if ( !params.megahit_fix_cpu_1 || task.cpus == 1 ) """ diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf deleted file mode 100644 index f1c3fa0d..00000000 --- a/modules/local/multiqc.nf +++ /dev/null @@ -1,53 +0,0 @@ -process MULTIQC { - label 'process_medium' - - conda "bioconda::multiqc=1.12" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" - - input: - path multiqc_files - path mqc_custom_config - path 'fastqc_raw/*' - path 'fastqc_trimmed/*' - path host_removal - path 'quast*/*' - path 'bowtie2log/*' - path short_summary - path additional - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - read_type = params.single_end ? "--single_end" : '' - if ( params.host_fasta || params.host_genome ) { - """ - # get multiqc parsed data for bowtie2 - multiqc -f $custom_config_file *.bowtie2.log - multiqc_to_custom_tsv.py ${read_type} - # run multiqc using custom content file instead of original bowtie2 log files - multiqc -f $custom_config_file --ignore "*.bowtie2.log" . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - } else { - """ - multiqc -f $args . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - } -} diff --git a/modules/local/nanolyse.nf b/modules/local/nanolyse.nf index de4c1e69..4cd46d4f 100644 --- a/modules/local/nanolyse.nf +++ b/modules/local/nanolyse.nf @@ -4,7 +4,7 @@ process NANOLYSE { conda "bioconda::nanolyse=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/nanolyse:1.1.0--py36_1' : - 'quay.io/biocontainers/nanolyse:1.1.0--py36_1' }" + 'biocontainers/nanolyse:1.1.0--py36_1' }" input: tuple val(meta), path(reads) diff --git a/modules/local/nanoplot.nf b/modules/local/nanoplot.nf index 16e95868..d3c347aa 100644 --- a/modules/local/nanoplot.nf +++ b/modules/local/nanoplot.nf @@ -4,7 +4,7 @@ process NANOPLOT { conda "bioconda::nanoplot=1.26.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/nanoplot:1.26.3--py_0' : - 'quay.io/biocontainers/nanoplot:1.26.3--py_0' }" + 'biocontainers/nanoplot:1.26.3--py_0' }" input: tuple val(meta), path(reads) diff --git a/modules/local/pool_paired_reads.nf b/modules/local/pool_paired_reads.nf index aaa46c4b..9e73028e 100644 --- a/modules/local/pool_paired_reads.nf +++ b/modules/local/pool_paired_reads.nf @@ -4,7 +4,7 @@ process POOL_PAIRED_READS { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(reads1), path(reads2) diff --git a/modules/local/pool_single_reads.nf b/modules/local/pool_single_reads.nf index 6764d798..3ab6cc7c 100644 --- a/modules/local/pool_single_reads.nf +++ b/modules/local/pool_single_reads.nf @@ -4,7 +4,7 @@ process POOL_SINGLE_READS { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(reads) diff --git a/modules/local/porechop.nf b/modules/local/porechop.nf index d397284d..91576887 100644 --- a/modules/local/porechop.nf +++ b/modules/local/porechop.nf @@ -4,7 +4,7 @@ process PORECHOP { conda "bioconda::porechop=0.2.3_seqan2.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/porechop:0.2.3_seqan2.1.1--py36h2d50403_3' : - 'quay.io/biocontainers/porechop:0.2.3_seqan2.1.1--py36h2d50403_3' }" + 'biocontainers/porechop:0.2.3_seqan2.1.1--py36h2d50403_3' }" input: tuple val(meta), path(reads) diff --git a/modules/local/quast.nf b/modules/local/quast.nf index 374de1d9..4b68f412 100644 --- a/modules/local/quast.nf +++ b/modules/local/quast.nf @@ -4,18 +4,20 @@ process QUAST { conda "bioconda::quast=5.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/quast:5.0.2--py37pl526hb5aa323_2' : - 'quay.io/biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" + 'biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" input: tuple val(meta), path(assembly) output: - path "QUAST/*" , emit: qc - path "versions.yml", emit: versions + path "QUAST/*" , emit: qc + path "QUAST/report_rawassemblies.tsv", emit: report + path "versions.yml" , emit: versions script: """ metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 -l "${meta.assembler}-${meta.id}" "${assembly}" -o "QUAST" + cp QUAST/report.tsv QUAST/report_rawassemblies.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf index 8cbb0cc5..b8015ad5 100644 --- a/modules/local/quast_bins.nf +++ b/modules/local/quast_bins.nf @@ -4,13 +4,13 @@ process QUAST_BINS { conda "bioconda::quast=5.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/quast:5.0.2--py37pl526hb5aa323_2' : - 'quay.io/biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" + 'biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" input: tuple val(meta), path(bins) output: - path "QUAST/*", type: 'dir' + path "QUAST/*", type: 'dir' , emit: dir path "QUAST/*-quast_summary.tsv", emit: quast_bin_summaries path "versions.yml" , emit: versions @@ -20,10 +20,10 @@ process QUAST_BINS { IFS=', ' read -r -a bins <<< \"\$BINS\" for bin in \"\${bins[@]}\"; do metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}" - if ! [ -f "QUAST/${meta.assembler}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then - cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.binner}-${meta.id}-quast_summary.tsv" + if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then + cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" else - tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.binner}-${meta.id}-quast_summary.tsv" + tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" fi done diff --git a/modules/local/quast_bins_summary.nf b/modules/local/quast_bins_summary.nf index c2cb3f2f..8b1734df 100644 --- a/modules/local/quast_bins_summary.nf +++ b/modules/local/quast_bins_summary.nf @@ -3,7 +3,7 @@ process QUAST_BINS_SUMMARY { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path(summaries) diff --git a/modules/local/rename_postdastool.nf b/modules/local/rename_postdastool.nf index 99cd9ad2..7d5a325e 100644 --- a/modules/local/rename_postdastool.nf +++ b/modules/local/rename_postdastool.nf @@ -6,7 +6,7 @@ process RENAME_POSTDASTOOL { conda "bioconda::multiqc=1.12" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: tuple val(meta), path(bins) diff --git a/modules/local/rename_predastool.nf b/modules/local/rename_predastool.nf index 9c73abfd..cc3bab18 100644 --- a/modules/local/rename_predastool.nf +++ b/modules/local/rename_predastool.nf @@ -6,7 +6,7 @@ process RENAME_PREDASTOOL { conda "bioconda::multiqc=1.12" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: tuple val(meta), path(bins) diff --git a/modules/local/spades.nf b/modules/local/spades.nf index 1a74fec5..9ef7ec77 100644 --- a/modules/local/spades.nf +++ b/modules/local/spades.nf @@ -4,7 +4,7 @@ process SPADES { conda "bioconda::spades=3.15.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/spades:3.15.3--h95f258a_0' : - 'quay.io/biocontainers/spades:3.15.3--h95f258a_0' }" + 'biocontainers/spades:3.15.3--h95f258a_0' }" input: tuple val(meta), path(reads) @@ -20,14 +20,17 @@ process SPADES { script: def args = task.ext.args ?: '' maxmem = task.memory.toGiga() + // The -s option is not supported for metaspades. Each time this is called with `meta.single_end` it's because + // read depth was normalized with BBNorm, which actually outputs pairs, but in an interleaved file. + def readstr = meta.single_end ? "--12 ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + if ( params.spades_fix_cpus == -1 || task.cpus == params.spades_fix_cpus ) """ metaspades.py \ $args \ --threads "${task.cpus}" \ --memory $maxmem \ - --pe1-1 ${reads[0]} \ - --pe1-2 ${reads[1]} \ + ${readstr} \ -o spades mv spades/assembly_graph_with_scaffolds.gfa SPAdes-${meta.id}_graph.gfa mv spades/scaffolds.fasta SPAdes-${meta.id}_scaffolds.fasta diff --git a/modules/local/spadeshybrid.nf b/modules/local/spadeshybrid.nf index b0957bb1..13578a69 100644 --- a/modules/local/spadeshybrid.nf +++ b/modules/local/spadeshybrid.nf @@ -4,7 +4,7 @@ process SPADESHYBRID { conda "bioconda::spades=3.15.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/spades:3.15.3--h95f258a_0' : - 'quay.io/biocontainers/spades:3.15.3--h95f258a_0' }" + 'biocontainers/spades:3.15.3--h95f258a_0' }" input: tuple val(meta), path(long_reads), path(short_reads) diff --git a/modules/local/split_fasta.nf b/modules/local/split_fasta.nf index 3a42ba78..4ea3b757 100644 --- a/modules/local/split_fasta.nf +++ b/modules/local/split_fasta.nf @@ -6,7 +6,7 @@ process SPLIT_FASTA { conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' : - 'quay.io/biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" + 'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" input: tuple val(meta), path(unbinned) diff --git a/modules/local/tiara_classify.nf b/modules/local/tiara_classify.nf new file mode 100644 index 00000000..8fde5241 --- /dev/null +++ b/modules/local/tiara_classify.nf @@ -0,0 +1,49 @@ +process TIARA_CLASSIFY { + tag "${meta.id}" + label "process_single" + + conda "conda-forge::r-tidyverse=1.3.1 conda-forge::r-optparse=1.7.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' : + 'biocontainers/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' }" + + input: + tuple val(meta), path(classification), path(contig2bin), path(bins) + + output: + tuple val(meta), path("eukarya/*.fa"), emit: eukarya_bins, optional: true + tuple val(meta), path("prokarya/*.fa"), emit: prokarya_bins, optional: true + tuple val(meta), path("bacteria/*.fa"), emit: bacteria_bins, optional: true + tuple val(meta), path("archaea/*.fa"), emit: archaea_bins, optional: true + tuple val(meta), path("organelle/*.fa"), emit: organelle_bins, optional: true + tuple val(meta), path("unknown/*.fa"), emit: unknown_bins, optional: true + tuple val(meta), path("*.binclassification.tsv"), emit: bin_classifications + path 'versions.yml', emit: versions + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + """ + domain_classification.R \ + --classification_file ${classification} \ + --contig_to_bin ${contig2bin} \ + ${args} \ + --output_prefix ${prefix} + + mkdir eukarya + mkdir prokarya + mkdir bacteria + mkdir archaea + mkdir organelle + mkdir unknown + + while IFS=\$"\t" read bin domain; do + find -L . -name "\${bin}*" -exec mv {} \${domain}/ \\; + done < bin2classification.tsv + + cat <<-END_VERSIONS > versions.yml + r-base: \$(R --version | head -n 1 | grep -Eo '[0-9.]+ ') + r-tidyverse: \$(cat tidyverse_version.txt) + END_VERSIONS + """ +} diff --git a/modules/nf-core/adapterremoval/main.nf b/modules/nf-core/adapterremoval/main.nf index 324b4b06..29aac1c0 100644 --- a/modules/nf-core/adapterremoval/main.nf +++ b/modules/nf-core/adapterremoval/main.nf @@ -5,7 +5,7 @@ process ADAPTERREMOVAL { conda "bioconda::adapterremoval=2.3.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : - 'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + 'biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff index 5d9b47f3..789fdb44 100644 --- a/modules/nf-core/aria2/aria2.diff +++ b/modules/nf-core/aria2/aria2.diff @@ -1,15 +1,6 @@ Changes in module 'nf-core/aria2' --- modules/nf-core/aria2/main.nf +++ modules/nf-core/aria2/main.nf -@@ -3,7 +3,7 @@ - tag "$source_url" - label 'process_single' - -- conda "conda-forge::aria2=1.36.0" -+ conda "conda-forge::aria2=1.36.0 conda-forge::tar" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : - 'quay.io/biocontainers/aria2:1.36.0' }" @@ -12,7 +12,7 @@ val source_url diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf index 0dcd7423..b6091dad 100644 --- a/modules/nf-core/aria2/main.nf +++ b/modules/nf-core/aria2/main.nf @@ -3,10 +3,10 @@ process ARIA2 { tag "$source_url" label 'process_single' - conda "conda-forge::aria2=1.36.0 conda-forge::tar" + conda "conda-forge::aria2=1.36.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : - 'quay.io/biocontainers/aria2:1.36.0' }" + 'biocontainers/aria2:1.36.0' }" input: val source_url diff --git a/modules/nf-core/aria2/meta.yml b/modules/nf-core/aria2/meta.yml index 031b319e..64c2a524 100644 --- a/modules/nf-core/aria2/meta.yml +++ b/modules/nf-core/aria2/meta.yml @@ -5,10 +5,9 @@ keywords: tools: - "aria2": description: "aria2 is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink." - homepage: "None" - documentation: "None" + tool_dev_url: "https://github.com/aria2/aria2/" - doi: "" + licence: "['GPL v2']" input: diff --git a/modules/nf-core/bbmap/bbnorm/main.nf b/modules/nf-core/bbmap/bbnorm/main.nf new file mode 100644 index 00000000..9974bfb4 --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/main.nf @@ -0,0 +1,42 @@ +process BBMAP_BBNORM { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bbmap=39.01 pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0': + 'biocontainers/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + input = meta.single_end ? "in=${fastq.join(',')}" : "in=${fastq[0]} in2=${fastq[1]}" + output = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.nm.fastq.gz out2=${prefix}_2.nm.fastq.gz" + + """ + bbnorm.sh \\ + $input \\ + $output \\ + $args \\ + threads=$task.cpus \\ + -Xmx${task.memory.toGiga()}g \\ + &> ${prefix}.bbnorm.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbnorm/meta.yml b/modules/nf-core/bbmap/bbnorm/meta.yml new file mode 100644 index 00000000..6c81bb41 --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/meta.yml @@ -0,0 +1,42 @@ +name: bbmap_bbnorm +description: BBNorm is designed to normalize coverage by down-sampling reads over high-depth areas of a genome, to result in a flat coverage distribution. +keywords: + - normalization + - assembly + - coverage +tools: + - bbmap: + description: "BBMap is a short read aligner, as well as various other bioinformatic tools." + homepage: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + documentation: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + tool_dev_url: "https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/" + licence: "BBMap - Bushnell B. - sourceforge.net/projects/bbmap/" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: fastq file + pattern: "*.{fastq,fq}(.gz)?" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: fastq file + pattern: "*.{fastq, fq}.gz" + +authors: + - "@danilodileo" diff --git a/modules/nf-core/bcftools/consensus/main.nf b/modules/nf-core/bcftools/consensus/main.nf index a32d94b1..2c5e8607 100644 --- a/modules/nf-core/bcftools/consensus/main.nf +++ b/modules/nf-core/bcftools/consensus/main.nf @@ -2,10 +2,10 @@ process BCFTOOLS_CONSENSUS { tag "$meta.id" label 'process_medium' - conda "bioconda::bcftools=1.16" + conda "bioconda::bcftools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.16--hfe4b78e_1': - 'quay.io/biocontainers/bcftools:1.16--hfe4b78e_1' }" + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(vcf), path(tbi), path(fasta) diff --git a/modules/nf-core/bcftools/index/main.nf b/modules/nf-core/bcftools/index/main.nf index f1c897cd..43360aab 100644 --- a/modules/nf-core/bcftools/index/main.nf +++ b/modules/nf-core/bcftools/index/main.nf @@ -2,10 +2,10 @@ process BCFTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda "bioconda::bcftools=1.16" + conda "bioconda::bcftools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.16--hfe4b78e_1': - 'quay.io/biocontainers/bcftools:1.16--hfe4b78e_1' }" + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(vcf) diff --git a/modules/nf-core/bcftools/view/main.nf b/modules/nf-core/bcftools/view/main.nf index 04ced9c9..86f807d3 100644 --- a/modules/nf-core/bcftools/view/main.nf +++ b/modules/nf-core/bcftools/view/main.nf @@ -2,10 +2,10 @@ process BCFTOOLS_VIEW { tag "$meta.id" label 'process_medium' - conda "bioconda::bcftools=1.16" + conda "bioconda::bcftools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.16--hfe4b78e_1': - 'quay.io/biocontainers/bcftools:1.16--hfe4b78e_1' }" + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" input: tuple val(meta), path(vcf), path(index) diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 00000000..5021e6fc --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 00000000..8a39e309 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,40 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf index 85d5f7f9..d8674ddc 100644 --- a/modules/nf-core/checkm/lineagewf/main.nf +++ b/modules/nf-core/checkm/lineagewf/main.nf @@ -5,7 +5,7 @@ process CHECKM_LINEAGEWF { conda "bioconda::checkm-genome=1.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" input: tuple val(meta), path(fasta, stageAs: "input_bins/*") diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf index e62eb5ef..b0c0e69a 100644 --- a/modules/nf-core/checkm/qa/main.nf +++ b/modules/nf-core/checkm/qa/main.nf @@ -5,7 +5,7 @@ process CHECKM_QA { conda "bioconda::checkm-genome=1.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'quay.io/biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" input: tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) diff --git a/modules/nf-core/concoct/concoct/main.nf b/modules/nf-core/concoct/concoct/main.nf index 563f28fb..536d195d 100644 --- a/modules/nf-core/concoct/concoct/main.nf +++ b/modules/nf-core/concoct/concoct/main.nf @@ -1,12 +1,12 @@ process CONCOCT_CONCOCT { tag "$meta.id" - label 'process_low' + label 'process_high' conda "bioconda::concoct=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py38h7be5676_2': - 'quay.io/biocontainers/concoct:1.1.0--py38h7be5676_2' }" + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py311h245ed52_4': + 'biocontainers/concoct:1.1.0--py311h245ed52_4' }" input: tuple val(meta), path(coverage_file), path(fasta) diff --git a/modules/nf-core/concoct/concoctcoveragetable/main.nf b/modules/nf-core/concoct/concoctcoveragetable/main.nf index 5be484ae..21f0f218 100644 --- a/modules/nf-core/concoct/concoctcoveragetable/main.nf +++ b/modules/nf-core/concoct/concoctcoveragetable/main.nf @@ -5,8 +5,8 @@ process CONCOCT_CONCOCTCOVERAGETABLE { conda "bioconda::concoct=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py38h7be5676_2': - 'quay.io/biocontainers/concoct:1.1.0--py38h7be5676_2' }" + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py311h245ed52_4': + 'biocontainers/concoct:1.1.0--py311h245ed52_4' }" input: tuple val(meta), path(bed), path(bamfiles), path(baifiles) diff --git a/modules/nf-core/concoct/cutupfasta/main.nf b/modules/nf-core/concoct/cutupfasta/main.nf index a765d91c..dba2fc58 100644 --- a/modules/nf-core/concoct/cutupfasta/main.nf +++ b/modules/nf-core/concoct/cutupfasta/main.nf @@ -5,8 +5,8 @@ process CONCOCT_CUTUPFASTA { conda "bioconda::concoct=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py38h7be5676_2': - 'quay.io/biocontainers/concoct:1.1.0--py38h7be5676_2' }" + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py311h245ed52_4': + 'biocontainers/concoct:1.1.0--py311h245ed52_4' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/concoct/extractfastabins/main.nf b/modules/nf-core/concoct/extractfastabins/main.nf index da6dd91b..102ea934 100644 --- a/modules/nf-core/concoct/extractfastabins/main.nf +++ b/modules/nf-core/concoct/extractfastabins/main.nf @@ -4,8 +4,8 @@ process CONCOCT_EXTRACTFASTABINS { conda "bioconda::concoct=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py38h7be5676_2': - 'quay.io/biocontainers/concoct:1.1.0--py38h7be5676_2' }" + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py311h245ed52_4': + 'biocontainers/concoct:1.1.0--py311h245ed52_4' }" input: tuple val(meta), path(original_fasta), path(csv) diff --git a/modules/nf-core/concoct/mergecutupclustering/main.nf b/modules/nf-core/concoct/mergecutupclustering/main.nf index dac6a006..d95efde3 100644 --- a/modules/nf-core/concoct/mergecutupclustering/main.nf +++ b/modules/nf-core/concoct/mergecutupclustering/main.nf @@ -4,8 +4,8 @@ process CONCOCT_MERGECUTUPCLUSTERING { conda "bioconda::concoct=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py38h7be5676_2': - 'quay.io/biocontainers/concoct:1.1.0--py38h7be5676_2' }" + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py311h245ed52_4': + 'biocontainers/concoct:1.1.0--py311h245ed52_4' }" input: tuple val(meta), path(clustering_csv) diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a0..c32657de 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d43..da033408 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,11 +4,10 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/dastool/dastool/main.nf b/modules/nf-core/dastool/dastool/main.nf index cfa36ec8..8440edc7 100644 --- a/modules/nf-core/dastool/dastool/main.nf +++ b/modules/nf-core/dastool/dastool/main.nf @@ -5,7 +5,7 @@ process DASTOOL_DASTOOL { conda "bioconda::das_tool=1.1.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/das_tool:1.1.6--r42hdfd78af_0' : - 'quay.io/biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" + 'biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" input: tuple val(meta), path(contigs), path(bins) diff --git a/modules/nf-core/dastool/dastool/meta.yml b/modules/nf-core/dastool/dastool/meta.yml index 0889ca47..1d4ffa8b 100644 --- a/modules/nf-core/dastool/dastool/meta.yml +++ b/modules/nf-core/dastool/dastool/meta.yml @@ -71,6 +71,7 @@ output: description: Quality and completeness estimates of input bin sets pattern: "*.eval" - bins: + type: file description: Final refined bins in fasta format pattern: "*.fa" - pdfs: diff --git a/modules/nf-core/dastool/fastatocontig2bin/main.nf b/modules/nf-core/dastool/fastatocontig2bin/main.nf index 5e024d10..f4f77c0f 100644 --- a/modules/nf-core/dastool/fastatocontig2bin/main.nf +++ b/modules/nf-core/dastool/fastatocontig2bin/main.nf @@ -5,7 +5,7 @@ process DASTOOL_FASTATOCONTIG2BIN { conda "bioconda::das_tool=1.1.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/das_tool:1.1.6--r42hdfd78af_0' : - 'quay.io/biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" + 'biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index e1ed9288..831b7f12 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -2,10 +2,10 @@ process FASTP { tag "$meta.id" label 'process_medium' - conda "bioconda::fastp=0.23.2" + conda "bioconda::fastp=0.23.4" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastp:0.23.2--h79da9fb_0' : - 'quay.io/biocontainers/fastp:0.23.2--h79da9fb_0' }" + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" input: tuple val(meta), path(reads) @@ -58,7 +58,6 @@ process FASTP { [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz fastp \\ - --stdout \\ --in1 ${prefix}.fastq.gz \\ --out1 ${prefix}.fastp.fastq.gz \\ --thread $task.cpus \\ diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml index 6f6fad74..197ea7ca 100644 --- a/modules/nf-core/fastp/meta.yml +++ b/modules/nf-core/fastp/meta.yml @@ -9,7 +9,7 @@ tools: description: | A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. documentation: https://github.com/OpenGene/fastp - doi: https://doi.org/10.1093/bioinformatics/bty560 + doi: 10.1093/bioinformatics/bty560 licence: ["MIT"] input: - meta: diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9ae58381..249f9064 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -5,7 +5,7 @@ process FASTQC { conda "bioconda::fastqc=0.11.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'quay.io/biocontainers/fastqc:0.11.9--0' }" + 'biocontainers/fastqc:0.11.9--0' }" input: tuple val(meta), path(reads) @@ -29,7 +29,11 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 00000000..3961de60 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process FASTQC" + script "modules/nf-core/fastqc/main.nf" + process "FASTQC" + tag "fastqc" + + test("Single-Read") { + + when { + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assert process.success + assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" + assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") + assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" + } + + } + +} diff --git a/modules/nf-core/freebayes/main.nf b/modules/nf-core/freebayes/main.nf index 35ced9f1..1466f085 100644 --- a/modules/nf-core/freebayes/main.nf +++ b/modules/nf-core/freebayes/main.nf @@ -5,7 +5,7 @@ process FREEBAYES { conda "bioconda::freebayes=1.3.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/freebayes:1.3.6--hbfe0e7f_2' : - 'quay.io/biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" + 'biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" input: tuple val(meta), path(input_1), path(input_1_index), path(input_2), path(input_2_index), path(target_bed) diff --git a/modules/nf-core/freebayes/meta.yml b/modules/nf-core/freebayes/meta.yml index cbbd297e..17d83cba 100644 --- a/modules/nf-core/freebayes/meta.yml +++ b/modules/nf-core/freebayes/meta.yml @@ -15,7 +15,7 @@ tools: homepage: https://github.com/freebayes/freebayes documentation: https://github.com/freebayes/freebayes tool_dev_url: https://github.com/freebayes/freebayes - doi: "arXiv:1207.3907" + doi: "10.48550/arXiv.1207.3907" licence: ["MIT"] input: diff --git a/modules/nf-core/genomad/download/main.nf b/modules/nf-core/genomad/download/main.nf new file mode 100644 index 00000000..a2ac6ecb --- /dev/null +++ b/modules/nf-core/genomad/download/main.nf @@ -0,0 +1,72 @@ +process GENOMAD_DOWNLOAD { + label 'process_single' + + conda "bioconda::genomad=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genomad:1.5.2--pyhdfd78af_0': + 'biocontainers/genomad:1.5.2--pyhdfd78af_0' }" + + output: + path "genomad_db/" , emit: genomad_db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + genomad \\ + download-database . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir genomad_db + touch genomad_db/genomad_db + touch genomad_db/genomad_db.dbtype + touch genomad_db/genomad_db.index + touch genomad_db/genomad_db.lookup + touch genomad_db/genomad_db.source + touch genomad_db/genomad_db_h + touch genomad_db/genomad_db_h.dbtype + touch genomad_db/genomad_db_h.index + touch genomad_db/genomad_db_mapping + touch genomad_db/genomad_db_taxonomy + touch genomad_db/genomad_integrase_db + touch genomad_db/genomad_integrase_db.dbtype + touch genomad_db/genomad_integrase_db.index + touch genomad_db/genomad_integrase_db.lookup + touch genomad_db/genomad_integrase_db.source + touch genomad_db/genomad_integrase_db_h + touch genomad_db/genomad_integrase_db_h.dbtype + touch genomad_db/genomad_integrase_db_h.index + touch genomad_db/genomad_marker_metadata.tsv + touch genomad_db/genomad_mini_db + touch genomad_db/genomad_mini_db.dbtype + touch genomad_db/genomad_mini_db.index + touch genomad_db/genomad_mini_db.lookup + touch genomad_db/genomad_mini_db.source + touch genomad_db/genomad_mini_db_h + touch genomad_db/genomad_mini_db_h.dbtype + touch genomad_db/genomad_mini_db_h.index + touch genomad_db/genomad_mini_db_mapping + touch genomad_db/genomad_mini_db_taxonomy + touch genomad_db/mini_set_ids + touch genomad_db/names.dmp + touch genomad_db/nodes.dmp + touch genomad_db/plasmid_hallmark_annotation.txt + touch genomad_db/version.txt + touch genomad_db/virus_hallmark_annotation.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/genomad/download/meta.yml b/modules/nf-core/genomad/download/meta.yml new file mode 100644 index 00000000..dee0428c --- /dev/null +++ b/modules/nf-core/genomad/download/meta.yml @@ -0,0 +1,31 @@ +name: "genomad_download" +description: Download geNomad databases and related files +keywords: + - metagenomics + - genomad + - database + - download + - phage + - virus + - plasmid +tools: + - "genomad": + description: "Identification of mobile genetic elements" + homepage: https://portal.nersc.gov/genomad/ + documentation: https://portal.nersc.gov/genomad/ + tool_dev_url: https://github.com/apcamargo/genomad/ + doi: 10.1101/2023.03.05.531206 + licence: "['Lawrence Berkeley National Labs BSD variant license']" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - genomad_db: + type: directory + description: Directory containing downloaded data with directory being named "genomad_db" + pattern: "genomad_db" + +authors: + - "@CarsonJM" diff --git a/modules/nf-core/genomad/endtoend/main.nf b/modules/nf-core/genomad/endtoend/main.nf new file mode 100644 index 00000000..48276578 --- /dev/null +++ b/modules/nf-core/genomad/endtoend/main.nf @@ -0,0 +1,82 @@ +process GENOMAD_ENDTOEND { + tag "$meta.id" + label 'process_high' + + conda "bioconda::genomad=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genomad:1.5.2--pyhdfd78af_0': + 'biocontainers/genomad:1.5.2--pyhdfd78af_0' }" + + input: + tuple val(meta) , path(fasta) + path genomad_db + + output: + tuple val(meta), path("*_aggregated_classification/*_aggregated_classification.tsv") , emit: aggregated_classification + tuple val(meta), path("*_annotate/*_taxonomy.tsv") , emit: taxonomy + tuple val(meta), path("*_find_proviruses/*_provirus.tsv") , emit: provirus + tuple val(meta), path("*_score_calibration/*_compositions.tsv") , emit: compositions , optional: true + tuple val(meta), path("*_score_calibration/*_calibrated_aggregated_classification.tsv") , emit: calibrated_classification , optional: true + tuple val(meta), path("*_summary/*_plasmid.fna") , emit: plasmid_fasta + tuple val(meta), path("*_summary/*_plasmid_genes.tsv") , emit: plasmid_genes + tuple val(meta), path("*_summary/*_plasmid_proteins.faa") , emit: plasmid_proteins + tuple val(meta), path("*_summary/*_plasmid_summary.tsv") , emit: plasmid_summary + tuple val(meta), path("*_summary/*_virus.fna") , emit: virus_fasta + tuple val(meta), path("*_summary/*_virus_genes.tsv") , emit: virus_genes + tuple val(meta), path("*_summary/*_virus_proteins.faa") , emit: virus_proteins + tuple val(meta), path("*_summary/*_virus_summary.tsv") , emit: virus_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + genomad \\ + end-to-end \\ + $fasta \\ + ./ \\ + $genomad_db \\ + --threads $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def filename = "${fasta}"[0..<"${fasta}".lastIndexOf('.')] + """ + mkdir ${filename}_aggregated_classification + touch ${filename}_aggregated_classification/${filename}_aggregated_classification.tsv + mkdir ${filename}_annotate + touch ${filename}_annotate/${filename}_taxonomy.tsv + mkdir ${filename}_find_proviruses + touch ${filename}_find_proviruses/${filename}_provirus.tsv + mkdir ${filename}_marker_classification + mkdir ${filename}_nn_classification + mkdir ${filename}_score_calibration + touch ${filename}_score_calibration/${filename}_calibrated_aggregated_classification.tsv + touch ${filename}_score_calibration/${filename}_compositions.tsv + mkdir ${filename}_summary + touch ${filename}_summary/${filename}_plasmid.fna + touch ${filename}_summary/${filename}_plasmid_genes.tsv + touch ${filename}_summary/${filename}_plasmid_proteins.faa + touch ${filename}_summary/${filename}_plasmid_summary.tsv + touch ${filename}_summary/${filename}_virus.fna + touch ${filename}_summary/${filename}_virus_genes.tsv + touch ${filename}_summary/${filename}_virus_proteins.faa + touch ${filename}_summary/${filename}_virus_summary.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/genomad/endtoend/meta.yml b/modules/nf-core/genomad/endtoend/meta.yml new file mode 100644 index 00000000..b5a6f61f --- /dev/null +++ b/modules/nf-core/genomad/endtoend/meta.yml @@ -0,0 +1,103 @@ +name: "genomad_endtoend" + +description: Identify mobile genetic elements present in genomic assemblies +keywords: + - metagenomics + - genomad + - database + - download + - phage + - virus + - plasmid + +tools: + - "genomad": + description: "Identification of mobile genetic elements" + homepage: https://portal.nersc.gov/genomad/ + documentation: https://portal.nersc.gov/genomad/ + tool_dev_url: https://github.com/apcamargo/genomad/ + doi: 10.1101/2023.03.05.531206 + licence: "['Lawrence Berkeley National Labs BSD variant license']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file containing contigs/scaffolds/chromosomes + pattern: "*.{fasta,fna,fa}" + - genomad_db: + type: directory + description: Directory pointing to geNomad database + - score_calibration: + type: boolean + description: true/false value to indicate if score calibration should be enabled + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - aggregated_classification: + type: file + description: Combined classification scores for each contig/scaffold/chromosome + pattern: "*_aggregated_classification.tsv" + - taxonomy: + type: file + description: Detailed output of geNomad's marker gene taxonomy analysis + pattern: "*_taxonomy.tsv" + - provirus: + type: file + description: Detailed output of each provirus identified by geNomad's find_proviruses module + pattern: "*_provirus.tsv" + - compositions: + type: file + description: OPTIONAL - Predicted sample composition when `--enable-score-calibration` is used + pattern: "*_compositions.tsv" + - calibrated_classification: + type: file + description: OPTIONAL - Classification scores that have been adjusted based on sample composition when `--enable-score-calibration` is used` + pattern: "*_calibrated_aggregated_classification.tsv" + - plasmid_fasta: + type: file + description: FASTA file containing predicted plasmid sequences + pattern: "*_plasmid.fna" + - plasmid_genes: + type: file + description: TSV file containing predicted plasmid genes and their annotations + pattern: "*_plasmid_genes.tsv" + - plasmid_proteins: + type: file + description: FASTA file containing predicted plasmid protein sequences + pattern: "*_plasmid_proteins.faa" + - plasmid_summary: + type: file + description: TSV file containing a summary of geNomad's plasmid predictions + pattern: "*_plasmid_summary.tsv" + - virus_fasta: + type: file + description: FASTA file containing predicted virus sequences + pattern: "*_virus.fna" + - virus_genes: + type: file + description: TSV file containing predicted virus genes and their annotations + pattern: "*_virus_genes.tsv" + - virus_proteins: + type: file + description: FASTA file containing predicted virus protein sequences + pattern: "*_virus_proteins.faa" + - virus_summary: + type: file + description: TSV file containing a summary of geNomad's virus predictions + pattern: "*_virus_summary.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@CarsonJM" diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf new file mode 100644 index 00000000..0b6b76cc --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -0,0 +1,80 @@ +process GTDBTK_CLASSIFYWF { + tag "${meta.assembler}-${meta.id}" + label 'process_medium' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "bioconda::gtdbtk=2.1.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gtdbtk:2.1.1--pyhdfd78af_1' : + 'biocontainers/gtdbtk:2.1.1--pyhdfd78af_1' }" + + input: + tuple val(meta), path("bins/*") + tuple val(db_name), path("database/*") + + output: + path "gtdbtk.${meta.assembler}-${meta.id}.*.summary.tsv" , emit: summary + path "gtdbtk.${meta.assembler}-${meta.id}.*.classify.tree.gz" , emit: tree + path "gtdbtk.${meta.assembler}-${meta.id}.*.markers_summary.tsv", emit: markers + path "gtdbtk.${meta.assembler}-${meta.id}.*.msa.fasta.gz" , emit: msa + path "gtdbtk.${meta.assembler}-${meta.id}.*.user_msa.fasta" , emit: user_msa + path "gtdbtk.${meta.assembler}-${meta.id}.*.filtered.tsv" , emit: filtered + path "gtdbtk.${meta.assembler}-${meta.id}.log" , emit: log + path "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" , emit: warnings + path "gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv" , emit: failed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" + + """ + export GTDBTK_DATA_PATH="\${PWD}/database" + if [ ${pplacer_scratch} != "" ] ; then + mkdir pplacer_tmp + fi + + gtdbtk classify_wf \\ + $args \\ + --genome_dir bins \\ + --prefix "gtdbtk.${meta.assembler}-${meta.id}" \\ + --out_dir "\${PWD}" \\ + --cpus $task.cpus \\ + --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ + $pplacer_scratch \\ + --min_perc_aa $params.gtdbtk_min_perc_aa \\ + --min_af $params.gtdbtk_min_af + + gzip "gtdbtk.${meta.assembler}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.id}".*.msa.fasta + mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.id}.log" + mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") + END_VERSIONS + """ + + stub: + def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + """ + touch gtdbtk.${meta.assembler}-${meta.id}.stub.summary.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.stub.classify.tree.gz + touch gtdbtk.${meta.assembler}-${meta.id}.stub.markers_summary.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.stub.msa.fasta.gz + touch gtdbtk.${meta.assembler}-${meta.id}.stub.user_msa.fasta + touch gtdbtk.${meta.assembler}-${meta.id}.stub.filtered.tsv + touch gtdbtk.${meta.assembler}-${meta.id}.log + touch gtdbtk.${meta.assembler}-${meta.id}.warnings.log + touch gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gtdbtk: \$(echo "$VERSION") + END_VERSIONS + """ +} diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml new file mode 100644 index 00000000..4e7ec5f1 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -0,0 +1,83 @@ +name: gtdbtk_classifywf +description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. +keywords: + - GTDB taxonomy + - taxonomic classification + - metagenomics + - classification + - genome taxonomy database + - bacteria + - archaea +tools: + - gtdbtk: + description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. + homepage: https://ecogenomics.github.io/GTDBTk/ + documentation: https://ecogenomics.github.io/GTDBTk/ + tool_dev_url: https://github.com/Ecogenomics/GTDBTk + doi: "10.1093/bioinformatics/btz848" + licence: ["GNU General Public v3 (GPL v3)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + - bins: + type: file + description: The binned fasta files from the assembler + pattern: "*.{fasta,fa}" + - database: + type: file + description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - summary: + type: file + description: A TSV summary file for the classification + pattern: "*.{summary.tsv}" + - tree: + type: file + description: NJ or UPGMA tree in Newick format produced from a multiple sequence alignment + pattern: "*.{classify.tree.gz}" + - markers: + type: file + description: A TSV summary file lineage markers used for the classification. + pattern: "*.{markers_summary.tsv}" + - msa: + type: file + description: Multiple sequence alignments file. + pattern: "*.{msa.fasta.gz}" + - user_msa: + type: file + description: Multiple sequence alignments file for the user-provided files. + pattern: "*.{user_msa.fasta.gz}" + - filtered: + type: file + description: A list of genomes with an insufficient number of amino acids in MSA.. + pattern: "*.{filtered.tsv}" + - log: + type: file + description: GTDB-tk log file + pattern: "*.{log}" + - warnings: + type: file + description: GTDB-tk warnings log file + pattern: "*.{warnings.log}" + - failed: + type: file + description: A TSV summary of the genomes which GTDB-tk failed to classify. + pattern: "*.{failed_genomes.tsv}" +authors: + - "@skrakau" + - "@abhi18av" diff --git a/modules/nf-core/gunc/downloaddb/main.nf b/modules/nf-core/gunc/downloaddb/main.nf index 1e77a4c6..a080d8f2 100644 --- a/modules/nf-core/gunc/downloaddb/main.nf +++ b/modules/nf-core/gunc/downloaddb/main.nf @@ -5,7 +5,7 @@ process GUNC_DOWNLOADDB { conda "bioconda::gunc=1.0.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" input: val db_name diff --git a/modules/nf-core/gunc/mergecheckm/main.nf b/modules/nf-core/gunc/mergecheckm/main.nf index a5c46aca..b6399f22 100644 --- a/modules/nf-core/gunc/mergecheckm/main.nf +++ b/modules/nf-core/gunc/mergecheckm/main.nf @@ -5,7 +5,7 @@ process GUNC_MERGECHECKM { conda "bioconda::gunc=1.0.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" input: tuple val(meta), path(gunc_file), path(checkm_file) diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf index 07511c51..2f1167fa 100644 --- a/modules/nf-core/gunc/run/main.nf +++ b/modules/nf-core/gunc/run/main.nf @@ -5,7 +5,7 @@ process GUNC_RUN { conda "bioconda::gunc=1.0.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'quay.io/biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index d906034c..73bf08cd 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -5,7 +5,7 @@ process GUNZIP { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(archive) @@ -21,10 +21,14 @@ process GUNZIP { def args = task.ext.args ?: '' gunzip = archive.toString() - '.gz' """ - gunzip \\ - -f \\ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ $args \\ - $archive + $archive \\ + > $gunzip cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 4d2ebc84..4cdcdf4c 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -3,31 +3,32 @@ description: Compresses and decompresses files. keywords: - gunzip - compression + - decompression tools: - gunzip: - description: | - gzip is a file format and a software application used for file compression and decompression. - documentation: https://www.gnu.org/software/gzip/manual/gzip.html - licence: ["GPL-3.0-or-later"] + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] input: - meta: - type: map - description: | - Optional groovy Map containing meta information - e.g. [ id:'test', single_end:false ] + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] - archive: - type: file - description: File to be compressed/uncompressed - pattern: "*.*" + type: file + description: File to be compressed/uncompressed + pattern: "*.*" output: - gunzip: - type: file - description: Compressed/uncompressed file - pattern: "*.*" + type: file + description: Compressed/uncompressed file + pattern: "*.*" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@joseespinosa" - "@drpatelh" diff --git a/modules/nf-core/maxbin2/main.nf b/modules/nf-core/maxbin2/main.nf index 3df1bc5f..d5f49344 100644 --- a/modules/nf-core/maxbin2/main.nf +++ b/modules/nf-core/maxbin2/main.nf @@ -5,7 +5,7 @@ process MAXBIN2 { conda "bioconda::maxbin2=2.2.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/maxbin2:2.2.7--he1b5a44_2' : - 'quay.io/biocontainers/maxbin2:2.2.7--he1b5a44_2' }" + 'biocontainers/maxbin2:2.2.7--he1b5a44_2' }" input: tuple val(meta), path(contigs), path(reads), path(abund) diff --git a/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf index 7b161a89..7804ea01 100644 --- a/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf +++ b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf @@ -5,7 +5,7 @@ process METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS { conda "bioconda::metabat2=2.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/metabat2:2.15--h986a166_1' : - 'quay.io/biocontainers/metabat2:2.15--h986a166_1' }" + 'biocontainers/metabat2:2.15--h986a166_1' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/metabat2/metabat2/main.nf b/modules/nf-core/metabat2/metabat2/main.nf index 99393869..7cbee678 100644 --- a/modules/nf-core/metabat2/metabat2/main.nf +++ b/modules/nf-core/metabat2/metabat2/main.nf @@ -5,27 +5,27 @@ process METABAT2_METABAT2 { conda "bioconda::metabat2=2.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/metabat2:2.15--h986a166_1' : - 'quay.io/biocontainers/metabat2:2.15--h986a166_1' }" + 'biocontainers/metabat2:2.15--h986a166_1' }" input: tuple val(meta), path(fasta), path(depth) output: - tuple val(meta), path("*.tooShort.fa.gz") , optional:true , emit: tooshort - tuple val(meta), path("*.lowDepth.fa.gz") , optional:true , emit: lowdepth - tuple val(meta), path("*.unbinned.fa.gz") , optional:true , emit: unbinned - tuple val(meta), path("*.tsv.gz") , optional:true , emit: membership - tuple val(meta), path("bins/*.fa.gz") , optional:true , emit: fasta - path "versions.yml" , emit: versions + tuple val(meta), path("*.tooShort.fa.gz") , optional:true, emit: tooshort + tuple val(meta), path("*.lowDepth.fa.gz") , optional:true, emit: lowdepth + tuple val(meta), path("*.unbinned.fa.gz") , optional:true, emit: unbinned + tuple val(meta), path("*.tsv.gz") , optional:true, emit: membership + tuple val(meta), path("*[!lowDepth|tooShort|unbinned].fa.gz"), optional:true, emit: fasta + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def decompress_depth = depth ? "gzip -d -f $depth" : "" - def depth_file = depth ? "-a ${depth.baseName}" : "" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def decompress_depth = depth ? "gzip -d -f $depth" : "" + def depth_file = depth ? "-a ${depth.baseName}" : "" """ $decompress_depth @@ -35,14 +35,10 @@ process METABAT2_METABAT2 { $depth_file \\ -t $task.cpus \\ --saveCls \\ - -o metabat2/${prefix} + -o ${prefix} - mv metabat2/${prefix} ${prefix}.tsv - mv metabat2 bins - - gzip ${prefix}.tsv - find ./bins/ -name "*.fa" -type f | xargs -t -n 1 bgzip -@ ${task.cpus} - find ./bins/ -name "*[lowDepth,tooShort,unbinned].fa.gz" -type f -exec mv {} . \\; + gzip -cn ${prefix} > ${prefix}.tsv.gz + find . -name "*.fa" -type f | xargs -t -n 1 bgzip -@ ${task.cpus} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/metabat2/metabat2/meta.yml b/modules/nf-core/metabat2/metabat2/meta.yml index 04b8df4f..37f80fdf 100644 --- a/modules/nf-core/metabat2/metabat2/meta.yml +++ b/modules/nf-core/metabat2/metabat2/meta.yml @@ -1,4 +1,5 @@ name: metabat2_metabat2 +description: Metagenome binning of contigs keywords: - sort - binning diff --git a/modules/nf-core/metaeuk/easypredict/main.nf b/modules/nf-core/metaeuk/easypredict/main.nf new file mode 100644 index 00000000..5caf38f9 --- /dev/null +++ b/modules/nf-core/metaeuk/easypredict/main.nf @@ -0,0 +1,62 @@ +process METAEUK_EASYPREDICT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::metaeuk=6.a5d39d9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaeuk:6.a5d39d9--pl5321hf1761c0_2': + 'biocontainers/metaeuk:6.a5d39d9--pl5321hf1761c0_2' }" + + input: + tuple val(meta), path(fasta) + path(database) + + output: + tuple val(meta), path("${prefix}.fas") , emit: faa + tuple val(meta), path("${prefix}.codon.fas"), emit: codon + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*.gff") , emit: gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + if [ -d ${database} ]; then + ## if supplying an mmseqs database as a directory, metaeuk requires the basename of the database + DBBASE=`find ${database}/ -name "*.version" -exec sh -c 'file=\$(basename {}); echo \${file%%.*}' \\;` + DB=`echo "${database}/\${DBBASE}"` + else + DB=${database} + fi + + metaeuk easy-predict \\ + ${fasta} \\ + \${DB} \\ + ${prefix} \\ + tmp/ \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaeuk: \$(metaeuk | grep 'Version' | sed 's/metaeuk Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fas + touch ${prefix}.codon.fas + touch ${prefix}.headersMap.tsv + touch ${prefix}.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaeuk: \$(metaeuk | grep 'Version' | sed 's/metaeuk Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaeuk/easypredict/meta.yml b/modules/nf-core/metaeuk/easypredict/meta.yml new file mode 100644 index 00000000..6fe44d0b --- /dev/null +++ b/modules/nf-core/metaeuk/easypredict/meta.yml @@ -0,0 +1,67 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "metaeuk_easypredict" +description: Annotation of eukaryotic metagenomes using MetaEuk +keywords: + - genomics + - annotation + - fasta +tools: + - "metaeuk": + description: "MetaEuk - sensitive, high-throughput gene discovery and annotation for large-scale eukaryotic metagenomics" + homepage: https://github.com/soedinglab/metaeuk + documentation: https://github.com/soedinglab/metaeuk + tool_dev_url: https://github.com/soedinglab/metaeuk + doi: "10.1186/s40168-020-00808-x" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - fasta: + type: file + description: Nucleotide FASTA file for annotation + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + + - database: + type: file + description: Either a fasta file containing protein sequences, or a directory containing an mmseqs2-formatted protein database + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - faa: + type: file + description: Protein FASTA file containing the exons from the input FASTA file + pattern: "*.{fas}" + + - codon: + type: file + description: Nucleotide FASTA file of protein-coding sequences + pattern: "*.{codon.fas}" + + - tsv: + type: file + description: TSV file containing locations of each protein coding sequence in the input fasta + pattern: "*.headersMap.{tsv}" + + - gff: + type: file + description: Annotation file in GFF format + pattern: "*.{gff}" + +authors: + - "@prototaxites" diff --git a/modules/nf-core/mmseqs/databases/main.nf b/modules/nf-core/mmseqs/databases/main.nf new file mode 100644 index 00000000..a23693c4 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/main.nf @@ -0,0 +1,62 @@ +process MMSEQS_DATABASES { + tag "${database}" + label 'process_medium' + + conda "bioconda::mmseqs2=14.7e284" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:14.7e284--pl5321h6a68c12_2': + 'biocontainers/mmseqs2:14.7e284--pl5321h6a68c12_2' }" + + input: + val database + + output: + path "${prefix}/" , emit: database + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + mmseqs databases \\ + ${database} \\ + ${prefix}/database \\ + tmp/ \\ + --threads ${task.cpus} \\ + --compressed 1 \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + touch ${prefix}/database + touch ${prefix}/database.dbtype + touch ${prefix}/database_h + touch ${prefix}/database_h.dbtype + touch ${prefix}/database_h.index + touch ${prefix}/database.index + touch ${prefix}/database.lookup + touch ${prefix}/database_mapping + touch ${prefix}/database.source + touch ${prefix}/database_taxonomy + touch ${prefix}/database.version + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: /') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/databases/meta.yml b/modules/nf-core/mmseqs/databases/meta.yml new file mode 100644 index 00000000..edd093bd --- /dev/null +++ b/modules/nf-core/mmseqs/databases/meta.yml @@ -0,0 +1,34 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_databases" +description: Download an mmseqs-formatted database +keywords: + - database + - indexing + - clustering + - searching +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: "['GPL v3']" + +input: + - database: + type: string + description: Database available through the mmseqs2 databases interface - see https://github.com/soedinglab/MMseqs2/wiki#downloading-databases for details + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - database: + type: directory + description: Directory containing processed mmseqs database + +authors: + - "@prototaxites" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 68f66bea..65d7dd0d 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ebc29b27..f93b5ee5 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: @@ -37,7 +38,7 @@ output: description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: diff --git a/modules/nf-core/prodigal/main.nf b/modules/nf-core/prodigal/main.nf index e5c28984..8cf87a6d 100644 --- a/modules/nf-core/prodigal/main.nf +++ b/modules/nf-core/prodigal/main.nf @@ -5,18 +5,18 @@ process PRODIGAL { conda "bioconda::prodigal=2.6.3 conda-forge::pigz=2.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' : - 'quay.io/biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" + 'biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" input: tuple val(meta), path(genome) val(output_format) output: - tuple val(meta), path("${prefix}.${output_format}"), emit: gene_annotations - tuple val(meta), path("${prefix}.fna"), emit: nucleotide_fasta - tuple val(meta), path("${prefix}.faa"), emit: amino_acid_fasta - tuple val(meta), path("${prefix}_all.txt"), emit: all_gene_annotations - path "versions.yml", emit: versions + tuple val(meta), path("${prefix}.${output_format}.gz"), emit: gene_annotations + tuple val(meta), path("${prefix}.fna.gz"), emit: nucleotide_fasta + tuple val(meta), path("${prefix}.faa.gz"), emit: amino_acid_fasta + tuple val(meta), path("${prefix}_all.txt.gz"), emit: all_gene_annotations + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -33,6 +33,8 @@ process PRODIGAL { -a "${prefix}.faa" \\ -s "${prefix}_all.txt" + pigz -nm ${prefix}* + cat <<-END_VERSIONS > versions.yml "${task.process}": prodigal: \$(prodigal -v 2>&1 | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p') diff --git a/modules/nf-core/prodigal/meta.yml b/modules/nf-core/prodigal/meta.yml index 8cb3d12e..30747a90 100644 --- a/modules/nf-core/prodigal/meta.yml +++ b/modules/nf-core/prodigal/meta.yml @@ -1,7 +1,9 @@ name: prodigal description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program keywords: - - sort + - prokaryotes + - gene finding + - microbial tools: - prodigal: description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program diff --git a/modules/nf-core/prokka/main.nf b/modules/nf-core/prokka/main.nf index 048d373f..60fbe232 100644 --- a/modules/nf-core/prokka/main.nf +++ b/modules/nf-core/prokka/main.nf @@ -5,7 +5,7 @@ process PROKKA { conda "bioconda::prokka=1.14.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/prokka%3A1.14.6--pl5321hdfd78af_4' : - 'quay.io/biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" + 'biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/pydamage/analyze/main.nf b/modules/nf-core/pydamage/analyze/main.nf index 5d6fc817..03cbe62a 100644 --- a/modules/nf-core/pydamage/analyze/main.nf +++ b/modules/nf-core/pydamage/analyze/main.nf @@ -5,7 +5,7 @@ process PYDAMAGE_ANALYZE { conda "bioconda::pydamage=0.70" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' : - 'quay.io/biocontainers/pydamage:0.70--pyhdfd78af_0' }" + 'biocontainers/pydamage:0.70--pyhdfd78af_0' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/pydamage/filter/main.nf b/modules/nf-core/pydamage/filter/main.nf index 71f11d8f..59d6e4b9 100644 --- a/modules/nf-core/pydamage/filter/main.nf +++ b/modules/nf-core/pydamage/filter/main.nf @@ -5,7 +5,7 @@ process PYDAMAGE_FILTER { conda "bioconda::pydamage=0.70" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' : - 'quay.io/biocontainers/pydamage:0.70--pyhdfd78af_0' }" + 'biocontainers/pydamage:0.70--pyhdfd78af_0' }" input: tuple val(meta), path(csv) diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index ce6580d2..59ed3088 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,18 +2,20 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(fasta) + tuple val(meta2), path(fai) output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,8 +25,8 @@ process SAMTOOLS_FAIDX { """ samtools \\ faidx \\ - $args \\ - $fasta + $fasta \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index fe2fe9a1..957b25e5 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -3,6 +3,7 @@ description: Index FASTA file keywords: - index - fasta + - faidx tools: - samtools: description: | @@ -17,12 +18,21 @@ input: - meta: type: map description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: type: file description: FASTA file pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" output: - meta: type: map diff --git a/modules/nf-core/seqtk/mergepe/main.nf b/modules/nf-core/seqtk/mergepe/main.nf new file mode 100644 index 00000000..6a4362e5 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/main.nf @@ -0,0 +1,46 @@ +process SEQTK_MERGEPE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::seqtk=1.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' : + 'biocontainers/seqtk:1.3--h5bf99c6_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + ln -s ${reads} ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqtk \\ + mergepe \\ + $args \\ + ${reads} \\ + | gzip -n >> ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqtk/mergepe/meta.yml b/modules/nf-core/seqtk/mergepe/meta.yml new file mode 100644 index 00000000..8248ee09 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/meta.yml @@ -0,0 +1,40 @@ +name: seqtk_mergepe +description: Interleave pair-end reads from FastQ files +keywords: + - interleave +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. Seqtk mergepe command merges pair-end reads into one interleaved file. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data,respectively. + pattern: "*.{fastq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: If single-end reads, the output is the same as the input, 1 FastQ file for each read. If pair-end reads, the read pairs will be interleaved and output as 1 FastQ file for each read pair. + pattern: "*.{fastq.gz}" + +authors: + - "@emnilsson" diff --git a/modules/nf-core/tiara/tiara/main.nf b/modules/nf-core/tiara/tiara/main.nf new file mode 100644 index 00000000..ec28032d --- /dev/null +++ b/modules/nf-core/tiara/tiara/main.nf @@ -0,0 +1,63 @@ +process TIARA_TIARA { + tag "$meta.id" + label 'process_medium' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "conda-forge::tiara=1.0.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tiara:1.0.3' : + 'biocontainers/tiara:1.0.3' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${prefix}.{txt,txt.gz}") , emit: classifications + tuple val(meta), path("log_*.{txt,txt.gz}") , emit: log + tuple val(meta), path("*.{fasta,fasta.gz}") , emit: fasta, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + tiara -i ${fasta} \ + -o ${prefix}.txt \ + --threads ${task.cpus} \ + ${args} + + ## fix gzip flag weirdness and ensure consistent .fasta filename output + ## check if fasta files are being output + if echo "${args}" | grep -qE "tf|to-fasta"; then + ## check if we've asked for gzip output, then rename files consistently + if echo "${args}" | grep -q "gz"; then + find . -name "*_${fasta}*" -exec sh -c 'file=`basename {}`; mv "\$file" "\${file%%_*}_${prefix}.fasta.gz"' \\; + else + find . -name "*_${fasta}*" -exec sh -c 'file=`basename {}`; mv "\$file" "\${file%%_*}_${prefix}.fasta"' \\; + fi + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiara: ${VERSION} + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}.out.txt + touch log_${prefix}.out.txt + touch bacteria_${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiara: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/tiara/tiara/meta.yml b/modules/nf-core/tiara/tiara/meta.yml new file mode 100644 index 00000000..687bb63e --- /dev/null +++ b/modules/nf-core/tiara/tiara/meta.yml @@ -0,0 +1,52 @@ +name: "tiara_tiara" +description: Domain-level classification of contigs to bacterial, archaeal, eukaryotic, or organelle +keywords: + - contigs + - metagenomics + - classify +tools: + - "tiara": + description: "Deep-learning-based approach for identification of eukaryotic sequences in the metagenomic data powered by PyTorch." + homepage: "https://ibe-uw.github.io/tiara/" + documentation: https://ibe-uw.github.io/tiara/" + tool_dev_url: "https://github.com/ibe-uw/tiara" + doi: "10.1093/bioinformatics/btab672" + licence: "MIT" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file of assembled contigs. + pattern: "*.{fa,fa.gz,fasta,fasta.gz,fna,fna.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classifications: + type: file + description: TSV file containing per-contig classification probabilities and overall classifications. Gzipped if flag --gz is set. + pattern: "*.{txt,txt.gz}" + - log: + type: file + description: Log file containing tiara model parameters. Gzipped if flag --gz is set. + pattern: "log_*.{txt,txt.gz}" + - fasta: + type: file + description: | + (optional) - fasta files for each domain category specified in command flag `-tf`, containing classified contigs + pattern: "*.{fasta,fasta.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@prototaxites" diff --git a/nextflow.config b/nextflow.config index 0d0fa8a5..0ac8d964 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,7 @@ params { // Input options input = null single_end = false + assembly_input = null // short read preprocessing options skip_clipping = false @@ -34,10 +35,15 @@ params { host_removal_verysensitive = false host_removal_save_ids = false save_hostremoved_reads = false + bbnorm = false + bbnorm_target = 100 + bbnorm_min = 5 + save_bbnorm_reads = false // binning options bowtie2_mode = null binning_map_mode = 'group' + save_assembly_mapped_reads = false skip_binning = false min_contig_size = 1500 min_length_unbinned_contigs = 1000000 @@ -54,25 +60,34 @@ params { skip_quast = false skip_prodigal = false + // virus identification options + run_virus_identification = false + genomad_db = null + genomad_min_score = 0.7 + genomad_splits = 1 + // ancient DNA assembly validation options ancient_dna = false + pydamage_accuracy = 0.5 + skip_ancient_damagecorrection = false freebayes_ploidy = 1 freebayes_min_basequality = 20 freebayes_minallelefreq = 0.33 bcftools_view_high_variant_quality = 30 bcftools_view_medium_variant_quality = 20 bcftools_view_minimal_allelesupport = 3 - pydamage_accuracy = 0.5 // taxonomy options centrifuge_db = null kraken2_db = null skip_krona = false + krona_db = null cat_db = null cat_db_generate = false cat_official_taxonomy = false save_cat_db = false - gtdb = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz" + skip_gtdbtk = false + gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 gtdbtk_min_perc_aa = 10 @@ -96,6 +111,9 @@ params { skip_metabat2 = false skip_maxbin2 = false skip_concoct = false + bin_domain_classification = false + bin_domain_classification_tool = 'tiara' + tiara_min_length = 3000 refine_bins_dastool = false refine_bins_dastool_threshold = 0.5 postbinning_input = 'raw_bins_only' @@ -108,6 +126,7 @@ params { busco_auto_lineage_prok = false save_busco_reference = false busco_clean = false + checkm_download_url = "https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz" checkm_db = null save_checkm_data = false run_gunc = false @@ -121,6 +140,12 @@ params { spadeshybrid_fix_cpus = -1 metabat_rng_seed = 1 + // Annotation options + skip_metaeuk = false + metaeuk_mmseqs_db = null + metaeuk_db = null + save_mmseqs_db = false + // References igenomes_base = 's3://ngi-igenomes/igenomes' igenomes_ignore = false @@ -134,7 +159,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -143,19 +167,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' - // Config options - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null - config_profile_contact = null - config_profile_url = null - config_profile_name = null - + config_profile_name = null + config_profile_description = null + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_contact = null + config_profile_url = null // Max resource options // Defaults only, expecting to be overwritten @@ -163,6 +182,13 @@ params { max_cpus = 16 max_time = '240.h' + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -178,14 +204,17 @@ try { // Load nf-core/mag custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! try { - includeConfig "${params.custom_config_base}/pipeline/mag.config" + includeConfig "${params.custom_config_base}/pipeline/mag.config" } catch (Exception e) { - System.err.println("WARNING: Could not load nf-core/config/mag profiles: ${params.custom_config_base}/pipeline/mag.config") + System.err.println("WARNING: Could not load nf-core/config/mag profiles: ${params.custom_config_base}/pipeline/mag.config") } - profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } conda { conda.enabled = true docker.enabled = false @@ -193,6 +222,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } mamba { conda.enabled = true @@ -202,14 +232,17 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -217,31 +250,48 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } gitpod { executor.name = 'local' @@ -256,10 +306,26 @@ profiles { test_busco_auto { includeConfig 'conf/test_busco_auto.config' } test_ancient_dna { includeConfig 'conf/test_ancient_dna.config' } test_adapterremoval { includeConfig 'conf/test_adapterremoval.config' } + test_binning_entry { includeConfig 'conf/test_binning_entry.config' } test_binrefinement { includeConfig 'conf/test_binrefinement.config' } test_no_clipping { includeConfig 'conf/test_no_clipping.config' } + test_bbnorm { includeConfig 'conf/test_bbnorm.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_virus_identification { includeConfig 'conf/test_virus_identification.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Load igenomes.config if required if (!params.igenomes_ignore) { @@ -267,8 +333,6 @@ if (!params.igenomes_ignore) { } else { params.genomes = [:] } - - // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -286,19 +350,19 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { @@ -307,8 +371,8 @@ manifest { homePage = 'https://github.com/nf-core/mag' description = """Assembly, binning and annotation of metagenomes""" mainScript = 'main.nf' - nextflowVersion = '!>=22.10.1' - version = '2.3.2' + nextflowVersion = '!>=23.04.0' + version = '2.4.0' doi = '10.1093/nargab/lqac007' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 1a6c2d45..5dbd2a26 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -14,8 +14,10 @@ "properties": { "input": { "type": "string", - "mimetype": "text/csv", "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", "description": "Input FastQ files or CSV samplesheet file containing information about the samples in the experiment.", "help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq.gz'\n``` \n\nAlternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file with 5 columns and the following header: sample,group,short_reads_1,short_reads_2,long_reads. See [usage docs](https://nf-co.re/mag/usage#input-specifications).", "fa_icon": "fas fa-file-csv" @@ -26,6 +28,14 @@ "fa_icon": "fas fa-align-center", "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run." }, + "assembly_input": { + "type": "string", + "mimetype": "text/csv", + "format": "file-path", + "description": "Additional input CSV samplesheet containing information about pre-computed assemblies. When set, both read pre-processing and assembly are skipped and the pipeline begins at the binning stage.", + "help_text": "If you have pre-computed assemblies from another source, it is possible to jump straight to the binning stage of the pipeline by supplying these assemblies in a CSV file. This CSV file should have three columns and the following header: `id,group,assembler,fasta`. Short reads must still be supplied in to `--input` in CSV format. See [usage docs](https://nf-co.re/mag/usage#input-specifications) for further details.", + "fa_icon": "fas fa-file-csv" + }, "outdir": { "type": "string", "format": "directory-path", @@ -146,7 +156,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -217,6 +227,7 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true @@ -232,13 +243,6 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -246,12 +250,26 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } }, @@ -346,13 +364,13 @@ }, "host_genome": { "type": "string", - "help_text": "This parameter is mutually exclusive with `--host_genome`. Host read removal is done with Bowtie2. \nBoth the iGenomes FASTA file as well as corresponding, already pre-built Bowtie 2 index files will be used.", + "help_text": "This parameter is mutually exclusive with `--host_fasta`. Host read removal is done with Bowtie2. \nBoth the iGenomes FASTA file as well as corresponding, already pre-built Bowtie 2 index files will be used.", "description": "Name of iGenomes reference for host contamination removal." }, "host_fasta": { "type": "string", "description": "Fasta reference file for host contamination removal.", - "help_text": "This parameter is mutually exclusive with `--host_fasta`. The reference can be masked. Host read removal is done with Bowtie2." + "help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2." }, "host_removal_verysensitive": { "type": "boolean", @@ -383,6 +401,24 @@ "save_phixremoved_reads": { "type": "boolean", "description": "Specify to save input FASTQ files with phiX reads removed to --outdir." + }, + "bbnorm": { + "type": "boolean", + "description": "Run BBnorm to normalize sequence depth." + }, + "bbnorm_target": { + "type": "integer", + "default": 100, + "description": "Set BBnorm target maximum depth to this number." + }, + "bbnorm_min": { + "type": "integer", + "default": 5, + "description": "Set BBnorm minimum depth to this number." + }, + "save_bbnorm_reads": { + "type": "boolean", + "description": "Save normalized read files to output directory." } } }, @@ -452,6 +488,11 @@ "description": "Database for taxonomic binning with kraken2.", "help_text": "The database file must be a compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz." }, + "krona_db": { + "type": "string", + "description": "Database for taxonomic binning with krona", + "help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file." + }, "skip_krona": { "type": "boolean", "description": "Skip creating a krona plot for taxonomic binning." @@ -475,15 +516,18 @@ "type": "boolean", "description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT." }, - "gtdb": { + "skip_gtdbtk": { + "type": "boolean", + "description": "Skip the running of GTDB, as well as the automatic download of the database" + }, + "gtdb_db": { "type": "string", - "default": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", - "description": "GTDB database for taxonomic classification of bins with GTDB-tk.", - "help_text": "For information which GTDB reference databases are compatible with the used GTDB-tk version see https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data." + "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", + "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" }, "gtdbtk_min_completeness": { "type": "number", - "default": 50, + "default": 50.0, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -491,7 +535,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10, + "default": 10.0, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -499,7 +543,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10, + "default": 10.0, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -513,7 +557,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1, + "default": 1.0, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, @@ -522,6 +566,11 @@ "default": true, "description": "Reduce GTDB-Tk memory consumption by running pplacer in a setting writing to disk.", "help_text": "Will be slower. Set to `false` to turn this off." + }, + "genomad_db": { + "type": "string", + "description": "Database for virus classification with geNomad", + "help_text": "Must be a directory containing the uncompressed contents from https://portal.nersc.gov/genomad/__data__/genomad_db_v1.1.tar.gz" } } }, @@ -538,12 +587,12 @@ "spades_options": { "type": "string", "description": "Additional custom options for SPAdes.", - "help_text": "An example is adjusting k-mers (\"-k 21,33,55,77\") or adding [advanced options](https://github.com/ablab/spades#advanced-options). But not -t, -m, -o or --out-prefix, because these are already in use." + "help_text": "An example is adjusting k-mers (\"-k 21,33,55,77\") or adding [advanced options](https://github.com/ablab/spades#advanced-options). But not -t, -m, -o or --out-prefix, because these are already in use. Must be used like this: --spades_options=\"-k 21,33,55,77\")" }, "megahit_options": { "type": "string", "description": "Additional custom options for MEGAHIT.", - "help_text": "An example is adjusting presets (e.g. \"--presets meta-large\"), k-mers (e.g. \"-k 21,33,55,77\") or adding other [advanced options](https://github.com/voutcn/megahit#advanced-usage). For example, increase the minimum k-mer in the event of an error message such as \"Too many vertices in the unitig graph, you may increase the kmer size to remove tons of erroneous kmers.\" in the MEGAHIT log file. But not --threads, --memory, -o or input read files, because these are already in use." + "help_text": "An example is adjusting presets (e.g. \"--presets meta-large\"), k-mers (e.g. \"-k 21,33,55,77\") or adding other [advanced options](https://github.com/voutcn/megahit#advanced-usage). For example, increase the minimum k-mer in the event of an error message such as \"Too many vertices in the unitig graph, you may increase the kmer size to remove tons of erroneous kmers.\" in the MEGAHIT log file. But not --threads, --memory, -o or input read files, because these are already in use. Must be used like this: --megahit_options=\"--presets meta-large\"" }, "skip_spades": { "type": "boolean", @@ -563,8 +612,8 @@ } } }, - "gene_prediction_options": { - "title": "Gene prediction options", + "gene_prediction_and_annotation_options": { + "title": "Gene prediction and annotation options", "type": "object", "description": "", "default": "", @@ -572,6 +621,49 @@ "skip_prodigal": { "type": "boolean", "description": "Skip Prodigal gene prediction" + }, + "skip_prokka": { + "type": "boolean", + "description": "Skip Prokka genome annotation." + }, + "skip_metaeuk": { + "type": "boolean", + "description": "Skip MetaEuk gene prediction and annotation" + }, + "metaeuk_mmseqs_db": { + "type": "string", + "description": "A string containing the name of one of the databases listed in the [mmseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). This database will be downloaded and formatted for eukaryotic genome annotation. Incompatible with --metaeuk_db.", + "help_text": "mmseqs2 lists a large number of databases, not all of which are appropriate for use with MetaEuk. MetaEuk requires protein inputs, so you should select one of the Aminoacid or Profile options." + }, + "metaeuk_db": { + "type": "string", + "description": "Path to either a local fasta file of protein sequences, or to a directory containing an mmseqs2-formatted database, for annotation of eukaryotic genomes.", + "help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes." + }, + "save_mmseqs_db": { + "type": "boolean", + "description": "Save the downloaded mmseqs2 database specified in `--metaeuk_mmseqs_db`." + } + } + }, + "virus_identification_options": { + "title": "Virus identification options", + "type": "object", + "default": "", + "properties": { + "run_virus_identification": { + "type": "boolean", + "description": "Run virus identification." + }, + "genomad_min_score": { + "type": "number", + "default": 0.7, + "description": "Minimum geNomad score for a sequence to be considered viral" + }, + "genomad_splits": { + "type": "integer", + "default": 1, + "description": "Number of groups that geNomad's MMSeqs2 databse should be split into (reduced memory requirements)" } } }, @@ -624,11 +716,28 @@ "bowtie2_mode": { "type": "string", "description": "Bowtie2 alignment mode", - "help_text": "Bowtie2 alignment mode options, for example: `--very-fast` , `--very-sensitive-local -N 1` , ..." + "help_text": "Bowtie2 alignment mode options, for example: `--very-fast` , `--very-sensitive-local -N 1` , ... Must be used like this: --bowtie2_mode=\"--very-sensitive\"" }, - "skip_prokka": { + "save_assembly_mapped_reads": { "type": "boolean", - "description": "Skip Prokka genome annotation." + "description": "Save the output of mapping raw reads back to assembled contigs", + "help_text": "Specify to save the BAM and BAI files generated when mapping input reads back to the assembled contigs (performed in preparation for binning and contig depth estimations)." + }, + "bin_domain_classification": { + "type": "boolean", + "description": "Enable domain-level (prokaryote or eukaryote) classification of bins using Tiara. Processes which are domain-specific will then only receive bins matching the domain requirement.", + "help_text": "Enable this if it is likely that your metagenome samples contain a mixture of eukaryotic and prokaryotic genomes. This will ensure that prokaryote-only steps only receive putatively prokaryotic genomes, and vice-versa. Additionally, may improve the performance of DAS Tool by ensuring it only receives prokaryotic genomes." + }, + "bin_domain_classification_tool": { + "type": "string", + "default": "tiara", + "description": "Specify which tool to use for domain classification of bins. Currently only 'tiara' is implemented.", + "hidden": true + }, + "tiara_min_length": { + "type": "integer", + "default": 3000, + "description": "Minimum contig length for Tiara to use for domain classification. For accurate classification, should be longer than 3000 bp." } } }, @@ -672,9 +781,15 @@ "description": "Enable clean-up of temporary files created during BUSCO runs.", "help_text": "By default, BUSCO creates a large number of intermediate files every run. This may cause problems on some clusters which have file number limits in plate, particularly with large numbers of bins. Enabling this option cleans these files, reducing the total file count of the work directory." }, + "checkm_download_url": { + "type": "string", + "default": "https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz", + "hidden": true, + "description": "URL pointing to checkM database for auto download, if local path not supplied.", + "help_text": "You can use this parameter to point to an online copy of the checkM database TAR archive that the pipeline will use for auto download if a local path is not supplied to `--checkm_db`." + }, "checkm_db": { "type": "string", - "default": "None", "description": "Path to local folder containing already downloaded and uncompressed CheckM database.", "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`." }, @@ -697,8 +812,8 @@ "type": "string", "default": "raw_bins_only", "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", - "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n`both`: bins and unbinned contigs from both the binning and bin refinement steps.", - "enum": ["raw_bins_only", "refined_bins_only", "both"] + "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.", + "enum": ["raw_bins_only", "refined_bins_only"] }, "run_gunc": { "type": "boolean", @@ -706,7 +821,6 @@ }, "gunc_db": { "type": "string", - "default": "None", "description": "Specify a path to a pre-downloaded GUNC dmnd database file" }, "gunc_database_type": { @@ -732,6 +846,15 @@ "type": "boolean", "description": "Turn on/off the ancient DNA subworfklow" }, + "pydamage_accuracy": { + "type": "number", + "default": 0.5, + "description": "PyDamage accuracy threshold" + }, + "skip_ancient_damagecorrection": { + "type": "boolean", + "description": "deactivate damage correction of ancient contigs using variant and consensus calling" + }, "freebayes_ploidy": { "type": "integer", "default": 1, @@ -761,11 +884,6 @@ "type": "integer", "default": 3, "description": "minimum number of bases supporting the alternative allele" - }, - "pydamage_accuracy": { - "type": "number", - "default": 0.5, - "description": "PyDamage accuracy threshold" } } } @@ -802,7 +920,10 @@ "$ref": "#/definitions/assembly_options" }, { - "$ref": "#/definitions/gene_prediction_options" + "$ref": "#/definitions/gene_prediction_and_annotation_options" + }, + { + "$ref": "#/definitions/virus_identification_options" }, { "$ref": "#/definitions/binning_options" diff --git a/subworkflows/local/ancient_dna.nf b/subworkflows/local/ancient_dna.nf index 442a8c1d..a8188a69 100644 --- a/subworkflows/local/ancient_dna.nf +++ b/subworkflows/local/ancient_dna.nf @@ -10,31 +10,50 @@ workflow ANCIENT_DNA_ASSEMBLY_VALIDATION { take: input //channel: [val(meta), path(contigs), path(bam), path(bam_index)] main: + ch_versions = Channel.empty() + PYDAMAGE_ANALYZE(input.map {item -> [item[0], item[2], item[3]]}) PYDAMAGE_FILTER(PYDAMAGE_ANALYZE.out.csv) - FAIDX(input.map { item -> [ item[0], item[1] ] }) - freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] - FREEBAYES (freebayes_input.map { item -> [item[0], item[2], item[3], [], [], []] }, - freebayes_input.map { item -> item[1] }, - freebayes_input.map { item -> item[4] }, - [], - [], - [] ) - - BCFTOOLS_INDEX_PRE(FREEBAYES.out.vcf) - BCFTOOLS_VIEW(FREEBAYES.out.vcf.join(BCFTOOLS_INDEX_PRE.out.tbi), [], [], []) - BCFTOOLS_INDEX_POST(BCFTOOLS_VIEW.out.vcf) - BCFTOOLS_CONSENSUS(BCFTOOLS_VIEW.out.vcf - .join(BCFTOOLS_INDEX_POST.out.tbi) - .join(input.map { item -> [ item[0], item[1] ] })) + ch_versions = ch_versions.mix(PYDAMAGE_ANALYZE.out.versions.first()) + + if ( params.skip_ancient_damagecorrection ) { + ch_corrected_contigs = Channel.empty() + } + + if ( !params.skip_ancient_damagecorrection ) { + FAIDX(input.map { item -> [ item[0], item[1] ] }, [[],[]] ) + freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] + .multiMap{ + meta, contigs, bam, bai, fai -> + reads: [ meta, bam, bai, [], [], [] ] + fasta: [ contigs ] + fai: [ fai ] + } + FREEBAYES ( freebayes_input.reads.dump(tag: 'reads'), + freebayes_input.fasta.dump(tag: 'fasta'), + freebayes_input.fai.dump(tag: 'fai'), + [], + [], + [] ) + + BCFTOOLS_INDEX_PRE(FREEBAYES.out.vcf) + BCFTOOLS_VIEW(FREEBAYES.out.vcf.join(BCFTOOLS_INDEX_PRE.out.tbi), [], [], []) + BCFTOOLS_INDEX_POST(BCFTOOLS_VIEW.out.vcf) + BCFTOOLS_CONSENSUS(BCFTOOLS_VIEW.out.vcf + .join(BCFTOOLS_INDEX_POST.out.tbi) + .join(input.map { item -> [ item[0], item[1] ] })) + + ch_corrected_contigs = BCFTOOLS_CONSENSUS.out.fasta + + ch_versions = ch_versions.mix(FAIDX.out.versions.first()) + ch_versions = ch_versions.mix(FREEBAYES.out.versions.first()) + ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions.first()) + } + + - ch_versions = Channel.empty() - ch_versions = PYDAMAGE_ANALYZE.out.versions.first() - ch_versions = ch_versions.mix(FAIDX.out.versions.first()) - ch_versions = ch_versions.mix(FREEBAYES.out.versions.first()) - ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions.first()) emit: - contigs_recalled = BCFTOOLS_CONSENSUS.out.fasta // channel: [ val(meta), path(fasta) ] + contigs_recalled = ch_corrected_contigs // channel: [ val(meta), path(fasta) ] pydamage_results = PYDAMAGE_ANALYZE.out.csv // channel: [ val(meta), path(csv) ] pydamage_filtered_results = PYDAMAGE_FILTER.out.csv // channel: [ val(meta), path(csv) ] versions = ch_versions // channel: [ versions.yml ] diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index 4a7c9afc..a07ca416 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -2,28 +2,17 @@ * Binning with MetaBAT2 and MaxBin2 */ -include { METABAT2_METABAT2 } from '../../modules/nf-core/metabat2/metabat2/main' -include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../modules/nf-core/metabat2/jgisummarizebamcontigdepths/main' -include { MAXBIN2 } from '../../modules/nf-core/maxbin2/main' -include { GUNZIP as GUNZIP_BINS } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_UNBINS } from '../../modules/nf-core/gunzip/main' +include { METABAT2_METABAT2 } from '../../modules/nf-core/metabat2/metabat2/main' +include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../modules/nf-core/metabat2/jgisummarizebamcontigdepths/main' +include { MAXBIN2 } from '../../modules/nf-core/maxbin2/main' +include { GUNZIP as GUNZIP_BINS } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_UNBINS } from '../../modules/nf-core/gunzip/main' include { CONVERT_DEPTHS } from '../../modules/local/convert_depths' include { ADJUST_MAXBIN2_EXT } from '../../modules/local/adjust_maxbin2_ext' include { SPLIT_FASTA } from '../../modules/local/split_fasta' -include { MAG_DEPTHS } from '../../modules/local/mag_depths' -include { MAG_DEPTHS_PLOT } from '../../modules/local/mag_depths_plot' -include { MAG_DEPTHS_SUMMARY } from '../../modules/local/mag_depths_summary' include { FASTA_BINNING_CONCOCT } from '../../subworkflows/nf-core/fasta_binning_concoct/main' -/* - * Get number of columns in file (first line) - */ -def getColNo(filename) { - lines = file(filename).readLines() - return lines[0].split('\t').size() -} - workflow BINNING { take: assemblies // channel: [ val(meta), path(assembly), path(bams), path(bais) ] @@ -36,17 +25,14 @@ workflow BINNING { // generate coverage depths for each contig ch_summarizedepth_input = assemblies .map { meta, assembly, bams, bais -> - def meta_new = meta.clone() - [ meta_new, bams, bais ] + [ meta, bams, bais ] } METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS ( ch_summarizedepth_input ) ch_metabat_depths = METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth .map { meta, depths -> - def meta_new = meta.clone() - meta_new['binner'] = 'MetaBAT2' - + def meta_new = meta + [binner: 'MetaBAT2'] [ meta_new, depths ] } @@ -55,9 +41,7 @@ workflow BINNING { // combine depths back with assemblies ch_metabat2_input = assemblies .map { meta, assembly, bams, bais -> - def meta_new = meta.clone() - meta_new['binner'] = 'MetaBAT2' - + def meta_new = meta + [binner: 'MetaBAT2'] [ meta_new, assembly, bams, bais ] } .join( ch_metabat_depths, by: 0 ) @@ -70,9 +54,7 @@ workflow BINNING { CONVERT_DEPTHS ( ch_metabat2_input ) ch_maxbin2_input = CONVERT_DEPTHS.out.output .map { meta, assembly, reads, depth -> - def meta_new = meta.clone() - meta_new['binner'] = 'MaxBin2' - + def meta_new = meta + [binner: 'MaxBin2'] [ meta_new, assembly, reads, depth ] } ch_versions = ch_versions.mix(CONVERT_DEPTHS.out.versions.first()) @@ -80,8 +62,10 @@ workflow BINNING { // main bins for decompressing for MAG_DEPTHS ch_final_bins_for_gunzip = Channel.empty() + // final gzipped bins ch_binning_results_gzipped_final = Channel.empty() + // run binning if ( !params.skip_metabat2 ) { METABAT2_METABAT2 ( ch_metabat2_input ) @@ -101,9 +85,7 @@ workflow BINNING { ch_concoct_input = assemblies .map { meta, bins, bams, bais -> - def meta_new = meta.clone() - meta_new['binner'] = 'CONCOCT' - + def meta_new = meta + [binner: 'CONCOCT'] [ meta_new, bins, bams, bais ] } .multiMap { @@ -136,63 +118,20 @@ workflow BINNING { GUNZIP_BINS ( ch_final_bins_for_gunzip ) ch_binning_results_gunzipped = GUNZIP_BINS.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_BINS.out.versions.first()) + .groupTuple(by: 0) GUNZIP_UNBINS ( ch_split_fasta_results_transposed ) ch_splitfasta_results_gunzipped = GUNZIP_UNBINS.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_UNBINS.out.versions.first()) - - // Compute bin depths for different samples (according to `binning_map_mode`) - // Have to remove binner meta before joining with according depths files, - // as required for MAG_DEPTHS, but we can add 'binner' - // info again based on file name and finally group by - // 'assembler', 'id', 'binner' - ch_depth_input = ch_binning_results_gunzipped - .mix(ch_splitfasta_results_gunzipped ) - .map { meta, bin -> - def meta_new = meta.clone() - meta_new.remove('binner') - [ meta_new, bin ] - } - .groupTuple (by: 0 ) - .join( METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth, by: 0 ) - .transpose() - .map { meta, bin, contig_depths_file -> - def meta_new = meta.clone() - meta_new['binner'] = bin.name.split("-")[1] - [ meta_new, bin, contig_depths_file ] - } - .groupTuple (by: [0,2] ) - - MAG_DEPTHS ( ch_depth_input ) - ch_versions = ch_versions.mix(MAG_DEPTHS.out.versions) - - // Plot bin depths heatmap for each assembly and mapped samples (according to `binning_map_mode`) - // create file containing group information for all samples - ch_sample_groups = reads - .collectFile(name:'sample_groups.tsv'){ meta, reads -> meta.id + '\t' + meta.group + '\n' } + .groupTuple(by: 0) - // Filter MAG depth files: use only those for plotting that contain depths for > 2 samples - ch_mag_depths_plot = MAG_DEPTHS.out.depths - .map { meta, bin_depths_file -> - if (getColNo(bin_depths_file) > 2) [ meta, bin_depths_file ] - } - - MAG_DEPTHS_PLOT ( ch_mag_depths_plot, ch_sample_groups.collect() ) - MAG_DEPTHS_SUMMARY ( MAG_DEPTHS.out.depths.map{it[1]}.collect() ) - ch_versions = ch_versions.mix( MAG_DEPTHS_PLOT.out.versions ) - ch_versions = ch_versions.mix( MAG_DEPTHS_SUMMARY.out.versions ) - - // Group final binned contigs per sample for final output - ch_binning_results_gunzipped_final = ch_binning_results_gunzipped.groupTuple(by: 0) - ch_binning_results_gzipped_final = ch_binning_results_gzipped_final.groupTuple(by: 0) + ch_versions = ch_versions.mix(GUNZIP_BINS.out.versions.first()) + ch_versions = ch_versions.mix(GUNZIP_UNBINS.out.versions.first()) emit: - bins = ch_binning_results_gunzipped_final + bins = ch_binning_results_gunzipped bins_gz = ch_binning_results_gzipped_final - unbinned = ch_splitfasta_results_gunzipped.groupTuple() + unbinned = ch_splitfasta_results_gunzipped unbinned_gz = SPLIT_FASTA.out.unbinned - depths_summary = MAG_DEPTHS_SUMMARY.out.summary metabat2depths = METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth versions = ch_versions } diff --git a/subworkflows/local/binning_preparation.nf b/subworkflows/local/binning_preparation.nf index cff4621a..60f63a26 100644 --- a/subworkflows/local/binning_preparation.nf +++ b/subworkflows/local/binning_preparation.nf @@ -26,13 +26,16 @@ workflow BINNING_PREPARATION { .map { meta, assembly, index -> [ meta.group, meta, assembly, index ] } .combine(ch_reads_bowtie2, by: 0) .map { group, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] } + } else { + // i.e. --binning_map_mode 'own' // combine assemblies (not co-assembled) with reads from own sample ch_reads_bowtie2 = reads.map{ meta, reads -> [ meta.id, meta, reads ] } ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index .map { meta, assembly, index -> [ meta.id, meta, assembly, index ] } .combine(ch_reads_bowtie2, by: 0) .map { id, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] } + } BOWTIE2_ASSEMBLY_ALIGN ( ch_bowtie2_input ) @@ -42,7 +45,7 @@ workflow BINNING_PREPARATION { .map { meta, assembly, bams, bais -> [ meta, assembly.sort()[0], bams, bais ] } // multiple symlinks to the same assembly -> use first of sorted list emit: - bowtie2_assembly_multiqc = BOWTIE2_ASSEMBLY_ALIGN.out.log.map { assembly_meta, reads_meta, log -> if (assembly_meta.id == reads_meta.id) {return [ log ]} } + bowtie2_assembly_multiqc = BOWTIE2_ASSEMBLY_ALIGN.out.log.map { assembly_meta, reads_meta, log -> [ log ] } bowtie2_version = BOWTIE2_ASSEMBLY_ALIGN.out.versions grouped_mappings = ch_grouped_mappings } diff --git a/subworkflows/local/binning_refinement.nf b/subworkflows/local/binning_refinement.nf index 09eb72a9..eea8c76a 100644 --- a/subworkflows/local/binning_refinement.nf +++ b/subworkflows/local/binning_refinement.nf @@ -8,29 +8,33 @@ include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_CONCOCT } from include { DASTOOL_DASTOOL } from '../../modules/nf-core/dastool/dastool/main.nf' include { RENAME_PREDASTOOL } from '../../modules/local/rename_predastool' include { RENAME_POSTDASTOOL } from '../../modules/local/rename_postdastool' -include { MAG_DEPTHS as MAG_DEPTHS_REFINED } from '../../modules/local/mag_depths' -include { MAG_DEPTHS_PLOT as MAG_DEPTHS_PLOT_REFINED } from '../../modules/local/mag_depths_plot' -include { MAG_DEPTHS_SUMMARY as MAG_DEPTHS_SUMMARY_REFINED } from '../../modules/local/mag_depths_summary' /* * Get number of columns in file (first line) */ -def getColNo(filename) { - lines = file(filename).readLines() - return lines[0].split('\t').size() -} workflow BINNING_REFINEMENT { take: ch_contigs_for_dastool // channel: [ val(meta), path(contigs) ] bins // channel: [ val(meta), path(bins) ] - depths - reads main: ch_versions = Channel.empty() - ch_bins_for_fastatocontig2bin = RENAME_PREDASTOOL(bins).renamed_bins + // remove domain information, will add it back later + // everything here is either unclassified or a prokaryote + ch_bins = bins + .map { meta, bins -> + def meta_new = meta - meta.subMap('domain') + [meta_new, bins] + } + .groupTuple() + .map { + meta, bins -> [meta, bins.flatten()] + } + + // prepare bins + ch_bins_for_fastatocontig2bin = RENAME_PREDASTOOL(ch_bins).renamed_bins .branch { metabat2: it[0]['binner'] == 'MetaBAT2' maxbin2: it[0]['binner'] == 'MaxBin2' @@ -51,8 +55,7 @@ workflow BINNING_REFINEMENT { .mix(DASTOOL_FASTATOCONTIG2BIN_CONCOCT.out.fastatocontig2bin) .map { meta, fastatocontig2bin -> - def meta_new = meta.clone() - meta_new.remove('binner') + def meta_new = meta - meta.subMap('binner') [ meta_new, fastatocontig2bin ] } .groupTuple(by: 0) @@ -77,63 +80,30 @@ workflow BINNING_REFINEMENT { .map { meta, bin -> if (bin.name != "unbinned.fa") { - def meta_new = meta.clone() - meta_new['binner'] = 'DASTool' + def meta_new = meta + [binner: 'DASTool'] [ meta_new, bin ] } } .groupTuple() + .map { + meta, bins -> + def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' + def meta_new = meta + [domain: domain_class] + [ meta_new, bins ] + } ch_input_for_renamedastool = DASTOOL_DASTOOL.out.bins .map { meta, bins -> - def meta_new = meta.clone() - meta_new['binner'] = 'DASTool' + def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' + def meta_new = meta + [binner: 'DASTool', domain: domain_class] [ meta_new, bins ] } RENAME_POSTDASTOOL ( ch_input_for_renamedastool ) - // We have to strip the meta to be able to combine with the original - // depths file to run MAG_DEPTH - ch_input_for_magdepth = ch_dastool_bins_newmeta - .mix( RENAME_POSTDASTOOL.out.refined_unbins ) - .map { - meta, refinedbins -> - def meta_new = meta.clone() - meta_new.remove('binner') - [ meta_new, refinedbins ] - } - .transpose() - .groupTuple (by: 0 ) - .join( depths, by: 0 ) - .map { - meta, bins, contig_depths_file -> - def meta_new = meta.clone() - meta_new['binner'] = 'DASTool' - [ meta_new, bins, contig_depths_file ] - } - - MAG_DEPTHS_REFINED ( ch_input_for_magdepth ) - - // Plot bin depths heatmap for each assembly and mapped samples (according to `binning_map_mode`) - // create file containing group information for all samples - ch_sample_groups = reads - .collectFile(name:'sample_groups.tsv'){ meta, reads -> meta.id + '\t' + meta.group + '\n' } - - // Filter MAG depth files: use only those for plotting that contain depths for > 2 samples - ch_mag_depths_plot_refined = MAG_DEPTHS_REFINED.out.depths - .map { meta, bin_depths_file -> - if (getColNo(bin_depths_file) > 2) [ meta, bin_depths_file ] - } - - MAG_DEPTHS_PLOT_REFINED ( ch_mag_depths_plot_refined, ch_sample_groups.collect() ) - MAG_DEPTHS_SUMMARY_REFINED ( MAG_DEPTHS_REFINED.out.depths.map{it[1]}.collect() ) - emit: refined_bins = ch_dastool_bins_newmeta refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins - refined_depths = MAG_DEPTHS_REFINED.out.depths - refined_depths_summary = MAG_DEPTHS_SUMMARY_REFINED.out.summary versions = ch_versions } diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 2d2079bc..6165be47 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -40,6 +40,6 @@ workflow BUSCO_QC { emit: summary = BUSCO_SUMMARY.out.summary failed_bin = BUSCO.out.failed_bin.map{it[1]} - multiqc = BUSCO.out.summary_domain.map{it[1]} + multiqc = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{it[1]} versions = BUSCO.out.versions } diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf new file mode 100644 index 00000000..81c93c6f --- /dev/null +++ b/subworkflows/local/depths.nf @@ -0,0 +1,61 @@ +include { MAG_DEPTHS } from '../../modules/local/mag_depths' +include { MAG_DEPTHS_PLOT } from '../../modules/local/mag_depths_plot' +include { MAG_DEPTHS_SUMMARY } from '../../modules/local/mag_depths_summary' + +/* + * Get number of columns in file (first line) + */ +def getColNo(filename) { + lines = file(filename).readLines() + return lines[0].split('\t').size() +} + +workflow DEPTHS { + take: + bins_unbins //channel: val(meta), [ path(bins) ] + depths //channel: val(meta), path(depths) + reads //channel: val(meta), path(reads) + + main: + ch_versions = Channel.empty() + + // Compute bin depths for different samples (according to `binning_map_mode`) + // Create a new meta joining key first, but copy meta so that + // we retain the information about binners and domain classification + ch_depth_input = bins_unbins + .map { meta, bins -> + def meta_join = meta - meta.subMap('binner','domain') + [ meta_join, meta, bins ] + } + .combine( depths, by: 0 ) + .map { meta_join, meta, bins, contig_depths_file -> + def meta_new = meta - meta.subMap('domain') + [ meta_new, bins, contig_depths_file ] + } + .transpose() + .groupTuple(by: [0,2]) + + + MAG_DEPTHS ( ch_depth_input ) + ch_versions = ch_versions.mix(MAG_DEPTHS.out.versions) + + // Plot bin depths heatmap for each assembly and mapped samples (according to `binning_map_mode`) + // create file containing group information for all samples + ch_sample_groups = reads + .collectFile(name:'sample_groups.tsv'){ meta, reads -> meta.id + '\t' + meta.group + '\n' } + + // Filter MAG depth files: use only those for plotting that contain depths for > 2 samples + ch_mag_depths_plot = MAG_DEPTHS.out.depths + .map { meta, bin_depths_file -> + if (getColNo(bin_depths_file) > 2) [ meta, bin_depths_file ] + } + + MAG_DEPTHS_PLOT ( ch_mag_depths_plot, ch_sample_groups.collect() ) + MAG_DEPTHS_SUMMARY ( MAG_DEPTHS.out.depths.map{it[1]}.collect() ) + ch_versions = ch_versions.mix( MAG_DEPTHS_PLOT.out.versions ) + ch_versions = ch_versions.mix( MAG_DEPTHS_SUMMARY.out.versions ) + + emit: + depths_summary = MAG_DEPTHS_SUMMARY.out.summary + versions = ch_versions +} diff --git a/subworkflows/local/domain_classification.nf b/subworkflows/local/domain_classification.nf new file mode 100644 index 00000000..38291888 --- /dev/null +++ b/subworkflows/local/domain_classification.nf @@ -0,0 +1,28 @@ +/* +* Domain classification with Tiara +*/ + +include { TIARA } from '../../subworkflows/local/tiara' + +workflow DOMAIN_CLASSIFICATION { + take: + assemblies // tuple val(meta), path(assembly) + bins // tuple val(meta), path( [ bins ] ) + unbins // tuple val(meta), path( [ unbins ] ) + + main: + ch_versions = Channel.empty() + + if ( params.bin_domain_classification_tool == "tiara") { + TIARA (assemblies, bins, unbins) + } + + ch_classified_bins = TIARA.out.classified_bins + ch_classified_unbins = TIARA.out.classified_unbins + ch_versions = ch_versions.mix(TIARA.out.versions) + + emit: + classified_bins = ch_classified_bins + classified_unbins = ch_classified_unbins + versions = ch_versions +} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index a6852cb8..21823962 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -3,7 +3,7 @@ */ include { GTDBTK_DB_PREPARATION } from '../../modules/local/gtdbtk_db_preparation' -include { GTDBTK_CLASSIFY } from '../../modules/local/gtdbtk_classify' +include { GTDBTK_CLASSIFYWF } from '../../modules/nf-core/gtdbtk/classifywf/main' include { GTDBTK_SUMMARY } from '../../modules/local/gtdbtk_summary' workflow GTDBTK { @@ -59,20 +59,34 @@ workflow GTDBTK { return [it[0], it[1]] } - GTDBTK_DB_PREPARATION ( gtdb ) - GTDBTK_CLASSIFY ( + if ( gtdb.extension == 'gz' ) { + // Expects to be tar.gz! + ch_db_for_gtdbtk = GTDBTK_DB_PREPARATION ( gtdb ).db + } else if ( gtdb.isDirectory() ) { + // Make up meta id to match expected channel cardinality for GTDBTK + ch_db_for_gtdbtk = Channel + .of(gtdb) + .map{ + [ it.toString().split('/').last(), it ] + } + .collect() + } else { + error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") + } + + GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), - GTDBTK_DB_PREPARATION.out + ch_db_for_gtdbtk ) GTDBTK_SUMMARY ( ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]), - GTDBTK_CLASSIFY.out.summary.collect().ifEmpty([]), - GTDBTK_CLASSIFY.out.filtered.collect().ifEmpty([]), - GTDBTK_CLASSIFY.out.failed.collect().ifEmpty([]) + GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]), + GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]), + GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([]) ) emit: summary = GTDBTK_SUMMARY.out.summary - versions = GTDBTK_CLASSIFY.out.versions + versions = GTDBTK_CLASSIFYWF.out.versions } diff --git a/subworkflows/local/gunc_qc.nf b/subworkflows/local/gunc_qc.nf index 82d621bb..912b9425 100644 --- a/subworkflows/local/gunc_qc.nf +++ b/subworkflows/local/gunc_qc.nf @@ -4,7 +4,7 @@ include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' -include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' workflow GUNC_QC { take: diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 53a5103c..09e66a41 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -14,27 +14,30 @@ workflow INPUT_CHECK { .from(file(params.input)) .splitCsv(header: true) .map { row -> - if (row.size() == 5) { + if (row.size() >= 5) { def id = row.sample + def run = row.run def group = row.group def sr1 = row.short_reads_1 ? file(row.short_reads_1, checkIfExists: true) : false def sr2 = row.short_reads_2 ? file(row.short_reads_2, checkIfExists: true) : false def lr = row.long_reads ? file(row.long_reads, checkIfExists: true) : false // Check if given combination is valid + if (run != null && run == "") exit 1, "ERROR: Please check input samplesheet -> Column 'run' contains an empty field. Either remove column 'run' or fill each field with a value." if (!sr1) exit 1, "Invalid input samplesheet: short_reads_1 can not be empty." if (!sr2 && lr) exit 1, "Invalid input samplesheet: invalid combination of single-end short reads and long reads provided! SPAdes does not support single-end data and thus hybrid assembly cannot be performed." if (!sr2 && !params.single_end) exit 1, "Invalid input samplesheet: single-end short reads provided, but command line parameter `--single_end` is false. Note that either only single-end or only paired-end reads must provided." if (sr2 && params.single_end) exit 1, "Invalid input samplesheet: paired-end short reads provided, but command line parameter `--single_end` is true. Note that either only single-end or only paired-end reads must provided." - return [ id, group, sr1, sr2, lr ] + return [ id, run, group, sr1, sr2, lr ] } else { - exit 1, "Input samplesheet contains row with ${row.size()} column(s). Expects 5." + exit 1, "Input samplesheet contains row with ${row.size()} column(s). Expects at least 5." } } // separate short and long reads ch_raw_short_reads = ch_input_rows - .map { id, group, sr1, sr2, lr -> + .map { id, run, group, sr1, sr2, lr -> def meta = [:] meta.id = id + meta.run = run == null ? "0" : run meta.group = group meta.single_end = params.single_end if (params.single_end) @@ -43,10 +46,11 @@ workflow INPUT_CHECK { return [ meta, [ sr1, sr2 ] ] } ch_raw_long_reads = ch_input_rows - .map { id, group, sr1, sr2, lr -> + .map { id, run, group, sr1, sr2, lr -> if (lr) { def meta = [:] meta.id = id + meta.run = run == null ? "0" : run meta.group = group return [ meta, lr ] } @@ -58,6 +62,7 @@ workflow INPUT_CHECK { .map { row -> def meta = [:] meta.id = row[0] + meta.run = 0 meta.group = 0 meta.single_end = params.single_end return [ meta, row[1] ] @@ -66,13 +71,73 @@ workflow INPUT_CHECK { ch_raw_long_reads = Channel.empty() } - // Ensure sample IDs are unique + if (params.assembly_input) { + // check if we have supplied reads as a CSV file + if(!hasExtension(params.input, "csv")) { exit 1, "ERROR: when supplying assemblies with --assembly_input, reads must be supplied using a CSV!" } + + ch_input_assembly_rows = Channel + .from(file(params.assembly_input)) + .splitCsv(header: true) + .map { row -> + if (row.size() == 4) { + def id = row.id + def group = row.group + def assembler = row.assembler ?: false + def assembly = row.fasta ? file(row.fasta, checkIfExists: true) : false + // Check if given combination is valid + if (!assembly) exit 1, "Invalid input assembly samplesheet: fasta can not be empty." + if (!assembler) exit 1, "Invalid input assembly samplesheet: assembler can not be empty." + return [ id, group, assembler, assembly ] + } else { + exit 1, "Input samplesheet contains row with ${row.size()} column(s). Expects 3." + } + } + + // build meta map + ch_input_assemblies = ch_input_assembly_rows + .map { id, group, assembler, fasta -> + def meta = [:] + meta.id = params.coassemble_group? "group-$group" : id + meta.group = group + meta.assembler = assembler + return [ meta, [ fasta ] ] + } + } else { + ch_input_assembly_rows = Channel.empty() + ch_input_assemblies = Channel.empty() + } + + // Ensure run IDs are unique within samples (also prevents duplicated sample names) + + // Note: do not need to check for PE/SE mismatch, as checks above do not allow mixing ch_input_rows - .map { id, group, sr1, sr2, lr -> id } - .toList() - .map { ids -> if( ids.size() != ids.unique().size() ) {exit 1, "ERROR: input samplesheet contains duplicated sample IDs!" } } + .groupTuple(by: 0) + .map { id, run, group, sr1, sr2, lr -> if( run.size() != run.unique().size() ) { { error("ERROR: input samplesheet contains duplicated sample or run IDs (within a sample)! Check samplesheet for sample id: ${id}") } } } + + // If assembly csv file supplied, additionally ensure groups are all represented between reads and assemblies + if (params.assembly_input) { + ch_read_ids = ch_input_rows + .map { id, run, group, sr1, sr2, lr -> params.coassemble_group ? group : id } + .unique() + .toList() + .sort() + + ch_assembly_ids = ch_input_assembly_rows + .map { id, group, assembler, assembly -> params.coassemble_group ? group : id } + .unique() + .toList() + .sort() + + ch_read_ids.cross(ch_assembly_ids) + .map { ids1, ids2 -> + if (ids1.sort() != ids2.sort()) { + exit 1, "ERROR: supplied IDs in read and assembly CSV files do not match!" + } + } + } emit: - raw_short_reads = ch_raw_short_reads - raw_long_reads = ch_raw_long_reads + raw_short_reads = ch_raw_short_reads + raw_long_reads = ch_raw_long_reads + input_assemblies = ch_input_assemblies } diff --git a/subworkflows/local/tiara.nf b/subworkflows/local/tiara.nf new file mode 100644 index 00000000..ab274cc8 --- /dev/null +++ b/subworkflows/local/tiara.nf @@ -0,0 +1,128 @@ +include { TIARA_TIARA } from '../../modules/nf-core/tiara/tiara/main' +include { TIARA_CLASSIFY } from '../../modules/local/tiara_classify' +include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_TIARA } from '../../modules/nf-core/dastool/fastatocontig2bin/main' +include { COMBINE_TSV as TIARA_SUMMARY } from '../../modules/local/combine_tsv' + +workflow TIARA { + take: + assemblies // tuple val(meta), path(assembly) + bins // tuple val(meta), path( [ bins ] ) + unbins // tuple val(meta), path( [ unbins ] ) + + main: + ch_versions = Channel.empty() + + bins = bins + .map { meta, bins -> + def meta_new = meta + [bin: 'bins'] + meta_new.bin = 'bins' + [meta_new, bins] + } + + unbins = unbins + .map { meta, unbins -> + def meta_new = meta + [bin: 'unbins'] + [meta_new, unbins] + } + + ch_tiara_input = bins.mix(unbins) + + TIARA_TIARA ( assemblies ) + ch_versions = ch_versions.mix(TIARA_TIARA.out.versions.first()) + + // Need contig2bin file for each bin group + DASTOOL_FASTATOCONTIG2BIN_TIARA ( ch_tiara_input , 'fa') + ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_TIARA.out.versions.first()) + + // Need to per-assembly Tiara classifications to their bins + // Have to remove binner information from the meta map to do this + ch_contigs_to_bin_tiara = DASTOOL_FASTATOCONTIG2BIN_TIARA.out.fastatocontig2bin + .combine(ch_tiara_input, by: 0) + .map { meta, contig2bin, bins -> + def meta_join = meta - meta.subMap('binner', 'bin') + [ meta_join, meta, contig2bin, bins ] + } + + ch_tiara_classify_input = ch_contigs_to_bin_tiara + .combine( TIARA_TIARA.out.classifications, by: 0) + .map { meta_join, meta, contig2bin, bins, classifications -> + [ meta, classifications, contig2bin, bins ] + } + + TIARA_CLASSIFY( ch_tiara_classify_input ) + ch_versions = ch_versions.mix(TIARA_CLASSIFY.out.versions.first()) + + ch_eukarya_bins = TIARA_CLASSIFY.out.eukarya_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'eukarya'] + [meta_new, bins] + } + + ch_prokarya_bins = TIARA_CLASSIFY.out.prokarya_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'prokarya'] + [meta_new, bins] + } + + ch_bacteria_bins = TIARA_CLASSIFY.out.bacteria_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'bacteria'] + [meta_new, bins] + } + + ch_archaea_bins = TIARA_CLASSIFY.out.archaea_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'archaea'] + [meta_new, bins] + } + + ch_organelle_bins = TIARA_CLASSIFY.out.organelle_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'organelle'] + [meta_new, bins] + } + + ch_unknown_bins = TIARA_CLASSIFY.out.unknown_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'unknown'] + [meta_new, bins] + } + + ch_classified_bins_unbins = ch_eukarya_bins + .mix(ch_prokarya_bins) + .mix(ch_bacteria_bins) + .mix(ch_archaea_bins) + .mix(ch_organelle_bins) + .mix(ch_unknown_bins) + + ch_classified_bins = ch_classified_bins_unbins + .filter { meta, bins -> + meta.bin == "bins" + } + .map { meta, bins -> + def meta_new = meta - meta.subMap('bin') + [meta_new, bins] + } + + ch_classified_unbins = ch_classified_bins_unbins + .filter { meta, bins -> + meta.bin == "unbins" + } + .map { meta, bins -> + def meta_new = meta - meta.subMap('bin') + [meta_new, bins] + } + + ch_bin_classifications = TIARA_CLASSIFY.out.bin_classifications + .map { meta, classification -> + [ classification ] + } + .collect() + + TIARA_SUMMARY(ch_bin_classifications) + + emit: + classified_bins = ch_classified_bins + classified_unbins = ch_classified_unbins + versions = ch_versions +} diff --git a/subworkflows/local/virus_identification.nf b/subworkflows/local/virus_identification.nf new file mode 100644 index 00000000..4a3a2dac --- /dev/null +++ b/subworkflows/local/virus_identification.nf @@ -0,0 +1,30 @@ +/* + * geNomad: Identification of mobile genetic elements + */ + +include { GENOMAD_DOWNLOAD } from '../../modules/nf-core/genomad/download/main' +include { GENOMAD_ENDTOEND } from '../../modules/nf-core/genomad/endtoend/main' + +workflow VIRUS_IDENTIFICATION { + take: + ch_assemblies // [ [ meta] , fasta ], input scaffolds (mandatory) + ch_genomad_db // [ db ], presupplied geNomad database (optional) + + main: + ch_versions = Channel.empty() + + if ( params.genomad_db ) { + ch_db_for_genomad = ch_genomad_db + } else { + ch_db_for_genomad = GENOMAD_DOWNLOAD( ).genomad_db + ch_versions.mix( GENOMAD_DOWNLOAD.out.versions ) + } + + ch_identified_viruses = GENOMAD_ENDTOEND ( ch_assemblies, ch_db_for_genomad ).virus_fasta + ch_versions.mix( GENOMAD_ENDTOEND.out.versions ) + + emit: + identified_viruses = ch_identified_viruses + versions = ch_versions + +} diff --git a/subworkflows/nf-core/fasta_binning_concoct/meta.yml b/subworkflows/nf-core/fasta_binning_concoct/meta.yml index 5450dc49..5c5b4cb4 100644 --- a/subworkflows/nf-core/fasta_binning_concoct/meta.yml +++ b/subworkflows/nf-core/fasta_binning_concoct/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json name: "fasta_binning_concoct" description: Runs the CONCOCT workflow of contig binning keywords: @@ -5,7 +6,7 @@ keywords: - binning - metagenomics - contigs -modules: +components: - concoct/cutupfasta - concoct/concoctcoveragetable - concoct/concoct diff --git a/tower.yml b/tower.yml new file mode 100644 index 00000000..787aedfe --- /dev/null +++ b/tower.yml @@ -0,0 +1,5 @@ +reports: + multiqc_report.html: + display: "MultiQC HTML report" + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" diff --git a/workflows/mag.nf b/workflows/mag.nf index 95960665..825549a0 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -1,10 +1,14 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) // Check already if long reads are provided def hasExtension(it, extension) { @@ -16,36 +20,30 @@ if(hasExtension(params.input, "csv")){ .from(file(params.input)) .splitCsv(header: true) .map { row -> - if (row.size() == 5) { - if (row.long_reads) hybrid = true - } else { - log.error "Input samplesheet contains row with ${row.size()} column(s). Expects 5." - System.exit(1) - } + if (row.long_reads) hybrid = true } } +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation + // Validate input parameters WorkflowMag.initialise(params, log, hybrid) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.gtdb, params.lambda_reference, params.busco_reference ] +def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.krona_db, params.gtdb_db, params.lambda_reference, params.busco_reference ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -// Currently not used as using local version of MultiQC -//ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.fromPath("$projectDir/assets/nf-core-mag_logo_light.png") +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -77,6 +75,7 @@ include { POOL_SINGLE_READS as POOL_LONG_READS } from '../modules include { MEGAHIT } from '../modules/local/megahit' include { SPADES } from '../modules/local/spades' include { SPADESHYBRID } from '../modules/local/spadeshybrid' +include { GUNZIP as GUNZIP_ASSEMBLIES } from '../modules/nf-core/gunzip' include { QUAST } from '../modules/local/quast' include { QUAST_BINS } from '../modules/local/quast_bins' include { QUAST_BINS_SUMMARY } from '../modules/local/quast_bins_summary' @@ -86,20 +85,22 @@ include { CAT } from '../modules include { CAT_SUMMARY } from "../modules/local/cat_summary" include { BIN_SUMMARY } from '../modules/local/bin_summary' include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules/local/combine_tsv' -include { MULTIQC } from '../modules/local/multiqc' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' -include { BINNING } from '../subworkflows/local/binning' -include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' -include { GTDBTK } from '../subworkflows/local/gtdbtk' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' +include { BINNING } from '../subworkflows/local/binning' +include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' +include { BUSCO_QC } from '../subworkflows/local/busco_qc' +include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { GUNC_QC } from '../subworkflows/local/gunc_qc' +include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' +include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' +include { DEPTHS } from '../subworkflows/local/depths' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -113,12 +114,18 @@ include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_ include { ARIA2 as ARIA2_UNTAR } from '../modules/nf-core/aria2/main' include { FASTQC as FASTQC_RAW } from '../modules/nf-core/fastqc/main' include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/fastqc/main' +include { SEQTK_MERGEPE } from '../modules/nf-core/seqtk/mergepe/main' +include { BBMAP_BBNORM } from '../modules/nf-core/bbmap/bbnorm/main' include { FASTP } from '../modules/nf-core/fastp/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_PE } from '../modules/nf-core/adapterremoval/main' include { ADAPTERREMOVAL as ADAPTERREMOVAL_SE } from '../modules/nf-core/adapterremoval/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' include { PRODIGAL } from '../modules/nf-core/prodigal/main' include { PROKKA } from '../modules/nf-core/prokka/main' +include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' +include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' //////////////////////////////////////////////////// /* -- Create channel for reference databases -- */ @@ -182,6 +189,13 @@ if(params.cat_db){ ch_cat_db_file = Channel.empty() } +if(params.krona_db){ + ch_krona_db_file = Channel + .value(file( "${params.krona_db}" )) +} else { + ch_krona_db_file = Channel.empty() +} + if(!params.keep_phix) { ch_phix_db_file = Channel .value(file( "${params.phix_reference}" )) @@ -192,12 +206,25 @@ if (!params.keep_lambda) { .value(file( "${params.lambda_reference}" )) } -gtdb = params.skip_binqc ? false : params.gtdb +if (params.genomad_db){ + ch_genomad_db = file(params.genomad_db, checkIfExists: true) +} else { + ch_genomad_db = Channel.empty() +} + +gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db + if (gtdb) { - ch_gtdb = Channel - .value(file( "${gtdb}" )) + gtdb = file( "${gtdb}", checkIfExists: true) } else { - ch_gtdb = Channel.empty() + gtdb = [] +} + +if(params.metaeuk_db && !params.skip_metaeuk) { + ch_metaeuk_db = Channel. + value(file("${params.metaeuk_db}", checkIfExists: true)) +} else { + ch_metaeuk_db = Channel.empty() } /* @@ -216,18 +243,26 @@ workflow MAG { // Get checkM database if not supplied - if ( !params.checkm_db ) { - ARIA2_UNTAR ("https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz") + if ( !params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db ) { + ARIA2_UNTAR (params.checkm_download_url) ch_checkm_db = ARIA2_UNTAR.out.downloaded_file - } + } + // Get mmseqs db for MetaEuk if requested + if (!params.skip_metaeuk && params.metaeuk_mmseqs_db) { + MMSEQS_DATABASES(params.metaeuk_mmseqs_db) + ch_metaeuk_db = MMSEQS_DATABASES.out.database + ch_versions = ch_versions.mix(MMSEQS_DATABASES.out.versions) + } // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // + INPUT_CHECK () - ch_raw_short_reads = INPUT_CHECK.out.raw_short_reads - ch_raw_long_reads = INPUT_CHECK.out.raw_long_reads + ch_raw_short_reads = INPUT_CHECK.out.raw_short_reads + ch_raw_long_reads = INPUT_CHECK.out.raw_long_reads + ch_input_assemblies = INPUT_CHECK.out.input_assemblies /* ================================================================================ @@ -239,74 +274,133 @@ workflow MAG { ch_raw_short_reads ) ch_versions = ch_versions.mix(FASTQC_RAW.out.versions.first()) - if ( !params.skip_clipping ) { - if ( params.clip_tool == 'fastp' ) { - ch_clipmerge_out = FASTP ( - ch_raw_short_reads, - [], - params.fastp_save_trimmed_fail, - [] + + ch_bowtie2_removal_host_multiqc = Channel.empty() + if ( !params.assembly_input ) { + if ( !params.skip_clipping ) { + if ( params.clip_tool == 'fastp' ) { + ch_clipmerge_out = FASTP ( + ch_raw_short_reads, + [], + params.fastp_save_trimmed_fail, + [] + ) + ch_short_reads_prepped = FASTP.out.reads + ch_versions = ch_versions.mix(FASTP.out.versions.first()) + + } else if ( params.clip_tool == 'adapterremoval' ) { + + // due to strange output file scheme in AR2, have to manually separate + // SE/PE to allow correct pulling of reads after. + ch_adapterremoval_in = ch_raw_short_reads + .branch { + single: it[0]['single_end'] + paired: !it[0]['single_end'] + } + + ADAPTERREMOVAL_PE ( ch_adapterremoval_in.paired, [] ) + ADAPTERREMOVAL_SE ( ch_adapterremoval_in.single, [] ) + + ch_short_reads_prepped = Channel.empty() + ch_short_reads_prepped = ch_short_reads_prepped.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ADAPTERREMOVAL_PE.out.paired_truncated) + + ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions.first(), ADAPTERREMOVAL_SE.out.versions.first()) + } + } else { + ch_short_reads_prepped = ch_raw_short_reads + } + + if (params.host_fasta){ + BOWTIE2_HOST_REMOVAL_BUILD ( + ch_host_fasta + ) + ch_host_bowtie2index = BOWTIE2_HOST_REMOVAL_BUILD.out.index + } + ch_bowtie2_removal_host_multiqc = Channel.empty() + if (params.host_fasta || params.host_genome){ + BOWTIE2_HOST_REMOVAL_ALIGN ( + ch_short_reads_prepped, + ch_host_bowtie2index ) - ch_short_reads = FASTP.out.reads - ch_versions = ch_versions.mix(FASTP.out.versions.first()) + ch_short_reads_hostremoved = BOWTIE2_HOST_REMOVAL_ALIGN.out.reads + ch_bowtie2_removal_host_multiqc = BOWTIE2_HOST_REMOVAL_ALIGN.out.log + ch_versions = ch_versions.mix(BOWTIE2_HOST_REMOVAL_ALIGN.out.versions.first()) + } else { + ch_short_reads_hostremoved = ch_short_reads_prepped + } - } else if ( params.clip_tool == 'adapterremoval' ) { + if(!params.keep_phix) { + BOWTIE2_PHIX_REMOVAL_BUILD ( + ch_phix_db_file + ) + BOWTIE2_PHIX_REMOVAL_ALIGN ( + ch_short_reads_hostremoved, + BOWTIE2_PHIX_REMOVAL_BUILD.out.index + ) + ch_short_reads_phixremoved = BOWTIE2_PHIX_REMOVAL_ALIGN.out.reads + ch_versions = ch_versions.mix(BOWTIE2_PHIX_REMOVAL_ALIGN.out.versions.first()) + } else { + ch_short_reads_phixremoved = ch_short_reads_hostremoved + } - // due to strange output file scheme in AR2, have to manually separate - // SE/PE to allow correct pulling of reads after. - ch_adapterremoval_in = ch_raw_short_reads - .branch { - single: it[0]['single_end'] - paired: !it[0]['single_end'] - } + if (!(params.keep_phix && params.skip_clipping && !(params.host_genome || params.host_fasta))) { + FASTQC_TRIMMED ( + ch_short_reads_phixremoved + ) + ch_versions = ch_versions.mix(FASTQC_TRIMMED.out.versions) + } - ADAPTERREMOVAL_PE ( ch_adapterremoval_in.paired, [] ) - ADAPTERREMOVAL_SE ( ch_adapterremoval_in.single, [] ) + // Run/Lane merging - ch_short_reads = Channel.empty() - ch_short_reads = ch_short_reads.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ADAPTERREMOVAL_PE.out.paired_truncated) + ch_short_reads_forcat = ch_short_reads_phixremoved + .map { + meta, reads -> + def meta_new = meta - meta.subMap('run') + [ meta_new, reads ] + } + .groupTuple() + .branch { + meta, reads -> + cat: ( meta.single_end && reads.size() == 1 ) || ( !meta.single_end && reads.size() >= 2 ) + skip_cat: true // Can skip merging if only single lanes + } - ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions.first(), ADAPTERREMOVAL_SE.out.versions.first()) + CAT_FASTQ ( ch_short_reads_forcat.cat.map { meta, reads -> [ meta, reads.flatten() ]} ) + ch_short_reads = Channel.empty() + ch_short_reads = CAT_FASTQ.out.reads.mix( ch_short_reads_forcat.skip_cat ).map { meta, reads -> [ meta, reads.flatten() ]} + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) + + if ( params.bbnorm ) { + if ( params.coassemble_group ) { + // Interleave pairs, to be able to treat them as single ends when calling bbnorm. This prepares + // for dropping the single_end parameter, but keeps assembly modules as they are, i.e. not + // accepting a mix of single end and pairs. + SEQTK_MERGEPE ( + ch_short_reads.filter { ! it[0].single_end } + ) + ch_versions = ch_versions.mix(SEQTK_MERGEPE.out.versions.first()) + // Combine the interleaved pairs with any single end libraries. Set the meta.single_end to true (used by the bbnorm module). + ch_bbnorm = SEQTK_MERGEPE.out.reads + .mix(ch_short_reads.filter { it[0].single_end }) + .map { [ [ id: sprintf("group%s", it[0].group), group: it[0].group, single_end: true ], it[1] ] } + .groupTuple() + } else { + ch_bbnorm = ch_short_reads + } + BBMAP_BBNORM ( ch_bbnorm ) + ch_versions = ch_versions.mix(BBMAP_BBNORM.out.versions) + ch_short_reads_assembly = BBMAP_BBNORM.out.fastq + } else { + ch_short_reads_assembly = ch_short_reads } } else { ch_short_reads = ch_raw_short_reads - } - - if (params.host_fasta){ - BOWTIE2_HOST_REMOVAL_BUILD ( - ch_host_fasta - ) - ch_host_bowtie2index = BOWTIE2_HOST_REMOVAL_BUILD.out.index - } - ch_bowtie2_removal_host_multiqc = Channel.empty() - if (params.host_fasta || params.host_genome){ - BOWTIE2_HOST_REMOVAL_ALIGN ( - ch_short_reads, - ch_host_bowtie2index - ) - ch_short_reads = BOWTIE2_HOST_REMOVAL_ALIGN.out.reads - ch_bowtie2_removal_host_multiqc = BOWTIE2_HOST_REMOVAL_ALIGN.out.log - ch_versions = ch_versions.mix(BOWTIE2_HOST_REMOVAL_ALIGN.out.versions.first()) - } - - if(!params.keep_phix) { - BOWTIE2_PHIX_REMOVAL_BUILD ( - ch_phix_db_file - ) - BOWTIE2_PHIX_REMOVAL_ALIGN ( - ch_short_reads, - BOWTIE2_PHIX_REMOVAL_BUILD.out.index - ) - ch_short_reads = BOWTIE2_PHIX_REMOVAL_ALIGN.out.reads - ch_versions = ch_versions.mix(BOWTIE2_PHIX_REMOVAL_ALIGN.out.versions.first()) - } - - if (!(params.keep_phix && params.skip_clipping && !(params.host_genome || params.host_fasta))) { - FASTQC_TRIMMED ( - ch_short_reads - ) - ch_versions = ch_versions.mix(FASTQC_TRIMMED.out.versions) + .map { + meta, reads -> + def meta_new = meta - meta.subMap('run') + [ meta_new, reads ] + } } /* @@ -320,41 +414,49 @@ workflow MAG { ch_versions = ch_versions.mix(NANOPLOT_RAW.out.versions.first()) ch_long_reads = ch_raw_long_reads - if (!params.skip_adapter_trimming) { - PORECHOP ( - ch_raw_long_reads - ) - ch_long_reads = PORECHOP.out.reads - ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) - } + .map { + meta, reads -> + def meta_new = meta - meta.subMap('run') + [ meta_new, reads ] + } - if (!params.keep_lambda) { - NANOLYSE ( - ch_long_reads, - ch_nanolyse_db - ) - ch_long_reads = NANOLYSE.out.reads - ch_versions = ch_versions.mix(NANOLYSE.out.versions.first()) - } + if ( !params.assembly_input ) { + if (!params.skip_adapter_trimming) { + PORECHOP ( + ch_raw_long_reads + ) + ch_long_reads = PORECHOP.out.reads + ch_versions = ch_versions.mix(PORECHOP.out.versions.first()) + } + + if (!params.keep_lambda) { + NANOLYSE ( + ch_long_reads, + ch_nanolyse_db + ) + ch_long_reads = NANOLYSE.out.reads + ch_versions = ch_versions.mix(NANOLYSE.out.versions.first()) + } - // join long and short reads by sample name - ch_short_reads_tmp = ch_short_reads - .map { meta, sr -> [ meta.id, meta, sr ] } + // join long and short reads by sample name + ch_short_reads_tmp = ch_short_reads + .map { meta, sr -> [ meta.id, meta, sr ] } - ch_short_and_long_reads = ch_long_reads - .map { meta, lr -> [ meta.id, meta, lr ] } - .join(ch_short_reads_tmp, by: 0) - .map { id, meta_lr, lr, meta_sr, sr -> [ meta_lr, lr, sr[0], sr[1] ] } // should not occur for single-end, since SPAdes (hybrid) does not support single-end + ch_short_and_long_reads = ch_long_reads + .map { meta, lr -> [ meta.id, meta, lr ] } + .join(ch_short_reads_tmp, by: 0) + .map { id, meta_lr, lr, meta_sr, sr -> [ meta_lr, lr, sr[0], sr[1] ] } // should not occur for single-end, since SPAdes (hybrid) does not support single-end - FILTLONG ( - ch_short_and_long_reads - ) - ch_long_reads = FILTLONG.out.reads - ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + FILTLONG ( + ch_short_and_long_reads + ) + ch_long_reads = FILTLONG.out.reads + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) - NANOPLOT_FILTERED ( - ch_long_reads - ) + NANOPLOT_FILTERED ( + ch_long_reads + ) + } /* ================================================================================ @@ -362,6 +464,7 @@ workflow MAG { ================================================================================ */ CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ) + CENTRIFUGE ( ch_short_reads, CENTRIFUGE_DB_PREPARATION.out.db @@ -378,16 +481,20 @@ workflow MAG { ch_versions = ch_versions.mix(KRAKEN2.out.versions.first()) if (( params.centrifuge_db || params.kraken2_db ) && !params.skip_krona){ - KRONA_DB () + if (params.krona_db){ + ch_krona_db = ch_krona_db_file + } else { + KRONA_DB () + ch_krona_db = KRONA_DB.out.db + } ch_tax_classifications = CENTRIFUGE.out.results_for_krona.mix(KRAKEN2.out.results_for_krona) . map { classifier, meta, report -> - def meta_new = meta.clone() - meta_new.classifier = classifier + def meta_new = meta + [classifier: classifier] [ meta_new, report ] } KRONA ( ch_tax_classifications, - KRONA_DB.out.db.collect() + ch_krona_db ) ch_versions = ch_versions.mix(KRONA.out.versions.first()) } @@ -398,109 +505,136 @@ workflow MAG { ================================================================================ */ - // Co-assembly: prepare grouping for MEGAHIT and for pooling for SPAdes - if (params.coassemble_group) { - // short reads - // group and set group as new id - ch_short_reads_grouped = ch_short_reads - .map { meta, reads -> [ meta.group, meta, reads ] } - .groupTuple(by: 0) - .map { group, metas, reads -> + if ( !params.assembly_input ) { + // Co-assembly: prepare grouping for MEGAHIT and for pooling for SPAdes + if (params.coassemble_group) { + // short reads + // group and set group as new id + ch_short_reads_grouped = ch_short_reads_assembly + .map { meta, reads -> [ meta.group, meta, reads ] } + .groupTuple(by: 0) + .map { group, metas, reads -> + def assemble_as_single = params.single_end || ( params.bbnorm && params.coassemble_group ) + def meta = [:] + meta.id = "group-$group" + meta.group = group + meta.single_end = assemble_as_single + if ( assemble_as_single ) [ meta, reads.collect { it }, [] ] + else [ meta, reads.collect { it[0] }, reads.collect { it[1] } ] + } + // long reads + // group and set group as new id + ch_long_reads_grouped = ch_long_reads + .map { meta, reads -> [ meta.group, meta, reads ] } + .groupTuple(by: 0) + .map { group, metas, reads -> def meta = [:] meta.id = "group-$group" meta.group = group - meta.single_end = params.single_end - if (!params.single_end) [ meta, reads.collect { it[0] }, reads.collect { it[1] } ] - else [ meta, reads.collect { it }, [] ] - } - // long reads - // group and set group as new id - ch_long_reads_grouped = ch_long_reads - .map { meta, reads -> [ meta.group, meta, reads ] } - .groupTuple(by: 0) - .map { group, metas, reads -> - def meta = [:] - meta.id = "group-$group" - meta.group = group - [ meta, reads.collect { it } ] - } - } else { - ch_short_reads_grouped = ch_short_reads - .map { meta, reads -> - if (!params.single_end){ [ meta, [reads[0]], [reads[1]] ] } - else [ meta, [reads], [] ] } - } + [ meta, reads.collect { it } ] + } + } else { + ch_short_reads_grouped = ch_short_reads_assembly + .filter { it[0].single_end } + .map { meta, reads -> [ meta, [ reads ], [] ] } + .mix ( + ch_short_reads_assembly + .filter { ! it[0].single_end } + .map { meta, reads -> [ meta, [ reads[0] ], [ reads[1] ] ] } + ) + ch_long_reads_grouped = ch_long_reads + } - ch_assemblies = Channel.empty() - if (!params.skip_megahit){ - MEGAHIT ( ch_short_reads_grouped ) - ch_megahit_assemblies = MEGAHIT.out.assembly - .map { meta, assembly -> - def meta_new = meta.clone() - meta_new.assembler = "MEGAHIT" - [ meta_new, assembly ] - } - ch_assemblies = ch_assemblies.mix(ch_megahit_assemblies) - ch_versions = ch_versions.mix(MEGAHIT.out.versions.first()) - } + ch_assemblies = Channel.empty() + if (!params.skip_megahit){ + MEGAHIT ( ch_short_reads_grouped ) + ch_megahit_assemblies = MEGAHIT.out.assembly + .map { meta, assembly -> + def meta_new = meta + [assembler: 'MEGAHIT'] + [ meta_new, assembly ] + } + ch_assemblies = ch_assemblies.mix(ch_megahit_assemblies) + ch_versions = ch_versions.mix(MEGAHIT.out.versions.first()) + } - // Co-assembly: pool reads for SPAdes - if (params.coassemble_group) { - // short reads - if (!params.single_end && (!params.skip_spades || !params.skip_spadeshybrid)){ - if (params.single_end){ - POOL_SHORT_SINGLE_READS ( ch_short_reads_grouped ) - ch_short_reads_spades = POOL_SHORT_SINGLE_READS.out.reads + // Co-assembly: pool reads for SPAdes + if ( ! params.skip_spades || ! params.skip_spadeshybrid ){ + if ( params.coassemble_group ) { + if ( params.bbnorm ) { + ch_short_reads_spades = ch_short_reads_grouped.map { [ it[0], it[1] ] } + } else { + POOL_SHORT_SINGLE_READS ( + ch_short_reads_grouped + .filter { it[0].single_end } + ) + POOL_PAIRED_READS ( + ch_short_reads_grouped + .filter { ! it[0].single_end } + ) + ch_short_reads_spades = POOL_SHORT_SINGLE_READS.out.reads + .mix(POOL_PAIRED_READS.out.reads) + } + } else { + ch_short_reads_spades = ch_short_reads_assembly + } + // long reads + if (!params.single_end && !params.skip_spadeshybrid){ + POOL_LONG_READS ( ch_long_reads_grouped ) + ch_long_reads_spades = POOL_LONG_READS.out.reads } else { - POOL_PAIRED_READS ( ch_short_reads_grouped ) - ch_short_reads_spades = POOL_PAIRED_READS.out.reads + ch_long_reads_spades = Channel.empty() } + } else { + ch_short_reads_spades = Channel.empty() + ch_long_reads_spades = Channel.empty() + } + + if (!params.single_end && !params.skip_spades){ + SPADES ( ch_short_reads_spades ) + ch_spades_assemblies = SPADES.out.assembly + .map { meta, assembly -> + def meta_new = meta + [assembler: 'SPAdes'] + [ meta_new, assembly ] + } + ch_assemblies = ch_assemblies.mix(ch_spades_assemblies) + ch_versions = ch_versions.mix(SPADES.out.versions.first()) } - // long reads + if (!params.single_end && !params.skip_spadeshybrid){ - POOL_LONG_READS ( ch_long_reads_grouped ) - ch_long_reads_spades = POOL_LONG_READS.out.reads + ch_short_reads_spades_tmp = ch_short_reads_spades + .map { meta, reads -> [ meta.id, meta, reads ] } + + ch_reads_spadeshybrid = ch_long_reads_spades + .map { meta, reads -> [ meta.id, meta, reads ] } + .combine(ch_short_reads_spades_tmp, by: 0) + .map { id, meta_long, long_reads, meta_short, short_reads -> [ meta_short, long_reads, short_reads ] } + + SPADESHYBRID ( ch_reads_spadeshybrid ) + ch_spadeshybrid_assemblies = SPADESHYBRID.out.assembly + .map { meta, assembly -> + def meta_new = meta + [assembler: "SPAdesHybrid"] + [ meta_new, assembly ] + } + ch_assemblies = ch_assemblies.mix(ch_spadeshybrid_assemblies) + ch_versions = ch_versions.mix(SPADESHYBRID.out.versions.first()) } } else { - ch_short_reads_spades = ch_short_reads - ch_long_reads_spades = ch_long_reads - .map { meta, reads -> [ meta, [reads] ] } - } - - if (!params.single_end && !params.skip_spades){ - SPADES ( ch_short_reads_spades ) - ch_spades_assemblies = SPADES.out.assembly - .map { meta, assembly -> - def meta_new = meta.clone() - meta_new.assembler = "SPAdes" - [ meta_new, assembly ] + ch_assemblies_split = ch_input_assemblies + .branch { meta, assembly -> + gzipped: assembly[0].getExtension() == "gz" + ungzip: true } - ch_assemblies = ch_assemblies.mix(ch_spades_assemblies) - ch_versions = ch_versions.mix(SPADES.out.versions.first()) - } - if (!params.single_end && !params.skip_spadeshybrid){ - ch_short_reads_spades_tmp = ch_short_reads_spades - .map { meta, reads -> [ meta.id, meta, reads ] } - ch_reads_spadeshybrid = ch_long_reads_spades - .map { meta, reads -> [ meta.id, meta, reads ] } - .combine(ch_short_reads_spades_tmp, by: 0) - .map { id, meta_long, long_reads, meta_short, short_reads -> [ meta_short, long_reads, short_reads ] } - SPADESHYBRID ( ch_reads_spadeshybrid ) - ch_spadeshybrid_assemblies = SPADESHYBRID.out.assembly - .map { meta, assembly -> - def meta_new = meta.clone() - meta_new.assembler = "SPAdesHybrid" - [ meta_new, assembly ] - } - ch_assemblies = ch_assemblies.mix(ch_spadeshybrid_assemblies) - ch_versions = ch_versions.mix(SPADESHYBRID.out.versions.first()) + GUNZIP_ASSEMBLIES(ch_assemblies_split.gzipped) + ch_versions = ch_versions.mix(GUNZIP_ASSEMBLIES.out.versions) + + ch_assemblies = Channel.empty() + ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLIES.out.gunzip) } ch_quast_multiqc = Channel.empty() if (!params.skip_quast){ QUAST ( ch_assemblies ) - ch_quast_multiqc = QUAST.out.qc ch_versions = ch_versions.mix(QUAST.out.versions.first()) } @@ -520,22 +654,31 @@ workflow MAG { /* ================================================================================ - Binning preparation + Virus identification ================================================================================ */ + if (params.run_virus_identification){ + VIRUS_IDENTIFICATION(ch_assemblies, ch_genomad_db) + ch_versions = ch_versions.mix(VIRUS_IDENTIFICATION.out.versions.first()) + } + + /* + ================================================================================ + Binning preparation + ================================================================================ + */ - ch_bowtie2_assembly_multiqc = Channel.empty() ch_busco_summary = Channel.empty() ch_checkm_summary = Channel.empty() - ch_busco_multiqc = Channel.empty() - - - BINNING_PREPARATION ( - ch_assemblies, - ch_short_reads - ) + if ( !params.skip_binning || params.ancient_dna ) { + BINNING_PREPARATION ( + ch_assemblies, + ch_short_reads + ) + ch_versions = ch_versions.mix(BINNING_PREPARATION.out.bowtie2_version.first()) + } /* ================================================================================ @@ -556,11 +699,12 @@ workflow MAG { if (!params.skip_binning){ - if (params.ancient_dna) { + // Make sure if running aDNA subworkflow to use the damage-corrected contigs for higher accuracy + if (params.ancient_dna && !params.skip_ancient_damagecorrection) { BINNING ( BINNING_PREPARATION.out.grouped_mappings .join(ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled) - .map{ it -> [ it[0], it[4], it[2], it[3] ] }, // [meta, contigs_recalled, bam, bais] + .map { it -> [ it[0], it[4], it[2], it[3] ] }, // [meta, contigs_recalled, bam, bais] ch_short_reads ) } else { @@ -569,17 +713,51 @@ workflow MAG { ch_short_reads ) } - - ch_bowtie2_assembly_multiqc = BINNING_PREPARATION.out.bowtie2_assembly_multiqc - ch_versions = ch_versions.mix(BINNING_PREPARATION.out.bowtie2_version.first()) ch_versions = ch_versions.mix(BINNING.out.versions) + if ( params.bin_domain_classification ) { + + // Make sure if running aDNA subworkflow to use the damage-corrected contigs for higher accuracy + if (params.ancient_dna && !params.skip_ancient_damagecorrection) { + ch_assemblies_for_domainclassification = ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled + } else { + ch_assemblies_for_domainclassification = ch_assemblies + } + + DOMAIN_CLASSIFICATION ( ch_assemblies_for_domainclassification, BINNING.out.bins, BINNING.out.unbinned ) + ch_binning_results_bins = DOMAIN_CLASSIFICATION.out.classified_bins + ch_binning_results_unbins = DOMAIN_CLASSIFICATION.out.classified_unbins + ch_versions = ch_versions.mix(DOMAIN_CLASSIFICATION.out.versions) + + + } else { + ch_binning_results_bins = BINNING.out.bins + .map { meta, bins -> + def meta_new = meta + [domain: 'unclassified'] + [meta_new, bins] + } + ch_binning_results_unbins = BINNING.out.unbinned + .map { meta, bins -> + def meta_new = meta + [domain: 'unclassified'] + [meta_new, bins] + } + } + /* * DAS Tool: binning refinement */ // If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one if ( params.refine_bins_dastool ) { + ch_prokarya_bins_dastool = ch_binning_results_bins + .filter { meta, bins -> + meta.domain != "eukarya" + } + + ch_eukarya_bins_dastool = ch_binning_results_bins + .filter { meta, bins -> + meta.domain == "eukarya" + } if (params.ancient_dna) { ch_contigs_for_binrefinement = ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled @@ -588,48 +766,50 @@ workflow MAG { .map{ meta, contigs, bam, bai -> [ meta, contigs ] } } - BINNING_REFINEMENT ( ch_contigs_for_binrefinement, BINNING.out.bins, BINNING.out.metabat2depths, ch_short_reads ) + BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool ) + ch_refined_bins = ch_eukarya_bins_dastool.mix(BINNING_REFINEMENT.out.refined_bins) + ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) if ( params.postbinning_input == 'raw_bins_only' ) { - ch_input_for_postbinning_bins = BINNING.out.bins - ch_input_for_postbinning_bins_unbins = BINNING.out.bins.mix(BINNING.out.unbinned) - ch_input_for_binsummary = BINNING.out.depths_summary + ch_input_for_postbinning_bins = ch_binning_results_bins + ch_input_for_postbinning_bins_unbins = ch_binning_results_bins.mix(ch_binning_results_unbins) } else if ( params.postbinning_input == 'refined_bins_only' ) { - ch_input_for_postbinning_bins = BINNING_REFINEMENT.out.refined_bins - ch_input_for_postbinning_bins_unbins = BINNING_REFINEMENT.out.refined_bins.mix(BINNING_REFINEMENT.out.refined_unbins) - ch_input_for_binsummary = BINNING_REFINEMENT.out.refined_depths_summary - } else if (params.postbinning_input == 'both') { - ch_input_for_postbinning_bins = BINNING.out.bins.mix(BINNING_REFINEMENT.out.refined_bins) - ch_input_for_postbinning_bins_unbins = BINNING.out.bins.mix(BINNING.out.unbinned,BINNING_REFINEMENT.out.refined_bins,BINNING_REFINEMENT.out.refined_unbins) - ch_combinedepthtsvs_for_binsummary = BINNING.out.depths_summary.mix(BINNING_REFINEMENT.out.refined_depths_summary) - ch_input_for_binsummary = COMBINE_SUMMARY_TSV ( ch_combinedepthtsvs_for_binsummary.collect() ).combined + ch_input_for_postbinning_bins = ch_refined_bins + ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins) + // TODO REACTIVATE ONCE PR #489 IS READY! + // TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING + // } else if ( params.postbinning_input == 'both' ) { + // ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) + // ch_input_for_postbinning_bins = ch_all_bins + // ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) } - } else { - ch_input_for_postbinning_bins = BINNING.out.bins - ch_input_for_postbinning_bins_unbins = BINNING.out.bins.mix(BINNING.out.unbinned) - ch_input_for_binsummary = BINNING.out.depths_summary + ch_input_for_postbinning_bins = ch_binning_results_bins + ch_input_for_postbinning_bins_unbins = ch_binning_results_bins.mix(ch_binning_results_unbins) } + DEPTHS ( ch_input_for_postbinning_bins_unbins, BINNING.out.metabat2depths, ch_short_reads ) + ch_input_for_binsummary = DEPTHS.out.depths_summary + ch_versions = ch_versions.mix(DEPTHS.out.versions) + /* * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, and/or GUNC */ - // Results in: [ [meta], path_to_bin.fa ] ch_input_bins_for_qc = ch_input_for_postbinning_bins_unbins.transpose() if (!params.skip_binqc && params.binqc_tool == 'busco'){ /* * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly */ + BUSCO_QC ( ch_busco_db_file, ch_busco_download_folder, ch_input_bins_for_qc ) ch_busco_summary = BUSCO_QC.out.summary - ch_busco_multiqc = BUSCO_QC.out.multiqc ch_versions = ch_versions.mix(BUSCO_QC.out.versions.first()) // process information if BUSCO analysis failed for individual bins due to no matching genes BUSCO_QC.out @@ -642,21 +822,30 @@ workflow MAG { /* * CheckM subworkflow: Quantitative measures for the assessment of genome assembly */ + + ch_input_bins_for_checkm = ch_input_bins_for_qc + .filter { meta, bins -> + meta.domain != "eukarya" + } + CHECKM_QC ( - ch_input_bins_for_qc.groupTuple(), + ch_input_bins_for_checkm.groupTuple(), ch_checkm_db ) ch_checkm_summary = CHECKM_QC.out.summary - // TODO custom output parsing? Add to MultiQC? ch_versions = ch_versions.mix(CHECKM_QC.out.versions) } if ( params.run_gunc && params.binqc_tool == 'checkm' ) { - GUNC_QC ( ch_input_bins_for_qc, ch_gunc_db, CHECKM_QC.out.checkm_tsv ) + GUNC_QC ( ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv ) ch_versions = ch_versions.mix( GUNC_QC.out.versions ) } else if ( params.run_gunc ) { + ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins + .filter { meta, bins -> + meta.domain != "eukarya" + } GUNC_QC ( ch_input_bins_for_qc, ch_gunc_db, [] ) ch_versions = ch_versions.mix( GUNC_QC.out.versions ) } @@ -665,11 +854,12 @@ workflow MAG { if (!params.skip_quast){ ch_input_for_quast_bins = ch_input_for_postbinning_bins_unbins .groupTuple() - .map{ + .map { meta, reads -> def new_reads = reads.flatten() [meta, new_reads] } + QUAST_BINS ( ch_input_for_quast_bins ) ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first()) QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() ) @@ -700,19 +890,31 @@ workflow MAG { /* * GTDB-tk: taxonomic classifications using GTDB reference */ - ch_gtdbtk_summary = Channel.empty() - if ( gtdb ){ - GTDBTK ( - ch_input_for_postbinning_bins_unbins, - ch_busco_summary, - ch_checkm_summary, - ch_gtdb - ) - ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) - ch_gtdbtk_summary = GTDBTK.out.summary + + if ( !params.skip_gtdbtk ) { + + ch_gtdbtk_summary = Channel.empty() + if ( gtdb ){ + + ch_gtdb_bins = ch_input_for_postbinning_bins_unbins + .filter { meta, bins -> + meta.domain != "eukarya" + } + + GTDBTK ( + ch_gtdb_bins, + ch_busco_summary, + ch_checkm_summary, + gtdb + ) + ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) + ch_gtdbtk_summary = GTDBTK.out.summary + } + } else { + ch_gtdbtk_summary = Channel.empty() } - if ( ( !params.skip_binqc ) || !params.skip_quast || gtdb){ + if ( ( !params.skip_binqc ) || !params.skip_quast || !params.skip_gtdbtk){ BIN_SUMMARY ( ch_input_for_binsummary, ch_busco_summary.ifEmpty([]), @@ -725,14 +927,17 @@ workflow MAG { /* * Prokka: Genome annotation */ - ch_bins_for_prokka = ch_input_for_postbinning_bins_unbins.transpose() + + if (!params.skip_prokka){ + ch_bins_for_prokka = ch_input_for_postbinning_bins_unbins.transpose() .map { meta, bin -> - def meta_new = meta.clone() - meta_new.id = bin.getBaseName() + def meta_new = meta + [id: bin.getBaseName()] [ meta_new, bin ] } + .filter { meta, bin -> + meta.domain != "eukarya" + } - if (!params.skip_prokka){ PROKKA ( ch_bins_for_prokka, [], @@ -740,6 +945,20 @@ workflow MAG { ) ch_versions = ch_versions.mix(PROKKA.out.versions.first()) } + + if (!params.skip_metaeuk && (params.metaeuk_db || params.metaeuk_mmseqs_db)) { + ch_bins_for_metaeuk = ch_input_for_postbinning_bins_unbins.transpose() + .filter { meta, bin -> + meta.domain in ["eukarya", "unclassified"] + } + .map { meta, bin -> + def meta_new = meta + [id: bin.getBaseName()] + [ meta_new, bin ] + } + + METAEUK_EASYPREDICT (ch_bins_for_metaeuk, ch_metaeuk_db) + ch_versions = ch_versions.mix(METAEUK_EASYPREDICT.out.versions) + } } CUSTOM_DUMPSOFTWAREVERSIONS ( @@ -752,57 +971,71 @@ workflow MAG { workflow_summary = WorkflowMag.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - // Currently not used due to local MultiQC module - //methods_description = WorkflowMag.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) - //ch_methods_description = Channel.value(methods_description) + methods_description = WorkflowMag.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - /* //This is the template input with the nf-core module + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW.out.zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMMED.out.zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bowtie2_removal_host_multiqc.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_quast_multiqc.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_bowtie2_assembly_multiqc.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_busco_multiqc.collect().ifEmpty([])) - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - */ + if (!params.assembly_input) { + + if ( !params.skip_clipping && params.clip_tool == 'adapterremoval' ) { + ch_multiqc_files = ch_multiqc_files.mix(ADAPTERREMOVAL_PE.out.settings.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ADAPTERREMOVAL_SE.out.settings.collect{it[1]}.ifEmpty([])) + + } else if ( !params.skip_clipping && params.clip_tool == 'fastp' ) { + ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.json.collect{it[1]}.ifEmpty([])) + } + + if (!(params.keep_phix && params.skip_clipping && !(params.host_genome || params.host_fasta))) { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMMED.out.zip.collect{it[1]}.ifEmpty([])) + } + + if ( params.host_fasta || params.host_genome ) { + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_HOST_REMOVAL_ALIGN.out.log.collect{it[1]}.ifEmpty([])) + } + + if(!params.keep_phix) { + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_PHIX_REMOVAL_ALIGN.out.log.collect{it[1]}.ifEmpty([])) + } + + } + + ch_multiqc_files = ch_multiqc_files.mix(CENTRIFUGE.out.kreport.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2.out.report.collect{it[1]}.ifEmpty([])) - ch_multiqc_readprep = Channel.empty() + if (!params.skip_quast){ + ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.report.collect().ifEmpty([])) - if (!params.skip_clipping) { - if ( params.clip_tool == "fastp") { - ch_multiqc_readprep = ch_multiqc_readprep.mix(FASTP.out.json.collect{it[1]}.ifEmpty([])) - } else if ( params.clip_tool == "adapterremoval" ) { - ch_multiqc_readprep = ch_multiqc_readprep.mix(ADAPTERREMOVAL_PE.out.settings.collect{it[1]}.ifEmpty([]), ADAPTERREMOVAL_SE.out.settings.collect{it[1]}.ifEmpty([])) + if ( !params.skip_binning ) { + ch_multiqc_files = ch_multiqc_files.mix(QUAST_BINS.out.dir.collect().ifEmpty([])) } } - ch_fastqc_trimmed_multiqc = Channel.empty() - if (!(params.keep_phix && params.skip_clipping && !(params.host_genome || params.host_fasta))) { - ch_fastqc_trimmed_multiqc = FASTQC_TRIMMED.out.zip.collect{it[1]}.ifEmpty([]) + if ( !params.skip_binning || params.ancient_dna ) { + ch_multiqc_files = ch_multiqc_files.mix(BINNING_PREPARATION.out.bowtie2_assembly_multiqc.collect().ifEmpty([])) + } + + if (!params.skip_binning && !params.skip_prokka){ + ch_multiqc_files = ch_multiqc_files.mix(PROKKA.out.txt.collect{it[1]}.ifEmpty([])) + } + + if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco'){ + ch_multiqc_files = ch_multiqc_files.mix(BUSCO_QC.out.multiqc.collect().ifEmpty([])) } + MULTIQC ( ch_multiqc_files.collect(), - ch_multiqc_custom_config.collect().ifEmpty([]), - FASTQC_RAW.out.zip.collect{it[1]}.ifEmpty([]), - ch_fastqc_trimmed_multiqc.collect().ifEmpty([]), - ch_bowtie2_removal_host_multiqc.collect{it[1]}.ifEmpty([]), - ch_quast_multiqc.collect().ifEmpty([]), - ch_bowtie2_assembly_multiqc.collect().ifEmpty([]), - ch_busco_multiqc.collect().ifEmpty([]), - ch_multiqc_readprep.collect().ifEmpty([]) + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() ) + multiqc_report = MULTIQC.out.report.toList() }