diff --git a/.editorconfig b/.editorconfig
index b78de6e6..b6b31907 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -8,7 +8,7 @@ trim_trailing_whitespace = true
indent_size = 4
indent_style = space
-[*.{md,yml,yaml,html,css,scss,js,cff}]
+[*.{md,yml,yaml,html,css,scss,js}]
indent_size = 2
# These files are edited and tested upstream in nf-core/modules
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 6e0ea2a2..2ac5f2d9 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -116,4 +116,3 @@ To get started:
Devcontainer specs:
- [DevContainer config](.devcontainer/devcontainer.json)
-- [Dockerfile](.devcontainer/Dockerfile)
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 491a2a47..146d5516 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -42,9 +42,9 @@ body:
attributes:
label: System information
description: |
- * Nextflow version _(eg. 22.10.1)_
+ * Nextflow version _(eg. 23.04.0)_
* Hardware _(eg. HPC, Desktop, Cloud)_
* Executor _(eg. slurm, local, awsbatch)_
- * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_
+ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_
* OS _(eg. CentOS Linux, macOS, Linux Mint)_
* Version of nf-core/mag _(eg. 1.1, 1.5, 1.8.2)_
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 2b1d3b46..46acbff1 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -15,7 +15,8 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/mag/
- [ ] This comment contains a description of changes (with reason).
- [ ] If you've fixed a bug or added code that should be tested, add tests!
-- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository.
+- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/mag/tree/master/.github/CONTRIBUTING.md)
+- [ ] If necessary, also make a PR on the nf-core/mag _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository.
- [ ] Make sure your code lints (`nf-core lint`).
- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `).
- [ ] Usage Documentation in `docs/usage.md` is updated.
diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
index 19051df4..c53ab09d 100644
--- a/.github/workflows/awsfulltest.yml
+++ b/.github/workflows/awsfulltest.yml
@@ -14,18 +14,23 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Launch workflow via tower
- uses: nf-core/tower-action@v3
+ uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+ revision: ${{ github.sha }}
workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/mag/work-${{ github.sha }}
parameters: |
{
+ "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}",
"outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-${{ github.sha }}"
}
- profiles: test_full,aws_tower
+ profiles: test_full
+
- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
- path: tower_action_*.log
+ path: |
+ tower_action_*.log
+ tower_action_*.json
diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
index 7a6ff7ef..be11af11 100644
--- a/.github/workflows/awstest.yml
+++ b/.github/workflows/awstest.yml
@@ -12,18 +12,22 @@ jobs:
steps:
# Launch workflow using Tower CLI tool action
- name: Launch workflow via tower
- uses: nf-core/tower-action@v3
+ uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+ revision: ${{ github.sha }}
workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/mag/work-${{ github.sha }}
parameters: |
{
"outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/mag/results-test-${{ github.sha }}"
}
- profiles: test,aws_tower
+ profiles: test
+
- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
- path: tower_action_*.log
+ path: |
+ tower_action_*.log
+ tower_action_*.json
diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml
index e08454a2..0d205d33 100644
--- a/.github/workflows/branch.yml
+++ b/.github/workflows/branch.yml
@@ -13,7 +13,7 @@ jobs:
- name: Check PRs
if: github.repository == 'nf-core/mag'
run: |
- { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/mag ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]
+ { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/mag ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]
# If the above check failed, post a comment on the PR explaining the failure
# NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b95b00ef..3aaa6f3e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ jobs:
strategy:
matrix:
NXF_VER:
- - "22.10.1"
+ - "23.04.0"
- "latest-everything"
steps:
- name: Free some space
@@ -61,6 +61,7 @@ jobs:
test_ancient_dna,
test_adapterremoval,
test_binrefinement,
+ test_virus_identification,
]
steps:
- name: Free some space
diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml
new file mode 100644
index 00000000..694e90ec
--- /dev/null
+++ b/.github/workflows/clean-up.yml
@@ -0,0 +1,24 @@
+name: "Close user-tagged issues and PRs"
+on:
+ schedule:
+ - cron: "0 0 * * 0" # Once a week
+
+jobs:
+ clean-up:
+ runs-on: ubuntu-latest
+ permissions:
+ issues: write
+ pull-requests: write
+ steps:
+ - uses: actions/stale@v7
+ with:
+ stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days."
+ stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful."
+ close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity."
+ days-before-stale: 30
+ days-before-close: 20
+ days-before-pr-close: -1
+ any-of-labels: "awaiting-changes,awaiting-feedback"
+ exempt-issue-labels: "WIP"
+ exempt-pr-labels: "WIP"
+ repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
index 858d622e..888cb4bc 100644
--- a/.github/workflows/linting.yml
+++ b/.github/workflows/linting.yml
@@ -78,7 +78,7 @@ jobs:
- uses: actions/setup-python@v4
with:
- python-version: "3.7"
+ python-version: "3.8"
architecture: "x64"
- name: Install dependencies
diff --git a/.gitpod.yml b/.gitpod.yml
index 85d95ecc..25488dcc 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -1,4 +1,9 @@
image: nfcore/gitpod:latest
+tasks:
+ - name: Update Nextflow and setup pre-commit
+ command: |
+ pre-commit install --install-hooks
+ nextflow self-update
vscode:
extensions: # based on nf-core.nf-core-extensionpack
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..0c31cdb9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,5 @@
+repos:
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: "v2.7.1"
+ hooks:
+ - id: prettier
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0358ddff..0ca54895 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,56 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## 2.4.0 - 2023-09-25
+
+### `Added`
+
+- [#497](https://github.com/nf-core/mag/pull/497) - Adds support for pointing at a local db for krona, using the parameter `--krona_db` (by @willros).
+- [#395](https://github.com/nf-core/mag/pull/395) - Adds support for fast domain-level classification of bins using Tiara, to allow bins to be separated into eukaryotic and prokaryotic-specific processes.
+- [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli)
+- [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites)
+- [#459](https://github.com/nf-core/mag/pull/459) - Adds ability to skip damage correction step in the ancient DNA workflow and just run pyDamage (by @jfy133)
+- [#364](https://github.com/nf-core/mag/pull/364) - Adds geNomad nf-core modules for identifying viruses in assemblies (by @PhilPalmer and @CarsonJM)
+- [#481](https://github.com/nf-core/mag/pull/481) - Adds MetaEuk for annotation of eukaryotic MAGs, and MMSeqs2 to enable downloading databases for MetaEuk (by @prototaxites)
+- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb_db` also now supports directory input of an pre-uncompressed GTDB archive directory (reported by @alneberg, fix by @jfy133)
+- [#494](https://github.com/nf-core/mag/pull/494) - Adds support for saving the BAM files from Bowtie2 mapping of input reads back to assembly (fix by @jfy133)
+
+### `Changed`
+
+- [#428](https://github.com/nf-core/mag/pull/428) [#467](https://github.com/nf-core/mag/pull/467) - Update to nf-core 2.8, 2.9 `TEMPLATE` (by @jfy133)
+- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133)
+- [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133).
+- [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133).
+- [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133)
+- [#477](https://github.com/nf-core/mag/pull/477) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133)
+- [#500](https://github.com/nf-core/mag/pull/500) - Temporarily disabled downstream processing of both refined and raw bins due to bug (by @jfy133)
+
+### `Fixed`
+
+- [#496](https://github.com/nf-core/mag/pull/496) - Fix help text for paramters `--bowtie2_mode`, `spades_options` and `megahit_options` (by @willros)
+- [#400](https://github.com/nf-core/mag/pull/400) - Fix duplicated Zenodo badge in README (by @jfy133)
+- [#406](https://github.com/nf-core/mag/pull/406) - Fix CheckM database always downloading, regardless if CheckM is selected (by @jfy133)
+- [#419](https://github.com/nf-core/mag/pull/419) - Fix bug with busco_clean parameter, where it is always activated (by @prototaxites)
+- [#426](https://github.com/nf-core/mag/pull/426) - Fixed typo in help text for parameters `--host_genome` and `--host_fasta` (by @tillenglert)
+- [#434](https://github.com/nf-core/mag/pull/434) - Fix location of samplesheet for AWS full tests (reported by @Lfulcrum, fix by @jfy133)
+- [#438](https://github.com/nf-core/mag/pull/438) - Fixed version inconsistency between conda and containers for GTDBTK_CLASSIFYWF (by @jfy133)
+- [#439](https://github.com/nf-core/mag/pull/445) - Fix bug in assembly input (by @prototaxites)
+- [#447](https://github.com/nf-core/mag/pull/447) - Remove `default: None` from parameter schema (by @drpatelh)
+- [#449](https://github.com/nf-core/mag/pull/447) - Fix results file overwriting in Ancient DNA workflow (reported by @alexhbnr, fix by @jfy133)
+- [#470](https://github.com/nf-core/mag/pull/470) - Fix binning preparation from running even when binning was requested to be skipped (reported by @prototaxites, fix by @jfy133)
+- [#480](https://github.com/nf-core/mag/pull/480) - Improved `-resume` reliability through better meta map preservation (reported by @prototaxites, fix by @jfy133)
+- [#493](https://github.com/nf-core/mag/pull/493) - Update `METABAT2` nf-core module so that it reduced the number of unnecessary file moves, enabling virtual filesystems (fix by @adamrtalbot)
+- [#500](https://github.com/nf-core/mag/pull/500) - Fix MaxBin2 bins not being saved in results directly properly (reported by @Perugolate, fix by @jfy133)
+
+### `Dependencies`
+
+| Tool | Previous version | New version |
+| -------- | ---------------- | ----------- |
+| BCFtools | 1.16 | 1.17 |
+| SAMtools | 1.16.1 | 1.17 |
+| fastp | 0.23.2 | 0.23.4 |
+| MultiQC | 1.14 | 1.15 |
+
## v2.3.2 - [2023-06-23]
### `Fixed`
@@ -36,6 +86,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#373](https://github.com/nf-core/mag/pull/373) - Removed parameter `--enable_conda`. Updated local modules to new conda syntax and updated nf-core modules (by @skrakau)
- [#385](https://github.com/nf-core/mag/pull/385) - CAT also now runs on unbinned contigs as well as binned contigs (added by @jfy133)
- [#399](https://github.com/nf-core/mag/pull/399/files) - Removed undocumented BUSCO_PLOT process (previously generated `*.busco_figure.png` plots unsuitable for metagenomics) (by @skrakau).
+- [#416](https://github.com/nf-core/mag/pull/416) - Use GTDBTK_CLASSIFYWF nf-core module instead of local module (added by @alxndrdiaz)
### `Fixed`
diff --git a/CITATIONS.md b/CITATIONS.md
index 09d165b3..846609b5 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -14,6 +14,8 @@
> Schubert, M., Lindgreen, S., and Orlando, L. 2016. "AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging." BMC Research Notes 9 (February): 88. doi: 10.1186/s13104-016-1900-2
+- [BBnorm/BBTools](http://sourceforge.net/projects/bbmap/)
+
- [BCFtools](https://doi.org/10.1093/gigascience/giab008)
> Danecek, Petr, et al. "Twelve years of SAMtools and BCFtools." Gigascience 10.2 (2021): giab008. doi: 10.1093/gigascience/giab008
@@ -52,12 +54,18 @@
- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+ > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. Available online https://www.bioinformatics.babraham.ac.uk/projects/fastqc/.
+
- [Filtlong](https://github.com/rrwick/Filtlong)
- [Freebayes](https://arxiv.org/abs/1207.3907)
> Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 [q-bio.GN] 2012
+- [geNomad](https://doi.org/10.1101/2023.03.05.531206)
+
+ > Camargo, A. P., et al. (2023). You can move, but you can’t hide: identification of mobile genetic elements with geNomad. bioRxiv preprint. doi: https://doi.org/10.1101/2023.03.05.531206
+
- [GTDB-Tk](https://doi.org/10.1093/bioinformatics/btz848)
> Chaumeil, P. A., Mussig, A. J., Hugenholtz, P., & Parks, D. H. (2020). GTDB-Tk: a toolkit to classify genomes with the Genome Taxonomy Database. Bioinformatics , 36(6), 1925–1927. doi: 10.1093/bioinformatics/btz848.
@@ -86,6 +94,14 @@
> Kang, D. D., Li, F., Kirton, E., Thomas, A., Egan, R., An, H., & Wang, Z. (2019). MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies. PeerJ, 7, e7359. doi: 10.7717/peerj.7359.
+- [MetaEuk](https://doi.org/10.1186/s40168-020-00808-x)
+
+> Levy Karin, E., Mirdita, M. & Söding, J. MetaEuk—sensitive, high-throughput gene discovery, and annotation for large-scale eukaryotic metagenomics. Microbiome 8, 48 (2020). https://doi.org/10.1186/s40168-020-00808-x
+
+- [MMseqs2](https://www.nature.com/articles/nbt.3988)
+
+> Steinegger, M., Söding, J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). https://doi.org/10.1038/nbt.3988
+
- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
@@ -116,10 +132,16 @@
> Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. doi: 10.1093/bioinformatics/btp352.
+- [Seqtk](https://github.com/lh3/seqtk)
+
- [SPAdes](https://doi.org/10.1101/gr.213959.116)
> Nurk, S., Meleshko, D., Korobeynikov, A., & Pevzner, P. A. (2017). metaSPAdes: a new versatile metagenomic assembler. Genome research, 27(5), 824-834. doi: 10.1101/gr.213959.116.
+- [Tiara](https://doi.org/10.1093/bioinformatics/btab672)
+
+ > Karlicki, M., Antonowicz, S., Karnkowska, A., 2022. Tiara: deep learning-based classification system for eukaryotic sequences. Bioinformatics 38, 344–350. doi: 10.1093/bioinformatics/btab672
+
## Data
- [Full-size test data](https://doi.org/10.1038/s41587-019-0191-2)
@@ -141,5 +163,8 @@
- [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241)
+ > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241.
+
- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/)
+
> Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675.
diff --git a/README.md b/README.md
index c851487d..3ed797e8 100644
--- a/README.md
+++ b/README.md
@@ -2,16 +2,16 @@
[![GitHub Actions CI Status](https://github.com/nf-core/mag/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+CI%22)
[![GitHub Actions Linting Status](https://github.com/nf-core/mag/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/mag/actions?query=workflow%3A%22nf-core+linting%22)
-[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3589527-1073c8)](https://doi.org/10.5281/zenodo.3589527)
+[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3589527-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3589527)
[![Cite Publication](https://img.shields.io/badge/Cite%20Us!-Cite%20Publication-orange)](https://doi.org/10.1093/nargab/lqac007)
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/mag)
-[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/mag)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)
+[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23mag-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/mag)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)
## Introduction
@@ -21,51 +21,35 @@
-The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
-
-On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/mag/results).
-
## Pipeline summary
-By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
+By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), and merge multiple sequencing runs.
+
The pipeline then:
- assigns taxonomy to reads using [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) and/or [Kraken2](https://github.com/DerrickWood/kraken2/wiki)
- performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast)
- (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html)
-- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal)
+- predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk)
- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/).
+- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes)
- optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool)
-- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT)
+- assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara)
Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions.
-## Quick Start
-
-1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`)
-
-2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_.
-
-3. Download the pipeline and test it on a minimal dataset with a single command:
-
- ```bash
- nextflow run nf-core/mag -profile test,YOURPROFILE --outdir
- ```
+## Usage
- Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string.
+> **Note**
+> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how
+> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
+> with `-profile test` before running the workflow on actual data.
- > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`.
- > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
- > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs.
- > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs.
-
-4. Start running your own analysis!
-
- ```bash
- nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir
- ```
+```bash
+nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir
+```
- or
+or
```bash
nextflow run nf-core/mag -profile --input samplesheet.csv --outdir
@@ -73,9 +57,18 @@ nextflow run nf-core/mag -profile **Warning:**
+> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
+> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
+> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
+
+For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/mag/usage) and the [parameter documentation](https://nf-co.re/mag/parameters).
+
+## Pipeline output
-The nf-core/mag pipeline comes with documentation about the pipeline [usage](https://nf-co.re/mag/usage), [parameters](https://nf-co.re/mag/parameters) and [output](https://nf-co.re/mag/output). Detailed information about how to specify the input can be found under [input specifications](https://nf-co.re/mag/usage#input_specifications).
+To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/mag/results) tab on the nf-core website pipeline page.
+For more details about the output files and reports, please refer to the
+[output documentation](https://nf-co.re/mag/output).
### Group-wise co-assembly and co-abundance computation
@@ -100,6 +93,7 @@ We thank the following people for their extensive assistance in the development
- [Maxime Garcia](https://github.com/MaxUlysse)
- [Michael L Heuer](https://github.com/heuermh)
- [Alex Hübner](https://github.com/alexhbnr)
+- [Jim Downie](https://github.com/prototaxites)
## Contributions and Support
diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml
index 1b18c63e..d3fe4abe 100644
--- a/assets/methods_description_template.yml
+++ b/assets/methods_description_template.yml
@@ -3,18 +3,22 @@ description: "Suggested text and references to use when describing pipeline usag
section_name: "nf-core/mag Methods Description"
section_href: "https://github.com/nf-core/mag"
plot_type: "html"
-## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline
+## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline
## You inject any metadata in the Nextflow '${workflow}' object
data: |
Methods
-
Data was processed using nf-core/mag v${workflow.manifest.version} (${doi_text}; Krakau et al., 2022) of the nf-core collection of workflows (Ewels et al., 2020).
The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:
${workflow.commandLine}
+
${tool_citations}
References
-
Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
-
Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
+
Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
+
Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
Krakau, S., Straub, D., Gourlé, H., Gabernet, G., & Nahnsen, S. (2022). nf-core/mag: a best-practice pipeline for metagenome hybrid assembly and binning. NAR Genomics and Bioinformatics, 4(1). https://doi.org/10.1038/s41587-020-0439-x
+
Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
+
da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
+ ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 97e1f26e..4909cb1a 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,7 +1,7 @@
report_comment: >
- This report has been generated by the nf-core/mag
+ This report has been generated by the nf-core/mag
analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ documentation.
report_section_order:
"nf-core-mag-methods-description":
order: -1000
@@ -14,22 +14,78 @@ export_plots: true
data_format: "yaml"
+run_modules:
+ - fastqc
+ - fastp
+ - adapterRemoval
+ - custom_content
+ - bowtie2
+ - busco
+ - quast
+ - kraken
+ - prokka
+
+## Module order
top_modules:
- "fastqc":
name: "FastQC: raw reads"
path_filters_exclude:
- "*trimmed*"
- "fastp"
- - "adapterRemoval":
- - custom_content
+ - "adapterRemoval"
- "fastqc":
name: "FastQC: after preprocessing"
info: "After trimming and, if requested, contamination removal."
path_filters:
- "*trimmed*"
+ - "bowtie2":
+ name: "Bowtie2: PhiX removal"
+ info: "Mapping statistics of reads mapped against PhiX and subsequently removed."
+ path_filters:
+ - "*_phix_removed.bowtie2.log"
+ - "bowtie2":
+ name: "Bowtie2: host removal"
+ info: "Mapping statistics of reads mapped against host genome and subsequently removed."
+ path_filters:
+ - "*_host_removed.bowtie2.log"
+ - "bowtie2":
+ name: "Bowtie2: assembly"
+ info: "Mapping statistics of reads mapped against assemblies."
+ path_filters_exclude:
+ - "*_host_removed.bowtie2.log"
+ - "*_phix_removed.bowtie2.log"
+ - "kraken":
+ name: "Kraken2"
+ anchor: "Kraken2"
+ target: "Kraken2"
+ doi: "10.1101/gr.210641.116"
+ path_filters:
+ - "*.kraken2_report.txt"
+ - "kraken":
+ name: "Centrifuge"
+ anchor: "centrifuge"
+ target: "Centrifuge"
+ doi: "10.1101/gr.210641.116"
+ info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title"
+ extra: "ℹ️: plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above."
+ path_filters:
+ - "*.centrifuge_kreport.txt"
+ - "porechop"
+ - "bcftools"
+ - custom_content
- "busco":
info: "assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO's automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown."
- - "quast"
+ - "quast":
+ name: "QUAST: assembly"
+ info: "Assembly statistics of raw assemblies."
+ path_filters:
+ - "*rawassemblies.tsv"
+ - "quast":
+ name: "QUAST: bins"
+ info: "Assembly statistics of binned assemblies."
+ path_filters_exclude:
+ - "*rawassemblies.tsv"
+ - "prokka"
custom_data:
host_removal:
@@ -42,12 +98,145 @@ custom_data:
title: "Bowtie 2: reads mapped against host reference"
ylab: "# Reads"
+## Sample name cleaning
sp:
host_removal:
fn: "host_removal_metrics.tsv"
adapterRemoval:
- fn: "*_ar2_*.log"
+ fn: "*_ar2.settings"
+ kraken:
+ fn_re: ".*[kraken2|centrifuge].*report.txt"
+ quast:
+ fn_re: "report.*.tsv"
+## File name cleaning
extra_fn_clean_exts:
- ".bowtie2"
- "_ar2"
+ - "host_removed"
+ - "phix_removed"
+ - "centrifuge_kreport"
+ - "_fastp"
+
+## Prettification
+custom_logo: "nf-core-mag_logo_light.png"
+custom_logo_url: https://github.com/nf-core/mag/
+custom_logo_title: "nf-core/mag"
+
+## Tool specific configuration
+prokka_fn_snames: True
+
+## General Stats customisation
+table_columns_visible:
+ "FastQC: raw reads":
+ avg_sequence_length: True
+ "FastQC: after preprocessing":
+ avg_sequence_length: True
+ "fastp":
+ pct_duplication: False
+ after_filtering_q30_rate: False
+ after_filtering_q30_bases: False
+ filtering_result_passed_filter_reads: 3300
+ after_filtering_gc_content: False
+ pct_surviving: True
+ pct_adapter: True
+ "Bowtie2: assembly": False
+ "Kraken2": False
+ "Centrifuge": False
+ "QUAST: assembly":
+ N75: True
+ L50: True
+ L75: True
+ "Largest contig": True
+ "Total length": True
+ N50: True
+ "QUAST: bins":
+ N75: True
+ L50: True
+ L75: True
+ "Largest contig": True
+ "Total length": True
+ N50: True
+ "Prokka": False
+
+table_columns_placement:
+ "FastQC: raw reads":
+ percent_duplicates: 1000
+ percent_gc: 1100
+ avg_sequence_length: 1200
+ median_sequence_length: 1300
+ total_sequences: 1400
+ percent_fails: 1500
+ "FastQC: after preprocessing":
+ percent_duplicates: 2000
+ percent_gc: 2100
+ avg_sequence_length: 2200
+ median_sequence_length: 2300
+ total_sequences: 2400
+ percent_fails: 2500
+ "fastp":
+ pct_duplication: 3000
+ after_filtering_q30_rate: 3100
+ after_filtering_q30_bases: 3200
+ filtering_result_passed_filter_reads: 3300
+ after_filtering_gc_content: 3400
+ pct_surviving: 3500
+ pct_adapter: 3600
+ "Adapter Removal":
+ percent_aligned: 4000
+ aligned_total: 4100
+ percent_discarded: 4200
+ "Bowtie2: PhiX removal":
+ overall_alignment_rate: 5000
+ "Bowtie2: host removal":
+ overall_alignment_rate: 6000
+ "Bowtie2: assembly":
+ overall_alignment_rate: 7000
+ "Kraken2":
+ "% root": 8000
+ "% Top 5": 8100
+ "% Unclassified": 8200
+ "Centrifuge":
+ "% root": 9000
+ "% Top 5": 9100
+ "% Unclassified": 9200
+ "QUAST: assembly":
+ "N50": 10000
+ "Total length": 11000
+ "QUAST: bins":
+ "N50": 10000
+ "Total length": 11000
+ Prokka:
+ contigs: 20000
+ bases: 21000
+ CDS: 22000
+ organism: 23000
+
+table_columns_name:
+ "FastQC: raw reads":
+ percent_duplicates: "% Dups (raw)"
+ percent_gc: "% GC (raw)"
+ avg_sequence_length: "Avg. length (raw)"
+ median_sequence_length: "Median length (raw)"
+ total_sequences: "M Seqs (raw)"
+ percent_fails: "% Fails (raw)"
+ "FastQC: after preprocessing":
+ percent_duplicates: "% Dups (processed)"
+ percent_gc: "% GC (processed)"
+ avg_sequence_length: "Avg. length (processed)"
+ median_sequence_length: "Median length (processed)"
+ total_sequences: "M Seqs (processed)"
+ percent_fails: "% Fails (processed)"
+ "Bowtie2: PhiX removal":
+ overall_alignment_rate: "% Aligned (PhiX)"
+ "Bowtie2: host removal":
+ overall_alignment_rate: "% Aligned (Host)"
+ "Bowtie2: assembly":
+ overall_alignment_rate: "% Aligned (Assem.)"
+
+custom_table_header_config:
+ general_stats_table:
+ "Total length":
+ hidden: True
+ N50:
+ hidden: True
diff --git a/assets/nf-core-mag_logo_light.png b/assets/nf-core-mag_logo_light.png
index 26d7ed5d..64276cbe 100644
Binary files a/assets/nf-core-mag_logo_light.png and b/assets/nf-core-mag_logo_light.png differ
diff --git a/assets/slackreport.json b/assets/slackreport.json
index 043d02f2..bc7d3f0c 100644
--- a/assets/slackreport.json
+++ b/assets/slackreport.json
@@ -3,7 +3,7 @@
{
"fallback": "Plain-text summary of the attachment.",
"color": "<% if (success) { %>good<% } else { %>danger<%} %>",
- "author_name": "sanger-tol/readmapping v${version} - ${runName}",
+ "author_name": "nf-core/mag v${version} - ${runName}",
"author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico",
"text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>",
"fields": [
diff --git a/bin/domain_classification.R b/bin/domain_classification.R
new file mode 100755
index 00000000..eb64b312
--- /dev/null
+++ b/bin/domain_classification.R
@@ -0,0 +1,156 @@
+#!/usr/bin/env Rscript
+
+# Written by Jim Downie and released under the MIT license.
+# See git repository (https://github.com/nf-core/mag) for full license text.
+
+library(optparse)
+library(tidyverse)
+
+parser <- OptionParser()
+parser <- add_option(parser, c("-t", "--classification_file"),
+ action = "store",
+ type = "character",
+ metavar = "character",
+ help = "The out.txt tsv file of per-contig classifications from Tiara.")
+parser <- add_option(parser, c("-s", "--contig_to_bin"),
+ action = "store",
+ type = "character",
+ metavar = "character",
+ help = "A tsv file with two columns, bin and contig, listing the contig membership for each bin.")
+parser <- add_option(parser, c("-j", "--join_prokaryotes"),
+ action = "store_true",
+ type = "logical",
+ default = TRUE,
+ metavar = "logical",
+ help = "Use an general prokaryote classification instead of separating Archaea and Bacteria.")
+parser <- add_option(parser, c("-a", "--assembler"),
+ action = "store",
+ type = "character",
+ metavar = "character",
+ help = "Assembler used to assemble the contigs. 'MEGAHIT' or 'SPAdes' only.")
+parser <- add_option(parser, c("-o", "--output_prefix"),
+ action = "store",
+ type = "character",
+ metavar = "character",
+ help = "Prefix for the output classification table name.")
+args <- parse_args(parser)
+
+## optparse doesn't have a required flag so exit if we don't get given a file
+if(is.null(args$classification_file)) {
+ stop("Tiara classification file not provided.")
+}
+if(is.null(args$contig_to_bin)) {
+ stop("Contig to bin file not provided.")
+}
+if(is.null(args$assembler)) {
+ stop("Assembler not provided.")
+}
+if(!(args$assembler %in% c("MEGAHIT", "SPAdes"))) {
+ stop("Invalid assembler provided.")
+}
+
+find_classification <- function(probabilities, join_prokaryotes = TRUE) {
+ if(join_prokaryotes) {
+ classifications <- c("prokarya", "eukarya", "organelle", "unknown")
+ } else {
+ classifications <- c("archaea", "bacteria", "eukarya", "organelle", "unknown")
+ }
+ return(classifications[which.max(probabilities)])
+}
+
+classify_bins <- function(tiara, contig2bin, join_prokaryotes, assembler){
+ ## MEGAHIT produces contigs with spaces in the name
+ ## Depending on the binner, everything after the first space is sometimes dropped
+ ## Make sure that we drop everything after a possible space before doing anything else to allow merging
+ if(assembler == "MEGAHIT"){
+ tiara$sequence_id <- word(tiara$sequence_id)
+ contig2bin$sequence_id <- word(contig2bin$sequence_id)
+ }
+ if(join_prokaryotes) {
+ n_classifications <- 4
+ } else {
+ n_classifications <- 5
+ }
+
+ ## combination of left_join and filter collectively eliminate unclassified contigs
+ tiara <- tiara |>
+ left_join(contig2bin) |>
+ filter(!is.na(BinID)) |>
+ select(sequence_id,
+ BinID,
+ Archaea = arc,
+ Bacteria = bac,
+ Eukarya = euk,
+ Organelle = org,
+ Unknown = unk1)
+
+ if(join_prokaryotes) {
+ tiara <- tiara |>
+ mutate(Prokarya = Archaea + Bacteria) |>
+ select(sequence_id, BinID, Prokarya, Eukarya, Organelle, Unknown)
+ }
+
+ ## Identify the columns to softmax
+ prob_columns <- 2:(2 + n_classifications - 1)
+
+ ## Calculate softmax probabilites based on summed bin probabilities for each category
+ softmax_probabilities <- tiara |>
+ group_by(BinID) |>
+ summarise(across(all_of(prob_columns), sum), .groups = "drop") |>
+ rowwise() |>
+ mutate(denominator = sum(exp(c_across(all_of(prob_columns))))) |>
+ mutate(across(all_of(prob_columns), \(x) exp(x)/denominator),
+ classification = find_classification(c_across(all_of(prob_columns)),
+ join_prokaryotes = join_prokaryotes)) |>
+ select(-denominator)
+
+ ## A bin may have no classified contigs if all contigs are below the minimum
+ ## Tiara length threshold
+ all_bins <- unique(contig2bin$BinID)
+ unclassified_bins <- all_bins[!(all_bins %in% softmax_probabilities$BinID)]
+
+ ## Assign these as unclassified
+ if(length(unclassified_bins) > 0) {
+ if(join_prokaryotes == TRUE){
+ unclassified_bins_tbl <- tibble(
+ BinID = unclassified_bins,
+ Prokarya = NA,
+ Eukarya = NA,
+ Organelle = NA,
+ Unknown = NA,
+ classification = "unknown"
+ )
+ } else {
+ unclassified_bins_tbl <- tibble(
+ BinID = unclassified_bins,
+ Bacteria = NA,
+ Archaea = NA,
+ Eukarya = NA,
+ Organelle = NA,
+ Unknown = NA,
+ classification = "unknown"
+ )
+ }
+ softmax_probabilities <- bind_rows(softmax_probabilities, unclassified_bins_tbl)
+ }
+
+ return(softmax_probabilities)
+}
+
+classifications <- read_tsv(args$classification_file, na = c("NA", "n/a"))
+contig_to_bin <- read_tsv(args$contig_to_bin, col_names = c("sequence_id", "BinID"))
+
+results <- classify_bins(tiara = classifications,
+ contig2bin = contig_to_bin,
+ join_prokaryotes = args$join_prokaryotes,
+ assembler = args$assembler)
+
+## Keep just the classifications so we can loop over more easily
+results_basic <- select(results, BinID, classification)
+
+## write outputs
+write_tsv(results, paste0(args$output_prefix, ".binclassification.tsv"))
+write_tsv(results_basic, "bin2classification.tsv", col_names = FALSE)
+
+## write out package versions
+packageVersion("tidyverse") |> as.character() |> writeLines("tidyverse_version.txt")
diff --git a/bin/run_busco.sh b/bin/run_busco.sh
new file mode 100755
index 00000000..9e022e87
--- /dev/null
+++ b/bin/run_busco.sh
@@ -0,0 +1,158 @@
+#! /usr/bin/env bash
+
+p=$1
+cp_augustus_config=$2
+db=$3
+bin=$4
+task_cpus=$5
+lineage_dataset_provided=$6
+busco_clean=$7
+
+# ensure augustus has write access to config directory
+if [ ${cp_augustus_config} = "Y" ]; then
+ cp -r /usr/local/config/ augustus_config/
+ export AUGUSTUS_CONFIG_PATH=augustus_config
+fi
+
+# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it)
+if [ ${lineage_dataset_provided} = "Y" ]; then
+ mkdir dataset
+ mv ${db} dataset/
+fi
+
+# set nullgob: if pattern matches no files, expand to a null string rather than to itself
+shopt -s nullglob
+
+# only used for saving busco downloads
+most_spec_db="NA"
+
+if busco ${p} \
+ --mode genome \
+ --in ${bin} \
+ --cpu ${task_cpus} \
+ --out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err; then
+
+ # get name of used specific lineage dataset
+ summaries=(BUSCO/short_summary.specific.*.BUSCO.txt)
+ if [ ${#summaries[@]} -ne 1 ]; then
+ echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one."
+ exit 1
+ fi
+ [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]
+ db_name_spec="${BASH_REMATCH[1]}"
+ most_spec_db=${db_name_spec}
+ echo "Used specific lineage dataset: ${db_name_spec}"
+
+ if [ ${lineage_dataset_provided} = "Y" ]; then
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt
+
+ # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting
+ # report bin as failed to allow consistent warnings within the pipeline for both settings
+ if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then
+ echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log."
+ echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt"
+ fi
+ else
+ # auto lineage selection
+ if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log &&
+ egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } ||
+ { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log &&
+ egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log &&
+ egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then
+ # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately
+ # and cause different log messages
+ echo "Domain and specific lineage could be selected by BUSCO."
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt
+
+ db_name_gen=""
+ summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt)
+ if [ ${#summaries_gen[@]} -lt 1 ]; then
+ echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same."
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt
+ db_name_gen=${db_name_spec}
+ else
+ [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]
+ db_name_gen="${BASH_REMATCH[1]}"
+ echo "Used generic lineage dataset: ${db_name_gen}"
+ cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt
+ fi
+
+ for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
+ cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz
+ break
+ done
+ for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
+ cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz
+ break
+ done
+
+ elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then
+ echo "Domain could be selected by BUSCO, but no more specific lineage."
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt
+
+ elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then
+ echo "Domain could be selected by BUSCO, but no more specific lineage."
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt
+
+ elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then
+ # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*?
+ echo "Domain could not be selected by BUSCO, but virus dataset was selected."
+ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt
+ else
+ echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2
+ exit 1
+ fi
+ fi
+
+ for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do
+ cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz
+ break
+ done
+ for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do
+ cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz
+ break
+ done
+
+elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then
+ echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err."
+ echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt"
+
+elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then
+ echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset."
+ echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt"
+
+ message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log)
+ [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]
+ db_name_gen="${BASH_REMATCH[1]}"
+ most_spec_db=${db_name_gen}
+ echo "Used generic lineage dataset: ${db_name_gen}"
+ cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt
+
+ for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
+ cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz
+ break
+ done
+ for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
+ cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz
+ break
+ done
+
+else
+ echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2
+ exit 1
+fi
+
+# additionally output genes predicted with Prodigal (GFF3)
+if [ -f BUSCO/logs/prodigal_out.log ]; then
+ mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff"
+fi
+
+# output value of most_spec_db
+echo ${most_spec_db} > info_most_spec_db.txt
+
+# if needed delete temporary BUSCO files
+if [ ${busco_clean} = "Y" ]; then
+ find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \;
+ find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \;
+ find . -depth -type d -name "run_*" -execdir rm -rf "{}" +
+fi
diff --git a/bin/split_fasta.py b/bin/split_fasta.py
index c9149f25..87cb9dfa 100755
--- a/bin/split_fasta.py
+++ b/bin/split_fasta.py
@@ -45,10 +45,10 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
- pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name))
+ pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
# remaining sequences
else:
- remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name))
+ remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
else:
with open(input_file) as f:
fasta_sequences = SeqIO.parse(f, "fasta")
@@ -64,10 +64,10 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
- pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name))
+ pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
# remaining sequences
else:
- remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name))
+ remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
# Sort sequences above threshold by length
df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True)
@@ -77,10 +77,10 @@
for index, row in df_above_threshold.iterrows():
if index + 1 <= max_sequences:
print("write " + out_base + "." + str(index + 1) + ".fa")
- out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"])
+ out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
else:
- pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"]))
+ pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description=""))
print("write " + out_base + ".pooled.fa")
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
diff --git a/conf/base.config b/conf/base.config
index ca06b1a6..7dec9e28 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -14,7 +14,7 @@ process {
memory = { check_max( 7.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
- errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
+ errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
maxRetries = 3
maxErrors = '-1'
@@ -117,7 +117,7 @@ process {
memory = { check_max (40.GB * task.attempt, 'memory' ) }
time = { check_max (12.h * task.attempt, 'time' ) }
}
- withName: GTDBTK_CLASSIFY {
+ withName: GTDBTK_CLASSIFYWF {
cpus = { check_max (10 * task.attempt, 'cpus' ) }
memory = { check_max (128.GB * task.attempt, 'memory' ) }
time = { check_max (12.h * task.attempt, 'time' ) }
diff --git a/conf/igenomes.config b/conf/igenomes.config
index 7a1b3ac6..3f114377 100644
--- a/conf/igenomes.config
+++ b/conf/igenomes.config
@@ -36,6 +36,14 @@ params {
macs_gsize = "2.7e9"
blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed"
}
+ 'CHM13' {
+ fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa"
+ bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/"
+ bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/"
+ gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf"
+ gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
+ mito_name = "chrM"
+ }
'GRCm38' {
fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa"
bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/"
diff --git a/conf/modules.config b/conf/modules.config
index e61906cb..08167030 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -27,6 +27,8 @@ process {
mode: params.publish_dir_mode,
pattern: "*.html"
]
+ ext.prefix = { "${meta.id}_run${meta.run}_raw" }
+ tag = { "${meta.id}_run${meta.run}_raw" }
}
withName: FASTP {
@@ -50,6 +52,8 @@ process {
enabled: params.save_clipped_reads
]
]
+ ext.prefix = { "${meta.id}_run${meta.run}_fastp" }
+ tag = { "${meta.id}_run${meta.run}" }
}
withName: ADAPTERREMOVAL_PE {
@@ -72,7 +76,8 @@ process {
enabled: params.save_clipped_reads
]
]
- ext.prefix = { "${meta.id}_ar2" }
+ ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
+ tag = { "${meta.id}_run${meta.run}" }
}
withName: ADAPTERREMOVAL_SE {
@@ -87,11 +92,12 @@ process {
mode: params.publish_dir_mode,
pattern: "*.{settings}"
]
- ext.prefix = { "${meta.id}_ar2" }
+ ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
+ tag = { "${meta.id}_run${meta.run}" }
}
withName: BOWTIE2_PHIX_REMOVAL_ALIGN {
- ext.prefix = { "${meta.id}.phix_removed" }
+ ext.prefix = { "${meta.id}_run${meta.run}_phix_removed" }
publishDir = [
[
path: { "${params.outdir}/QC_shortreads/remove_phix" },
@@ -105,12 +111,13 @@ process {
enabled: params.save_phixremoved_reads
]
]
+ tag = { "${meta.id}_run${meta.run}" }
}
withName: BOWTIE2_HOST_REMOVAL_ALIGN {
ext.args = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive"
ext.args2 = params.host_removal_save_ids ? "--host_removal_save_ids" : ''
- ext.prefix = { "${meta.id}.host_removed" }
+ ext.prefix = { "${meta.id}_run${meta.run}_host_removed" }
publishDir = [
[
path: { "${params.outdir}/QC_shortreads/remove_host" },
@@ -124,16 +131,40 @@ process {
enabled: params.save_hostremoved_reads
]
]
+ tag = { "${meta.id}_run${meta.run}" }
}
withName: FASTQC_TRIMMED {
ext.args = '--quiet'
- ext.prefix = { "${meta.id}.trimmed" }
+ ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
publishDir = [
path: { "${params.outdir}/QC_shortreads/fastqc" },
mode: params.publish_dir_mode,
pattern: "*.html"
]
+ tag = { "${meta.id}_run${meta.run}" }
+ }
+
+ withName: BBMAP_BBNORM {
+ ext.args = [
+ params.bbnorm_target ? "target=${params.bbnorm_target}" : '',
+ params.bbnorm_min ? "min=${params.bbnorm_min}" : '',
+ ].join(' ').trim()
+ publishDir = [
+ [
+ path : { "${params.outdir}/bbmap/bbnorm/logs" },
+ enabled: params.save_bbnorm_reads,
+ mode : params.publish_dir_mode,
+ pattern: "*.log"
+ ],
+ [
+ path : { "${params.outdir}/bbmap/bbnorm/"},
+ mode : 'copy',
+ enabled: params.save_bbnorm_reads,
+ mode : params.publish_dir_mode,
+ pattern: "*.fastq.gz"
+ ]
+ ]
}
withName: PORECHOP {
@@ -143,6 +174,7 @@ process {
pattern: "*_porechop.fastq",
enabled: params.save_porechop_reads
]
+ ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
}
withName: FILTLONG {
@@ -152,6 +184,7 @@ process {
pattern: "*_lr_filtlong.fastq.gz",
enabled: params.save_filtlong_reads
]
+ ext.prefix = { "${meta.id}_run${meta.run}_lengthfiltered" }
}
withName: NANOLYSE {
@@ -168,6 +201,7 @@ process {
enabled: params.save_lambdaremoved_reads
]
]
+ ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" }
}
withName: NANOPLOT_RAW {
@@ -252,29 +286,42 @@ process {
]
}
- withName: BOWTIE2_ASSEMBLY_ALIGN {
- ext.args = params.bowtie2_mode ? params.bowtie2_mode : params.ancient_dna ? '--very-sensitive -N 1' : ''
+ withName: GENOMAD_ENDTOEND {
+ ext.args = [
+ "--cleanup",
+ "--min-score ${params.genomad_min_score}",
+ "--splits ${params.genomad_splits}",
+ ].join(' ').trim()
publishDir = [
- path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" },
+ path: { "${params.outdir}/VirusIdentification/geNomad/${meta.id}" },
mode: params.publish_dir_mode,
- pattern: "*.log"
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
- withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY|MAG_DEPTHS_PLOT_REFINED' {
+ withName: BOWTIE2_ASSEMBLY_ALIGN {
+ ext.args = params.bowtie2_mode ? params.bowtie2_mode : params.ancient_dna ? '--very-sensitive -N 1' : ''
+ ext.prefix = { "${meta.id}.assembly" }
publishDir = [
- path: { "${params.outdir}/GenomeBinning/depths/bins" },
- mode: params.publish_dir_mode,
- pattern: "*.{png,tsv}"
+ [
+ path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" },
+ mode: params.publish_dir_mode,
+ pattern: "*.log"
+ ],
+ [
+ path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" },
+ mode: params.publish_dir_mode,
+ pattern: "*.{bam,bai}",
+ enabled: params.save_assembly_mapped_reads
+ ],
]
}
- withName: 'MAG_DEPTHS_SUMMARY_REFINED' {
- ext.prefix = "bin_refined_depths_summary"
+ withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY' {
publishDir = [
path: { "${params.outdir}/GenomeBinning/depths/bins" },
mode: params.publish_dir_mode,
- pattern: "*.{tsv}"
+ pattern: "*.{png,tsv}"
]
}
@@ -409,7 +456,7 @@ process {
]
}
- withName: GTDBTK_CLASSIFY {
+ withName: GTDBTK_CLASSIFYWF {
ext.args = "--extension fa"
publishDir = [
path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" },
@@ -430,7 +477,7 @@ process {
withName: PROKKA {
ext.args = "--metagenome"
publishDir = [
- path: { "${params.outdir}/Prokka/${meta.assembler}" },
+ path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
@@ -448,7 +495,7 @@ process {
withName: PRODIGAL {
ext.args = "-p meta"
publishDir = [
- path: { "${params.outdir}/Prodigal/${meta.assembler}/${meta.id}" },
+ path: { "${params.outdir}/Annotation/Prodigal/${meta.assembler}/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
@@ -572,7 +619,7 @@ process {
[
path: { "${params.outdir}/GenomeBinning/MaxBin2/bins/" },
mode: params.publish_dir_mode,
- pattern: '*/*.fa.gz'
+ pattern: '*.fa.gz'
],
]
}
@@ -622,6 +669,10 @@ process {
ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" }
}
+ withName: DASTOOL_FASTATOCONTIG2BIN_TIARA {
+ ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}" }
+ }
+
withName: DASTOOL_DASTOOL {
publishDir = [
[
@@ -650,6 +701,57 @@ process {
]
}
+ withName: TIARA_TIARA {
+ publishDir = [
+ [
+ path: { "${params.outdir}/Taxonomy/Tiara" },
+ mode: params.publish_dir_mode,
+ pattern: { "${meta.assembler}-${meta.id}.tiara.{txt}" }
+ ],
+ [
+ path: { "${params.outdir}/Taxonomy/Tiara/log" },
+ mode: params.publish_dir_mode,
+ pattern: { "log_${meta.assembler}-${meta.id}.tiara.{txt}" }
+ ]
+ ]
+ ext.args = { "--min_len ${params.tiara_min_length} --probabilities" }
+ ext.prefix = { "${meta.assembler}-${meta.id}.tiara" }
+ }
+
+ withName: TIARA_CLASSIFY {
+ ext.args = { "--join_prokaryotes --assembler ${meta.assembler}" }
+ ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.bin}-${meta.id}" }
+ }
+
+ withName: TIARA_SUMMARY {
+ publishDir = [
+ path: { "${params.outdir}/Taxonomy/" },
+ mode: params.publish_dir_mode,
+ pattern: "tiara_summary.tsv"
+ ]
+ ext.prefix = "tiara_summary"
+ }
+
+ withName: MMSEQS_DATABASES {
+ ext.prefix = { "${params.metaeuk_mmseqs_db.replaceAll("/", "-")}" }
+ publishDir = [
+ path: { "${params.outdir}/Annotation/mmseqs_db/" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+ enabled: params.save_mmseqs_db
+ ]
+ }
+
+ withName: METAEUK_EASYPREDICT {
+ ext.args = ""
+ ext.prefix = { "${meta.id}" }
+ publishDir = [
+ path: { "${params.outdir}/Annotation/MetaEuk/${meta.assembler}/${meta.id}" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
diff --git a/conf/test.config b/conf/test.config
index 21a099a6..348b95d5 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -20,7 +20,7 @@ params {
max_time = '6.h'
// Input data
- input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
skip_krona = true
@@ -28,6 +28,6 @@ params {
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
busco_clean = true
- gtdb = false
+ skip_gtdbtk = true
skip_concoct = true
}
diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config
index b4621c81..92d51aec 100644
--- a/conf/test_adapterremoval.config
+++ b/conf/test_adapterremoval.config
@@ -11,8 +11,8 @@
*/
params {
- config_profile_name = 'Test profile for running with AdapterRemoval'
- config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data'
+ config_profile_name = 'Test profile for running with AdapterRemoval and domain classification'
+ config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data and domain classification.'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
@@ -20,14 +20,16 @@ params {
max_time = '6.h'
// Input data
- input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.euk.csv'
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
+ metaeuk_db = "https://github.com/nf-core/test-datasets/raw/modules/data/proteomics/database/yeast_UPS.fasta"
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
+ skip_gtdbtk = true
clip_tool = 'adapterremoval'
skip_concoct = true
+ bin_domain_classification = true
}
diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config
index dcb8f7c9..325362fc 100644
--- a/conf/test_ancient_dna.config
+++ b/conf/test_ancient_dna.config
@@ -27,7 +27,7 @@ params {
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
+ skip_gtdbtk = true
ancient_dna = true
binning_map_mode = 'own'
skip_spades = false
diff --git a/conf/test_no_clipping.config b/conf/test_bbnorm.config
similarity index 78%
rename from conf/test_no_clipping.config
rename to conf/test_bbnorm.config
index a4f1881e..5f481adf 100644
--- a/conf/test_no_clipping.config
+++ b/conf/test_bbnorm.config
@@ -11,8 +11,8 @@
*/
params {
- config_profile_name = 'Test profile for skipping all short read preprocessing'
- config_profile_description = 'Minimal test dataset to check pipeline function when all short read preprocessing is skipped.'
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
// Limit resources so that this can run on GitHub Actions
max_cpus = 2
@@ -21,14 +21,20 @@ params {
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ keep_phix = true
+ skip_clipping = true
+ skip_prokka = true
+ skip_prodigal = true
+ skip_quast = true
+ skip_binning = true
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
- skip_clipping = true
- keep_phix = true
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
- skip_concoct = true
+ busco_clean = true
+ skip_gtdbtk = true
+ bbnorm = true
+ coassemble_group = true
}
diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config
index ddf44ceb..85dda8db 100644
--- a/conf/test_binrefinement.config
+++ b/conf/test_binrefinement.config
@@ -21,14 +21,17 @@ params {
// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ assembly_input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/assembly_samplesheet.csv'
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
+ skip_gtdbtk = true
refine_bins_dastool = true
refine_bins_dastool_threshold = 0
- postbinning_input = 'both'
+ // TODO not using 'both' until #489 merged
+ postbinning_input = 'refined_bins_only'
+ busco_clean = true
}
diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config
index 9480575c..6479012f 100644
--- a/conf/test_busco_auto.config
+++ b/conf/test_busco_auto.config
@@ -24,7 +24,7 @@ params {
skip_spades = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- gtdb = false
+ skip_gtdbtk = true
skip_prokka = true
skip_prodigal = true
skip_quast = true
diff --git a/conf/test_full.config b/conf/test_full.config
index 039fff67..4917332e 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -10,8 +10,6 @@
----------------------------------------------------------------------------------------
*/
-cleanup = true
-
params {
config_profile_name = 'Full test profile'
config_profile_description = 'Full test dataset to check pipeline function'
@@ -24,7 +22,7 @@ params {
centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz"
kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz"
cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz"
- gtdb = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz"
+ gtdb_db = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz"
// reproducibility options for assembly
spades_fix_cpus = 10
diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config
index f91ef48c..b3487c6b 100644
--- a/conf/test_host_rm.config
+++ b/conf/test_host_rm.config
@@ -25,6 +25,6 @@ params {
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
+ skip_gtdbtk = true
skip_concoct = true
}
diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config
index 8cf5e525..bc22d3d2 100644
--- a/conf/test_hybrid.config
+++ b/conf/test_hybrid.config
@@ -24,6 +24,6 @@ params {
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- gtdb = false
+ skip_gtdbtk = true
skip_concoct = true
}
diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config
index 8a37b813..7a0e4a15 100644
--- a/conf/test_hybrid_host_rm.config
+++ b/conf/test_hybrid_host_rm.config
@@ -26,4 +26,5 @@ params {
max_unbinned_contigs = 2
skip_binqc = true
skip_concoct = true
+ skip_gtdbtk = true
}
diff --git a/conf/test_nothing.config b/conf/test_nothing.config
new file mode 100644
index 00000000..53df219f
--- /dev/null
+++ b/conf/test_nothing.config
@@ -0,0 +1,43 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Runs input data but skipping all possible steps to allow for a fast testing
+ profile for input checks etc.
+
+ Use as follows:
+ nextflow run nf-core/mag -profile test_nothing, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+
+ // Limit resources so that this can run on GitHub Actions
+ max_cpus = 2
+ max_memory = '6.GB'
+ max_time = '6.h'
+
+ // Input data
+ input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ centrifuge_db = null
+ kraken2_db = null
+ skip_krona = true
+ skip_clipping = true
+ skip_adapter_trimming = true
+ skip_spades = true
+ skip_spadeshybrid = true
+ skip_megahit = true
+ skip_quast = true
+ skip_prodigal = true
+ skip_binning = true
+ skip_metabat2 = true
+ skip_maxbin2 = true
+ skip_concoct = true
+ skip_prokka = true
+ skip_binqc = true
+ skip_gtdbtk = true
+ skip_concoct = true
+}
diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config
new file mode 100644
index 00000000..e15fab7d
--- /dev/null
+++ b/conf/test_virus_identification.config
@@ -0,0 +1,42 @@
+/*
+========================================================================================
+ Nextflow config file for running minimal tests
+========================================================================================
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run nf-core/mag -profile test_virus_identification, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+ config_profile_name = 'Test profile for running virus_identification'
+ config_profile_description = 'Minimal test dataset to check pipeline function virus identification'
+
+ // Limit resources so that this can run on GitHub Actions
+ max_cpus = 2
+ max_memory = '6.GB'
+ max_time = '6.h'
+
+ // Input data
+ input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+ run_virus_identification = true
+ genomad_splits = 7
+
+ // For computational efficiency
+ reads_minlength = 150
+ coassemble_group = true
+ skip_gtdbtk = true
+ skip_binning = true
+ skip_prokka = true
+ skip_spades = true
+ skip_spadeshybrid = true
+ skip_quast = true
+ skip_prodigal = true
+ skip_krona = true
+ skip_adapter_trimming = true
+ skip_metabat2 = true
+ skip_maxbin2 = true
+ skip_busco = true
+}
diff --git a/docs/images/mag_workflow.png b/docs/images/mag_workflow.png
index f476287a..d4cda1a0 100644
Binary files a/docs/images/mag_workflow.png and b/docs/images/mag_workflow.png differ
diff --git a/docs/images/mag_workflow.svg b/docs/images/mag_workflow.svg
index f847cec3..cf9dfc1f 100644
--- a/docs/images/mag_workflow.svg
+++ b/docs/images/mag_workflow.svg
@@ -1,11 +1,11 @@
+ originy="145.52081"
+ dotted="true"
+ spacingy="1"
+ spacingx="1"
+ units="mm"
+ visible="false" />
@@ -470,10 +474,10 @@
id="g6248"
transform="translate(6.9563316,-9.8853133)">
+
+
+ Domain classification
+ transform="translate(-48.61407,65.869579)">
+ transform="translate(-127.8353,38.642012)">
Short reads(required)Long reads(optional)
+ transform="translate(-78.042851,145.23947)">
+ transform="translate(197.63408,-247.93437)">
@@ -1471,20 +1513,10 @@
-
+ transform="translate(0.41482033,-21.713362)">
+ transform="translate(12.595022,186.3768)">
@@ -1546,23 +1578,62 @@
y="-57.991726"
style="font-size:3.175px;fill:#000000;fill-opacity:1;stroke-width:0.264583">GTDB-Tk
+
+
+ Tiara
+
+
+
+ MetaEuk
+
-
+ transform="translate(34.122913,-23.872595)">
+ transform="translate(249.24779,-79.485727)">
+ transform="translate(-197.12776,198.55821)">
+
+
+
+ Virus identification
+
+
@@ -1802,6 +1904,29 @@
x="71.375946"
y="23.016262"
style="font-size:3.175px;stroke-width:0.264583">BCFTools
+
+
+ geNomad
+
@@ -2131,12 +2256,12 @@
+ transform="translate(87.850198,-14.321784)">
+ transform="translate(31.0114,87.276994)">
MaxBin2
+ transform="translate(146.5855,-168.13403)">
CONCOCT
- Evaluation
-
+ id="g1745"
+ transform="translate(0,43.915012)">
Evaluation
+
+
+ BUSCO
-
-
-
- BUSCO
+
+
+
+ CheckM
-
-
-
- CheckM
+
+
+
+ GUNC
-
-
-
- GUNC
+
+
+
+ QUAST
+ id="text4774-3-1-7-1-9-7-7">QUAST
+ (Abundance estimation(Abundance estimation and and visualization)
+ id="tspan4193">visualization)
v2.3.0
+ y="65.938919">v2.4.0
+ transform="translate(3.3065026,127.42992)">
-
+
+ inkscape:export-ydpi="289.40701">Bin post-processing
+
diff --git a/docs/output.md b/docs/output.md
index 301dc6ad..31b86883 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -12,8 +12,10 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [Quality control](#quality-control) of input reads - trimming and contaminant removal
- [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads)
+- [Digital sequencing normalisation](#digital-normalization-with-BBnorm)
- [Assembly](#assembly) of trimmed reads
- [Protein-coding gene prediction](#gene-prediction) of assemblies
+- [Virus identification](#virus-identification-in-assemblies) of assemblies
- [Binning and binning refinement](#binning-and-binning-refinement) of assembled contigs
- [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes)
- [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes)
@@ -129,6 +131,20 @@ NanoPlot is used to calculate various metrics and plots about the quality and le
+## Digital normalization with BBnorm
+
+If the pipeline is called with the `--bbnorm` option, it will normalize sequencing depth of libraries prior assembly by removing reads to 1) reduce coverage of very abundant kmers and 2) delete very rare kmers (see `--bbnorm_target` and `--bbnorm_min` parameters).
+When called in conjunction with `--coassemble_group`, BBnorm will operate on interleaved (merged) FastQ files, producing only a single output file.
+If the `--save_bbnorm_reads` parameter is set, the resulting FastQ files are saved together with log output.
+
+
+Output files
+
+- `bbmap/bbnorm/[sample]\*.fastq.gz`
+- `bbmap/bbnorm/log/[sample].bbnorm.log`
+
+
+
## Taxonomic classification of trimmed reads
### Kraken
@@ -177,6 +193,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl
- `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs
- `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set.
- `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap").
+ - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -195,6 +212,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl
- `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs
- `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set.
- `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap").
+ - `SPAdes-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -213,6 +231,7 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft
- `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs
- `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set.
- `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap").
+ - `SPAdesHybrid-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly.
@@ -246,11 +265,40 @@ Protein-coding genes are predicted for each assembly.
Output files
-- `Prodigal/`
- - `[sample/group].gff`: Gene Coordinates in GFF format
- - `[sample/group].faa`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format.
- - `[sample/group].fna`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U').
- - `[sample/group]_all.txt`: Information about start positions of genes.
+- `Annotation/Prodigal/`
+ - `[assembler]-[sample/group].gff.gz`: Gene Coordinates in GFF format
+ - `[assembler]-[sample/group].faa.gz`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format.
+ - `[assembler]-[sample/group].fna.gz`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U').
+ - `[assembler]-[sample/group]_all.txt.gz`: Information about start positions of genes.
+
+
+
+## Virus identification in assemblies
+
+### geNomad
+
+[geNomad](https://github.com/apcamargo/genomad) identifies viruses and plasmids in sequencing data (isolates, metagenomes, and metatranscriptomes)
+
+
+Output files
+
+- `VirusIdentification/geNomad/[assembler]-[sample/group]*/`
+ - `[assembler]-[sample/group]*_annotate`
+ - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data
+ - `[assembler]-[sample/group]*_aggregated_classification`
+ - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format
+ - `[assembler]-[sample/group]*_find_proviruses`
+ - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad
+ - `[assembler]-[sample/group]*_summary`
+ - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format
+ - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format
+ - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format
+ - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format
+ - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format
+ - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format
+ - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format
+ - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format
+ - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary)
@@ -377,6 +425,22 @@ By default, only the raw bins (and unbinned contigs) from the actual binning met
⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps.
+### Tiara
+
+Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities.
+
+
+Output files
+
+- `Taxonomy/Tiara/`
+ - `[assembler]-[sample/group].tiara.txt` - Tiara output classifications (with probabilities) for all contigs within the specified sample/group assembly
+ - `log/log_[assembler]-[sample/group].txt` - log file detailing the parameters used by the Tiara model for contig classification.
+- `GenomeBinning/tiara_summary.tsv` - Summary of Tiara domain classification for all bins.
+
+
+
+Typically, you would use `tiara_summary.tsv` as the primary file to see which bins or unbins have been classified to which domains at a glance, whereas `[assembler]-[sample/group].tiara.txt` provides classifications for each contig.
+
### Bin sequencing depth
For each bin or refined bin the median sequencing depth is computed based on the corresponding contig depths.
@@ -565,19 +629,34 @@ Whole genome annotation is the process of identifying features of interest in a
Output files
-- `Prokka/[assembler]/[bin]/`
- - `[bin].gff`: annotation in GFF3 format, containing both sequences and annotations
- - `[bin].gbk`: annotation in GenBank format, containing both sequences and annotations
- - `[bin].fna`: nucleotide FASTA file of the input contig sequences
- - `[bin].faa`: protein FASTA file of the translated CDS sequences
- - `[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA)
- - `[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank
- - `[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file
- - `[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file
- - `[bin].err`: unacceptable annotations - the NCBI discrepancy report.
- - `[bin].log`: contains all the output that Prokka produced during its run
- - `[bin].txt`: statistics relating to the annotated features found
- - `[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product)
+- `Annotation/Prokka/[assembler]/[bin]/`
+ - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format, containing both sequences and annotations
+ - `[assembler]-[binner]-[bin].gbk`: annotation in GenBank format, containing both sequences and annotations
+ - `[assembler]-[binner]-[bin].fna`: nucleotide FASTA file of the input contig sequences
+ - `[assembler]-[binner]-[bin].faa`: protein FASTA file of the translated CDS sequences
+ - `[assembler]-[binner]-[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA)
+ - `[assembler]-[binner]-[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank
+ - `[assembler]-[binner]-[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file
+ - `[assembler]-[binner]-[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file
+ - `[assembler]-[binner]-[bin].err`: unacceptable annotations - the NCBI discrepancy report.
+ - `[assembler]-[binner]-[bin].log`: contains all the output that Prokka produced during its run
+ - `[assembler]-[binner]-[bin].txt`: statistics relating to the annotated features found
+ - `[assembler]-[binner]-[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product)
+
+
+
+### MetaEuk
+
+In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://github.com/soedinglab/metaeuk) is also available to annotate eukaryotic genomes quickly with standards-compliant output files.
+
+
+Output files
+
+- `Annotation/MetaEuk/[assembler]/[bin]`
+ - `[assembler]-[binner]-[bin].fas`: fasta file of protein sequences identified by MetaEuk
+ - `[assembler]-[binner]-[bin].codon.fas`: fasta file of nucleotide sequences corresponding to the protein sequences fasta
+ - `[assembler]-[binner]-[bin].headersMap.tsv`: tab-separated table containing the information from each header in the fasta files
+ - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format
@@ -610,7 +689,7 @@ Optional, only running when parameter `-profile ancient_dna` is specified.
### `variant_calling`
-Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is re-called with a variant calling software using the reads aligned back to the contigs
+Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied.
Output files
@@ -640,6 +719,21 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc
Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
+The general stats table at the top of the table will by default only display the most relevant pre- and post-processing statistics prior to assembly, i.e., FastQC, fastp/Adapter removal, and Bowtie2 PhiX and host removal mapping results.
+
+Note that the FastQC raw and processed columns are right next to each other for improved visual comparability, however the processed columns represent the input reads _after_ fastp/Adapter Removal processing (the dedicated columns of which come directly after the two FastQC set of columns). Hover your cursor over each column name to see the which tool the column is derived from.
+
+Summary tool-specific plots and tables of following tools are currently displayed (if activated):
+
+- FastQC (pre- and post-trimming)
+- fastp
+- Adapter Removal
+- bowtie2
+- BUSCO
+- QUAST
+- Kraken2 / Centrifuge
+- PROKKA
+
### Pipeline information
diff --git a/docs/usage.md b/docs/usage.md
index ea80671f..c991434c 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -6,7 +6,7 @@
## Input specifications
-The input data can be passed to nf-core/mag in two possible ways using the `--input` parameter.
+The input data can be passed to nf-core/mag in three possible ways, either using the `--input` or `--assembly_input` parameters.
### Direct FASTQ input (short reads only)
@@ -27,12 +27,13 @@ Please note the following additional requirements:
- When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs
- To run single-end data you must additionally specify `--single_end`
- If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`
+- Sample name and run combinations must be unique
### Samplesheet input file
-Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata.
+Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. Furthermore when a `run` column is present, the pipeline will also run perform run- or lane-wise concatenation, for cases where you may have a sample or library sequenced with the same sequencing configuration across multiple runs. The optional run merging happens after short read QC (adapter clipping, host/PhiX removal etc.), and prior to normalisation, taxonomic profiling, and assembly.
-This CSV file should contain the following columns:
+At a minimum CSV file should contain the following columns:
`sample,group,short_reads_1,short_reads_2,long_reads`
@@ -53,12 +54,22 @@ sample1,0,data/sample1.fastq.gz,,
sample2,0,data/sample2.fastq.gz,,
```
+or to additionally to perform run merging of two runs of sample1:
+
+```bash
+sample,run,group,short_reads_1,short_reads_2,long_reads
+sample1,1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
+sample1,2,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
+sample2,0,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz
+sample3,1,0,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz,
+```
+
Please note the following requirements:
-- 5 comma-seperated columns
+- a minimum 5 of comma-seperated columns
- Valid file extension: `.csv`
-- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads`
-- Sample IDs must be unique
+- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` (where `run` can be optionally added)
+- Run IDs must be unique within a multi-run sample. A sample with multiple runs will be automatically concatenated.
- FastQ files must be compressed (`.fastq.gz`, `.fq.gz`)
- `long_reads` can only be provided in combination with paired-end short read data
- Within one samplesheet either only single-end or only paired-end reads can be specified
@@ -66,6 +77,47 @@ Please note the following requirements:
Again, by default, the group information is only used to compute co-abundances for the binning step, but not for group-wise co-assembly (see the parameter docs for [`--coassemble_group`](https://nf-co.re/mag/parameters#coassemble_group) and [`--binning_map_mode`](https://nf-co.re/mag/parameters#binning_map_mode) for more information about how this group information can be used).
+### Supplying pre-computed assemblies
+
+It is also possible to run nf-core/mag on pre-computed assemblies, by supplying a CSV file to the parameter `--assembly_input` in addition to the raw reads supplied to `--input`. Supplying assembly input skips all read pre-processing and assembly, jumping straight to the binning stage of the pipeline.
+
+The assembly CSV file should contain the following columns:
+
+`id,group,assembler,fasta`
+
+Where `id` is the ID of the assembly, group is the assembly/binning group (see samplesheet information section for more details), `assembler` is the assembler used to produce the assembly (one of `MEGAHIT`, `SPAdes`, or `SPAdesHybrid`), and `fasta` is the path to the assembly fasta file. Input fasta files can be compressed or uncompressed, but compressed assemblies will be automatically uncompressed for use within the pipeline. The exact information required for each supplied assembly depends on whether the assemblies provided are single assemblies or group-wise co-assemblies. For the following example `--input` CSV:
+
+```bash
+sample,group,short_reads_1,short_reads_2,long_reads
+sample1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,
+sample2,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,
+sample3,1,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz,
+```
+
+If the assemblies are single assemblies, then the `id` and `group` columns should match those supplied in the `-input` read CSV files for each read set:
+
+```bash
+id,group,assembler,fasta
+sample1,0,MEGAHIT,MEGAHIT-sample1.contigs.fa.gz
+sample1,0,SPAdes,SPAdes-sample1.fasta.gz
+sample2,0,MEGAHIT,MEGAHIT-sample2.contigs.fa.gz
+sample2,0,SPAdes,SPAdes-sample2.contigs.fasta.gz
+sample3,1,MEGAHIT,MEGAHIT-sample3.contigs.fa.gz
+sample3,1,SPAdes,SPAdes-sample3.contigs.fasta.gz
+```
+
+If the assemblies are co-assemblies, the parameter `--coassemble_group` should additionally be specified. In this case, the `id` column should uniquely identify the assembly, while `group` should match those specified in the `--input` CSV file:
+
+```bash
+id,group,assembler,fasta
+group-0,0,MEGAHIT,MEGAHIT-group-0.contigs.fa.gz
+group-0,0,SPAdes,SPAdes-group-0.contigs.fasta.gz
+group-1,1,MEGAHIT,MEGAHIT-group-1.contigs.fa.gz
+group-1,1,SPAdes,SPAdes-group-1.contigs.fasta.gz
+```
+
+When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies, i.e., adapter-removed, run-merged etc.. Preprocessing steps will not be ran on raw reads when pre-computed assemblies are supplied. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored.
+
## Running the pipeline
The typical command for running the pipeline is as follows:
@@ -85,7 +137,29 @@ work # Directory containing the nextflow working files
# Other nextflow hidden files, eg. history of pipeline runs and old logs.
```
-See the [nf-core/mag website documentation](https://nf-co.re/mag/usage#usage) for more information about pipeline specific parameters.
+If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file.
+
+Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `.
+
+> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args).
+
+The above pipeline run specified with a params file in yaml format:
+
+```bash
+nextflow run nf-core/mag -profile docker -params-file params.yaml
+```
+
+with `params.yaml` containing:
+
+```yaml
+input: './samplesheet.csv'
+outdir: './results/'
+<...>
+```
+
+You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
+
+See the [nf-core/mag website documentation](https://nf-co.re/mag/parameters) for more information about pipeline specific parameters.
### Updating the pipeline
@@ -103,6 +177,10 @@ First, go to the [nf-core/mag releases page](https://github.com/nf-core/mag/rele
This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports.
+To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter.
+
+> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles.
+
Additionally, to enable also reproducible results from the individual assembly tools this pipeline provides extra parameters. SPAdes is designed to be deterministic for a given number of threads. To generate reproducible results set the number of cpus with `--spades_fix_cpus` or `--spadeshybrid_fix_cpus`. This will overwrite the number of cpus specified in the `base.config` file and additionally ensure that it is not increased in case of retries for individual samples. MEGAHIT only generates reproducible results when run single-threaded.
You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do not specify the number of cpus for these processes in additional custom config files, this would result in an error.
@@ -112,6 +190,8 @@ To allow also reproducible bin QC with BUSCO, run BUSCO providing already downlo
For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved.
+When it comes to visualizing taxonomic data using [Krona](https://github.com/marbl/Krona), you have the option to provide a taxonomy file, such as `taxonomy.tab`, using the `--krona_db` parameter. If you don't supply a taxonomy file, Krona is designed to automatically download the required taxonomy data for visualization.
+
The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reproducible, since the placement of bins in the reference tree is non-deterministic. However, the authors of the GTDB-Tk article examined the reproducibility on a set of 100 genomes across 50 trials and did not observe any difference (see [https://doi.org/10.1093/bioinformatics/btz848](https://doi.org/10.1093/bioinformatics/btz848)).
## Core Nextflow arguments
@@ -122,7 +202,7 @@ The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reprod
Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments.
-Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below.
+Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below.
> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported.
@@ -146,8 +226,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof
- A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/)
- `charliecloud`
- A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/)
+- `apptainer`
+ - A generic configuration profile to be used with [Apptainer](https://apptainer.org/)
- `conda`
- - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud.
+ - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer.
### `-resume`
@@ -163,58 +245,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se
Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped.
-For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue:
-
-```bash
-[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1)
-Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)'
-
-Caused by:
- Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137)
+To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website.
-Command executed:
- STAR \
- --genomeDir star \
- --readFilesIn WT_REP1_trimmed.fq.gz \
- --runThreadN 2 \
- --outFileNamePrefix WT_REP1. \
-
+### Custom Containers
-Command exit status:
- 137
+In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date.
-Command output:
- (empty)
+To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website.
-Command error:
- .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1.
-Work dir:
- /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb
+### Custom Tool Arguments
-Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run`
-```
+A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default.
-#### For beginners
-
-A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below.
-
-#### Advanced option on process level
-
-To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN).
-We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`.
-If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9).
-The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements.
-The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB.
-Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB.
-The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections.
-
-```nextflow
-process {
- withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' {
- memory = 100.GB
- }
-}
-```
+To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website.
Note, do not change number of CPUs with custom config files for the processes `spades`, `spadeshybrid` or `megahit` when specifying the parameters `--spades_fix_cpus`, `--spadeshybrid_fix_cpus` and `--megahit_fix_cpu_1` respectively.
diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy
deleted file mode 100755
index 33cd4f6e..00000000
--- a/lib/NfcoreSchema.groovy
+++ /dev/null
@@ -1,528 +0,0 @@
-//
-// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template.
-//
-
-import org.everit.json.schema.Schema
-import org.everit.json.schema.loader.SchemaLoader
-import org.everit.json.schema.ValidationException
-import org.json.JSONObject
-import org.json.JSONTokener
-import org.json.JSONArray
-import groovy.json.JsonSlurper
-import groovy.json.JsonBuilder
-
-class NfcoreSchema {
-
- //
- // Resolve Schema path relative to main workflow directory
- //
- public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') {
- return "${workflow.projectDir}/${schema_filename}"
- }
-
- //
- // Function to loop over all parameters defined in schema and check
- // whether the given parameters adhere to the specifications
- //
- /* groovylint-disable-next-line UnusedPrivateMethodParameter */
- public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') {
- def has_error = false
- //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
- // Check for nextflow core params and unexpected params
- def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text
- def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions')
- def nf_params = [
- // Options for base `nextflow` command
- 'bg',
- 'c',
- 'C',
- 'config',
- 'd',
- 'D',
- 'dockerize',
- 'h',
- 'log',
- 'q',
- 'quiet',
- 'syslog',
- 'v',
-
- // Options for `nextflow run` command
- 'ansi',
- 'ansi-log',
- 'bg',
- 'bucket-dir',
- 'c',
- 'cache',
- 'config',
- 'dsl2',
- 'dump-channels',
- 'dump-hashes',
- 'E',
- 'entry',
- 'latest',
- 'lib',
- 'main-script',
- 'N',
- 'name',
- 'offline',
- 'params-file',
- 'pi',
- 'plugins',
- 'poll-interval',
- 'pool-size',
- 'profile',
- 'ps',
- 'qs',
- 'queue-size',
- 'r',
- 'resume',
- 'revision',
- 'stdin',
- 'stub',
- 'stub-run',
- 'test',
- 'w',
- 'with-charliecloud',
- 'with-conda',
- 'with-dag',
- 'with-docker',
- 'with-mpi',
- 'with-notification',
- 'with-podman',
- 'with-report',
- 'with-singularity',
- 'with-timeline',
- 'with-tower',
- 'with-trace',
- 'with-weblog',
- 'without-docker',
- 'without-podman',
- 'work-dir'
- ]
- def unexpectedParams = []
-
- // Collect expected parameters from the schema
- def expectedParams = []
- def enums = [:]
- for (group in schemaParams) {
- for (p in group.value['properties']) {
- expectedParams.push(p.key)
- if (group.value['properties'][p.key].containsKey('enum')) {
- enums[p.key] = group.value['properties'][p.key]['enum']
- }
- }
- }
-
- for (specifiedParam in params.keySet()) {
- // nextflow params
- if (nf_params.contains(specifiedParam)) {
- log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'"
- has_error = true
- }
- // unexpected params
- def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params'
- def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() }
- def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase()
- def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase))
- if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) {
- // Temporarily remove camelCase/camel-case params #1035
- def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()}
- if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){
- unexpectedParams.push(specifiedParam)
- }
- }
- }
-
- //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
- // Validate parameters against the schema
- InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream()
- JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream))
-
- // Remove anything that's in params.schema_ignore_params
- raw_schema = removeIgnoredParams(raw_schema, params)
-
- Schema schema = SchemaLoader.load(raw_schema)
-
- // Clean the parameters
- def cleanedParams = cleanParameters(params)
-
- // Convert to JSONObject
- def jsonParams = new JsonBuilder(cleanedParams)
- JSONObject params_json = new JSONObject(jsonParams.toString())
-
- // Validate
- try {
- schema.validate(params_json)
- } catch (ValidationException e) {
- println ''
- log.error 'ERROR: Validation of pipeline parameters failed!'
- JSONObject exceptionJSON = e.toJSON()
- printExceptions(exceptionJSON, params_json, log, enums)
- println ''
- has_error = true
- }
-
- // Check for unexpected parameters
- if (unexpectedParams.size() > 0) {
- Map colors = NfcoreTemplate.logColours(params.monochrome_logs)
- println ''
- def warn_msg = 'Found unexpected parameters:'
- for (unexpectedParam in unexpectedParams) {
- warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}"
- }
- log.warn warn_msg
- log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}"
- println ''
- }
-
- if (has_error) {
- System.exit(1)
- }
- }
-
- //
- // Beautify parameters for --help
- //
- public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') {
- Map colors = NfcoreTemplate.logColours(params.monochrome_logs)
- Integer num_hidden = 0
- String output = ''
- output += 'Typical pipeline command:\n\n'
- output += " ${colors.cyan}${command}${colors.reset}\n\n"
- Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename))
- Integer max_chars = paramsMaxChars(params_map) + 1
- Integer desc_indent = max_chars + 14
- Integer dec_linewidth = 160 - desc_indent
- for (group in params_map.keySet()) {
- Integer num_params = 0
- String group_output = colors.underlined + colors.bold + group + colors.reset + '\n'
- def group_params = params_map.get(group) // This gets the parameters of that particular group
- for (param in group_params.keySet()) {
- if (group_params.get(param).hidden && !params.show_hidden_params) {
- num_hidden += 1
- continue;
- }
- def type = '[' + group_params.get(param).type + ']'
- def description = group_params.get(param).description
- def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : ''
- def description_default = description + colors.dim + defaultValue + colors.reset
- // Wrap long description texts
- // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap
- if (description_default.length() > dec_linewidth){
- List olines = []
- String oline = "" // " " * indent
- description_default.split(" ").each() { wrd ->
- if ((oline.size() + wrd.size()) <= dec_linewidth) {
- oline += wrd + " "
- } else {
- olines += oline
- oline = wrd + " "
- }
- }
- olines += oline
- description_default = olines.join("\n" + " " * desc_indent)
- }
- group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n'
- num_params += 1
- }
- group_output += '\n'
- if (num_params > 0){
- output += group_output
- }
- }
- if (num_hidden > 0){
- output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset
- }
- output += NfcoreTemplate.dashedLine(params.monochrome_logs)
- return output
- }
-
- //
- // Groovy Map summarising parameters/workflow options used by the pipeline
- //
- public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') {
- // Get a selection of core Nextflow workflow options
- def Map workflow_summary = [:]
- if (workflow.revision) {
- workflow_summary['revision'] = workflow.revision
- }
- workflow_summary['runName'] = workflow.runName
- if (workflow.containerEngine) {
- workflow_summary['containerEngine'] = workflow.containerEngine
- }
- if (workflow.container) {
- workflow_summary['container'] = workflow.container
- }
- workflow_summary['launchDir'] = workflow.launchDir
- workflow_summary['workDir'] = workflow.workDir
- workflow_summary['projectDir'] = workflow.projectDir
- workflow_summary['userName'] = workflow.userName
- workflow_summary['profile'] = workflow.profile
- workflow_summary['configFiles'] = workflow.configFiles.join(', ')
-
- // Get pipeline parameters defined in JSON Schema
- def Map params_summary = [:]
- def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename))
- for (group in params_map.keySet()) {
- def sub_params = new LinkedHashMap()
- def group_params = params_map.get(group) // This gets the parameters of that particular group
- for (param in group_params.keySet()) {
- if (params.containsKey(param)) {
- def params_value = params.get(param)
- def schema_value = group_params.get(param).default
- def param_type = group_params.get(param).type
- if (schema_value != null) {
- if (param_type == 'string') {
- if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) {
- def sub_string = schema_value.replace('\$projectDir', '')
- sub_string = sub_string.replace('\${projectDir}', '')
- if (params_value.contains(sub_string)) {
- schema_value = params_value
- }
- }
- if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) {
- def sub_string = schema_value.replace('\$params.outdir', '')
- sub_string = sub_string.replace('\${params.outdir}', '')
- if ("${params.outdir}${sub_string}" == params_value) {
- schema_value = params_value
- }
- }
- }
- }
-
- // We have a default in the schema, and this isn't it
- if (schema_value != null && params_value != schema_value) {
- sub_params.put(param, params_value)
- }
- // No default in the schema, and this isn't empty
- else if (schema_value == null && params_value != "" && params_value != null && params_value != false) {
- sub_params.put(param, params_value)
- }
- }
- }
- params_summary.put(group, sub_params)
- }
- return [ 'Core Nextflow options' : workflow_summary ] << params_summary
- }
-
- //
- // Beautify parameters for summary and return as string
- //
- public static String paramsSummaryLog(workflow, params) {
- Map colors = NfcoreTemplate.logColours(params.monochrome_logs)
- String output = ''
- def params_map = paramsSummaryMap(workflow, params)
- def max_chars = paramsMaxChars(params_map)
- for (group in params_map.keySet()) {
- def group_params = params_map.get(group) // This gets the parameters of that particular group
- if (group_params) {
- output += colors.bold + group + colors.reset + '\n'
- for (param in group_params.keySet()) {
- output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n'
- }
- output += '\n'
- }
- }
- output += "!! Only displaying parameters that differ from the pipeline defaults !!\n"
- output += NfcoreTemplate.dashedLine(params.monochrome_logs)
- return output
- }
-
- //
- // Loop over nested exceptions and print the causingException
- //
- private static void printExceptions(ex_json, params_json, log, enums, limit=5) {
- def causingExceptions = ex_json['causingExceptions']
- if (causingExceptions.length() == 0) {
- def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/
- // Missing required param
- if (m.matches()) {
- log.error "* Missing required parameter: --${m[0][1]}"
- }
- // Other base-level error
- else if (ex_json['pointerToViolation'] == '#') {
- log.error "* ${ex_json['message']}"
- }
- // Error with specific param
- else {
- def param = ex_json['pointerToViolation'] - ~/^#\//
- def param_val = params_json[param].toString()
- if (enums.containsKey(param)) {
- def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices"
- if (enums[param].size() > limit) {
- log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )"
- } else {
- log.error "${error_msg}: ${enums[param].join(', ')})"
- }
- } else {
- log.error "* --${param}: ${ex_json['message']} (${param_val})"
- }
- }
- }
- for (ex in causingExceptions) {
- printExceptions(ex, params_json, log, enums)
- }
- }
-
- //
- // Remove an element from a JSONArray
- //
- private static JSONArray removeElement(json_array, element) {
- def list = []
- int len = json_array.length()
- for (int i=0;i
- if(raw_schema.keySet().contains('definitions')){
- raw_schema.definitions.each { definition ->
- for (key in definition.keySet()){
- if (definition[key].get("properties").keySet().contains(ignore_param)){
- // Remove the param to ignore
- definition[key].get("properties").remove(ignore_param)
- // If the param was required, change this
- if (definition[key].has("required")) {
- def cleaned_required = removeElement(definition[key].required, ignore_param)
- definition[key].put("required", cleaned_required)
- }
- }
- }
- }
- }
- if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) {
- raw_schema.get("properties").remove(ignore_param)
- }
- if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) {
- def cleaned_required = removeElement(raw_schema.required, ignore_param)
- raw_schema.put("required", cleaned_required)
- }
- }
- return raw_schema
- }
-
- //
- // Clean and check parameters relative to Nextflow native classes
- //
- private static Map cleanParameters(params) {
- def new_params = params.getClass().newInstance(params)
- for (p in params) {
- // remove anything evaluating to false
- if (!p['value']) {
- new_params.remove(p.key)
- }
- // Cast MemoryUnit to String
- if (p['value'].getClass() == nextflow.util.MemoryUnit) {
- new_params.replace(p.key, p['value'].toString())
- }
- // Cast Duration to String
- if (p['value'].getClass() == nextflow.util.Duration) {
- new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day"))
- }
- // Cast LinkedHashMap to String
- if (p['value'].getClass() == LinkedHashMap) {
- new_params.replace(p.key, p['value'].toString())
- }
- }
- return new_params
- }
-
- //
- // This function tries to read a JSON params file
- //
- private static LinkedHashMap paramsLoad(String json_schema) {
- def params_map = new LinkedHashMap()
- try {
- params_map = paramsRead(json_schema)
- } catch (Exception e) {
- println "Could not read parameters settings from JSON. $e"
- params_map = new LinkedHashMap()
- }
- return params_map
- }
-
- //
- // Method to actually read in JSON file using Groovy.
- // Group (as Key), values are all parameters
- // - Parameter1 as Key, Description as Value
- // - Parameter2 as Key, Description as Value
- // ....
- // Group
- // -
- private static LinkedHashMap paramsRead(String json_schema) throws Exception {
- def json = new File(json_schema).text
- def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions')
- def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties')
- /* Tree looks like this in nf-core schema
- * definitions <- this is what the first get('definitions') gets us
- group 1
- title
- description
- properties
- parameter 1
- type
- description
- parameter 2
- type
- description
- group 2
- title
- description
- properties
- parameter 1
- type
- description
- * properties <- parameters can also be ungrouped, outside of definitions
- parameter 1
- type
- description
- */
-
- // Grouped params
- def params_map = new LinkedHashMap()
- schema_definitions.each { key, val ->
- def Map group = schema_definitions."$key".properties // Gets the property object of the group
- def title = schema_definitions."$key".title
- def sub_params = new LinkedHashMap()
- group.each { innerkey, value ->
- sub_params.put(innerkey, value)
- }
- params_map.put(title, sub_params)
- }
-
- // Ungrouped params
- def ungrouped_params = new LinkedHashMap()
- schema_properties.each { innerkey, value ->
- ungrouped_params.put(innerkey, value)
- }
- params_map.put("Other parameters", ungrouped_params)
-
- return params_map
- }
-
- //
- // Get maximum number of characters across all parameter names
- //
- private static Integer paramsMaxChars(params_map) {
- Integer max_chars = 0
- for (group in params_map.keySet()) {
- def group_params = params_map.get(group) // This gets the parameters of that particular group
- for (param in group_params.keySet()) {
- if (param.size() > max_chars) {
- max_chars = param.size()
- }
- }
- }
- return max_chars
- }
-}
diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy
index 2f9a1e42..28ad471d 100755
--- a/lib/NfcoreTemplate.groovy
+++ b/lib/NfcoreTemplate.groovy
@@ -58,9 +58,7 @@ class NfcoreTemplate {
// Set up the e-mail variables
def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
- if (busco_failed_bins.size() > 0) {
- subject = "[$workflow.manifest.name] Partially successful: For ${busco_failed_bins.size()} bin(s) the BUSCO analysis failed because no genes where found or placements failed: $workflow.runName"
- }
+
if (!workflow.success) {
subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
}
@@ -132,7 +130,7 @@ class NfcoreTemplate {
def email_html = html_template.toString()
// Render the sendmail template
- def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit
+ def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit
def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ]
def sf = new File("$projectDir/assets/sendmail_template.txt")
def sendmail_template = engine.createTemplate(sf).make(smail_fields)
@@ -231,32 +229,6 @@ class NfcoreTemplate {
//
public static void summary(workflow, params, log, busco_failed_bins = [:]) {
Map colors = logColours(params.monochrome_logs)
-
- if (busco_failed_bins.size() > 0) {
- def failed_bins_no_genes = ''
- def failed_bins_placements_failed = ''
- def count_no_genes = 0
- def count_placements_failed = 0
- for (bin in busco_failed_bins) {
- if (bin.value == "No genes"){
- count_no_genes += 1
- failed_bins_no_genes += " ${bin.key}\n"
- }
- if (bin.value == "Placements failed"){
- count_placements_failed += 1
- failed_bins_placements_failed += " ${bin.key}\n"
- }
- }
- if (params.busco_reference)
- log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${busco_failed_bins.size()} bin(s) BUSCO did not find any matching genes:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
- else {
- if (count_no_genes > 0)
- log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_no_genes} bin(s) the BUSCO analysis failed because no BUSCO genes could be found:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
- if (count_placements_failed > 0)
- log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_placements_failed} bin(s) the BUSCO analysis using automated lineage selection failed due to failed placements:\n${failed_bins_placements_failed}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information. Results for selected domain are still used.${colors.reset}-"
- }
- }
-
if (workflow.success) {
if (workflow.stats.ignoredCount == 0) {
log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"
diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy
index de2d769d..51822e4e 100755
--- a/lib/WorkflowMag.groovy
+++ b/lib/WorkflowMag.groovy
@@ -2,6 +2,7 @@
// This file holds several functions specific to the workflow/mag.nf in the nf-core/mag pipeline
//
+import nextflow.Nextflow
import groovy.text.SimpleTemplateEngine
class WorkflowMag {
@@ -9,25 +10,25 @@ class WorkflowMag {
//
// Check and validate parameters
//
+
public static void initialise(params, log, hybrid) {
// Check if binning mapping mode is valid
if (!['all', 'group', 'own'].contains(params.binning_map_mode)) {
- log.error "Invalid parameter '--binning_map_mode ${params.binning_map_mode}'. Valid values are 'all', 'group' or 'own'."
- System.exit(1)
+ Nextflow.error("Invalid parameter '--binning_map_mode ${params.binning_map_mode}'. Valid values are 'all', 'group' or 'own'.")
}
if (params.coassemble_group && params.binning_map_mode == 'own') {
- log.error "Invalid combination of parameter '--binning_map_mode own' and parameter '--coassemble_group'. Select either 'all' or 'group' mapping mode when performing group-wise co-assembly."
- System.exit(1)
+ Nextflow.error("Invalid combination of parameter '--binning_map_mode own' and parameter '--coassemble_group'. Select either 'all' or 'group' mapping mode when performing group-wise co-assembly.")
+ }
+ if (params.ancient_dna && params.binning_map_mode != 'own') {
+ Nextflow.error("Invalid combination of parameter '--binning_map_mode' and parameter '--ancient_dna'. Ancient DNA mode can only be executed with --binning_map_mode own. You supplied: --binning_map_mode ${params.binning_map_mode}")
}
// Check if specified cpus for SPAdes are available
if ( params.spades_fix_cpus > params.max_cpus ) {
- log.error "Invalid parameter '--spades_fix_cpus ${params.spades_fix_cpus}', max cpus are '${params.max_cpus}'."
- System.exit(1)
+ Nextflow.error("Invalid parameter '--spades_fix_cpus ${params.spades_fix_cpus}', max cpus are '${params.max_cpus}'.")
}
if ( params.spadeshybrid_fix_cpus > params.max_cpus ) {
- log.error "Invalid parameter '--spadeshybrid_fix_cpus ${params.spadeshybrid_fix_cpus}', max cpus are '${params.max_cpus}'."
- System.exit(1)
+ Nextflow.error("Invalid parameter '--spadeshybrid_fix_cpus ${params.spadeshybrid_fix_cpus}', max cpus are '${params.max_cpus}'.")
}
// Check if settings concerning reproducibility of used tools are consistent and print warning if not
if (params.megahit_fix_cpu_1 || params.spades_fix_cpus != -1 || params.spadeshybrid_fix_cpus != -1) {
@@ -52,8 +53,7 @@ class WorkflowMag {
// Check if parameters for host contamination removal are valid
if ( params.host_fasta && params.host_genome) {
- log.error 'Both host fasta reference and iGenomes genome are specified to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome.'
- System.exit(1)
+ Nextflow.error('Both host fasta reference and iGenomes genome are specified to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome.')
}
if ( hybrid && (params.host_fasta || params.host_genome) ) {
log.warn 'Host read removal is only applied to short reads. Long reads might be filtered indirectly by Filtlong, which is set to use read qualities estimated based on k-mer matches to the short, already filtered reads.'
@@ -63,25 +63,21 @@ class WorkflowMag {
}
if ( params.host_genome ) {
if (!params.genomes) {
- log.error 'No config file containing genomes provided!'
- System.exit(1)
+ Nextflow.error('No config file containing genomes provided!')
}
// Check if host genome exists in the config file
if (!params.genomes.containsKey(params.host_genome)) {
- log.error '=============================================================================\n' +
+ Nextflow.error('=============================================================================\n' +
" Host genome '${params.host_genome}' not found in any config files provided to the pipeline.\n" +
' Currently, the available genome keys are:\n' +
" ${params.genomes.keySet().join(', ')}\n" +
- '==================================================================================='
- System.exit(1)
+ '===================================================================================')
}
if ( !params.genomes[params.host_genome].fasta ) {
- log.error "No fasta file specified for the host genome ${params.host_genome}!"
- System.exit(1)
+ Nextflow.error("No fasta file specified for the host genome ${params.host_genome}!")
}
if ( !params.genomes[params.host_genome].bowtie2 ) {
- log.error "No Bowtie 2 index file specified for the host genome ${params.host_genome}!"
- System.exit(1)
+ Nextflow.error("No Bowtie 2 index file specified for the host genome ${params.host_genome}!")
}
}
@@ -93,56 +89,54 @@ class WorkflowMag {
// Check more than one binner is run for bin refinement (required DAS by Tool)
// If the number of run binners (i.e., number of not-skipped) is more than one, otherwise throw an error
if ( params.refine_bins_dastool && !([ params.skip_metabat2, params.skip_maxbin2, params.skip_concoct ].count(false) > 1) ) {
- log.error 'Bin refinement with --refine_bins_dastool requires at least two binners to be running (not skipped). Check input.'
- System.exit(1)
+ Nextflow.error('Bin refinement with --refine_bins_dastool requires at least two binners to be running (not skipped). Check input.')
}
// Check that bin refinement is actually turned on if any of the refined bins are requested for downstream
if (!params.refine_bins_dastool && params.postbinning_input != 'raw_bins_only') {
- log.error 'The parameter '--postbinning_input ${ params.postbinning_input }' for downstream steps can only be specified if bin refinement is activated with --refine_bins_dastool! Check input.'
- System.exit(1)
+ Nextflow.error("The parameter '--postbinning_input ${ params.postbinning_input }' for downstream steps can only be specified if bin refinement is activated with --refine_bins_dastool! Check input.")
}
// Check if BUSCO parameters combinations are valid
if (params.skip_binqc && params.binqc_tool == 'checkm') {
- log.error 'Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.'
- System.exit(1)
+ Nextflow.error('Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.')
}
if (params.skip_binqc) {
if (params.busco_reference) {
- log.error 'Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.'
- System.exit(1)
+ Nextflow.error('Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.')
}
if (params.busco_download_path) {
- log.error 'Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.'
- System.exit(1)
+ Nextflow.error('Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.')
}
if (params.busco_auto_lineage_prok) {
- log.error 'Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.'
- System.exit(1)
+ Nextflow.error('Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.')
}
}
if (params.busco_reference && params.busco_download_path) {
- log.error 'Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.'
- System.exit(1)
+ Nextflow.error('Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.')
}
if (params.busco_auto_lineage_prok && params.busco_reference) {
- log.error 'Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.'
- System.exit(1)
+ Nextflow.error('Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.')
}
- if (params.skip_binqc && params.gtdb) {
- log.warn '--skip_binqc and --gtdb are specified! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.'
+ if (params.skip_binqc && !params.skip_gtdbtk) {
+ log.warn '--skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.'
}
// Check if CAT parameters are valid
if (params.cat_db && params.cat_db_generate) {
- log.error 'Invalid combination of parameters --cat_db and --cat_db_generate is specified! Please specify either --cat_db or --cat_db_generate.'
- System.exit(1)
+ Nextflow.error('Invalid combination of parameters --cat_db and --cat_db_generate is specified! Please specify either --cat_db or --cat_db_generate.')
}
if (params.save_cat_db && !params.cat_db_generate) {
- log.error 'Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.'
- System.exit(1)
+ Nextflow.error('Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.')
+ }
+
+ // Chech MetaEuk db paramaters
+ if (params.metaeuk_mmseqs_db && params.metaeuk_db) {
+ Nextflow.error('Invalid parameter combination: both --metaeuk_mmseqs_db and --metaeuk_db are specified! Please specify either --metaeuk_mmseqs_db or --metaeuk_db.')
+ }
+ if (params.save_mmseqs_db && !params.metaeuk_mmseqs_db) {
+ Nextflow.error('Invalid parameter combination: --save_mmseqs_db supplied but no database has been requested for download with --metaeuk_mmseqs_db!')
}
}
@@ -173,14 +167,56 @@ class WorkflowMag {
return yaml_file_text
}
- public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) {
+ //
+ // Generate methods description for MultiQC
+ //
+
+ public static String toolCitationText(params) {
+
+ // TODO Optionally add in-text citation tools to this list.
+ // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "",
+ // Uncomment function in methodsDescriptionText to render in MultiQC report
+ def citation_text = [
+ "Tools used in the workflow included:",
+ "FastQC (Andrews 2010),",
+ "MultiQC (Ewels et al. 2016)",
+ "."
+ ].join(' ').trim()
+
+ return citation_text
+ }
+
+ public static String toolBibliographyText(params) {
+
+ // TODO Optionally add bibliographic entries to this list.
+ // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
Author (2023) Pub name, Journal, DOI
" : "",
+ // Uncomment function in methodsDescriptionText to render in MultiQC report
+ def reference_text = [
+ "
Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
"
+ ].join(' ').trim()
+
+ return reference_text
+ }
+
+ public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) {
// Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file
def meta = [:]
meta.workflow = run_workflow.toMap()
meta['manifest_map'] = run_workflow.manifest.toMap()
- meta['doi_text'] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : ''
- meta['nodoi_text'] = meta.manifest_map.doi ? '' : '
If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.