diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21b03689..e3477c4a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,28 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## 2.5.1 - [2023-11-17]
+
+### `Added`
+
+### `Changed`
+
+### `Fixed`
+
+- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (reported by @tillenglert and @maxibor, fix by @maxibor)
+- [#533](https://github.com/nf-core/mag/pull/533) Fix glob pattern for publishing MetaBAT2 bins in results (reported by @patriciatran, fix by @jfy133)
+- [#535](https://github.com/nf-core/mag/pull/535) Fix input validation pattern to again allow direct FASTQ input (reported by @lennijusten, @emnilsson, fix by @jfy133, @d4straub, @mahesh-panchal, @nvnieuwk)
+
+### `Dependencies`
+
+| Tool | Previous version | New version |
+| ---- | ---------------- | ----------- |
+| CAT | 4.6 | 5.2.3 |
+
+### `Deprecated`
+
+- [#536](https://github.com/nf-core/mag/pull/536) Remove custom function with native Nextflow for checking file extension (reported by @d4straub, fix by @jfy133)
+
## 2.5.0 - [2023-10-10]
### `Added`
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 49878c35..29a077f7 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,8 +1,8 @@
report_comment: >
- This report has been generated by the nf-core/mag
+ This report has been generated by the nf-core/mag
analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ documentation.
report_section_order:
"nf-core-mag-methods-description":
diff --git a/conf/modules.config b/conf/modules.config
index c87d71d9..cbfa51fb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -377,8 +377,8 @@ process {
}
withName: 'CHECKM_LINEAGEWF' {
- tag = { "${meta.assembler}-${meta.binner}-${meta.id}" }
- ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" }
+ tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
+ ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" }
publishDir = [
path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
mode: params.publish_dir_mode,
@@ -387,7 +387,7 @@ process {
}
withName: 'CHECKM_QA' {
- ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" }
+ ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" }
ext.args = "-o 2 --tab_table"
publishDir = [
path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
@@ -458,6 +458,7 @@ process {
withName: GTDBTK_CLASSIFYWF {
ext.args = "--extension fa"
+ ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
publishDir = [
path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" },
mode: params.publish_dir_mode,
@@ -569,9 +570,9 @@ process {
withName: METABAT2_METABAT2 {
publishDir = [
[
- path: { "${params.outdir}/GenomeBinning/MetaBAT2/" },
+ path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" },
mode: params.publish_dir_mode,
- pattern: 'bins/*.fa.gz'
+ pattern: '*[!lowDepth|tooShort|unbinned].fa.gz'
],
[
path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" },
diff --git a/docs/output.md b/docs/output.md
index 1061870f..88aba227 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -333,6 +333,8 @@ These depth files are used for downstream binning steps.
All the files and contigs in these folders will be assessed by QUAST and BUSCO.
+All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here.
+
Output files
@@ -476,6 +478,7 @@ For each bin or refined bin the median sequencing depth is computed based on the
- `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format
- `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor)
- `GenomeBinning/QC/`
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition.
- `quast_summary.tsv`: QUAST output for all bins summarized
@@ -531,9 +534,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place
Output files
- `GenomeBinning/QC/CheckM/`
- - `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
- - `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
- - `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
- `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`).
@@ -581,14 +584,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req
Output files
- `Taxonomy/CAT/[assembler]/[binner]/`
- - `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
- - `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
- `Taxonomy/CAT/[assembler]/[binner]/raw/`
- - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
- - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
- - `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
- - `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
- - `[assembler]-[binner]-[sample/group].log`: Log files
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
+ - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files
@@ -609,14 +612,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
Output files
- `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/`
- - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html).
- - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
- - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
- - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
- - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
- - `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files.
- - `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
-- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
+ - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)).
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files.
+ - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
+- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
diff --git a/modules.json b/modules.json
index 73a43c4d..e9162243 100644
--- a/modules.json
+++ b/modules.json
@@ -118,7 +118,7 @@
},
"gtdbtk/classifywf": {
"branch": "master",
- "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df",
+ "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
"installed_by": ["modules"]
},
"gunc/downloaddb": {
diff --git a/modules/local/cat.nf b/modules/local/cat.nf
index 48af75c0..bda355c6 100644
--- a/modules/local/cat.nf
+++ b/modules/local/cat.nf
@@ -1,39 +1,42 @@
process CAT {
- tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}"
+ tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}"
- conda "bioconda::cat=4.6 bioconda::diamond=2.0.6"
+ conda "bioconda::cat=5.2.3"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' :
- 'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }"
+ 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' :
+ 'biocontainers/cat:5.2.3--hdfd78af_1' }"
input:
tuple val(meta), path("bins/*")
tuple val(db_name), path("database/*"), path("taxonomy/*")
output:
- path("*.names.txt.gz") , emit: tax_classification
- path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
- path("raw/*.predicted_proteins.faa.gz"), emit: faa
- path("raw/*.predicted_proteins.gff.gz"), emit: gff
- path("raw/*.log") , emit: log
- path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids
- path "versions.yml" , emit: versions
+ path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
+ path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
+ path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
+ path("raw/*.predicted_proteins.faa.gz") , emit: faa
+ path("raw/*.predicted_proteins.gff.gz") , emit: gff
+ path("raw/*.log") , emit: log
+ path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
+ path "versions.yml" , emit: versions
script:
def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
"""
- CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
- CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
- CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
+ CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing
+ CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
+ CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
mkdir raw
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
- gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \
- "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \
- "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \
- "raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \
- "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \
- "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt"
+ gzip "raw/${prefix}.ORF2LCA.txt" \
+ "raw/${prefix}.concatenated.predicted_proteins.faa" \
+ "raw/${prefix}.concatenated.predicted_proteins.gff" \
+ "raw/${prefix}.bin2classification.txt" \
+ "${prefix}.ORF2LCA.names.txt" \
+ "${prefix}.bin2classification.names.txt"
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf
index b8015ad5..e8ae58e7 100644
--- a/modules/local/quast_bins.nf
+++ b/modules/local/quast_bins.nf
@@ -1,5 +1,5 @@
process QUAST_BINS {
- tag "${meta.assembler}-${meta.binner}-${meta.id}"
+ tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
conda "bioconda::quast=5.0.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -15,15 +15,16 @@ process QUAST_BINS {
path "versions.yml" , emit: versions
script:
+ def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
"""
BINS=\$(echo \"$bins\" | sed 's/[][]//g')
IFS=', ' read -r -a bins <<< \"\$BINS\"
for bin in \"\${bins[@]}\"; do
metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}"
- if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then
- cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+ if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then
+ cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv"
else
- tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+ tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv"
fi
done
diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf
index 0b6b76cc..00da4459 100644
--- a/modules/nf-core/gtdbtk/classifywf/main.nf
+++ b/modules/nf-core/gtdbtk/classifywf/main.nf
@@ -1,28 +1,29 @@
process GTDBTK_CLASSIFYWF {
- tag "${meta.assembler}-${meta.id}"
+ tag "${prefix}"
label 'process_medium'
// WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions.
- conda "bioconda::gtdbtk=2.1.1"
+ conda "bioconda::gtdbtk=2.3.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/gtdbtk:2.1.1--pyhdfd78af_1' :
- 'biocontainers/gtdbtk:2.1.1--pyhdfd78af_1' }"
+ 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' :
+ 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }"
input:
tuple val(meta), path("bins/*")
tuple val(db_name), path("database/*")
+ path(mash_db)
output:
- path "gtdbtk.${meta.assembler}-${meta.id}.*.summary.tsv" , emit: summary
- path "gtdbtk.${meta.assembler}-${meta.id}.*.classify.tree.gz" , emit: tree
- path "gtdbtk.${meta.assembler}-${meta.id}.*.markers_summary.tsv", emit: markers
- path "gtdbtk.${meta.assembler}-${meta.id}.*.msa.fasta.gz" , emit: msa
- path "gtdbtk.${meta.assembler}-${meta.id}.*.user_msa.fasta" , emit: user_msa
- path "gtdbtk.${meta.assembler}-${meta.id}.*.filtered.tsv" , emit: filtered
- path "gtdbtk.${meta.assembler}-${meta.id}.log" , emit: log
- path "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" , emit: warnings
- path "gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv" , emit: failed
- path "versions.yml" , emit: versions
+ tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary
+ tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv") , emit: markers, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed, optional: true
+ tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log
+ tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings
+ path("versions.yml") , emit: versions
when:
task.ext.when == null || task.ext.when
@@ -30,6 +31,8 @@ process GTDBTK_CLASSIFYWF {
script:
def args = task.ext.args ?: ''
def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : ""
+ def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen"
+ prefix = task.ext.prefix ?: "${meta.id}"
"""
export GTDBTK_DATA_PATH="\${PWD}/database"
@@ -40,17 +43,25 @@ process GTDBTK_CLASSIFYWF {
gtdbtk classify_wf \\
$args \\
--genome_dir bins \\
- --prefix "gtdbtk.${meta.assembler}-${meta.id}" \\
+ --prefix "gtdbtk.${prefix}" \\
--out_dir "\${PWD}" \\
--cpus $task.cpus \\
- --pplacer_cpus $params.gtdbtk_pplacer_cpus \\
+ $mash_mode \\
$pplacer_scratch \\
--min_perc_aa $params.gtdbtk_min_perc_aa \\
--min_af $params.gtdbtk_min_af
- gzip "gtdbtk.${meta.assembler}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.id}".*.msa.fasta
- mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.id}.log"
- mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.id}.warnings.log"
+ mv classify/* .
+
+ mv identify/* .
+
+ mv align/* .\
+
+ mv gtdbtk.log "gtdbtk.${prefix}.log"
+
+ mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log"
+
+ find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing
cat <<-END_VERSIONS > versions.yml
"${task.process}":
@@ -59,18 +70,18 @@ process GTDBTK_CLASSIFYWF {
"""
stub:
- def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
-
+ def VERSION = '2.3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+ prefix = task.ext.prefix ?: "${meta.id}"
"""
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.summary.tsv
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.classify.tree.gz
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.markers_summary.tsv
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.msa.fasta.gz
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.user_msa.fasta
- touch gtdbtk.${meta.assembler}-${meta.id}.stub.filtered.tsv
- touch gtdbtk.${meta.assembler}-${meta.id}.log
- touch gtdbtk.${meta.assembler}-${meta.id}.warnings.log
- touch gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv
+ touch gtdbtk.${prefix}.stub.summary.tsv
+ touch gtdbtk.${prefix}.stub.classify.tree.gz
+ touch gtdbtk.${prefix}.stub.markers_summary.tsv
+ touch gtdbtk.${prefix}.stub.msa.fasta.gz
+ touch gtdbtk.${prefix}.stub.user_msa.fasta.gz
+ touch gtdbtk.${prefix}.stub.filtered.tsv
+ touch gtdbtk.${prefix}.log
+ touch gtdbtk.${prefix}.warnings.log
+ touch gtdbtk.${prefix}.failed_genomes.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml
index 4e7ec5f1..4319bc74 100644
--- a/modules/nf-core/gtdbtk/classifywf/meta.yml
+++ b/modules/nf-core/gtdbtk/classifywf/meta.yml
@@ -31,6 +31,10 @@ input:
type: file
description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy)
pattern: "*"
+ - mash_db:
+ type: file
+ description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional)
+ pattern: "*.msh"
output:
- meta:
diff --git a/nextflow.config b/nextflow.config
index 56bc9857..764a48b1 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -88,6 +88,7 @@ params {
save_cat_db = false
skip_gtdbtk = false
gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz"
+ gtdb_mash = null
gtdbtk_min_completeness = 50.0
gtdbtk_max_contamination = 10.0
gtdbtk_min_perc_aa = 10
@@ -322,11 +323,6 @@ docker.registry = 'quay.io'
podman.registry = 'quay.io'
singularity.registry = 'quay.io'
-// Nextflow plugins
-plugins {
- id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet
-}
-
// Load igenomes.config if required
if (!params.igenomes_ignore) {
includeConfig 'conf/igenomes.config'
@@ -372,7 +368,7 @@ manifest {
description = """Assembly, binning and annotation of metagenomes"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
- version = '2.5.0'
+ version = '2.5.1'
doi = '10.1093/nargab/lqac007'
}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 837b4e20..13f7e6bc 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -14,11 +14,10 @@
"properties": {
"input": {
"type": "string",
- "format": "file-path",
+ "format": "file-path-pattern",
"exists": true,
- "mimetype": "text/csv",
- "pattern": "^\\S+\\.csv$",
- "description": "Input FastQ files or CSV samplesheet file containing information about the samples in the experiment.",
+ "pattern": "^\\S+\\.csv$|^\\S+\\.(fastq|fq).gz$",
+ "description": "Input FastQ files (gzip compressed) or CSV samplesheet file containing information about the samples in the experiment.",
"help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq.gz'\n``` \n\nAlternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file with 5 columns and the following header: sample,group,short_reads_1,short_reads_2,long_reads. See [usage docs](https://nf-co.re/mag/usage#input-specifications).",
"fa_icon": "fas fa-file-csv"
},
@@ -525,6 +524,10 @@
"description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.",
"default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz"
},
+ "gtdb_mash": {
+ "type": "string",
+ "description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step"
+ },
"gtdbtk_min_completeness": {
"type": "number",
"default": 50.0,
@@ -808,7 +811,7 @@
"default": "raw_bins_only",
"description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.",
"help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.",
- "enum": ["raw_bins_only", "refined_bins_only"]
+ "enum": ["raw_bins_only", "refined_bins_only", "both"]
},
"run_gunc": {
"type": "boolean",
diff --git a/subworkflows/local/binning_refinement.nf b/subworkflows/local/binning_refinement.nf
index eea8c76a..360bffaa 100644
--- a/subworkflows/local/binning_refinement.nf
+++ b/subworkflows/local/binning_refinement.nf
@@ -25,7 +25,7 @@ workflow BINNING_REFINEMENT {
// everything here is either unclassified or a prokaryote
ch_bins = bins
.map { meta, bins ->
- def meta_new = meta - meta.subMap('domain')
+ def meta_new = meta - meta.subMap(['domain','refinement'])
[meta_new, bins]
}
.groupTuple()
@@ -88,7 +88,7 @@ workflow BINNING_REFINEMENT {
.map {
meta, bins ->
def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified'
- def meta_new = meta + [domain: domain_class]
+ def meta_new = meta + [refinement: 'dastool_refined', domain: domain_class]
[ meta_new, bins ]
}
@@ -96,14 +96,21 @@ workflow BINNING_REFINEMENT {
.map {
meta, bins ->
def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified'
- def meta_new = meta + [binner: 'DASTool', domain: domain_class]
+ def meta_new = meta + [refinement: 'dastool_refined', binner: 'DASTool', domain: domain_class]
[ meta_new, bins ]
}
RENAME_POSTDASTOOL ( ch_input_for_renamedastool )
+ refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins
+ .map {
+ meta, bins ->
+ def meta_new = meta + [refinement: 'dastool_refined_unbinned']
+ [meta_new, bins]
+ }
+
emit:
refined_bins = ch_dastool_bins_newmeta
- refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins
+ refined_unbins = refined_unbins
versions = ch_versions
}
diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf
index ce92efd5..a5c3be8d 100644
--- a/subworkflows/local/busco_qc.nf
+++ b/subworkflows/local/busco_qc.nf
@@ -65,6 +65,10 @@ workflow BUSCO_QC {
BUSCO_SAVE_DOWNLOAD ( ch_downloads )
}
+ busco_summary_domain = BUSCO.out.summary_domain.collect()
+ busco_summary_specific = BUSCO.out.summary_specific.collect()
+ busco_failed_bin = BUSCO.out.failed_bin.collect()
+
BUSCO_SUMMARY (
BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]),
BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]),
diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf
index 81c93c6f..012899ad 100644
--- a/subworkflows/local/depths.nf
+++ b/subworkflows/local/depths.nf
@@ -19,21 +19,32 @@ workflow DEPTHS {
main:
ch_versions = Channel.empty()
+
+ depths.dump(tag: 'depths', pretty: true)
+
// Compute bin depths for different samples (according to `binning_map_mode`)
- // Create a new meta joining key first, but copy meta so that
+ // Create a new meta combine key first, but copy meta so that
// we retain the information about binners and domain classification
ch_depth_input = bins_unbins
- .map { meta, bins ->
- def meta_join = meta - meta.subMap('binner','domain')
- [ meta_join, meta, bins ]
- }
- .combine( depths, by: 0 )
- .map { meta_join, meta, bins, contig_depths_file ->
- def meta_new = meta - meta.subMap('domain')
- [ meta_new, bins, contig_depths_file ]
+ .map {
+ meta, bins ->
+ def meta_combine = meta - meta.subMap('binner','domain','refinement')
+ [meta_combine, meta, bins]
}
+ .groupTuple()
+ .combine(depths, by: 0)
.transpose()
+ .map {
+ meta_combine, meta, bins, depth ->
+ def meta_new = meta - meta.subMap('domain','refinement')
+ [meta_new, bins, depth]
+ }
.groupTuple(by: [0,2])
+ .map {
+ meta, bins, depth ->
+ [meta, bins.unique().flatten(), depth]
+ }
+
MAG_DEPTHS ( ch_depth_input )
diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf
index 7586b142..2f110a43 100644
--- a/subworkflows/local/gtdbtk.nf
+++ b/subworkflows/local/gtdbtk.nf
@@ -12,6 +12,7 @@ workflow GTDBTK {
busco_summary // channel: path
checkm_summary // channel: path
gtdb // channel: path
+ gtdb_mash // channel: path
main:
// Filter bins: classify only medium & high quality MAGs
@@ -46,6 +47,7 @@ workflow GTDBTK {
}
}
+
// Filter bins based on collected metrics: completeness, contamination
ch_filtered_bins = bins
.transpose()
@@ -76,14 +78,17 @@ workflow GTDBTK {
GTDBTK_CLASSIFYWF (
ch_filtered_bins.passed.groupTuple(),
- ch_db_for_gtdbtk
+ ch_db_for_gtdbtk,
+ gtdb_mash
)
GTDBTK_SUMMARY (
ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]),
GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]),
- GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]),
- GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([])
+ [],
+ // GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]),
+ []
+ // GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([])
)
emit:
diff --git a/workflows/mag.nf b/workflows/mag.nf
index 11449c8c..160928d2 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -11,11 +11,8 @@ def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
def summary_params = paramsSummaryMap(workflow)
// Check already if long reads are provided
-def hasExtension(it, extension) {
- it.toString().toLowerCase().endsWith(extension.toLowerCase())
-}
def hybrid = false
-if(hasExtension(params.input, "csv")){
+if(file(params.input).extension == 'csv'){
Channel
.from(file(params.input))
.splitCsv(header: true)
@@ -207,6 +204,7 @@ gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db
if (gtdb) {
gtdb = file( "${gtdb}", checkIfExists: true)
+ gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : []
} else {
gtdb = []
}
@@ -781,6 +779,20 @@ workflow MAG {
* DAS Tool: binning refinement
*/
+ ch_binning_results_bins = ch_binning_results_bins
+ .map { meta, bins ->
+ def meta_new = meta + [refinement:'unrefined']
+ [meta_new , bins]
+ }
+
+ ch_binning_results_unbins = ch_binning_results_unbins
+ .map { meta, bins ->
+ def meta_new = meta + [refinement:'unrefined_unbinned']
+ [meta_new, bins]
+ }
+
+
+
// If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one
if ( params.refine_bins_dastool ) {
ch_prokarya_bins_dastool = ch_binning_results_bins
@@ -801,7 +813,13 @@ workflow MAG {
}
BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool )
- ch_refined_bins = ch_eukarya_bins_dastool.mix(BINNING_REFINEMENT.out.refined_bins)
+ // ch_refined_bins = ch_eukarya_bins_dastool
+ // .map{ meta, bins ->
+ // def meta_new = meta + [refinement: 'eukaryote_unrefined']
+ // [meta_new, bins]
+ // }.mix( BINNING_REFINEMENT.out.refined_bins)
+
+ ch_refined_bins = BINNING_REFINEMENT.out.refined_bins
ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins
ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions)
@@ -813,10 +831,10 @@ workflow MAG {
ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins)
// TODO REACTIVATE ONCE PR #489 IS READY!
// TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING
- // } else if ( params.postbinning_input == 'both' ) {
- // ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins)
- // ch_input_for_postbinning_bins = ch_all_bins
- // ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins)
+ } else if ( params.postbinning_input == 'both' ) {
+ ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins)
+ ch_input_for_postbinning_bins = ch_all_bins
+ ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins)
}
} else {
ch_input_for_postbinning_bins = ch_binning_results_bins
@@ -888,9 +906,9 @@ workflow MAG {
ch_input_for_quast_bins = ch_input_for_postbinning_bins_unbins
.groupTuple()
.map {
- meta, reads ->
- def new_reads = reads.flatten()
- [meta, new_reads]
+ meta, bins ->
+ def new_bins = bins.flatten()
+ [meta, new_bins]
}
QUAST_BINS ( ch_input_for_quast_bins )
@@ -915,7 +933,7 @@ workflow MAG {
ch_cat_db
)
CAT_SUMMARY(
- CAT.out.tax_classification.collect()
+ CAT.out.tax_classification_names.collect()
)
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)
@@ -938,7 +956,8 @@ workflow MAG {
ch_gtdb_bins,
ch_busco_summary,
ch_checkm_summary,
- gtdb
+ gtdb,
+ gtdb_mash
)
ch_versions = ch_versions.mix(GTDBTK.out.versions.first())
ch_gtdbtk_summary = GTDBTK.out.summary