diff --git a/CHANGELOG.md b/CHANGELOG.md
index 916c78035..6558baf06 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Enhancements & fixes
+- [PR #1388](https://github.com/nf-core/rnaseq/pull/1351) - Adding Kraken2/Bracken on unaligned reads as an additional quality control step to detect sample contamination
- [PR #1186](https://github.com/nf-core/rnaseq/pull/1186) - Bump pipeline version to 3.16.0dev
+### Parameters
+
+| Old parameter | New parameter |
+| ------------- | --------------------------- |
+| | `--contaminant_screening` |
+| | `--kraken_db` |
+| | `--save_kraken_assignments` |
+| | `--save_kraken_unassigned` |
+| | `--bracken_precision` |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present.
+> **NB:** Parameter has been **added** if just the new parameter information is present.
+> **NB:** Parameter has been **removed** if new parameter information isn't present.
+
+### Software dependencies
+
+| Dependency | Old version | New version |
+| ---------- | ----------- | ----------- |
+| `Kraken2` | ----------- | 2.1.3 |
+| `Bracken` | ----------- | 2.9 |
+
+> **NB:** Dependency has been **updated** if both old and new version information is present.
+>
+> **NB:** Dependency has been **added** if just the new version information is present.
+>
+> **NB:** Dependency has been **removed** if new version information isn't present.
+
## [[3.15.1](https://github.com/nf-core/rnaseq/releases/tag/3.15.1)] - 2024-09-16
### Enhancements & fixes
diff --git a/CITATIONS.md b/CITATIONS.md
index 48a60b8e4..5eaeea7f8 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -16,6 +16,10 @@
> Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824.
+- [Bracken](https://doi.org/10.7717/peerj-cs.104)
+
+ > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. https://doi.org/10.7717/peerj-cs.104
+
- [fastp](https://www.ncbi.nlm.nih.gov/pubmed/30423086/)
> Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018 Sep 1;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560. PubMed PMID: 30423086; PubMed Central PMCID: PMC6129281.
@@ -38,6 +42,10 @@
> Kim D, Paggi JM, Park C, Bennett C, Salzberg SL. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. 2019 Aug;37(8):907-915. doi: 10.1038/s41587-019-0201-4. Epub 2019 Aug 2. PubMed PMID: 31375807.
+- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0)
+
+ > Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome Biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0
+
- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
diff --git a/README.md b/README.md
index c5f58478e..01da6cf8b 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@
3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html)
4. [`Preseq`](http://smithlabresearch.org/software/preseq/)
5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)
+ 6. [`Kraken2`](https://ccb.jhu.edu/software/kraken2/) -> [`Bracken`](https://ccb.jhu.edu/software/bracken/) on unaligned sequences; _optional_
15. Pseudoalignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/) or ['Kallisto'](https://pachterlab.github.io/kallisto/); _optional_)
16. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
diff --git a/docs/images/bracken-top-n-plot.png b/docs/images/bracken-top-n-plot.png
new file mode 100644
index 000000000..ac2d485f6
Binary files /dev/null and b/docs/images/bracken-top-n-plot.png differ
diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.png b/docs/images/nf-core-rnaseq_metro_map_grey.png
index d2c040315..1cb1cf32f 100644
Binary files a/docs/images/nf-core-rnaseq_metro_map_grey.png and b/docs/images/nf-core-rnaseq_metro_map_grey.png differ
diff --git a/docs/images/nf-core-rnaseq_metro_map_grey.svg b/docs/images/nf-core-rnaseq_metro_map_grey.svg
index 7b55ae7e5..c36c3e839 100644
--- a/docs/images/nf-core-rnaseq_metro_map_grey.svg
+++ b/docs/images/nf-core-rnaseq_metro_map_grey.svg
@@ -7,7 +7,7 @@
viewBox="0 0 646.4851 269.92565"
version="1.1"
id="svg8"
- inkscape:version="1.3.2 (1:1.3.2+202311252150+091e20ef0f)"
+ inkscape:version="1.3.2 (091e20e, 2023-11-25)"
sodipodi:docname="nf-core-rnaseq_metro_map_grey.svg"
inkscape:export-filename="nf-core-rnaseq_metro_map_grey.png"
inkscape:export-xdpi="89"
@@ -2775,24 +2775,39 @@
id="stop23-7" />RSeQC(multiple modules)Kraken2/Bracken5RSeQC(multiple modules)PreseqPreseqdupRadarDESeq2(PCA only)MultiQCQualimaprnaseqHTMLHTMLHTMLdupRadarMultiQCQualimaprnaseqDESeq2(PCA only)
+### Kraken2/Bracken
+
+
+Output files
+
+- `/contaminants/kraken2/kraken_reports`
+ - `*.kraken2.report.txt`: Classification of unaligned reads in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details
+ - `*.classified*.fastq.gz` If `--save_kraken_alignments`, outputs fastq file for each sample with each classified read annotated with taxonomic identification from Kraken2.
+ - `*.unclassified*.fastq.gz` If `save_kraken_unassigned`, outputs fastq file with all reads that were not classified by Kraken2.
+- `/contaminants/bracken/`
+ - `*.kraken2.report_bracken.txt`: Kraken-style reports of the Bracken abundance estimate results. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details.
+ - `*.tsv` Summary of estimated reads for each taxon member at the given classification level and what corrections were made from Kraken2.
+
+
+
+[Kraken2](https://ccb.jhu.edu/software/kraken2/) is a taxonomic classification tool that uses k-mer matches paired with a lowest common ancestory (LCA) algorithm to classify species reads. [Bracken](https://ccb.jhu.edu/software/bracken/) is a statistical method to generate abundance estimates based off of the Kraken2 output. These algorithms are run on unaligned sequences to detect potential contamination of samples. MultiQC reports the top 5 taxon members detected at the level of classification used for Bracken, with toggles available for higher taxonomic levels. If Bracken is skipped, MultiQC will report the top 5 species detected by Kraken2.
+
+![MultiQC - Bracken top species plot](images/bracken-top-n-plot.png)
+
### MultiQC
@@ -675,7 +695,7 @@ Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQ
### Pseudoalignment
-The principal output files are the same between Salmon and Kallsto:
+The principal output files are the same between Salmon and Kallisto:
Output files
diff --git a/docs/usage.md b/docs/usage.md
index b613ab79d..cd42a399b 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -296,6 +296,14 @@ Notes:
By default, the input GTF file will be filtered to ensure that sequence names correspond to those in the genome fasta file, and to remove rows with empty transcript identifiers. Filtering can be bypassed completely where you are confident it is not necessary, using the `--skip_gtf_filter` parameter. If you just want to skip the 'transcript_id' checking component of the GTF filtering script used in the pipeline this can be disabled specifically using the `--skip_gtf_transcript_filter` parameter.
+## Contamination screening options
+
+The pipeline provides the option to scan unaligned reads for contamination from other species using [Kraken2](https://ccb.jhu.edu/software/kraken2/), with the possibility of applying corrections from [Bracken](https://ccb.jhu.edu/software/bracken/). Since running Bracken is not computationally expensive, we recommend always using it to refine the abundance estimates generated by Kraken2.
+
+It is important to note that the accuracy of Kraken2 is [highly dependent on the database](https://doi.org/10.1099/mgen.0.000949) used. Specifically, it is [crucial](https://doi.org/10.1128/mbio.01607-23) to ensure that the host genome is included in the database. If you are particularly concerned about certain contaminants, it may be beneficial to use a smaller, more focused database containing primarily those contaminants instead of the full standard database. Various pre-built databases [are available for download](https://benlangmead.github.io/aws-indexes/k2), and instructions for building a custom database can be found in the [Kraken2 documentation](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown). Additionally, genomes of contaminants detected in previous sequencing experiments are available on the [OpenContami website](https://openlooper.hgc.jp/opencontami/help/help_oct.php).
+
+While Kraken2 is capable of detecting low-abundance contaminants in a sample, false positives can occur. Therefore, if only a very small number of reads from a contaminating species are detected, these results should be interpreted with caution.
+
## Running the pipeline
The typical command for running the pipeline is as follows:
diff --git a/modules.json b/modules.json
index 30090ab26..4c9ba76f1 100644
--- a/modules.json
+++ b/modules.json
@@ -15,6 +15,11 @@
"git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
"installed_by": ["modules"]
},
+ "bracken/bracken": {
+ "branch": "master",
+ "git_sha": "c214fad97b328eb6d6233f779be9ba44814a9136",
+ "installed_by": ["modules"]
+ },
"cat/fastq": {
"branch": "master",
"git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
@@ -68,7 +73,8 @@
"hisat2/align": {
"branch": "master",
"git_sha": "ad30f90cfc383dfaa505771d24f9e292c53157ab",
- "installed_by": ["fastq_align_hisat2"]
+ "installed_by": ["fastq_align_hisat2"],
+ "patch": "modules/nf-core/hisat2/align/hisat2-align.diff"
},
"hisat2/build": {
"branch": "master",
@@ -90,6 +96,11 @@
"git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
"installed_by": ["modules", "quantify_pseudo_alignment"]
},
+ "kraken2/kraken2": {
+ "branch": "master",
+ "git_sha": "a13d5d945742a60bbef6e5c177e81cda540f75dc",
+ "installed_by": ["modules"]
+ },
"multiqc": {
"branch": "master",
"git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
diff --git a/modules/nf-core/bracken/bracken/environment.yml b/modules/nf-core/bracken/bracken/environment.yml
new file mode 100644
index 000000000..6eb5b1b9a
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/environment.yml
@@ -0,0 +1,7 @@
+name: bracken_bracken
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - "bioconda::bracken=2.9"
diff --git a/modules/nf-core/bracken/bracken/main.nf b/modules/nf-core/bracken/bracken/main.nf
new file mode 100644
index 000000000..e3d32fb21
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/main.nf
@@ -0,0 +1,55 @@
+process BRACKEN_BRACKEN {
+ tag "$meta.id"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/bracken:2.9--py38h2494328_0':
+ 'biocontainers/bracken:2.9--py38h2494328_0' }"
+
+ input:
+ tuple val(meta), path(kraken_report)
+ path database
+
+ output:
+ tuple val(meta), path(bracken_report) , emit: reports
+ tuple val(meta), path(bracken_kraken_style_report), emit: txt
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ""
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ bracken_report = "${prefix}.tsv"
+ bracken_kraken_style_report = "${prefix}.kraken2.report_bracken.txt"
+ """
+ bracken \\
+ ${args} \\
+ -d '${database}' \\
+ -i '${kraken_report}' \\
+ -o '${bracken_report}' \\
+ -w '${bracken_kraken_style_report}'
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bracken: \$(echo \$(bracken -v) | cut -f2 -d'v')
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ bracken_report = "${prefix}.tsv"
+ bracken_kraken_style_report = "${prefix}.kraken2.report_bracken.txt"
+ """
+ touch ${prefix}.tsv
+ touch ${bracken_kraken_style_report}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ bracken: \$(echo \$(bracken -v) | cut -f2 -d'v')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/bracken/bracken/meta.yml b/modules/nf-core/bracken/bracken/meta.yml
new file mode 100644
index 000000000..b7ff4489f
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/meta.yml
@@ -0,0 +1,51 @@
+name: bracken_bracken
+description: Re-estimate taxonomic abundance of metagenomic samples analyzed by kraken.
+keywords:
+ - bracken
+ - metagenomics
+ - abundance
+ - kraken2
+tools:
+ - bracken:
+ description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample.
+ homepage: https://ccb.jhu.edu/software/bracken/
+ documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual
+ tool_dev_url: https://github.com/jenniferlu717/Bracken
+ doi: "10.7717/peerj-cs.104"
+ licence: ["GPL v3"]
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - kraken_report:
+ type: file
+ description: TSV file with six columns coming from kraken2 output
+ pattern: "*.{tsv}"
+ - database:
+ type: file
+ description: Directory containing the kraken2/Bracken files for analysis
+ pattern: "*"
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+ - reports:
+ type: file
+ description: TSV output report of the re-estimated abundances
+ pattern: "*.{tsv}"
+ - txt:
+ type: file
+ description: TXT file of bracken corrected results of Kraken2 report output
+ pattern: "*.txt"
+authors:
+ - "@Midnighter"
+maintainers:
+ - "@Midnighter"
diff --git a/modules/nf-core/bracken/bracken/nextflow.config b/modules/nf-core/bracken/bracken/nextflow.config
new file mode 100644
index 000000000..975a9793a
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/nextflow.config
@@ -0,0 +1,13 @@
+if (!params.skip_alignment && !params.skip_qc) {
+ if (params.contaminant_screening == 'kraken2_bracken') {
+ process {
+ withName: 'BRACKEN' {
+ ext.args = "-l ${params.bracken_precision}"
+ publishDir = [
+ path: { "${params.outdir}/${params.aligner}/contaminants/bracken" },
+ mode: params.publish_dir_mode
+ ]
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/bracken/bracken/tests/genus_test.config b/modules/nf-core/bracken/bracken/tests/genus_test.config
new file mode 100644
index 000000000..bc5f63ae0
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/tests/genus_test.config
@@ -0,0 +1,5 @@
+process {
+ withName: BRACKEN_BRACKEN {
+ ext.args = "-l G -t 10 -r 150"
+ }
+}
diff --git a/modules/nf-core/bracken/bracken/tests/main.nf.test b/modules/nf-core/bracken/bracken/tests/main.nf.test
new file mode 100644
index 000000000..9d2105ded
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/tests/main.nf.test
@@ -0,0 +1,167 @@
+nextflow_process {
+
+ name "Test Process BRACKEN_BRACKEN"
+ script "../main.nf"
+ process "BRACKEN_BRACKEN"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "bracken"
+ tag "bracken/bracken"
+ tag "kraken2/kraken2"
+ tag "untar"
+
+ setup {
+ run ("UNTAR") {
+ script "../../../untar/main.nf"
+ process {
+ """
+ input[0] = [[],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2_bracken.tar.gz', checkIfExists: true)]
+ """
+ }
+ }
+ }
+
+ test("sarscov2 - single-end - fastq") {
+
+ config "./nextflow.config"
+
+ setup {
+ run("KRAKEN2_KRAKEN2") {
+ script "../../../kraken2/kraken2/main.nf"
+ process {
+ """
+ input[0] = [[id: 'test', single_end: true], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)]
+ input[1] = UNTAR.out.untar.map{it[1]}
+ input[2] = false
+ input[3] = false
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = KRAKEN2_KRAKEN2.out.report
+ input[1] = UNTAR.out.untar.map{it[1]}
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("sarscov2 - paired-end - fastq - genus config") {
+
+ config "./genus_test.config"
+
+ setup {
+ run("KRAKEN2_KRAKEN2") {
+ script "../../../kraken2/kraken2/main.nf"
+ process {
+ """
+ input[0] = [
+ [id: 'test', single_end: false],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)
+ ]
+ ]
+ input[1] = UNTAR.out.untar.map{it[1]}
+ input[2] = false
+ input[3] = false
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = KRAKEN2_KRAKEN2.out.report
+ input[1] = UNTAR.out.untar.map{it[1]}
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("sarscov2 - paired-end - fastq") {
+
+ config "./nextflow.config"
+
+ setup {
+ run("KRAKEN2_KRAKEN2") {
+ script "../../../kraken2/kraken2/main.nf"
+ process {
+ """
+ input[0] = [
+ [id: 'test', single_end: false],
+ [
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)
+ ]
+ ]
+ input[1] = UNTAR.out.untar.map{it[1]}
+ input[2] = false
+ input[3] = false
+ """
+ }
+ }
+ }
+
+ when {
+ process {
+ """
+ input[0] = KRAKEN2_KRAKEN2.out.report
+ input[1] = UNTAR.out.untar.map{it[1]}
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("sarscov2 - stub - fastq") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [[id: 'test'],file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true)]
+ input[1] = UNTAR.out.untar.map{it[1]}
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/bracken/bracken/tests/main.nf.test.snap b/modules/nf-core/bracken/bracken/tests/main.nf.test.snap
new file mode 100644
index 000000000..c97a4c71b
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/tests/main.nf.test.snap
@@ -0,0 +1,210 @@
+{
+ "sarscov2 - single-end - fastq": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.tsv:md5,4a21ae14ff8a0311d55f139af5247838"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.kraken2.report_bracken.txt:md5,ca0fbeedc4353b5fdd081688823a33df"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ],
+ "reports": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.tsv:md5,4a21ae14ff8a0311d55f139af5247838"
+ ]
+ ],
+ "txt": [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.kraken2.report_bracken.txt:md5,ca0fbeedc4353b5fdd081688823a33df"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-08-06T11:35:03.995620397"
+ },
+ "sarscov2 - paired-end - fastq - genus config": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tsv:md5,f609b09d6edb5ebc1ea1435d1dd46cde"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.kraken2.report_bracken.txt:md5,2ce58814420a3690da1f08e10e8d3a30"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ],
+ "reports": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tsv:md5,f609b09d6edb5ebc1ea1435d1dd46cde"
+ ]
+ ],
+ "txt": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.kraken2.report_bracken.txt:md5,2ce58814420a3690da1f08e10e8d3a30"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-08-06T12:13:33.399680181"
+ },
+ "sarscov2 - paired-end - fastq": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tsv:md5,4a21ae14ff8a0311d55f139af5247838"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.kraken2.report_bracken.txt:md5,ca0fbeedc4353b5fdd081688823a33df"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ],
+ "reports": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.tsv:md5,4a21ae14ff8a0311d55f139af5247838"
+ ]
+ ],
+ "txt": [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.kraken2.report_bracken.txt:md5,ca0fbeedc4353b5fdd081688823a33df"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-08-06T12:09:15.465609745"
+ },
+ "sarscov2 - stub - fastq": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test"
+ },
+ "test.kraken2.report_bracken.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ],
+ "reports": [
+ [
+ {
+ "id": "test"
+ },
+ "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "txt": [
+ [
+ {
+ "id": "test"
+ },
+ "test.kraken2.report_bracken.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,599fbbf4c1cd5851022a98788f2afdba"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.0",
+ "nextflow": "24.04.4"
+ },
+ "timestamp": "2024-08-06T11:35:42.965471207"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/bracken/bracken/tests/nextflow.config b/modules/nf-core/bracken/bracken/tests/nextflow.config
new file mode 100644
index 000000000..b550b7489
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/tests/nextflow.config
@@ -0,0 +1,5 @@
+process {
+ withName: BRACKEN_BRACKEN {
+ ext.args = "-l S -t 10 -r 150"
+ }
+}
diff --git a/modules/nf-core/bracken/bracken/tests/tags.yml b/modules/nf-core/bracken/bracken/tests/tags.yml
new file mode 100644
index 000000000..6a2cbd384
--- /dev/null
+++ b/modules/nf-core/bracken/bracken/tests/tags.yml
@@ -0,0 +1,2 @@
+bracken/bracken:
+ - "modules/nf-core/bracken/bracken/**"
diff --git a/modules/nf-core/hisat2/align/hisat2-align.diff b/modules/nf-core/hisat2/align/hisat2-align.diff
new file mode 100644
index 000000000..2a64e4f4f
--- /dev/null
+++ b/modules/nf-core/hisat2/align/hisat2-align.diff
@@ -0,0 +1,57 @@
+Changes in module 'nf-core/hisat2/align'
+--- modules/nf-core/hisat2/align/main.nf
++++ modules/nf-core/hisat2/align/main.nf
+@@ -34,7 +34,7 @@
+ ss = "$splicesites" ? "--known-splicesite-infile $splicesites" : ''
+ def seq_center = params.seq_center ? "--rg-id ${prefix} --rg SM:$prefix --rg CN:${params.seq_center.replaceAll('\\s','_')}" : "--rg-id ${prefix} --rg SM:$prefix"
+ if (meta.single_end) {
+- def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
++ def unaligned = params.save_unaligned || params.contaminant_screening ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
+ """
+ INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'`
+ hisat2 \\
+@@ -56,7 +56,7 @@
+ END_VERSIONS
+ """
+ } else {
+- def unaligned = params.save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
++ def unaligned = params.save_unaligned || params.contaminant_screening ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
+ """
+ INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'`
+ hisat2 \\
+@@ -91,7 +91,8 @@
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+- def unaligned = params.save_unaligned ? "echo '' | gzip > ${prefix}.unmapped_1.fastq.gz \n echo '' | gzip > ${prefix}.unmapped_2.fastq.gz" : ''
++ def unaligned = params.save_unaligned || params.contaminant_screening ? "echo '' | gzip > ${prefix}.unmapped_1.fastq.gz \n echo '' | gzip > ${prefix}.unmapped_2.fastq.gz" : ''
++ def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+ """
+ ${unaligned}
+
+
+--- modules/nf-core/hisat2/align/tests/main.nf.test
++++ modules/nf-core/hisat2/align/tests/main.nf.test
+@@ -3,12 +3,6 @@
+ name "Test Process HISAT2_ALIGN"
+ script "../main.nf"
+ process "HISAT2_ALIGN"
+- tag "modules"
+- tag "modules_nfcore"
+- tag "hisat2"
+- tag "hisat2/align"
+- tag "hisat2/build"
+- tag "hisat2/extractsplicesites"
+
+ test("Single-End") {
+
+
+--- modules/nf-core/hisat2/align/tests/tags.yml
++++ /dev/null
+@@ -1,4 +0,0 @@
+-hisat2/align:
+- - modules/nf-core/hisat2/align/**
+- - modules/nf-core/hisat2/build/**
+- - modules/nf-core/hisat2/extractsplicesites/**
+
+************************************************************
diff --git a/modules/nf-core/hisat2/align/main.nf b/modules/nf-core/hisat2/align/main.nf
index f45f9bccb..464a28e2b 100644
--- a/modules/nf-core/hisat2/align/main.nf
+++ b/modules/nf-core/hisat2/align/main.nf
@@ -34,7 +34,7 @@ process HISAT2_ALIGN {
ss = "$splicesites" ? "--known-splicesite-infile $splicesites" : ''
def seq_center = params.seq_center ? "--rg-id ${prefix} --rg SM:$prefix --rg CN:${params.seq_center.replaceAll('\\s','_')}" : "--rg-id ${prefix} --rg SM:$prefix"
if (meta.single_end) {
- def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
+ def unaligned = params.save_unaligned || params.contaminant_screening ? "--un-gz ${prefix}.unmapped.fastq.gz" : ''
"""
INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'`
hisat2 \\
@@ -56,7 +56,7 @@ process HISAT2_ALIGN {
END_VERSIONS
"""
} else {
- def unaligned = params.save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
+ def unaligned = params.save_unaligned || params.contaminant_screening ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : ''
"""
INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'`
hisat2 \\
@@ -91,7 +91,8 @@ process HISAT2_ALIGN {
stub:
def prefix = task.ext.prefix ?: "${meta.id}"
- def unaligned = params.save_unaligned ? "echo '' | gzip > ${prefix}.unmapped_1.fastq.gz \n echo '' | gzip > ${prefix}.unmapped_2.fastq.gz" : ''
+ def unaligned = params.save_unaligned || params.contaminant_screening ? "echo '' | gzip > ${prefix}.unmapped_1.fastq.gz \n echo '' | gzip > ${prefix}.unmapped_2.fastq.gz" : ''
+ def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
"""
${unaligned}
diff --git a/modules/nf-core/kraken2/kraken2/environment.yml b/modules/nf-core/kraken2/kraken2/environment.yml
new file mode 100644
index 000000000..0c067feeb
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/environment.yml
@@ -0,0 +1,9 @@
+name: kraken2_kraken2
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - "bioconda::kraken2=2.1.3"
+ - "coreutils=9.4"
+ - "pigz=2.8"
diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf
new file mode 100644
index 000000000..364a6fe24
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/main.nf
@@ -0,0 +1,85 @@
+process KRAKEN2_KRAKEN2 {
+ tag "$meta.id"
+ label 'process_high'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/mulled-v2-8706a1dd73c6cc426e12dd4dd33a5e917b3989ae:c8cbdc8ff4101e6745f8ede6eb5261ef98bdaff4-0' :
+ 'biocontainers/mulled-v2-8706a1dd73c6cc426e12dd4dd33a5e917b3989ae:c8cbdc8ff4101e6745f8ede6eb5261ef98bdaff4-0' }"
+
+ input:
+ tuple val(meta), path(reads)
+ path db
+ val save_output_fastqs
+ val save_reads_assignment
+
+ output:
+ tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq
+ tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq
+ tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment
+ tuple val(meta), path('*report.txt') , emit: report
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def paired = meta.single_end ? "" : "--paired"
+ def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq"
+ def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq"
+ def classified_option = save_output_fastqs ? "--classified-out ${classified}" : ""
+ def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : ""
+ def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
+ def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+
+ """
+ kraken2 \\
+ --db $db \\
+ --threads $task.cpus \\
+ --report ${prefix}.kraken2.report.txt \\
+ --gzip-compressed \\
+ $unclassified_option \\
+ $classified_option \\
+ $readclassification_option \\
+ $paired \\
+ $args \\
+ $reads
+
+ $compress_reads_command
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+ pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+ END_VERSIONS
+ """
+
+ stub:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def paired = meta.single_end ? "" : "--paired"
+ def classified = meta.single_end ? "${prefix}.classified.fastq.gz" : "${prefix}.classified_1.fastq.gz ${prefix}.classified_2.fastq.gz"
+ def unclassified = meta.single_end ? "${prefix}.unclassified.fastq.gz" : "${prefix}.unclassified_1.fastq.gz ${prefix}.unclassified_2.fastq.gz"
+ def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null"
+ def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : ""
+
+ """
+ touch ${prefix}.kraken2.report.txt
+ if [ "$save_output_fastqs" == "true" ]; then
+ touch $classified
+ touch $unclassified
+ fi
+ if [ "$save_reads_assignment" == "true" ]; then
+ touch ${prefix}.kraken2.classifiedreads.txt
+ fi
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//')
+ pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+ END_VERSIONS
+ """
+
+}
diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml
new file mode 100644
index 000000000..7909ffe7e
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/meta.yml
@@ -0,0 +1,78 @@
+name: kraken2_kraken2
+description: Classifies metagenomic sequence data
+keywords:
+ - classify
+ - metagenomics
+ - fastq
+ - db
+tools:
+ - kraken2:
+ description: |
+ Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads
+ homepage: https://ccb.jhu.edu/software/kraken2/
+ documentation: https://github.com/DerrickWood/kraken2/wiki/Manual
+ doi: 10.1186/s13059-019-1891-0
+ licence: ["MIT"]
+input:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - reads:
+ type: file
+ description: |
+ List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+ respectively.
+ - db:
+ type: directory
+ description: Kraken2 database
+ - save_output_fastqs:
+ type: string
+ description: |
+ If true, optional commands are added to save classified and unclassified reads
+ as fastq files
+ - save_reads_assignment:
+ type: string
+ description: |
+ If true, an optional command is added to save a file reporting the taxonomic
+ classification of each input read
+output:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - classified_reads_fastq:
+ type: file
+ description: |
+ Reads classified as belonging to any of the taxa
+ on the Kraken2 database.
+ pattern: "*{fastq.gz}"
+ - unclassified_reads_fastq:
+ type: file
+ description: |
+ Reads not classified to any of the taxa
+ on the Kraken2 database.
+ pattern: "*{fastq.gz}"
+ - classified_reads_assignment:
+ type: file
+ description: |
+ Kraken2 output file indicating the taxonomic assignment of
+ each input read
+ - report:
+ type: file
+ description: |
+ Kraken2 report containing stats about classified
+ and not classifed reads.
+ pattern: "*.{report.txt}"
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@joseespinosa"
+ - "@drpatelh"
+maintainers:
+ - "@joseespinosa"
+ - "@drpatelh"
diff --git a/modules/nf-core/kraken2/kraken2/nextflow.config b/modules/nf-core/kraken2/kraken2/nextflow.config
new file mode 100644
index 000000000..7289bd9a1
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/nextflow.config
@@ -0,0 +1,15 @@
+if (!params.skip_alignment && !params.skip_qc) {
+ if (params.contaminant_screening in ['kraken2', 'kraken2_bracken']) {
+ process {
+ withName: 'KRAKEN2' {
+ // See https://doi.org/10.1080/19490976.2024.2323235 for the confidence level
+ // and https://doi.org/10.1038/s41596-022-00738-y for the minimum-hit-groups
+ ext.args = "--confidence 0.05 --minimum-hit-groups 3"
+ publishDir = [
+ path: { "${params.outdir}/${params.aligner}/contaminants/kraken2/kraken_reports" },
+ mode: params.publish_dir_mode
+ ]
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/kraken2/kraken2/tests/main.nf.test b/modules/nf-core/kraken2/kraken2/tests/main.nf.test
new file mode 100644
index 000000000..c0843df29
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/tests/main.nf.test
@@ -0,0 +1,143 @@
+nextflow_process {
+ name "Test Process KRAKEN2_KRAKEN2"
+ script "../main.nf"
+ process "KRAKEN2_KRAKEN2"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "untar"
+ tag "kraken2"
+ tag "kraken2/kraken2"
+
+ setup {
+ run("UNTAR") {
+ script "modules/nf-core/untar/main.nf"
+ process {
+ """
+ input[0] = Channel.of([
+ [],
+ file(
+ params.modules_testdata_base_path + "genomics/sarscov2/genome/db/kraken2.tar.gz",
+ checkIfExists: true
+ )
+ ])
+ """
+ }
+ }
+ }
+
+ test("sarscov2 illumina single end [fastq]") {
+ when {
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true ], // meta map
+ [ file(
+ params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz",
+ checkIfExists: true
+ )]
+ ]
+ input[1] = UNTAR.out.untar.map{ it[1] }
+ input[2] = true
+ input[3] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ process.out.report,
+ process.out.versions,
+ ).match()
+ },
+ { assert process.out.classified_reads_fastq.get(0).get(1) ==~ ".*/test.classified.fastq.gz" },
+ { assert process.out.unclassified_reads_fastq.get(0).get(1) ==~ ".*/test.unclassified.fastq.gz" },
+ )
+ }
+ }
+
+ test("sarscov2 illumina paired end [fastq]") {
+ when {
+ params {
+ outdir = "$outputDir"
+ }
+
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:false ], // meta map
+ [
+ file(
+ params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz",
+ checkIfExists: true
+ ),
+ file(
+ params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz",
+ checkIfExists: true
+ )
+
+ ]
+ ]
+ input[1] = UNTAR.out.untar.map{ it[1] }
+ input[2] = true
+ input[3] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ process.out.report,
+ process.out.versions,
+ ).match()
+ },
+ { assert process.out.classified_reads_fastq.get(0).get(1).get(0)
+ ==~ ".*/test.classified_1.fastq.gz" },
+ { assert process.out.classified_reads_fastq.get(0).get(1).get(1)
+ ==~ ".*/test.classified_2.fastq.gz" },
+ { assert process.out.unclassified_reads_fastq.get(0).get(1).get(0)
+ ==~ ".*/test.unclassified_1.fastq.gz" },
+ { assert process.out.unclassified_reads_fastq.get(0).get(1).get(1)
+ ==~ ".*/test.unclassified_2.fastq.gz" },
+ )
+ }
+ }
+
+ test("sarscov2 illumina single end [fastq] + save_reads_assignment") {
+ when {
+ params {
+ outdir = "$outputDir"
+ }
+
+ process {
+ """
+ input[0] = [
+ [ id:'test', single_end:true ], // meta map
+ [ file(
+ params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz",
+ checkIfExists: true
+ )]
+ ]
+ input[1] = UNTAR.out.untar.map{ it[1] }
+ input[2] = false
+ input[3] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(
+ process.out.report,
+ process.out.classified_reads_assignment,
+ process.out.versions,
+ ).match()
+ },
+ )
+ }
+ }
+}
diff --git a/modules/nf-core/kraken2/kraken2/tests/main.nf.test.snap b/modules/nf-core/kraken2/kraken2/tests/main.nf.test.snap
new file mode 100644
index 000000000..b432f878f
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/tests/main.nf.test.snap
@@ -0,0 +1,74 @@
+{
+ "sarscov2 illumina single end [fastq]": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.kraken2.report.txt:md5,4227755fe40478b8d7dc8634b489761e"
+ ]
+ ],
+ [
+ "versions.yml:md5,79adf2ca1cfc625cb77e391b27142c43"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-04-04T18:47:03.745692"
+ },
+ "sarscov2 illumina paired end [fastq]": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": false
+ },
+ "test.kraken2.report.txt:md5,4227755fe40478b8d7dc8634b489761e"
+ ]
+ ],
+ [
+ "versions.yml:md5,79adf2ca1cfc625cb77e391b27142c43"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-04-04T18:47:13.75649"
+ },
+ "sarscov2 illumina single end [fastq] + save_reads_assignment": {
+ "content": [
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.kraken2.report.txt:md5,4227755fe40478b8d7dc8634b489761e"
+ ]
+ ],
+ [
+ [
+ {
+ "id": "test",
+ "single_end": true
+ },
+ "test.kraken2.classifiedreads.txt:md5,e7a90531f0d8d777316515c36fe4cae0"
+ ]
+ ],
+ [
+ "versions.yml:md5,79adf2ca1cfc625cb77e391b27142c43"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.1"
+ },
+ "timestamp": "2024-04-04T18:47:22.459465"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/kraken2/kraken2/tests/tags.yml b/modules/nf-core/kraken2/kraken2/tests/tags.yml
new file mode 100644
index 000000000..9ebfd7ab6
--- /dev/null
+++ b/modules/nf-core/kraken2/kraken2/tests/tags.yml
@@ -0,0 +1,3 @@
+kraken2/kraken2:
+ - modules/nf-core/kraken2/kraken2/**
+ - modules/nf-core/untar/**
diff --git a/nextflow.config b/nextflow.config
index d5e00c6f4..94481109e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -89,6 +89,11 @@ params {
skip_preseq = true
skip_dupradar = false
skip_qualimap = false
+ contaminant_screening = null
+ kraken_db = null
+ save_kraken_assignments = false
+ save_kraken_unassigned = false
+ bracken_precision = "S"
skip_rseqc = false
skip_biotype_qc = false
skip_deseq2_qc = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index d89b6398e..e7cd6d468 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -518,6 +518,18 @@
"fa_icon": "fas fa-save",
"description": "Where possible, save unaligned reads from either STAR, HISAT2 or Salmon to the results directory.",
"help_text": "This may either be in the form of FastQ or BAM files depending on the options available for that particular tool."
+ },
+ "save_kraken_assignments": {
+ "type": "boolean",
+ "fa_icon": "fas fa-save",
+ "description": "Save read-by-read assignments from Kraken2.",
+ "help_text": "`--kraken_db` parameter must be provided."
+ },
+ "save_kraken_unassigned": {
+ "type": "boolean",
+ "fa_icon": "fas fa-save",
+ "description": "Save reads that were not given assignment from Kraken2.",
+ "help_text": "`--kraken_db` parameter must be provided."
}
}
},
@@ -539,6 +551,26 @@
"default": "bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication",
"fa_icon": "fas fa-chart-pie",
"description": "Specify the RSeQC modules to run."
+ },
+ "contaminant_screening": {
+ "type": "string",
+ "description": "Tool to use for detecting contaminants in unaligned reads - available options are 'kraken2' and 'kraken2_bracken'",
+ "fa_icon": "fas fa-virus-slash",
+ "enum": ["kraken2", "kraken2_bracken"]
+ },
+ "kraken_db": {
+ "type": "string",
+ "description": "Database when using Kraken2/Bracken for contaminant screening.",
+ "help_text": "See the usage tab for more information",
+ "fa_icon": "fas fa-fish"
+ },
+ "bracken_precision": {
+ "type": "string",
+ "default": "S",
+ "fa_icon": "fas fa-tree",
+ "description": "Taxonomic level for Bracken abundance estimations.",
+ "help_text": "First letter of Domain / Phylum / Class / Order / Family / Genus / Species",
+ "enum": ["D", "P", "C", "O", "F", "G", "S"]
}
}
},
diff --git a/subworkflows/local/align_star/nextflow.config b/subworkflows/local/align_star/nextflow.config
index 119eb6e4a..511ff6e63 100644
--- a/subworkflows/local/align_star/nextflow.config
+++ b/subworkflows/local/align_star/nextflow.config
@@ -22,7 +22,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
--outSAMattributes NH HI AS NM MD
--quantTranscriptomeBan Singleend
--outSAMstrandField intronMotif
- ${params.save_unaligned ? '--outReadsUnmapped Fastx' : ''}
+ ${params.save_unaligned || params.contaminant_screening ? '--outReadsUnmapped Fastx' : ''}
""".trim())
// Consolidate the extra arguments
diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
index b2b890479..5d55ce0a8 100644
--- a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
+++ b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
@@ -256,6 +256,29 @@ def validateInputParameters() {
}
}
+ //General checks for if contaminant screening is used
+ if (params.contaminant_screening) {
+ if (params.aligner == 'star_rsem') {
+ error("Contaminant screeneing cannot be done with --aligner star_rsem since unaligned reads are not saved. Please use --aligner star_salmon or --aligner hisat2.")
+ }
+ }
+
+ // Check that Kraken/Bracken database provided if using kraken2/bracken
+ if (params.contaminant_screening in ['kraken2', 'kraken2_bracken']) {
+ if (!params.kraken_db) {
+ error("Contaminant screening set to kraken2 but not database is provided. Please provide a database with the --kraken_db option.")
+ }
+ //Check that Kraken/Bracken parameters are not provided when Kraken2 is not being used
+ } else {
+ if (!params.bracken_precision.equals('S')) {
+ brackenPrecisionWithoutKrakenDBWarn()
+ }
+
+ if (params.save_kraken_assignments || params.save_kraken_unassigned || params.kraken_db) {
+ krakenArgumentsWithoutKrakenDBWarn()
+ }
+ }
+
// Check which RSeQC modules we are running
def valid_rseqc_modules = ['bam_stat', 'inner_distance', 'infer_experiment', 'junction_annotation', 'junction_saturation', 'read_distribution', 'read_duplication', 'tin']
def rseqc_modules = params.rseqc_modules ? params.rseqc_modules.split(',').collect{ it.trim().toLowerCase() } : []
@@ -471,6 +494,26 @@ def additionaFastaIndexWarn(index) {
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
}
+//
+// Print a warning if --save_kraken_assignments or --save_kraken_unassigned is provided without --kraken_db
+//
+def krakenArgumentsWithoutKrakenDBWarn() {
+ log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+ " 'Kraken2 related arguments have been provided without setting contaminant\n" +
+ " screening to Kraken2. Kraken2 is not being run so these will not be used.\n" +
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+}
+
+///
+/// Print a warning if --bracken-precision is provided without --kraken_db
+///
+def brackenPrecisionWithoutKrakenDBWarn() {
+ log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+ " '--bracken-precision' parameter has been provided without Kraken2 contaminant screening.\n" +
+ " Bracken will not run so precision will not be set.\n" +
+ "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+}
+
//
// Function to generate an error if contigs in genome fasta file > 512 Mbp
//
diff --git a/workflows/rnaseq/assets/multiqc/multiqc_config.yml b/workflows/rnaseq/assets/multiqc/multiqc_config.yml
index ff154e1cb..63fad09f1 100644
--- a/workflows/rnaseq/assets/multiqc/multiqc_config.yml
+++ b/workflows/rnaseq/assets/multiqc/multiqc_config.yml
@@ -93,6 +93,8 @@ run_modules:
- preseq
- rseqc
- qualimap
+ - kraken
+ - bracken
# Order of modules
@@ -156,6 +158,12 @@ sp:
samtools/idxstats:
fn: "*.idxstats*"
+ kraken:
+ fn: "*report.txt"
+
+ bracken:
+ fn: "*.kraken2.report_bracken.txt"
+
rseqc/bam_stat:
fn: "*.bam_stat.txt"
rseqc/gene_body_coverage:
diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf
index 2ee8384f4..8a85cf845 100755
--- a/workflows/rnaseq/main.nf
+++ b/workflows/rnaseq/main.nf
@@ -33,13 +33,15 @@ include { methodsDescriptionText } from '../../subworkflows/local/utils_
//
// MODULE: Installed directly from nf-core/modules
//
-include { DUPRADAR } from '../../modules/nf-core/dupradar'
-include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort'
-include { PRESEQ_LCEXTRAP } from '../../modules/nf-core/preseq/lcextrap'
-include { QUALIMAP_RNASEQ } from '../../modules/nf-core/qualimap/rnaseq'
-include { STRINGTIE_STRINGTIE } from '../../modules/nf-core/stringtie/stringtie'
-include { SUBREAD_FEATURECOUNTS } from '../../modules/nf-core/subread/featurecounts'
-include { MULTIQC } from '../../modules/nf-core/multiqc'
+include { DUPRADAR } from '../../modules/nf-core/dupradar'
+include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort'
+include { PRESEQ_LCEXTRAP } from '../../modules/nf-core/preseq/lcextrap'
+include { QUALIMAP_RNASEQ } from '../../modules/nf-core/qualimap/rnaseq'
+include { STRINGTIE_STRINGTIE } from '../../modules/nf-core/stringtie/stringtie'
+include { SUBREAD_FEATURECOUNTS } from '../../modules/nf-core/subread/featurecounts'
+include { KRAKEN2_KRAKEN2 as KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main'
+include { BRACKEN_BRACKEN as BRACKEN } from '../../modules/nf-core/bracken/bracken/main'
+include { MULTIQC } from '../../modules/nf-core/multiqc'
include { UMITOOLS_PREPAREFORRSEM as UMITOOLS_PREPAREFORSALMON } from '../../modules/nf-core/umitools/prepareforrsem'
include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_FW } from '../../modules/nf-core/bedtools/genomecov'
include { BEDTOOLS_GENOMECOV as BEDTOOLS_GENOMECOV_REV } from '../../modules/nf-core/bedtools/genomecov'
@@ -171,9 +173,10 @@ workflow RNASEQ {
//
// SUBWORKFLOW: Alignment with STAR and gene/transcript quantification with Salmon
//
- ch_genome_bam = Channel.empty()
- ch_genome_bam_index = Channel.empty()
- ch_star_log = Channel.empty()
+ ch_genome_bam = Channel.empty()
+ ch_genome_bam_index = Channel.empty()
+ ch_star_log = Channel.empty()
+ ch_unaligned_sequences = Channel.empty()
if (!params.skip_alignment && params.aligner == 'star_salmon') {
// Check if an AWS iGenome has been provided to use the appropriate version of STAR
def is_aws_igenome = false
@@ -193,10 +196,11 @@ workflow RNASEQ {
is_aws_igenome,
ch_fasta.map { [ [:], it ] }
)
- ch_genome_bam = ALIGN_STAR.out.bam
- ch_genome_bam_index = ALIGN_STAR.out.bai
- ch_transcriptome_bam = ALIGN_STAR.out.bam_transcript
- ch_star_log = ALIGN_STAR.out.log_final
+ ch_genome_bam = ALIGN_STAR.out.bam
+ ch_genome_bam_index = ALIGN_STAR.out.bai
+ ch_transcriptome_bam = ALIGN_STAR.out.bam_transcript
+ ch_star_log = ALIGN_STAR.out.log_final
+ ch_unaligned_sequences = ALIGN_STAR.out.fastq
ch_multiqc_files = ch_multiqc_files.mix(ALIGN_STAR.out.stats.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(ALIGN_STAR.out.flagstat.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(ALIGN_STAR.out.idxstats.collect{it[1]})
@@ -350,8 +354,9 @@ workflow RNASEQ {
ch_splicesites.map { [ [:], it ] },
ch_fasta.map { [ [:], it ] }
)
- ch_genome_bam = FASTQ_ALIGN_HISAT2.out.bam
- ch_genome_bam_index = FASTQ_ALIGN_HISAT2.out.bai
+ ch_genome_bam = FASTQ_ALIGN_HISAT2.out.bam
+ ch_genome_bam_index = FASTQ_ALIGN_HISAT2.out.bai
+ ch_unaligned_sequences = FASTQ_ALIGN_HISAT2.out.fastq
ch_multiqc_files = ch_multiqc_files.mix(FASTQ_ALIGN_HISAT2.out.stats.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(FASTQ_ALIGN_HISAT2.out.flagstat.collect{it[1]})
ch_multiqc_files = ch_multiqc_files.mix(FASTQ_ALIGN_HISAT2.out.idxstats.collect{it[1]})
@@ -647,6 +652,28 @@ workflow RNASEQ {
ch_multiqc_files = ch_multiqc_files.mix(ch_fail_strand_multiqc.collectFile(name: 'fail_strand_check_mqc.tsv'))
}
+
+ if (params.contaminant_screening in ['kraken2', 'kraken2_bracken'] ) {
+ KRAKEN2 (
+ ch_unaligned_sequences,
+ params.kraken_db,
+ params.save_kraken_assignments,
+ params.save_kraken_unassigned
+ )
+ ch_kraken_reports = KRAKEN2.out.report
+ ch_versions = ch_versions.mix(KRAKEN2.out.versions)
+
+ if (params.contaminant_screening == 'kraken2') {
+ ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2.out.report.collect{it[1]})
+ } else if (params.contaminant_screening == 'kraken2_bracken') {
+ BRACKEN (
+ ch_kraken_reports,
+ params.kraken_db
+ )
+ ch_versions = ch_versions.mix(BRACKEN.out.versions)
+ ch_multiqc_files = ch_multiqc_files.mix(BRACKEN.out.txt.collect{it[1]})
+ }
+ }
}
//
diff --git a/workflows/rnaseq/nextflow.config b/workflows/rnaseq/nextflow.config
index c7eeac733..9cbf0cd30 100644
--- a/workflows/rnaseq/nextflow.config
+++ b/workflows/rnaseq/nextflow.config
@@ -8,6 +8,8 @@ includeConfig "../../modules/nf-core/qualimap/rnaseq/nextflow.config"
includeConfig "../../modules/nf-core/sortmerna/nextflow.config"
includeConfig "../../modules/nf-core/stringtie/stringtie/nextflow.config"
includeConfig "../../modules/nf-core/subread/featurecounts/nextflow.config"
+includeConfig "../../modules/nf-core/kraken2/kraken2/nextflow.config"
+includeConfig "../../modules/nf-core/bracken/bracken/nextflow.config"
includeConfig "../../subworkflows/local/align_star/nextflow.config"
includeConfig "../../subworkflows/local/quantify_rsem/nextflow.config"
includeConfig "../../subworkflows/nf-core/quantify_pseudo_alignment/nextflow.config"