Merge pull request #406 from nf-core/krona-ktimporttext-input-collisi…

…on-fix Disambiguate same-db name but bracken step kraken from profile mergin…
nf-core · Oct 26, 2023 · 267e1a1 · 267e1a1
2 parents f18df85 + da8bf5a
commit 267e1a1
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Fixed`
 
 - [#405] Fix database to tool mismatching in KAIJU2KRONA input (❤️ to @MajoroMask for reporting, fix by @jfy133)
+- [#406] Fixed overwriting of bracken-derived kraken2 outputs when the database name is shared between Bracken/Kraken2. (❤️ to @MajoroMask for reporting, fix by @jfy133)
 
 ### `Dependencies`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -485,7 +485,7 @@ process {
     }
 
     withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN {
-        ext.prefix = { "kraken2_${meta.id}_combined_reports" }
+        ext.prefix = { "kraken2_${meta.db_name}_combined_reports" }
         publishDir = [
             path: { "${params.outdir}/kraken2/" },
             mode: params.publish_dir_mode,

diff --git a/docs/output.md b/docs/output.md
@@ -360,6 +360,7 @@ The main taxonomic profiling file from Bracken is the `*.tsv` file. This provide
 
 - `kraken2/`
   - `<db_name>_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`)
+    - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2-<mydatabase>-bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_<mydatabase>.tsv`).
   - `<db_name>/`
     - `<sample_id>_<db_name>.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample
     - `<sample_id>_<db_name>.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample
@@ -582,6 +583,7 @@ The resulting HTML files can be loaded into your web browser for exploration. Ea
   - `<tool>_<database>*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`.
     - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample.
     - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed.
+    - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2-<mydatabase>-bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_<mydatabase>.tsv`).
 
   </details>
 

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
@@ -172,9 +172,13 @@ workflow PROFILING {
         ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment )
         ch_raw_profiles        = ch_raw_profiles.mix(
             KRAKEN2_KRAKEN2.out.report
-                // Set the tool to be strictly 'kraken2' instead of potentially 'bracken' for downstream use.
-                // Will remain distinct from 'pure' Kraken2 results due to distinct database names in file names.
-                .map { meta, report -> [meta + [tool: 'kraken2'], report]}
+                // Rename tool in the meta for the for-bracken files to disambiguate from only-kraken2 results in downstream steps.
+                // Note may need to rename back to to just bracken in those downstream steps depending on context.
+                .map {
+                    meta, report ->
+                        def new_tool = 
+                    [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report]
+                }
         )
 
     }

diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf
@@ -52,12 +52,19 @@ workflow STANDARDISATION_PROFILES {
                             .map {
                                     meta, profile ->
                                         def meta_new = [:]
-                                        meta_new.id = meta.db_name
                                         meta_new.tool = meta.tool == 'malt' ? 'megan6' : meta.tool
+                                        meta_new.db_name = meta.db_name
                                         [meta_new, profile]
                             }
                             .groupTuple ()
-                            .map { [ it[0], it[1].flatten() ] }
+                            .map {
+                                meta, profiles ->
+                                    meta = meta + [
+                                        tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, // replace to get the right output-format description
+                                        id: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                    ]
+                                [ meta, profiles.flatten() ]
+                            }
 
     ch_taxpasta_tax_dir = params.taxpasta_taxonomy_dir ? Channel.fromPath(params.taxpasta_taxonomy_dir, checkIfExists: true).collect() : []
 
@@ -85,7 +92,7 @@ workflow STANDARDISATION_PROFILES {
             centrifuge: it[0]['tool'] == 'centrifuge'
             ganon: it[0]['tool'] == 'ganon'
             kmcp: it [0]['tool'] == 'kmcp'
-            kraken2: it[0]['tool'] == 'kraken2'
+            kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken'
             metaphlan: it[0]['tool'] == 'metaphlan'
             motus: it[0]['tool'] == 'motus'
             unknown: true
@@ -158,11 +165,15 @@ workflow STANDARDISATION_PROFILES {
     // Have to sort by size to ensure first file actually has hits otherwise
     // the script fails
     ch_profiles_for_kraken2 = ch_input_profiles.kraken2
-                                .map { [it[0]['db_name'], it[1]] }
-                                .groupTuple(sort: {-it.size()} )
                                 .map {
-                                    [[id:it[0]], it[1]]
+                                    meta, profiles ->
+                                        def new_meta = [:]
+                                        new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description
+                                        new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                        new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken
+                                    [ new_meta, profiles ]
                                 }
+                                .groupTuple(sort: {-it.size()})
 
     KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 )
     ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt )

diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf
@@ -27,7 +27,7 @@ workflow VISUALIZATION_KRONA {
     ch_input_profiles = profiles
         .branch {
             centrifuge: it[0]['tool'] == 'centrifuge'
-            kraken2: it[0]['tool'] == 'kraken2'
+            kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken'
             unknown: true
         }
     ch_input_classifications = classifications
@@ -41,7 +41,11 @@ workflow VISUALIZATION_KRONA {
         Convert Kraken2 formatted reports into Krona text files
     */
     ch_kraken_reports = ch_input_profiles.kraken2
-        .mix( ch_input_profiles.centrifuge )
+            .map {
+                meta, report ->
+                [meta +  [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report]
+            }
+            .mix( ch_input_profiles.centrifuge )
     KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports )
     ch_krona_text = ch_krona_text.mix( KRAKENTOOLS_KREPORT2KRONA.out.txt )
     ch_versions = ch_versions.mix( KRAKENTOOLS_KREPORT2KRONA.out.versions.first() )