From 320cb64f1e824d5a7672fc513e13935af3ec9075 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 28 Mar 2024 04:12:28 +0000 Subject: [PATCH 01/40] feat: flag for busco --- nextflow.config | 4 ++++ workflows/phoenix.nf | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 20e2a82f..687f8749 100755 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,10 @@ params { busco_db_path = null coverage = 30 // can only increase above 30 + // Run flags + run_busco = false + + // Additional input parameters for -entry SCAFFOLDS and CDC_SCAFFOLDS indir = null scaffolds_ext = '.scaffolds.fa.gz' diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 42fee2c8..0815080d 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -121,7 +121,7 @@ workflow PHOENIX_EXTERNAL { //fairy compressed file corruption check & generate read stats CORRUPTION_CHECK ( - INPUT_CHECK.out.reads, false // true says busco is being run in this workflow + INPUT_CHECK.out.reads, params.run_busco ) ch_versions = ch_versions.mix(CORRUPTION_CHECK.out.versions) From f0876a8857efee7c05eecefad91cc3ea43ef6bf2 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 28 Mar 2024 21:05:15 +0000 Subject: [PATCH 02/40] refactor: change filtering for failed samples #147 --- conf/modules.config | 2 +- modules/local/fairy_corruption_check.nf | 5 ++--- workflows/phoenix.nf | 5 +++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 1e91a084..42764e8d 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,7 @@ process { [ path: { "${params.outdir}/${meta.id}/file_integrity" }, mode: 'copy', - pattern: "*{_summary.txt}" + pattern: "*{_summary_fairy.txt}" ] ] } diff --git a/modules/local/fairy_corruption_check.nf b/modules/local/fairy_corruption_check.nf index b09c675b..8e0ebb6b 100644 --- a/modules/local/fairy_corruption_check.nf +++ b/modules/local/fairy_corruption_check.nf @@ -9,8 +9,7 @@ process CORRUPTION_CHECK { val(busco_val) output: - tuple val(meta), path('*_summary.txt'), emit: outcome - tuple val(meta), path('*_summary_old.txt'), emit: outcome_to_edit + tuple val(meta), path('*_summary_fairy.txt'), emit: outcome path('*_summaryline.tsv'), optional:true, emit: summary_line tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis path("versions.yml"), emit: versions @@ -37,7 +36,7 @@ process CORRUPTION_CHECK { script_version=\$(${ica}fairy_proc.sh -V) #making a copy of the summary file to pass to READ_COUNT_CHECKS to handle file names being the same - cp ${prefix}_summary.txt ${prefix}_summary_old.txt + mv ${prefix}_summary.txt ${prefix}_summary_fairy.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 0815080d..3bfbefff 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -127,8 +127,9 @@ workflow PHOENIX_EXTERNAL { //Combining reads with output of corruption check. By=2 is for getting R1 and R2 results //The mapping here is just to get things in the right bracket so we can call var[0] - read_stats_ch = INPUT_CHECK.out.reads.join(CORRUPTION_CHECK.out.outcome_to_edit, by: [0,0]) - .join(CORRUPTION_CHECK.out.outcome.splitCsv(strip:true, by:2).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0]]]}, by: [0,0]) + read_stats_ch = INPUT_CHECK.out.reads.join(CORRUPTION_CHECK.out.outcome, by: [0,0]) + .join(CORRUPTION_CHECK.out.outcome.splitCsv(strip:true, by:2).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0]]]}, by: [0,0]) + .filter { it[3].findAll {!it.contains('FAILED')}} //Get stats on raw reads if the reads aren't corrupted GET_RAW_STATS ( From 4e8df4c7a44d09e6567ca44154b1890da0abc0b4 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 05:18:35 +0000 Subject: [PATCH 03/40] refactor: ica handling #149 --- main.nf | 1 + modules/local/samplesheet_check.nf | 9 +++------ nextflow.config | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 50b9daef..daf9b032 100755 --- a/main.nf +++ b/main.nf @@ -49,6 +49,7 @@ workflow PHOENIX { //input on command line if (params.input) { ch_input = file(params.input) } else { exit 1, 'For -entry PHOENIX: Input samplesheet not specified!' } ch_versions = Channel.empty() // Used to collect the software versions + if (params.ica != true && params.ica != false) {exit 1, "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods."} main: PHOENIX_EXTERNAL ( ch_input, ch_versions, true ) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 952934ae..bc25439b 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -12,22 +12,19 @@ process SAMPLESHEET_CHECK { path("versions.yml"), emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/check_samplesheet.py" : "check_samplesheet.py" """ - ${ica}check_samplesheet.py \\ + ${script} \\ $samplesheet \\ samplesheet.valid.csv cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - check_samplesheet.py: \$(${ica}check_samplesheet.py --version ) + check_samplesheet.py: ${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/nextflow.config b/nextflow.config index 687f8749..4c935260 100755 --- a/nextflow.config +++ b/nextflow.config @@ -41,6 +41,7 @@ params { // Terra and ICA specific options terra = false ica = false + ica_path = "${launchDir}/bin/" // Database paths //path2db = "${baseDir}/assets/databases" // need this for kraken2db if you place those files in the assets folder From 5ad96bafddabebb5c756d37a643d2492b3a1cb7d Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 15:29:14 +0000 Subject: [PATCH 04/40] refactor: updating new ica handling #149 --- modules/local/fairy_corruption_check.nf | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/modules/local/fairy_corruption_check.nf b/modules/local/fairy_corruption_check.nf index 8e0ebb6b..104c9c6b 100644 --- a/modules/local/fairy_corruption_check.nf +++ b/modules/local/fairy_corruption_check.nf @@ -15,10 +15,6 @@ process CORRUPTION_CHECK { path("versions.yml"), emit: versions script: - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def num1 = "${reads[0]}".minus(".fastq.gz") @@ -26,14 +22,15 @@ process CORRUPTION_CHECK { def busco_parameter = busco_val ? "-b" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" - """ + def script = params.ica ? "python ${params.ica_path}/fairy_proc.sh" : "fairy_proc.sh" +""" #set +e #check for file integrity and log errors #if there is a corruption problem the script will create a *_summaryline.tsv and *.synopsis file for the sample. - ${ica}fairy_proc.sh -r ${reads[0]} -p ${prefix} ${busco_parameter} - ${ica}fairy_proc.sh -r ${reads[1]} -p ${prefix} ${busco_parameter} + ${script} -r ${reads[0]} -p ${prefix} ${busco_parameter} + ${script} -r ${reads[1]} -p ${prefix} ${busco_parameter} - script_version=\$(${ica}fairy_proc.sh -V) + script_version=\$(${script} -V) #making a copy of the summary file to pass to READ_COUNT_CHECKS to handle file names being the same mv ${prefix}_summary.txt ${prefix}_summary_fairy.txt From f131ab602ecc1d760b4a96c5c2060009fb95a303 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 15:32:17 +0000 Subject: [PATCH 05/40] refactor: handling errors, ica handling, filtering #147 #149 --- conf/modules.config | 2 +- modules/local/get_raw_stats.nf | 40 +++++++++++++++------------------- workflows/phoenix.nf | 40 +++++++++++++++++++--------------- 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 42764e8d..62982ad7 100755 --- a/conf/modules.config +++ b/conf/modules.config @@ -67,7 +67,7 @@ process { [ path: { "${params.outdir}/${meta.id}/file_integrity" }, mode: 'copy', - pattern: "*{_summary.txt}" + pattern: "*{_summary_rawstats.txt}" ] ] } diff --git a/modules/local/get_raw_stats.nf b/modules/local/get_raw_stats.nf index aa3add05..24f1ceef 100644 --- a/modules/local/get_raw_stats.nf +++ b/modules/local/get_raw_stats.nf @@ -11,46 +11,40 @@ process GET_RAW_STATS { output: tuple val(meta), path('*_stats.txt'), emit: raw_stats tuple val(meta), path('*_raw_read_counts.txt'), emit: combined_raw_stats - tuple val(meta), path('*_summary.txt'), emit: outcome - path('*_summaryline.tsv'), optional:true, emit: summary_line - tuple val(meta), path('*_summary_old_2.txt'), emit: outcome_to_edit - tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis + tuple val(meta), path('*_summary_rawstats.txt'), emit: outcome + path('*_summaryline.tsv'), optional:true, emit: summary_line + tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis path("versions.yml"), emit: versions - when: - //if the files are not corrupt then get the read stats - "${fairy_corrupt_outcome[0]}" == "PASSED: File ${meta.id}_R1 is not corrupt." && "${fairy_corrupt_outcome[1]}" == "PASSED: File ${meta.id}_R2 is not corrupt." - script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def busco_parameter = busco_val ? "--busco" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" - def path_to_bin = "${workflow.launchDir}/bin/" + def script_q30 = params.ica ? "python ${params.ica_path}/q30.py" : "q30.py" + def script_stats = params.ica ? "python ${params.ica_path}/create_raw_stats_output.py" : "create_raw_stats_output.py" + def script_fairy = params.ica ? "python ${params.ica_path}/fairy.py" : "fairy.py" """ - ${ica}q30.py -i ${reads[0]} > ${prefix}_R1_stats.txt - ${ica}q30.py -i ${reads[1]} > ${prefix}_R2_stats.txt - ${ica}create_raw_stats_output.py -n ${prefix} -r1 ${prefix}_R1_stats.txt -r2 ${prefix}_R2_stats.txt + ${script_q30} -i ${reads[0]} > ${prefix}_R1_stats.txt + ${script_q30} -i ${reads[1]} > ${prefix}_R2_stats.txt + ${script_stats} -n ${prefix} -r1 ${prefix}_R1_stats.txt -r2 ${prefix}_R2_stats.txt - ## checking that read counts match before moving on + # making a copy of the summary file - this avoids writing to the previous file + cp ${fairy_outcome} ${prefix}_input.txt # Output check for messages indicating read pairs that do not match - ${ica}fairy.py -r ${prefix}_raw_read_counts.txt -f ${fairy_outcome} ${busco_parameter} + ${script_fairy} -r ${prefix}_raw_read_counts.txt -f ${prefix}_input.txt ${busco_parameter} - #making a copy of the summary file to pass to BBMAP_REFORMAT to handle file names being the same - cp ${prefix}_summary.txt ${prefix}_summary_old_2.txt + # rename output file + mv ${prefix}_summary.txt ${prefix}_summary_rawstats.txt cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - q30.py: \$(${ica}q30.py --version ) - create_raw_stats_output.py: \$(${ica}create_raw_stats_output.py --version ) - fairy.py: \$(${ica}fairy.py --version ) + q30.py: \$(${script_q30} --version ) + create_raw_stats_output.py: \$(${script_stats} --version ) + fairy.py: \$(${script_fairy} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 3bfbefff..4869176f 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -113,7 +113,7 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - //unzip any zipped databases + // unzip any zipped databases ASSET_CHECK ( params.zipped_sketch, params.custom_mlstdb, kraken2_db_path ) @@ -127,13 +127,15 @@ workflow PHOENIX_EXTERNAL { //Combining reads with output of corruption check. By=2 is for getting R1 and R2 results //The mapping here is just to get things in the right bracket so we can call var[0] - read_stats_ch = INPUT_CHECK.out.reads.join(CORRUPTION_CHECK.out.outcome, by: [0,0]) - .join(CORRUPTION_CHECK.out.outcome.splitCsv(strip:true, by:2).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0]]]}, by: [0,0]) + read_stats_ch = INPUT_CHECK.out.reads + .join(CORRUPTION_CHECK.out.outcome, by: [0,0]) + .join(CORRUPTION_CHECK.out.outcome.splitCsv(strip:true, by:2) + .map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0]]]}, by: [0,0]) .filter { it[3].findAll {!it.contains('FAILED')}} - //Get stats on raw reads if the reads aren't corrupted + // Get stats on raw reads if the reads aren't corrupted GET_RAW_STATS ( - read_stats_ch, false // false says no busco is being run + read_stats_ch, params.run_busco // false says no busco is being run ) ch_versions = ch_versions.mix(GET_RAW_STATS.out.versions) @@ -475,19 +477,21 @@ workflow PHOENIX_EXTERNAL { ch_versions = ch_versions.mix(MULTIQC.out.versions) emit: - scaffolds = BBMAP_REFORMAT.out.filtered_scaffolds - trimmed_reads = FASTP_TRIMD.out.reads - mlst = DO_MLST.out.checked_MLSTs - amrfinder_output = AMRFINDERPLUS_RUN.out.report - gamma_ar = GAMMA_AR.out.gamma - phx_summary = GATHER_SUMMARY_LINES.out.summary_report - //output for phylophoenix - griphin_tsv = GRIPHIN.out.griphin_report - griphin_excel = GRIPHIN.out.griphin_tsv_report - dir_samplesheet = GRIPHIN.out.converted_samplesheet - //output for ncbi upload - ncbi_sra_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_sra : null - ncbi_biosample_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_biosample : null + check = INPUT_CHECK.out.versions + // outcome = GET_RAW_STATS.out.outcome + // scaffolds = BBMAP_REFORMAT.out.filtered_scaffolds + // trimmed_reads = FASTP_TRIMD.out.reads + // mlst = DO_MLST.out.checked_MLSTs + // amrfinder_output = AMRFINDERPLUS_RUN.out.report + // gamma_ar = GAMMA_AR.out.gamma + // phx_summary = GATHER_SUMMARY_LINES.out.summary_report + // //output for phylophoenix + // griphin_tsv = GRIPHIN.out.griphin_report + // griphin_excel = GRIPHIN.out.griphin_tsv_report + // dir_samplesheet = GRIPHIN.out.converted_samplesheet + // //output for ncbi upload + // ncbi_sra_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_sra : null + // ncbi_biosample_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_biosample : null } /* From 401ec10db6bd1686258592b5b8e1824bc859a314 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 15:33:42 +0000 Subject: [PATCH 06/40] refactor: check ica param is as expected 149 --- main.nf | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/main.nf b/main.nf index daf9b032..b78411f8 100755 --- a/main.nf +++ b/main.nf @@ -43,30 +43,29 @@ workflow PHOENIX { // Check input path parameters to see if they exist def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db] //removed , params.fasta to stop issue w/connecting to aws and igenomes not used for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - - // Check mandatory parameters + if (params.ica != true && params.ica != false) {exit 1, "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods."} //input on command line if (params.input) { ch_input = file(params.input) } else { exit 1, 'For -entry PHOENIX: Input samplesheet not specified!' } ch_versions = Channel.empty() // Used to collect the software versions - if (params.ica != true && params.ica != false) {exit 1, "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods."} - + main: PHOENIX_EXTERNAL ( ch_input, ch_versions, true ) emit: - scaffolds = PHOENIX_EXTERNAL.out.scaffolds - trimmed_reads = PHOENIX_EXTERNAL.out.trimmed_reads - mlst = PHOENIX_EXTERNAL.out.mlst - amrfinder_output = PHOENIX_EXTERNAL.out.amrfinder_output - gamma_ar = PHOENIX_EXTERNAL.out.gamma_ar - phx_summary = PHOENIX_EXTERNAL.out.phx_summary - //output for phylophoenix - griphin_tsv = PHOENIX_EXTERNAL.out.griphin_tsv - griphin_excel = PHOENIX_EXTERNAL.out.griphin_excel - dir_samplesheet = PHOENIX_EXTERNAL.out.dir_samplesheet - //output for ncbi upload - ncbi_sra_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_sra_sheet : null - ncbi_biosample_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_biosample_sheet : null + check = PHOENIX_EXTERNAL.out.check + // scaffolds = PHOENIX_EXTERNAL.out.scaffolds + // trimmed_reads = PHOENIX_EXTERNAL.out.trimmed_reads + // mlst = PHOENIX_EXTERNAL.out.mlst + // amrfinder_output = PHOENIX_EXTERNAL.out.amrfinder_output + // gamma_ar = PHOENIX_EXTERNAL.out.gamma_ar + // phx_summary = PHOENIX_EXTERNAL.out.phx_summary + // //output for phylophoenix + // griphin_tsv = PHOENIX_EXTERNAL.out.griphin_tsv + // griphin_excel = PHOENIX_EXTERNAL.out.griphin_excel + // dir_samplesheet = PHOENIX_EXTERNAL.out.dir_samplesheet + // //output for ncbi upload + // ncbi_sra_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_sra_sheet : null + // ncbi_biosample_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_biosample_sheet : null } // From 47a0c5cf91ab185d2e40b990dcd676371cb2fe4b Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 15:35:30 +0000 Subject: [PATCH 07/40] refactor: create ncbi_excel_creation flag #150 --- main.nf | 2 +- nextflow.config | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b78411f8..a7a14d98 100755 --- a/main.nf +++ b/main.nf @@ -50,7 +50,7 @@ workflow PHOENIX { ch_versions = Channel.empty() // Used to collect the software versions main: - PHOENIX_EXTERNAL ( ch_input, ch_versions, true ) + PHOENIX_EXTERNAL ( ch_input, ch_versions, params.ncbi_excel_creation ) emit: check = PHOENIX_EXTERNAL.out.check // scaffolds = PHOENIX_EXTERNAL.out.scaffolds diff --git a/nextflow.config b/nextflow.config index 4c935260..55eb08ef 100755 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { use_sra = false // For NCBI spreadsheet creation + ncbi_excel_creation = true microbe_example = "${baseDir}/assets/Microbe.1.0_Example_Data.xlsx" sra_metadata = "${baseDir}/assets/SRA_metadata_example.xlsx" osii_bioprojects = "${baseDir}/assets/osii-bioprojects.yaml" From 5221309ff887dca4be93c0b4c566725d0d4283a8 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 15:48:22 +0000 Subject: [PATCH 08/40] refactor: filtering strategy bbduk #147 --- modules/local/bbduk.nf | 4 ---- workflows/phoenix.nf | 5 ++++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/local/bbduk.nf b/modules/local/bbduk.nf index b409dbbb..e4b1e8f9 100755 --- a/modules/local/bbduk.nf +++ b/modules/local/bbduk.nf @@ -13,10 +13,6 @@ process BBDUK { tuple val(meta), path('*.log') , emit: log path "versions.yml" , emit: versions - when: - //if the files are not corrupt and there are equal number of reads in each file then run bbduk - "${fairy_outcome[0]}" == "PASSED: File ${meta.id}_R1 is not corrupt." && "${fairy_outcome[1]}" == "PASSED: File ${meta.id}_R2 is not corrupt." && "${fairy_outcome[2]}" == "PASSED: Read pairs for ${meta.id} are equal." - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 4869176f..33e447b2 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -140,7 +140,10 @@ workflow PHOENIX_EXTERNAL { ch_versions = ch_versions.mix(GET_RAW_STATS.out.versions) // Combining reads with output of corruption check - bbduk_ch = INPUT_CHECK.out.reads.join(GET_RAW_STATS.out.outcome.splitCsv(strip:true, by:3).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0]]]}, by: [0,0]) + bbduk_ch = INPUT_CHECK.out.reads + .join(GET_RAW_STATS.out.outcome.splitCsv(strip:true, by:3) + .map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0]]]}, by: [0,0]) + .filter { it[3].findAll {!it.contains('FAILED')}} // Remove PhiX reads BBDUK ( From f33a04a8280b26703b4a154795e2d7502b765d74 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 16:31:44 +0000 Subject: [PATCH 09/40] chore: fix meta tuple called --- workflows/phoenix.nf | 646 +++++++++++++++++++++---------------------- 1 file changed, 323 insertions(+), 323 deletions(-) diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 33e447b2..9d7357af 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -143,7 +143,7 @@ workflow PHOENIX_EXTERNAL { bbduk_ch = INPUT_CHECK.out.reads .join(GET_RAW_STATS.out.outcome.splitCsv(strip:true, by:3) .map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0]]]}, by: [0,0]) - .filter { it[3].findAll {!it.contains('FAILED')}} + .filter { it[2].findAll {!it.contains('FAILED')}} // Remove PhiX reads BBDUK ( @@ -157,330 +157,330 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(FASTP_TRIMD.out.versions) - // Rerun on unpaired reads to get stats, nothing removed - FASTP_SINGLES ( - FASTP_TRIMD.out.reads_fail - ) - ch_versions = ch_versions.mix(FASTP_SINGLES.out.versions) - - // Combining fastp json outputs based on meta.id - fastp_json_ch = FASTP_TRIMD.out.json.join(FASTP_SINGLES.out.json, by: [0,0])\ - .join(GET_RAW_STATS.out.combined_raw_stats, by: [0,0])\ - .join(GET_RAW_STATS.out.outcome_to_edit, by: [0,0]) - - // Script gathers data from fastp jsons for pipeline stats file - GET_TRIMD_STATS ( - fastp_json_ch, false // false says no busco is being run - ) - ch_versions = ch_versions.mix(GET_TRIMD_STATS.out.versions) - - // combing fastp_trimd information with fairy check of reads to confirm there are reads after filtering - trimd_reads_file_integrity_ch = FASTP_TRIMD.out.reads.join(GET_TRIMD_STATS.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0,0]) - - // Running Fastqc on trimmed reads - FASTQCTRIMD ( - trimd_reads_file_integrity_ch - ) - ch_versions = ch_versions.mix(FASTQCTRIMD.out.versions.first()) - - // Checking for Contamination in trimmed reads, creating krona plots and best hit files - KRAKEN2_TRIMD ( - FASTP_TRIMD.out.reads, GET_TRIMD_STATS.out.outcome, "trimd", GET_TRIMD_STATS.out.fastp_total_qc, [], ASSET_CHECK.out.kraken_db, "reads" - ) - ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) - - SPADES_WF ( - FASTP_SINGLES.out.reads, \ - FASTP_TRIMD.out.reads, \ - GET_TRIMD_STATS.out.fastp_total_qc, \ - GET_RAW_STATS.out.combined_raw_stats, \ - [], \ - KRAKEN2_TRIMD.out.report, \ - KRAKEN2_TRIMD.out.krona_html, \ - KRAKEN2_TRIMD.out.k2_bh_summary, \ - false - ) - ch_versions = ch_versions.mix(SPADES_WF.out.versions) - - // Rename scaffold headers - RENAME_FASTA_HEADERS ( - SPADES_WF.out.spades_ch - ) - ch_versions = ch_versions.mix(RENAME_FASTA_HEADERS.out.versions) - - // Removing scaffolds <500bp - BBMAP_REFORMAT ( - RENAME_FASTA_HEADERS.out.renamed_scaffolds - ) - ch_versions = ch_versions.mix(BBMAP_REFORMAT.out.versions) - - // Combine bbmap log with the fairy outcome file - scaffold_check_ch = BBMAP_REFORMAT.out.log.map{meta, log -> [[id:meta.id], log]}\ - .join(GET_TRIMD_STATS.out.outcome_to_edit.map{ meta, outcome_to_edit -> [[id:meta.id], outcome_to_edit]}, by: [0])\ - .join(GET_RAW_STATS.out.combined_raw_stats.map{meta, combined_raw_stats -> [[id:meta.id], combined_raw_stats]}, by: [0])\ - .join(GET_TRIMD_STATS.out.fastp_total_qc.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0])\ - .join(KRAKEN2_TRIMD.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ - .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ - .join(KRAKEN2_TRIMD.out.krona_html.map{ meta, krona_html -> [[id:meta.id], krona_html]}, by: [0]) - - // Checking that there are still scaffolds left after filtering - SCAFFOLD_COUNT_CHECK ( - scaffold_check_ch, false, params.coverage, params.nodes, params.names - ) - ch_versions = ch_versions.mix(SCAFFOLD_COUNT_CHECK.out.versions) - - //combing scaffolds with scaffold check information to ensure processes that need scaffolds only run when there are scaffolds in the file - filtered_scaffolds_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]} - .join(SCAFFOLD_COUNT_CHECK.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0]) - - // Running gamma to identify hypervirulence genes in scaffolds - GAMMA_HV ( - filtered_scaffolds_ch, params.hvgamdb - ) - ch_versions = ch_versions.mix(GAMMA_HV.out.versions) - - // Running gamma to identify AR genes in scaffolds - GAMMA_AR ( - filtered_scaffolds_ch, params.ardb - ) - ch_versions = ch_versions.mix(GAMMA_AR.out.versions) - - GAMMA_PF ( - filtered_scaffolds_ch, params.gamdbpf - ) - ch_versions = ch_versions.mix(GAMMA_PF.out.versions) - - // Getting Assembly Stats - QUAST ( - filtered_scaffolds_ch - ) - ch_versions = ch_versions.mix(QUAST.out.versions) - - // Creating krona plots and best hit files for weighted assembly - KRAKEN2_WTASMBLD ( - BBMAP_REFORMAT.out.filtered_scaffolds, SCAFFOLD_COUNT_CHECK.out.outcome, "wtasmbld", [], QUAST.out.report_tsv, ASSET_CHECK.out.kraken_db, "reads" - ) - ch_versions = ch_versions.mix(KRAKEN2_WTASMBLD.out.versions) - - // combine filtered scaffolds and mash_sketch so mash_sketch goes with each filtered_scaffolds file - mash_dist_ch = filtered_scaffolds_ch.combine(ASSET_CHECK.out.mash_sketch) - - // Running Mash distance to get top 20 matches for fastANI to speed things up - MASH_DIST ( - mash_dist_ch - ) - ch_versions = ch_versions.mix(MASH_DIST.out.versions) - - // Combining mash dist with filtered scaffolds and the outcome of the scaffolds count check based on meta.id - top_mash_hits_ch = MASH_DIST.out.dist.join(filtered_scaffolds_ch, by: [0]) - - // Generate file with list of paths of top taxa for fastANI - DETERMINE_TOP_MASH_HITS ( - top_mash_hits_ch - ) - ch_versions = ch_versions.mix(DETERMINE_TOP_MASH_HITS.out.versions) - - // Combining filtered scaffolds with the top taxa list based on meta.id - top_taxa_list_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]}\ - .join(DETERMINE_TOP_MASH_HITS.out.top_taxa_list.map{ meta, top_taxa_list -> [[id:meta.id], top_taxa_list ]}, by: [0])\ - .join(DETERMINE_TOP_MASH_HITS.out.reference_dir.map{ meta, reference_dir -> [[id:meta.id], reference_dir ]}, by: [0]) - - // Getting species ID - FASTANI ( - top_taxa_list_ch - ) - ch_versions = ch_versions.mix(FASTANI.out.versions) - - // Reformat ANI headers - FORMAT_ANI ( - FASTANI.out.ani - ) - ch_versions = ch_versions.mix(FORMAT_ANI.out.versions) - - // Combining weighted kraken report with the FastANI hit based on meta.id - best_hit_ch = KRAKEN2_WTASMBLD.out.k2_bh_summary.map{meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}\ - .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit ]}, by: [0])\ - .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary ]}, by: [0]) - - // Getting ID from either FastANI or if fails, from Kraken2 - DETERMINE_TAXA_ID ( - best_hit_ch, params.nodes, params.names - ) - ch_versions = ch_versions.mix(DETERMINE_TAXA_ID.out.versions) - - // Perform MLST steps on isolates (with srst2 on internal samples) - DO_MLST ( - BBMAP_REFORMAT.out.filtered_scaffolds, \ - SCAFFOLD_COUNT_CHECK.out.outcome, \ - FASTP_TRIMD.out.reads, \ - DETERMINE_TAXA_ID.out.taxonomy, \ - ASSET_CHECK.out.mlst_db, \ - false - ) - ch_versions = ch_versions.mix(DO_MLST.out.versions) - - // get gff and protein files for amrfinder+ - PROKKA ( - filtered_scaffolds_ch, [], [] - ) - ch_versions = ch_versions.mix(PROKKA.out.versions) - - /*// Fetch AMRFinder Database - AMRFINDERPLUS_UPDATE( ) - ch_versions = ch_versions.mix(AMRFINDERPLUS_UPDATE.out.versions)*/ - - // Create file that has the organism name to pass to AMRFinder - GET_TAXA_FOR_AMRFINDER ( - DETERMINE_TAXA_ID.out.taxonomy - ) - ch_versions = ch_versions.mix(GET_TAXA_FOR_AMRFINDER.out.versions) - - // Combining taxa and scaffolds to run amrfinder and get the point mutations. - amr_channel = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, reads -> [[id:meta.id], reads]}\ - .join(GET_TAXA_FOR_AMRFINDER.out.amrfinder_taxa.splitCsv(strip:true).map{meta, amrfinder_taxa -> [[id:meta.id], amrfinder_taxa ]}, by: [0])\ - .join(PROKKA.out.faa.map{ meta, faa -> [[id:meta.id], faa ]}, by: [0])\ - .join(PROKKA.out.gff.map{ meta, gff -> [[id:meta.id], gff ]}, by: [0]) - - // Run AMRFinder - AMRFINDERPLUS_RUN ( - amr_channel, params.amrfinder_db - ) - ch_versions = ch_versions.mix(AMRFINDERPLUS_RUN.out.versions) - - // Combining determined taxa with the assembly stats based on meta.id - assembly_ratios_ch = DETERMINE_TAXA_ID.out.taxonomy.map{meta, taxonomy -> [[id:meta.id], taxonomy]}\ - .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) - - // Calculating the assembly ratio and gather GC% stats - CALCULATE_ASSEMBLY_RATIO ( - assembly_ratios_ch, params.ncbi_assembly_stats - ) - ch_versions = ch_versions.mix(CALCULATE_ASSEMBLY_RATIO.out.versions) - - GENERATE_PIPELINE_STATS_WF ( - GET_RAW_STATS.out.combined_raw_stats, \ - GET_TRIMD_STATS.out.fastp_total_qc, \ - [], \ - KRAKEN2_TRIMD.out.report, \ - KRAKEN2_TRIMD.out.krona_html, \ - KRAKEN2_TRIMD.out.k2_bh_summary, \ - RENAME_FASTA_HEADERS.out.renamed_scaffolds, \ - BBMAP_REFORMAT.out.filtered_scaffolds, \ - DO_MLST.out.checked_MLSTs, \ - GAMMA_HV.out.gamma, \ - GAMMA_AR.out.gamma, \ - GAMMA_PF.out.gamma, \ - QUAST.out.report_tsv, \ - [], [], [], [], \ - KRAKEN2_WTASMBLD.out.report, \ - KRAKEN2_WTASMBLD.out.krona_html, \ - KRAKEN2_WTASMBLD.out.k2_bh_summary, \ - DETERMINE_TAXA_ID.out.taxonomy, \ - FORMAT_ANI.out.ani_best_hit, \ - CALCULATE_ASSEMBLY_RATIO.out.ratio, \ - AMRFINDERPLUS_RUN.out.mutation_report, \ - CALCULATE_ASSEMBLY_RATIO.out.gc_content, \ - false - ) - ch_versions = ch_versions.mix(GENERATE_PIPELINE_STATS_WF.out.versions) - - // Combining output based on meta.id to create summary by sample -- is this verbose, ugly and annoying? yes, if anyone has a slicker way to do this we welcome the input. - line_summary_ch = GET_TRIMD_STATS.out.fastp_total_qc.map{meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}\ - .join(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> [[id:meta.id], checked_MLSTs]}, by: [0])\ - .join(GAMMA_HV.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - .join(GAMMA_AR.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - .join(GAMMA_PF.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0])\ - .join(CALCULATE_ASSEMBLY_RATIO.out.ratio.map{ meta, ratio -> [[id:meta.id], ratio]}, by: [0])\ - .join(GENERATE_PIPELINE_STATS_WF.out.pipeline_stats.map{ meta, pipeline_stats -> [[id:meta.id], pipeline_stats]}, by: [0])\ - .join(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> [[id:meta.id], taxonomy]}, by: [0])\ - .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ - .join(AMRFINDERPLUS_RUN.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ - .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit]}, by: [0]) - - // Generate summary per sample that passed SPAdes - CREATE_SUMMARY_LINE ( - line_summary_ch - ) - ch_versions = ch_versions.mix(CREATE_SUMMARY_LINE.out.versions) - - // Collect all the summary files prior to fetch step to force the fetch process to wait - failed_summaries_ch = SPADES_WF.out.line_summary.collect().ifEmpty(params.placeholder) // if no spades failure pass empty file to keep it moving... - // If you only run one sample and it fails spades there is nothing in the create line summary so pass an empty list to keep it moving... - summaries_ch = CREATE_SUMMARY_LINE.out.line_summary.collect().ifEmpty( [] ) - - // This will check the output directory for an files ending in "_summaryline_failure.tsv" and add them to the output channel - FETCH_FAILED_SUMMARIES ( - outdir_path, failed_summaries_ch, summaries_ch - ) - ch_versions = ch_versions.mix(FETCH_FAILED_SUMMARIES.out.versions) - - // combine all line summaries into one channel - spades_failure_summaries_ch = FETCH_FAILED_SUMMARIES.out.spades_failure_summary_line - fairy_summary_ch = CORRUPTION_CHECK.out.summary_line.collect().ifEmpty( [] )\ - .combine(GET_RAW_STATS.out.summary_line.collect().ifEmpty( [] ))\ - .combine(GET_TRIMD_STATS.out.summary_line.collect().ifEmpty( [] ))\ - .combine(SCAFFOLD_COUNT_CHECK.out.summary_line.collect().ifEmpty( [] ))\ - .ifEmpty( [] ) - - // pulling it all together - all_summaries_ch = spades_failure_summaries_ch.combine(failed_summaries_ch).combine(summaries_ch).combine(fairy_summary_ch) - - // Combining sample summaries into final report - GATHER_SUMMARY_LINES ( - all_summaries_ch, outdir_path, false - ) - ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) - - //create GRiPHin report - GRIPHIN ( - all_summaries_ch, INPUT_CHECK.out.valid_samplesheet, params.ardb, outdir_path, params.coverage, true, false - ) - ch_versions = ch_versions.mix(GRIPHIN.out.versions) - - if (ncbi_excel_creation == true && params.create_ncbi_sheet == true) { - // requiring files so that this process doesn't start until needed files are made. - required_files_ch = FASTP_TRIMD.out.reads.map{ meta, reads -> reads[0]}.collect().combine(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> checked_MLSTs}.collect()).combine(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> taxonomy}.collect()) - - //Fill out NCBI excel sheets for upload based on what PHX found - CREATE_NCBI_UPLOAD_SHEET ( - required_files_ch, params.microbe_example, params.sra_metadata, params.osii_bioprojects, outdir_path, GRIPHIN.out.griphin_tsv_report - ) - ch_versions = ch_versions.mix(CREATE_NCBI_UPLOAD_SHEET.out.versions) - } - - // Collecting the software versions - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - - // - // MODULE: MultiQC - // - workflow_summary = WorkflowPhoenix.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQCTRIMD.out.zip.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_TRIMD.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLES.out.json.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(BBDUK.out.log.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.report_tsv.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_TRIMD.out.report.collect{it[1]}.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_WTASMBLD.out.report.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect() - ) - multiqc_report = MULTIQC.out.report.toList() - ch_versions = ch_versions.mix(MULTIQC.out.versions) + // // Rerun on unpaired reads to get stats, nothing removed + // FASTP_SINGLES ( + // FASTP_TRIMD.out.reads_fail + // ) + // ch_versions = ch_versions.mix(FASTP_SINGLES.out.versions) + + // // Combining fastp json outputs based on meta.id + // fastp_json_ch = FASTP_TRIMD.out.json.join(FASTP_SINGLES.out.json, by: [0,0])\ + // .join(GET_RAW_STATS.out.combined_raw_stats, by: [0,0])\ + // .join(GET_RAW_STATS.out.outcome_to_edit, by: [0,0]) + + // // Script gathers data from fastp jsons for pipeline stats file + // GET_TRIMD_STATS ( + // fastp_json_ch, false // false says no busco is being run + // ) + // ch_versions = ch_versions.mix(GET_TRIMD_STATS.out.versions) + + // // combing fastp_trimd information with fairy check of reads to confirm there are reads after filtering + // trimd_reads_file_integrity_ch = FASTP_TRIMD.out.reads.join(GET_TRIMD_STATS.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0,0]) + + // // Running Fastqc on trimmed reads + // FASTQCTRIMD ( + // trimd_reads_file_integrity_ch + // ) + // ch_versions = ch_versions.mix(FASTQCTRIMD.out.versions.first()) + + // // Checking for Contamination in trimmed reads, creating krona plots and best hit files + // KRAKEN2_TRIMD ( + // FASTP_TRIMD.out.reads, GET_TRIMD_STATS.out.outcome, "trimd", GET_TRIMD_STATS.out.fastp_total_qc, [], ASSET_CHECK.out.kraken_db, "reads" + // ) + // ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) + + // SPADES_WF ( + // FASTP_SINGLES.out.reads, \ + // FASTP_TRIMD.out.reads, \ + // GET_TRIMD_STATS.out.fastp_total_qc, \ + // GET_RAW_STATS.out.combined_raw_stats, \ + // [], \ + // KRAKEN2_TRIMD.out.report, \ + // KRAKEN2_TRIMD.out.krona_html, \ + // KRAKEN2_TRIMD.out.k2_bh_summary, \ + // false + // ) + // ch_versions = ch_versions.mix(SPADES_WF.out.versions) + + // // Rename scaffold headers + // RENAME_FASTA_HEADERS ( + // SPADES_WF.out.spades_ch + // ) + // ch_versions = ch_versions.mix(RENAME_FASTA_HEADERS.out.versions) + + // // Removing scaffolds <500bp + // BBMAP_REFORMAT ( + // RENAME_FASTA_HEADERS.out.renamed_scaffolds + // ) + // ch_versions = ch_versions.mix(BBMAP_REFORMAT.out.versions) + + // // Combine bbmap log with the fairy outcome file + // scaffold_check_ch = BBMAP_REFORMAT.out.log.map{meta, log -> [[id:meta.id], log]}\ + // .join(GET_TRIMD_STATS.out.outcome_to_edit.map{ meta, outcome_to_edit -> [[id:meta.id], outcome_to_edit]}, by: [0])\ + // .join(GET_RAW_STATS.out.combined_raw_stats.map{meta, combined_raw_stats -> [[id:meta.id], combined_raw_stats]}, by: [0])\ + // .join(GET_TRIMD_STATS.out.fastp_total_qc.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0])\ + // .join(KRAKEN2_TRIMD.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ + // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ + // .join(KRAKEN2_TRIMD.out.krona_html.map{ meta, krona_html -> [[id:meta.id], krona_html]}, by: [0]) + + // // Checking that there are still scaffolds left after filtering + // SCAFFOLD_COUNT_CHECK ( + // scaffold_check_ch, false, params.coverage, params.nodes, params.names + // ) + // ch_versions = ch_versions.mix(SCAFFOLD_COUNT_CHECK.out.versions) + + // //combing scaffolds with scaffold check information to ensure processes that need scaffolds only run when there are scaffolds in the file + // filtered_scaffolds_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]} + // .join(SCAFFOLD_COUNT_CHECK.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0]) + + // // Running gamma to identify hypervirulence genes in scaffolds + // GAMMA_HV ( + // filtered_scaffolds_ch, params.hvgamdb + // ) + // ch_versions = ch_versions.mix(GAMMA_HV.out.versions) + + // // Running gamma to identify AR genes in scaffolds + // GAMMA_AR ( + // filtered_scaffolds_ch, params.ardb + // ) + // ch_versions = ch_versions.mix(GAMMA_AR.out.versions) + + // GAMMA_PF ( + // filtered_scaffolds_ch, params.gamdbpf + // ) + // ch_versions = ch_versions.mix(GAMMA_PF.out.versions) + + // // Getting Assembly Stats + // QUAST ( + // filtered_scaffolds_ch + // ) + // ch_versions = ch_versions.mix(QUAST.out.versions) + + // // Creating krona plots and best hit files for weighted assembly + // KRAKEN2_WTASMBLD ( + // BBMAP_REFORMAT.out.filtered_scaffolds, SCAFFOLD_COUNT_CHECK.out.outcome, "wtasmbld", [], QUAST.out.report_tsv, ASSET_CHECK.out.kraken_db, "reads" + // ) + // ch_versions = ch_versions.mix(KRAKEN2_WTASMBLD.out.versions) + + // // combine filtered scaffolds and mash_sketch so mash_sketch goes with each filtered_scaffolds file + // mash_dist_ch = filtered_scaffolds_ch.combine(ASSET_CHECK.out.mash_sketch) + + // // Running Mash distance to get top 20 matches for fastANI to speed things up + // MASH_DIST ( + // mash_dist_ch + // ) + // ch_versions = ch_versions.mix(MASH_DIST.out.versions) + + // // Combining mash dist with filtered scaffolds and the outcome of the scaffolds count check based on meta.id + // top_mash_hits_ch = MASH_DIST.out.dist.join(filtered_scaffolds_ch, by: [0]) + + // // Generate file with list of paths of top taxa for fastANI + // DETERMINE_TOP_MASH_HITS ( + // top_mash_hits_ch + // ) + // ch_versions = ch_versions.mix(DETERMINE_TOP_MASH_HITS.out.versions) + + // // Combining filtered scaffolds with the top taxa list based on meta.id + // top_taxa_list_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]}\ + // .join(DETERMINE_TOP_MASH_HITS.out.top_taxa_list.map{ meta, top_taxa_list -> [[id:meta.id], top_taxa_list ]}, by: [0])\ + // .join(DETERMINE_TOP_MASH_HITS.out.reference_dir.map{ meta, reference_dir -> [[id:meta.id], reference_dir ]}, by: [0]) + + // // Getting species ID + // FASTANI ( + // top_taxa_list_ch + // ) + // ch_versions = ch_versions.mix(FASTANI.out.versions) + + // // Reformat ANI headers + // FORMAT_ANI ( + // FASTANI.out.ani + // ) + // ch_versions = ch_versions.mix(FORMAT_ANI.out.versions) + + // // Combining weighted kraken report with the FastANI hit based on meta.id + // best_hit_ch = KRAKEN2_WTASMBLD.out.k2_bh_summary.map{meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}\ + // .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit ]}, by: [0])\ + // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary ]}, by: [0]) + + // // Getting ID from either FastANI or if fails, from Kraken2 + // DETERMINE_TAXA_ID ( + // best_hit_ch, params.nodes, params.names + // ) + // ch_versions = ch_versions.mix(DETERMINE_TAXA_ID.out.versions) + + // // Perform MLST steps on isolates (with srst2 on internal samples) + // DO_MLST ( + // BBMAP_REFORMAT.out.filtered_scaffolds, \ + // SCAFFOLD_COUNT_CHECK.out.outcome, \ + // FASTP_TRIMD.out.reads, \ + // DETERMINE_TAXA_ID.out.taxonomy, \ + // ASSET_CHECK.out.mlst_db, \ + // false + // ) + // ch_versions = ch_versions.mix(DO_MLST.out.versions) + + // // get gff and protein files for amrfinder+ + // PROKKA ( + // filtered_scaffolds_ch, [], [] + // ) + // ch_versions = ch_versions.mix(PROKKA.out.versions) + + // /*// Fetch AMRFinder Database + // AMRFINDERPLUS_UPDATE( ) + // ch_versions = ch_versions.mix(AMRFINDERPLUS_UPDATE.out.versions)*/ + + // // Create file that has the organism name to pass to AMRFinder + // GET_TAXA_FOR_AMRFINDER ( + // DETERMINE_TAXA_ID.out.taxonomy + // ) + // ch_versions = ch_versions.mix(GET_TAXA_FOR_AMRFINDER.out.versions) + + // // Combining taxa and scaffolds to run amrfinder and get the point mutations. + // amr_channel = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, reads -> [[id:meta.id], reads]}\ + // .join(GET_TAXA_FOR_AMRFINDER.out.amrfinder_taxa.splitCsv(strip:true).map{meta, amrfinder_taxa -> [[id:meta.id], amrfinder_taxa ]}, by: [0])\ + // .join(PROKKA.out.faa.map{ meta, faa -> [[id:meta.id], faa ]}, by: [0])\ + // .join(PROKKA.out.gff.map{ meta, gff -> [[id:meta.id], gff ]}, by: [0]) + + // // Run AMRFinder + // AMRFINDERPLUS_RUN ( + // amr_channel, params.amrfinder_db + // ) + // ch_versions = ch_versions.mix(AMRFINDERPLUS_RUN.out.versions) + + // // Combining determined taxa with the assembly stats based on meta.id + // assembly_ratios_ch = DETERMINE_TAXA_ID.out.taxonomy.map{meta, taxonomy -> [[id:meta.id], taxonomy]}\ + // .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) + + // // Calculating the assembly ratio and gather GC% stats + // CALCULATE_ASSEMBLY_RATIO ( + // assembly_ratios_ch, params.ncbi_assembly_stats + // ) + // ch_versions = ch_versions.mix(CALCULATE_ASSEMBLY_RATIO.out.versions) + + // GENERATE_PIPELINE_STATS_WF ( + // GET_RAW_STATS.out.combined_raw_stats, \ + // GET_TRIMD_STATS.out.fastp_total_qc, \ + // [], \ + // KRAKEN2_TRIMD.out.report, \ + // KRAKEN2_TRIMD.out.krona_html, \ + // KRAKEN2_TRIMD.out.k2_bh_summary, \ + // RENAME_FASTA_HEADERS.out.renamed_scaffolds, \ + // BBMAP_REFORMAT.out.filtered_scaffolds, \ + // DO_MLST.out.checked_MLSTs, \ + // GAMMA_HV.out.gamma, \ + // GAMMA_AR.out.gamma, \ + // GAMMA_PF.out.gamma, \ + // QUAST.out.report_tsv, \ + // [], [], [], [], \ + // KRAKEN2_WTASMBLD.out.report, \ + // KRAKEN2_WTASMBLD.out.krona_html, \ + // KRAKEN2_WTASMBLD.out.k2_bh_summary, \ + // DETERMINE_TAXA_ID.out.taxonomy, \ + // FORMAT_ANI.out.ani_best_hit, \ + // CALCULATE_ASSEMBLY_RATIO.out.ratio, \ + // AMRFINDERPLUS_RUN.out.mutation_report, \ + // CALCULATE_ASSEMBLY_RATIO.out.gc_content, \ + // false + // ) + // ch_versions = ch_versions.mix(GENERATE_PIPELINE_STATS_WF.out.versions) + + // // Combining output based on meta.id to create summary by sample -- is this verbose, ugly and annoying? yes, if anyone has a slicker way to do this we welcome the input. + // line_summary_ch = GET_TRIMD_STATS.out.fastp_total_qc.map{meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}\ + // .join(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> [[id:meta.id], checked_MLSTs]}, by: [0])\ + // .join(GAMMA_HV.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + // .join(GAMMA_AR.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + // .join(GAMMA_PF.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + // .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0])\ + // .join(CALCULATE_ASSEMBLY_RATIO.out.ratio.map{ meta, ratio -> [[id:meta.id], ratio]}, by: [0])\ + // .join(GENERATE_PIPELINE_STATS_WF.out.pipeline_stats.map{ meta, pipeline_stats -> [[id:meta.id], pipeline_stats]}, by: [0])\ + // .join(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> [[id:meta.id], taxonomy]}, by: [0])\ + // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ + // .join(AMRFINDERPLUS_RUN.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ + // .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit]}, by: [0]) + + // // Generate summary per sample that passed SPAdes + // CREATE_SUMMARY_LINE ( + // line_summary_ch + // ) + // ch_versions = ch_versions.mix(CREATE_SUMMARY_LINE.out.versions) + + // // Collect all the summary files prior to fetch step to force the fetch process to wait + // failed_summaries_ch = SPADES_WF.out.line_summary.collect().ifEmpty(params.placeholder) // if no spades failure pass empty file to keep it moving... + // // If you only run one sample and it fails spades there is nothing in the create line summary so pass an empty list to keep it moving... + // summaries_ch = CREATE_SUMMARY_LINE.out.line_summary.collect().ifEmpty( [] ) + + // // This will check the output directory for an files ending in "_summaryline_failure.tsv" and add them to the output channel + // FETCH_FAILED_SUMMARIES ( + // outdir_path, failed_summaries_ch, summaries_ch + // ) + // ch_versions = ch_versions.mix(FETCH_FAILED_SUMMARIES.out.versions) + + // // combine all line summaries into one channel + // spades_failure_summaries_ch = FETCH_FAILED_SUMMARIES.out.spades_failure_summary_line + // fairy_summary_ch = CORRUPTION_CHECK.out.summary_line.collect().ifEmpty( [] )\ + // .combine(GET_RAW_STATS.out.summary_line.collect().ifEmpty( [] ))\ + // .combine(GET_TRIMD_STATS.out.summary_line.collect().ifEmpty( [] ))\ + // .combine(SCAFFOLD_COUNT_CHECK.out.summary_line.collect().ifEmpty( [] ))\ + // .ifEmpty( [] ) + + // // pulling it all together + // all_summaries_ch = spades_failure_summaries_ch.combine(failed_summaries_ch).combine(summaries_ch).combine(fairy_summary_ch) + + // // Combining sample summaries into final report + // GATHER_SUMMARY_LINES ( + // all_summaries_ch, outdir_path, false + // ) + // ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) + + // //create GRiPHin report + // GRIPHIN ( + // all_summaries_ch, INPUT_CHECK.out.valid_samplesheet, params.ardb, outdir_path, params.coverage, true, false + // ) + // ch_versions = ch_versions.mix(GRIPHIN.out.versions) + + // if (ncbi_excel_creation == true && params.create_ncbi_sheet == true) { + // // requiring files so that this process doesn't start until needed files are made. + // required_files_ch = FASTP_TRIMD.out.reads.map{ meta, reads -> reads[0]}.collect().combine(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> checked_MLSTs}.collect()).combine(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> taxonomy}.collect()) + + // //Fill out NCBI excel sheets for upload based on what PHX found + // CREATE_NCBI_UPLOAD_SHEET ( + // required_files_ch, params.microbe_example, params.sra_metadata, params.osii_bioprojects, outdir_path, GRIPHIN.out.griphin_tsv_report + // ) + // ch_versions = ch_versions.mix(CREATE_NCBI_UPLOAD_SHEET.out.versions) + // } + + // // Collecting the software versions + // CUSTOM_DUMPSOFTWAREVERSIONS ( + // ch_versions.unique().collectFile(name: 'collated_versions.yml') + // ) + + // // + // // MODULE: MultiQC + // // + // workflow_summary = WorkflowPhoenix.paramsSummaryMultiqc(workflow, summary_params) + // ch_workflow_summary = Channel.value(workflow_summary) + + // ch_multiqc_files = Channel.empty() + // ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) + // ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + // ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + // ch_multiqc_files = ch_multiqc_files.mix(FASTQCTRIMD.out.zip.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(FASTP_TRIMD.out.json.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLES.out.json.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(BBDUK.out.log.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.report_tsv.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_TRIMD.out.report.collect{it[1]}.ifEmpty([])) + // ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2_WTASMBLD.out.report.collect{it[1]}.ifEmpty([])) + + // MULTIQC ( + // ch_multiqc_files.collect() + // ) + // multiqc_report = MULTIQC.out.report.toList() + // ch_versions = ch_versions.mix(MULTIQC.out.versions) emit: - check = INPUT_CHECK.out.versions + check = FASTP_TRIMD.out.reads // outcome = GET_RAW_STATS.out.outcome // scaffolds = BBMAP_REFORMAT.out.filtered_scaffolds // trimmed_reads = FASTP_TRIMD.out.reads From e9abaa22377d0f50afc41c8bf7db47a78d953fc5 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 16:32:21 +0000 Subject: [PATCH 10/40] refactor: move fastp variables to config #150 --- nextflow.config | 2 ++ workflows/phoenix.nf | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 55eb08ef..a6841eb1 100755 --- a/nextflow.config +++ b/nextflow.config @@ -27,6 +27,8 @@ params { // Params for filtering minlength = 500 phred = 33 + save_trimmed_fail = true + save_merged = false // Additional input parameters for -entry SRA and CDC_SRA input_sra = null diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 9d7357af..df8e7c93 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -153,7 +153,7 @@ workflow PHOENIX_EXTERNAL { // Trim and remove low quality reads FASTP_TRIMD ( - BBDUK.out.reads, true, false + BBDUK.out.reads, params.save_trimmed_fail, params.save_merged ) ch_versions = ch_versions.mix(FASTP_TRIMD.out.versions) From 3c87d90aac30e9420806de37ec8752892d9b7b61 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 21:06:59 +0000 Subject: [PATCH 11/40] refactor: ica handling, filtering #147 #149 --- modules/local/get_trimd_stats.nf | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/modules/local/get_trimd_stats.nf b/modules/local/get_trimd_stats.nf index a14d4653..6ea2a575 100644 --- a/modules/local/get_trimd_stats.nf +++ b/modules/local/get_trimd_stats.nf @@ -13,41 +13,39 @@ process GET_TRIMD_STATS { output: tuple val(meta), path('*_trimmed_read_counts.txt'), emit: fastp_total_qc - tuple val(meta), path('*_summary.txt'), emit: outcome - path('*_summaryline.tsv'), optional:true, emit: summary_line - tuple val(meta), path('*_summary_old_3.txt'), emit: outcome_to_edit - tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis + path('*_summaryline.tsv'), optional:true, emit: summary_line + tuple val(meta), path('*_summary_fastp.txt'), emit: outcome + tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis path("versions.yml"), emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def busco_parameter = busco_val ? "--busco" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script_fastp = params.ica ? "python ${params.ica_path}/FastP_QC.py" : "FastP_QC.py" + def script_fairy = params.ica ? "python ${params.ica_path}/fairy.py" : "fairy.py" """ - ${ica}FastP_QC.py \\ + ${script_fastp} \\ --trimmed_json ${fastp_trimd_json} \\ --single_json ${fastp_singles_json} \\ --name ${prefix} - # Check that there are still reads in R1 and R2 before fastqc. If there aren't reads then fastqc dies. + # making a copy of the summary file - this avoids writing to the previous file + cp ${fairy_outcome} ${prefix}_input.txt # Output check for messages indicating there are no trimmed reads after filtering. - ${ica}fairy.py -r ${raw_qc} -f ${fairy_outcome} -t ${prefix}_trimmed_read_counts.txt ${busco_parameter} + ${script_fairy} -r ${raw_qc} -f ${prefix}_input.txt -t ${prefix}_trimmed_read_counts.txt ${busco_parameter} #making a copy of the summary file to pass to BBMAP_REFORMAT to handle file names being the same - cp ${prefix}_summary.txt ${prefix}_summary_old_3.txt + mv ${prefix}_summary.txt ${prefix}_summary_fastp.txt cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - fairy.py: \$( ${ica}fairy.py --version ) - FastP_QC.py: \$(${ica}FastP_QC.py --version ) + fairy.py: \$( ${script_fairy} --version ) + FastP_QC.py: \$(${script_fastp} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS From fccbc89f4ae22c711960cf571300965bf890c4cb Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 21:07:45 +0000 Subject: [PATCH 12/40] refactor: ica handling #147 --- modules/local/fastp_singles.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/local/fastp_singles.nf b/modules/local/fastp_singles.nf index 6328f8ed..dd4a1c60 100755 --- a/modules/local/fastp_singles.nf +++ b/modules/local/fastp_singles.nf @@ -19,14 +19,11 @@ process FASTP_SINGLES { task.ext.when == null || task.ext.when script: - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def container = task.container.toString() - "staphb/fastp@" + def script = params.ica ? "${params.ica_path}/create_empty_fastp_json.sh" : "create_empty_fastp_json.sh" """ echo "Debugging: Emptiness of reads[0] and reads[1]" > debug_status.log if [[ ! -s ${reads[0]} ]] && [[ ! -s ${reads[1]} ]]; then @@ -34,7 +31,7 @@ process FASTP_SINGLES { echo "!!!!! - Both are empty" # Both are empty, do nothing??? Nope we handle now #Create psuedo file as empty aint cutting it - ${ica}create_empty_fastp_json.sh -n ${prefix} + ${script} -n ${prefix} touch "${prefix}_empty.html" touch ${prefix}.singles.fastq gzip ${prefix}.singles.fastq @@ -69,7 +66,7 @@ process FASTP_SINGLES { 2> ${prefix}.fastp.log fi - script_version=\$(${ica}create_empty_fastp_json.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": From 25a2ca6d25021d122ad0f1a6cbe30ece69146a9c Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 21:09:16 +0000 Subject: [PATCH 13/40] refactor: ica handling, filtering #147 #149 --- workflows/phoenix.nf | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index df8e7c93..1d4d9a2c 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -157,22 +157,22 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(FASTP_TRIMD.out.versions) - // // Rerun on unpaired reads to get stats, nothing removed - // FASTP_SINGLES ( - // FASTP_TRIMD.out.reads_fail - // ) - // ch_versions = ch_versions.mix(FASTP_SINGLES.out.versions) + // Rerun on unpaired reads to get stats, nothing removed + FASTP_SINGLES ( + FASTP_TRIMD.out.reads_fail + ) + ch_versions = ch_versions.mix(FASTP_SINGLES.out.versions) - // // Combining fastp json outputs based on meta.id - // fastp_json_ch = FASTP_TRIMD.out.json.join(FASTP_SINGLES.out.json, by: [0,0])\ - // .join(GET_RAW_STATS.out.combined_raw_stats, by: [0,0])\ - // .join(GET_RAW_STATS.out.outcome_to_edit, by: [0,0]) + // Combining fastp json outputs based on meta.id + fastp_json_ch = FASTP_TRIMD.out.json.join(FASTP_SINGLES.out.json, by: [0,0])\ + .join(GET_RAW_STATS.out.combined_raw_stats, by: [0,0])\ + .join(GET_RAW_STATS.out.outcome, by: [0,0]) - // // Script gathers data from fastp jsons for pipeline stats file - // GET_TRIMD_STATS ( - // fastp_json_ch, false // false says no busco is being run - // ) - // ch_versions = ch_versions.mix(GET_TRIMD_STATS.out.versions) + // Script gathers data from fastp jsons for pipeline stats file + GET_TRIMD_STATS ( + fastp_json_ch, params.run_busco // false says no busco is being run + ) + ch_versions = ch_versions.mix(GET_TRIMD_STATS.out.versions) // // combing fastp_trimd information with fairy check of reads to confirm there are reads after filtering // trimd_reads_file_integrity_ch = FASTP_TRIMD.out.reads.join(GET_TRIMD_STATS.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0,0]) From 1a6fa53d36ed47109b5c7bae502af59b6021646e Mon Sep 17 00:00:00 2001 From: slsevilla Date: Sun, 31 Mar 2024 21:19:46 +0000 Subject: [PATCH 14/40] refactor: fastqc ica handling, filtering #147 #149 --- modules/local/fastqc.nf | 4 ---- workflows/phoenix.nf | 17 ++++++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/modules/local/fastqc.nf b/modules/local/fastqc.nf index aa7cf58c..863971ce 100755 --- a/modules/local/fastqc.nf +++ b/modules/local/fastqc.nf @@ -7,10 +7,6 @@ process FASTQC { input: tuple val(meta), path(reads), val(fairy_outcome) - when: - //if there are scaffolds left after filtering - "${fairy_outcome[3]}" == "PASSED: There are reads in ${meta.id} R1/R2 after trimming." - output: tuple val(meta), path("*.html"), emit: html tuple val(meta), path("*.zip") , emit: zip diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 1d4d9a2c..bddc3b3b 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -174,14 +174,17 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(GET_TRIMD_STATS.out.versions) - // // combing fastp_trimd information with fairy check of reads to confirm there are reads after filtering - // trimd_reads_file_integrity_ch = FASTP_TRIMD.out.reads.join(GET_TRIMD_STATS.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0,0]) + // combing fastp_trimd information with fairy check of reads to confirm there are reads after filtering + trimd_reads_file_integrity_ch = FASTP_TRIMD.out.reads + .join(GET_TRIMD_STATS.out.outcome.splitCsv(strip:true, by:5) + .map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0,0]) + .filter { it[2].findAll {!it.contains('FAILED')}} - // // Running Fastqc on trimmed reads - // FASTQCTRIMD ( - // trimd_reads_file_integrity_ch - // ) - // ch_versions = ch_versions.mix(FASTQCTRIMD.out.versions.first()) + // Running Fastqc on trimmed reads + FASTQCTRIMD ( + trimd_reads_file_integrity_ch + ) + ch_versions = ch_versions.mix(FASTQCTRIMD.out.versions.first()) // // Checking for Contamination in trimmed reads, creating krona plots and best hit files // KRAKEN2_TRIMD ( From 18132a9af8c6cd0a5e549bc88871458dd5d36318 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 03:42:43 +0000 Subject: [PATCH 15/40] refactor: move kraken params to config #150 --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index a6841eb1..827c48f1 100755 --- a/nextflow.config +++ b/nextflow.config @@ -29,6 +29,8 @@ params { phred = 33 save_trimmed_fail = true save_merged = false + save_output_fastqs = true + save_reads_assignment = true // Additional input parameters for -entry SRA and CDC_SRA input_sra = null From 10c7bdcb90a727bfcc08619b71c1b46ddbaef3a4 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 03:43:42 +0000 Subject: [PATCH 16/40] refactor: krakenbh handle ica and terra #149 --- modules/local/kraken_bh.nf | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/modules/local/kraken_bh.nf b/modules/local/kraken_bh.nf index 508a3531..a5f9af65 100644 --- a/modules/local/kraken_bh.nf +++ b/modules/local/kraken_bh.nf @@ -13,22 +13,16 @@ process KRAKEN_BEST_HIT { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-t terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "${params.ica_path}/kraken2_best_hit.sh" : "kraken2_best_hit.sh" + def terra = params.terra ? "-t terra" : "" """ - ${ica}kraken2_best_hit.sh -i $kraken_summary -q $count_file -n ${prefix} $terra + ${script} -i $kraken_summary -q $count_file -n ${prefix} $terra - script_version=\$(${ica}kraken2_best_hit.sh -V) + script_version=\$(${script} -V) mv ${prefix}.summary.txt ${prefix}.kraken2_${kraken_type}.top_kraken_hit.txt From 606e6e5b758e9d4286b7fcc9936dd24f3a4d2879 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 03:45:05 +0000 Subject: [PATCH 17/40] refactor: kraken subwf and modules for ica, reorg wf calls #149 --- modules/local/krakentools_kreport2krona.nf | 7 +- modules/local/krakentools_kreport2mpa.nf | 7 +- subworkflows/local/kraken2krona.nf | 175 ++++++--------------- workflows/phoenix.nf | 10 +- 4 files changed, 61 insertions(+), 138 deletions(-) diff --git a/modules/local/krakentools_kreport2krona.nf b/modules/local/krakentools_kreport2krona.nf index d4d3d2ae..c6bd89cf 100644 --- a/modules/local/krakentools_kreport2krona.nf +++ b/modules/local/krakentools_kreport2krona.nf @@ -13,17 +13,14 @@ process KRAKEN2_KRONA { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in phoenix/bin/ orginally from https://github.com/jenniferlu717/KrakenTools on 6/15/2022 - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def krakentools_version = "1.2" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/kreport2krona.py" : "kreport2krona.py" """ - ${ica}kreport2krona.py \\ + ${script} \\ --report ${kraken_report} \\ --output ${prefix}_${type}.krona diff --git a/modules/local/krakentools_kreport2mpa.nf b/modules/local/krakentools_kreport2mpa.nf index 01abaaa0..c14d4cb9 100644 --- a/modules/local/krakentools_kreport2mpa.nf +++ b/modules/local/krakentools_kreport2mpa.nf @@ -12,17 +12,14 @@ process KRAKENTOOLS_KREPORT2MPA { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def krakentools_version = "1.2" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/kreport2mpa.py" : "kreport2mpa.py" """ - ${ica}kreport2mpa.py \\ + ${script} \\ --report-file ${kraken_report} \\ --output ${prefix}.mpa diff --git a/subworkflows/local/kraken2krona.nf b/subworkflows/local/kraken2krona.nf index c89fac81..40e1c5c1 100755 --- a/subworkflows/local/kraken2krona.nf +++ b/subworkflows/local/kraken2krona.nf @@ -2,21 +2,12 @@ // Subworkflow: run Kraken2 // -include { KRAKEN2_KRAKEN2 as KRAKEN2_TRIMD } from '../../modules/local/kraken2' -include { KRAKEN2_KRAKEN2 as KRAKEN2_ASMBLD } from '../../modules/local/kraken2' -include { KRAKEN2_KRAKEN2 as KRAKEN2_WTASMBLD } from '../../modules/local/kraken2' -include { KRAKEN2_KRONA as KREPORT2KRONA_TRIMD } from '../../modules/local/krakentools_kreport2krona' -include { KRAKEN2_KRONA as KREPORT2KRONA_ASMBLD } from '../../modules/local/krakentools_kreport2krona' -include { KRAKEN2_KRONA as KREPORT2KRONA_WTASMBLD } from '../../modules/local/krakentools_kreport2krona' -include { KRONA_KTIMPORTTEXT as KRONA_KTIMPORTTEXT_TRIMD } from '../../modules/local/ktimporttext' -include { KRONA_KTIMPORTTEXT as KRONA_KTIMPORTTEXT_ASMBLD } from '../../modules/local/ktimporttext' -include { KRONA_KTIMPORTTEXT as KRONA_KTIMPORTTEXT_WTASMBLD } from '../../modules/local/ktimporttext' -include { KRAKENTOOLS_KREPORT2MPA as KREPORT2MPA_TRIMD } from '../../modules/local/krakentools_kreport2mpa' -include { KRAKENTOOLS_KREPORT2MPA as KREPORT2MPA_ASMBLD } from '../../modules/local/krakentools_kreport2mpa' -include { KRAKENTOOLS_MAKEKREPORT } from '../../modules/local/krakentools_makekreport' -include { KRAKEN_BEST_HIT as KRAKEN2_BH_TRIMD } from '../../modules/local/kraken_bh' -include { KRAKEN_BEST_HIT as KRAKEN2_BH_ASMBLD } from '../../modules/local/kraken_bh' -include { KRAKEN_BEST_HIT as KRAKEN2_BH_WTASMBLD } from '../../modules/local/kraken_bh' +include { KRAKEN2_KRAKEN2 } from '../../modules/local/kraken2' +include { KRAKEN2_KRONA } from '../../modules/local/krakentools_kreport2krona' +include { KRONA_KTIMPORTTEXT } from '../../modules/local/ktimporttext' +include { KRAKENTOOLS_KREPORT2MPA } from '../../modules/local/krakentools_kreport2mpa' +include { KRAKENTOOLS_MAKEKREPORT } from '../../modules/local/krakentools_makekreport' +include { KRAKEN_BEST_HIT } from '../../modules/local/kraken_bh' workflow KRAKEN2_WF { take: @@ -46,95 +37,27 @@ workflow KRAKEN2_WF { } } - if(type =="trimd") { + // Checking for Contamination in trimmed reads + KRAKEN2_KRAKEN2 ( + fasta_ch, type, params.save_output_fastqs, params.save_reads_assignment + ) + ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions) - // Checking for Contamination in trimmed reads - KRAKEN2_TRIMD ( - fasta_ch, "trimd", true, true - ) - ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) - - // Create mpa file - KREPORT2MPA_TRIMD ( - KRAKEN2_TRIMD.out.report - ) - ch_versions = ch_versions.mix(KREPORT2MPA_TRIMD.out.versions) - - // Converting kraken report to krona file to have hierarchical output in krona plot - KREPORT2KRONA_TRIMD ( - KRAKEN2_TRIMD.out.report, "trimd" - ) - ch_versions = ch_versions.mix(KREPORT2KRONA_TRIMD.out.versions) - - // Create krona plot from kraken report - KRONA_KTIMPORTTEXT_TRIMD ( - KREPORT2KRONA_TRIMD.out.krona, "trimd" - ) - ch_versions = ch_versions.mix(KRONA_KTIMPORTTEXT_TRIMD.out.versions) - - // Combining kraken report with quast report based on meta.id - kraken_bh_trimd_ch = KRAKEN2_TRIMD.out.report.map{meta, report -> [[id:meta.id], report]}\ - .join(qc_stats.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0]) - - // Getting Kraken best hit for assembled data - KRAKEN2_BH_TRIMD ( - kraken_bh_trimd_ch, "trimd" - ) - ch_versions = ch_versions.mix(KRAKEN2_BH_TRIMD.out.versions) - - report = KRAKEN2_TRIMD.out.report - k2_bh_summary = KRAKEN2_BH_TRIMD.out.ksummary - krona_html = KRONA_KTIMPORTTEXT_TRIMD.out.html - - } else if(type =="asmbld") { - - // Checking for Contamination in scaffolds - KRAKEN2_ASMBLD ( - fasta_ch, "asmbld", true, true - ) - ch_versions = ch_versions.mix(KRAKEN2_ASMBLD.out.versions) - - // Create mpa file - KREPORT2MPA_ASMBLD ( - KRAKEN2_ASMBLD.out.report - ) - ch_versions = ch_versions.mix(KREPORT2MPA_ASMBLD.out.versions) + // Create mpa file + KRAKENTOOLS_KREPORT2MPA ( + KRAKEN2_KRAKEN2.out.report + ) + ch_versions = ch_versions.mix(KRAKENTOOLS_KREPORT2MPA.out.versions) + if (type == "trimd" || type == "asmbld"){ // Converting kraken report to krona file to have hierarchical output in krona plot - KREPORT2KRONA_ASMBLD ( - KRAKEN2_ASMBLD.out.report, "asmbld" - ) - ch_versions = ch_versions.mix(KREPORT2KRONA_ASMBLD.out.versions) - - // Create krona plot from kraken report - KRONA_KTIMPORTTEXT_ASMBLD ( - KREPORT2KRONA_ASMBLD.out.krona, "asmbld" + KRAKEN2_KRONA ( + KRAKEN2_KRAKEN2.out.report, type ) - ch_versions = ch_versions.mix(KRONA_KTIMPORTTEXT_ASMBLD.out.versions) - - kraken_bh_asmbld_ch = KRAKEN2_ASMBLD.out.report.map{meta, report -> [[id:meta.id], report]}\ - .join(quast.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) - - // Getting Kraken best hit for assembled data - KRAKEN2_BH_ASMBLD ( - kraken_bh_asmbld_ch, "asmbld" - ) - ch_versions = ch_versions.mix(KRAKEN2_BH_ASMBLD.out.versions) - - report = KRAKEN2_ASMBLD.out.report - k2_bh_summary = KRAKEN2_BH_ASMBLD.out.ksummary - krona_html = KRONA_KTIMPORTTEXT_ASMBLD.out.html - - } else if(type=="wtasmbld") { - - // Getting species ID as back up for FastANI and checking contamination isn't in assembly - KRAKEN2_WTASMBLD ( - fasta_ch, "wtasmbld", true, true - ) - ch_versions = ch_versions.mix(KRAKEN2_WTASMBLD.out.versions) - + report = KRAKEN2_KRAKEN2.out.report + } else if (type == "wtasmbld"){ // Add in krakendb into the kraken reads channel so each fasta has a krakendb to go with it. - make_report_ch = KRAKEN2_WTASMBLD.out.classified_reads_assignment.combine(kraken2_db_path) + make_report_ch = KRAKEN2_KRAKEN2.out.classified_reads_assignment.combine(kraken2_db_path) // Create weighted kraken report based on scaffold length KRAKENTOOLS_MAKEKREPORT ( @@ -143,38 +66,44 @@ workflow KRAKEN2_WF { ch_versions = ch_versions.mix(KRAKENTOOLS_MAKEKREPORT.out.versions) // Converting kraken report to krona file to have hierarchical output in krona plot - KREPORT2KRONA_WTASMBLD ( + KRAKEN2_KRONA ( KRAKENTOOLS_MAKEKREPORT.out.kraken_weighted_report, "wtasmbld" ) - ch_versions = ch_versions.mix(KREPORT2KRONA_WTASMBLD.out.versions) + + report = KRAKENTOOLS_MAKEKREPORT.out.kraken_weighted_report + } + ch_versions = ch_versions.mix(KRAKEN2_KRONA.out.versions) + + + // Create krona plot from kraken report + KRONA_KTIMPORTTEXT ( + KRAKEN2_KRONA.out.krona, type + ) + ch_versions = ch_versions.mix(KRONA_KTIMPORTTEXT.out.versions) + if (type == "trimd"){ // Combining kraken report with quast report based on meta.id - kraken_bh_wtasmbld_ch = KRAKENTOOLS_MAKEKREPORT.out.kraken_weighted_report.map{meta, kraken_weighted_report -> [[id:meta.id], kraken_weighted_report]}\ + kraken_bh_ch = KRAKEN2_KRAKEN2.out.report.map{meta, report -> [[id:meta.id], report]}\ + .join(qc_stats.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0]) + } else if (type == "asmbld"){ + // Combining kraken report with quast report based on meta.id + kraken_bh_ch = KRAKEN2_KRAKEN2.out.report.map{meta, report -> [[id:meta.id], report]}\ + .join(quast.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) + } else if (type == "wtasmbld"){ + // Combining kraken report with quast report based on meta.id + kraken_bh_ch = KRAKENTOOLS_MAKEKREPORT.out.kraken_weighted_report.map{meta, kraken_weighted_report -> [[id:meta.id], kraken_weighted_report]}\ .join(quast.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) - - // Getting Kraken best hit for assembled data - KRAKEN2_BH_WTASMBLD ( - kraken_bh_wtasmbld_ch, "wtasmbld" - ) - ch_versions = ch_versions.mix(KRAKEN2_BH_WTASMBLD.out.versions) - - KRONA_KTIMPORTTEXT_WTASMBLD ( - KREPORT2KRONA_WTASMBLD.out.krona, "wtasmbld" - ) - ch_versions = ch_versions.mix(KRONA_KTIMPORTTEXT_WTASMBLD.out.versions) - - report = KRAKENTOOLS_MAKEKREPORT.out.kraken_weighted_report - k2_bh_summary = KRAKEN2_BH_WTASMBLD.out.ksummary - krona_html = KRONA_KTIMPORTTEXT_WTASMBLD.out.html - - } else { - println("Type options are: wtasmbld, asmbld or trimd") } + + // Getting Kraken best hit for assembled data + KRAKEN_BEST_HIT ( + kraken_bh_ch, type + ) + ch_versions = ch_versions.mix(KRAKEN_BEST_HIT.out.versions) emit: report = report - k2_bh_summary = k2_bh_summary - krona_html = krona_html + k2_bh_summary = KRAKEN_BEST_HIT.out.ksummary + krona_html = KRONA_KTIMPORTTEXT.out.html versions = ch_versions // channel: [ versions.yml ] - } \ No newline at end of file diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index bddc3b3b..370fbf2b 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -186,11 +186,11 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(FASTQCTRIMD.out.versions.first()) - // // Checking for Contamination in trimmed reads, creating krona plots and best hit files - // KRAKEN2_TRIMD ( - // FASTP_TRIMD.out.reads, GET_TRIMD_STATS.out.outcome, "trimd", GET_TRIMD_STATS.out.fastp_total_qc, [], ASSET_CHECK.out.kraken_db, "reads" - // ) - // ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) + // Checking for Contamination in trimmed reads, creating krona plots and best hit files + KRAKEN2_TRIMD ( + FASTP_TRIMD.out.reads, GET_TRIMD_STATS.out.outcome, "trimd", GET_TRIMD_STATS.out.fastp_total_qc, [], ASSET_CHECK.out.kraken_db, "reads" + ) + ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) // SPADES_WF ( // FASTP_SINGLES.out.reads, \ From 5896dcd9b30d34e44a5e8f93133dfe6f7de27a86 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 03:46:17 +0000 Subject: [PATCH 18/40] refactor: add check for terra parms #151 --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index a7a14d98..a34f5b49 100755 --- a/main.nf +++ b/main.nf @@ -44,6 +44,7 @@ workflow PHOENIX { def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db] //removed , params.fasta to stop issue w/connecting to aws and igenomes not used for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } if (params.ica != true && params.ica != false) {exit 1, "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods."} + if (params.terra != true && params.terra != false) {exit 1, "Please set params.terra to either \"true\" if running on terra or \"false\" for all other methods."} //input on command line if (params.input) { ch_input = file(params.input) } else { exit 1, 'For -entry PHOENIX: Input samplesheet not specified!' } From c75a6a916f730d5837e74ee8f69530b212164e56 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 20:34:14 +0000 Subject: [PATCH 19/40] refactor: add param for extended_qc #151 --- main.nf | 3 +++ workflows/phoenix.nf | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/main.nf b/main.nf index a34f5b49..4635262c 100755 --- a/main.nf +++ b/main.nf @@ -84,6 +84,9 @@ workflow CDC_PHOENIX { if (params.input) { ch_input = file(params.input) } else { exit 1, 'For -entry CDC_PHOENIX: Input samplesheet not specified!' } ch_versions = Channel.empty() // Used to collect the software versions + // true is for -entry CDC_PHOENIX and CDC_SCAFFOLDS - used in SPADES + extended_qc=false + main: PHOENIX_EXQC ( ch_input, ch_versions, true ) diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 370fbf2b..5ec00a70 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -192,18 +192,18 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(KRAKEN2_TRIMD.out.versions) - // SPADES_WF ( - // FASTP_SINGLES.out.reads, \ - // FASTP_TRIMD.out.reads, \ - // GET_TRIMD_STATS.out.fastp_total_qc, \ - // GET_RAW_STATS.out.combined_raw_stats, \ - // [], \ - // KRAKEN2_TRIMD.out.report, \ - // KRAKEN2_TRIMD.out.krona_html, \ - // KRAKEN2_TRIMD.out.k2_bh_summary, \ - // false - // ) - // ch_versions = ch_versions.mix(SPADES_WF.out.versions) + SPADES_WF ( + FASTP_SINGLES.out.reads, \ + FASTP_TRIMD.out.reads, \ + GET_TRIMD_STATS.out.fastp_total_qc, \ + GET_RAW_STATS.out.combined_raw_stats, \ + [], \ + KRAKEN2_TRIMD.out.report, \ + KRAKEN2_TRIMD.out.krona_html, \ + KRAKEN2_TRIMD.out.k2_bh_summary, \ + params.extended_qc + ) + ch_versions = ch_versions.mix(SPADES_WF.out.versions) // // Rename scaffold headers // RENAME_FASTA_HEADERS ( From e2c3145b4dcdf6ee0deae1971b45bf4108fe7152 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 20:35:22 +0000 Subject: [PATCH 20/40] refactor: spades for ica #149 --- modules/local/spades.nf | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/modules/local/spades.nf b/modules/local/spades.nf index e919591d..0f49c6e8 100755 --- a/modules/local/spades.nf +++ b/modules/local/spades.nf @@ -24,10 +24,6 @@ process SPADES { tuple val(meta), path("*_spades_outcome.csv") , emit: spades_outcome script: - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" @@ -37,14 +33,17 @@ process SPADES { def phred_offset = params.phred def extended_qc_arg = extended_qc ? "-c" : "" def container = task.container.toString() - "staphb/spades@" + def script_stats = params.ica ? "${params.ica_path}/pipeline_stats_writer_trimd.sh" : "pipeline_stats_writer_trimd.sh" + def script_before = params.ica ? "${params.ica_path}/beforeSpades.sh" : "beforeSpades.sh" + def script_after = params.ica ? "${params.ica_path}/afterSpades.sh" : "afterSpades.sh" """ # preemptively create _summary_line.csv and .synopsis file in case spades fails (no contigs or scaffolds created) we can still collect upstream stats. - ${ica}pipeline_stats_writer_trimd.sh -a ${fastp_raw_qc} -b ${fastp_total_qc} -c ${reads[0]} -d ${reads[1]} -e ${kraken2_trimd_report} -f ${k2_bh_summary} -g ${krona_trimd} - ${ica}beforeSpades.sh -k ${k2_bh_summary} -n ${prefix} -d ${full_outdir} ${extended_qc_arg} + ${script_stats} -a ${fastp_raw_qc} -b ${fastp_total_qc} -c ${reads[0]} -d ${reads[1]} -e ${kraken2_trimd_report} -f ${k2_bh_summary} -g ${krona_trimd} + ${script_before} -k ${k2_bh_summary} -n ${prefix} -d ${full_outdir} ${extended_qc_arg} #get version information - bspades_version=\$(${ica}beforeSpades.sh -V) - pipestats_version=\$(${ica}pipeline_stats_writer_trimd.sh -V) - aspades_version=\$(${ica}afterSpades.sh -V) + bspades_version=\$(${script_before} -V) + pipestats_version=\$(${script_stats} -V) + aspades_version=\$(${script_after} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -93,6 +92,6 @@ process SPADES { #Create a summaryline file that will be deleted later if spades is successful if not this line shows up in the final Phoenix_output_summary file #create file '*_spades_outcome.csv' to state if spades fails, if contigs or scaffolds are created. See spades_failure.nf subworkflow #This file will determine if downstream process GENERATE_PIPELINE_STATS_FAILURE and CREATE_SUMMARY_LINE_FAILURE will run (if spades creates contigs, but not scaffolds). - ${ica}afterSpades.sh + ${script_after} """ } From 05343daa1a555cec8e8878959b9fb00c7a38384b Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 20:36:28 +0000 Subject: [PATCH 21/40] refactor: spades wf modules for ica #149 --- modules/local/determine_taxa_id_failure.nf | 9 +++------ modules/local/generate_pipeline_stats_failure.nf | 14 ++++---------- .../local/generate_pipeline_stats_failure_exqc.nf | 14 ++++---------- modules/local/phoenix_summary_line_failure.nf | 9 +++------ 4 files changed, 14 insertions(+), 32 deletions(-) diff --git a/modules/local/determine_taxa_id_failure.nf b/modules/local/determine_taxa_id_failure.nf index e7aa7b82..6a11e57b 100644 --- a/modules/local/determine_taxa_id_failure.nf +++ b/modules/local/determine_taxa_id_failure.nf @@ -17,18 +17,15 @@ process DETERMINE_TAXA_ID_FAILURE { "${spades_outcome[0]}" == "run_failure" || "${spades_outcome[1]}" == "no_scaffolds" || "${spades_outcome[2]}" == "no_contigs" script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "bash ${params.ica_path}/determine_taxID.sh" : "determine_taxID.sh" """ - ${ica}determine_taxID.sh -r $k2_bh_summary -s $meta.id -d $nodes_file -m $names_file + ${script} -r $k2_bh_summary -s $meta.id -d $nodes_file -m $names_file - script_version=\$(${ica}determine_taxID.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/generate_pipeline_stats_failure.nf b/modules/local/generate_pipeline_stats_failure.nf index af4181fd..a0fa9e79 100644 --- a/modules/local/generate_pipeline_stats_failure.nf +++ b/modules/local/generate_pipeline_stats_failure.nf @@ -22,20 +22,14 @@ process GENERATE_PIPELINE_STATS_FAILURE { "${spades_outcome[0]}" == "run_failure" || "${spades_outcome[1]}" == "no_scaffolds" || "${spades_outcome[2]}" == "no_contigs" script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-2 terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "bash ${params.ica_path}/pipeline_stats_writer.sh" : "pipeline_stats_writer.sh" + def terra = params.terra ? "-2 terra" : "" """ - ${ica}pipeline_stats_writer.sh \\ + ${script} \\ -a $raw_qc \\ -b $fastp_total_qc \\ -d ${prefix} \\ @@ -46,7 +40,7 @@ process GENERATE_PIPELINE_STATS_FAILURE { -5 $coverage \\ $terra - script_version=\$(${ica}pipeline_stats_writer.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/generate_pipeline_stats_failure_exqc.nf b/modules/local/generate_pipeline_stats_failure_exqc.nf index 6006d332..9cf7fabc 100644 --- a/modules/local/generate_pipeline_stats_failure_exqc.nf +++ b/modules/local/generate_pipeline_stats_failure_exqc.nf @@ -23,21 +23,15 @@ process GENERATE_PIPELINE_STATS_FAILURE_EXQC { "${spades_outcome[0]}" == "run_failure" || "${spades_outcome[1]}" == "no_scaffolds" || "${spades_outcome[2]}" == "no_contigs" script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-2 terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "bash ${params.ica_path}/pipeline_stats_writer.sh" : "pipeline_stats_writer.sh" + def terra = params.terra ? "-2 terra" : "" """ # this runs with -entry CDC_PHEONIX when SPAdes fails (creates contigs and not scaffolds) - ${ica}pipeline_stats_writer.sh \\ + ${script} \\ -a $raw_qc \\ -b $fastp_total_qc \\ -d ${prefix} \\ @@ -49,7 +43,7 @@ process GENERATE_PIPELINE_STATS_FAILURE_EXQC { -5 $coverage \\ $terra - script_version=\$(${ica}pipeline_stats_writer.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/phoenix_summary_line_failure.nf b/modules/local/phoenix_summary_line_failure.nf index 3f63c7dd..d46eabd7 100644 --- a/modules/local/phoenix_summary_line_failure.nf +++ b/modules/local/phoenix_summary_line_failure.nf @@ -20,17 +20,14 @@ process CREATE_SUMMARY_LINE_FAILURE { "${spades_outcome[0]}" == "run_failure" || "${spades_outcome[1]}" == "no_scaffolds" || "${spades_outcome[2]}" == "no_contigs" script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def extended_qc_arg = extended_qc ? "--extended_qc" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/Phoenix_summary_line.py" : "Phoenix_summary_line.py" """ - ${ica}Phoenix_summary_line.py \\ + ${script} \\ -n ${prefix} \\ -k $trimd_ksummary \\ -t $fastp_total_qc \\ @@ -42,7 +39,7 @@ process CREATE_SUMMARY_LINE_FAILURE { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - Phoenix_summary_line.py: \$(${ica}Phoenix_summary_line.py --version ) + Phoenix_summary_line.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS From 66e8f4640c135f7c8d86a443013bcc5b73cc7f15 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 20:40:10 +0000 Subject: [PATCH 22/40] refactor: rename_headers for ica #149 --- modules/local/rename_fasta_headers.nf | 9 +++------ workflows/phoenix.nf | 10 +++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/modules/local/rename_fasta_headers.nf b/modules/local/rename_fasta_headers.nf index c59cac46..9239daaf 100644 --- a/modules/local/rename_fasta_headers.nf +++ b/modules/local/rename_fasta_headers.nf @@ -12,26 +12,23 @@ process RENAME_FASTA_HEADERS { path "versions.yml" , emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/rename_fasta_headers.py" : "rename_fasta_headers.py" """ gunzip --force ${assembled_scaffolds} unzipped=\$(basename ${assembled_scaffolds} .gz) #adding this in to allow alternative file names with -entry SCAFFOLDS --scaffolds_ext - ${ica}rename_fasta_headers.py --input \$unzipped --output ${prefix}.renamed.scaffolds.fa --name ${prefix} + ${script} --input \$unzipped --output ${prefix}.renamed.scaffolds.fa --name ${prefix} gzip --force ${prefix}.renamed.scaffolds.fa cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - rename_fasta_headers.py: \$(${ica}rename_fasta_headers.py --version ) + rename_fasta_headers.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 5ec00a70..a05229d0 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -205,11 +205,11 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(SPADES_WF.out.versions) - // // Rename scaffold headers - // RENAME_FASTA_HEADERS ( - // SPADES_WF.out.spades_ch - // ) - // ch_versions = ch_versions.mix(RENAME_FASTA_HEADERS.out.versions) + // Rename scaffold headers + RENAME_FASTA_HEADERS ( + SPADES_WF.out.spades_ch + ) + ch_versions = ch_versions.mix(RENAME_FASTA_HEADERS.out.versions) // // Removing scaffolds <500bp // BBMAP_REFORMAT ( From f72f42a92de20d708e5d7e540ef2f7075b5bf7c3 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Mon, 1 Apr 2024 21:53:34 +0000 Subject: [PATCH 23/40] refactor: ica, output file handling #149 --- modules/local/fairy_scaffold_count_check.nf | 109 +++++++++++--------- workflows/phoenix.nf | 40 +++---- 2 files changed, 80 insertions(+), 69 deletions(-) diff --git a/modules/local/fairy_scaffold_count_check.nf b/modules/local/fairy_scaffold_count_check.nf index b7f914d2..30a67c45 100644 --- a/modules/local/fairy_scaffold_count_check.nf +++ b/modules/local/fairy_scaffold_count_check.nf @@ -17,25 +17,12 @@ process SCAFFOLD_COUNT_CHECK { path(names_file) output: - tuple val(meta), path('*_summary.txt'), emit: outcome - path('*_summaryline.tsv'), optional:true, emit: summary_line - tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis - path("versions.yml"), emit: versions + tuple val(meta), path('*_summary_complete.txt'), emit: outcome + path('*_summaryline.tsv'), optional:true, emit: summary_line + tuple val(meta), path('*.synopsis'), optional:true, emit: synopsis + path("versions.yml"), emit: versions script: - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-2 terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { - ica_python = "" - ica_bash = "" - } else if (params.ica==true) { - ica_python = "python ${workflow.launchDir}/bin/" - ica_bash = "bash ${workflow.launchDir}/bin/" - } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def fairy_read_count_outcome_file = fairy_read_count_outcome ? "$fairy_read_count_outcome" : "" @@ -49,58 +36,82 @@ process SCAFFOLD_COUNT_CHECK { def extended_qc_arg = extended_qc ? "--extended_qc" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script_id = params.ica ? "${params.ica_path}/determine_taxID.sh" : "determine_taxID.sh" + def script_writer = params.ica ? "${params.ica_path}/pipeline_stats_writer.sh" : "pipeline_stats_writer.sh" + def script_summary = params.ica ? "python ${params.ica_path}/Phoenix_summary_line.py" : "Phoenix_summary_line.py" + def script_edit = params.ica ? "python ${params.ica_path}/edit_line_summary.py" : "edit_line_summary.py" + def terra = params.terra ? "-2 terra" : "" """ - #checking that the output contains scaffolds still: + # set new final script name + complete_summary="${prefix}_summary_complete.txt" + + # handle -entry SCAFFOLDS + scaffold_entry_file() { + cat <<> \${complete_summary} + PASSED: Using Scaffold entry no corruption check run on R1. + PASSED: Using Scaffold entry no corruption check run on R2. + PASSED: Using Scaffold entry no paired reads to check. + PASSED: Using Scaffold entry no trimd reads to check. + FAILED: No scaffolds in ${prefix} after filtering! + EOT + } + + # checking that the output contains scaffolds still: if grep "Output: 0 reads (0.00%) 0 bases (0.00%)" ${bbmap_log}; then #Check if the file exists already (it won't with -entry SCAFFOLDS) - if [ -f ${prefix}_summary_old_3.txt ]; then - #replace end of line with actual error message - sed -i 's/End_of_File/FAILED: No scaffolds in ${prefix} after filtering!/' ${fairy_read_count_outcome_file} + if [ -f ${fairy_read_count_outcome} ]; then + # replace end of line with actual error message + cp ${fairy_read_count_outcome} \${complete_summary} + sed -i 's/End_of_File/FAILED: No scaffolds in ${prefix} after filtering!/' \${complete_summary} else - echo "PASSED: Using Scaffold entry no corruption check run on R1." > ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no corruption check run on R2." >> ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no paired reads to check." >> ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no trimd reads to check." >> ${prefix}_summary_old_3.txt - echo "FAILED: No scaffolds in ${prefix} after filtering!" >> ${prefix}_summary_old_3.txt + scaffold_entry_file + echo "FAILED: No scaffolds in ${prefix} after filtering!" >> \${complete_summary} fi # if the sample has no scaffolds left make the summaryline and synopsis file for it. # get taxa ID - ${ica_bash}determine_taxID.sh -r $kraken2_trimd_summary -s ${prefix} -d $nodes_file -m $names_file + ${script_id} -r $kraken2_trimd_summary -s ${prefix} -d $nodes_file -m $names_file - #write synopsis file - ${ica_bash}pipeline_stats_writer.sh -d ${prefix} -q ${prefix}.tax -5 $coverage $raw_qc $fastp_total_qc_pipeline_stats \\ - $kraken2_trimd_report $kraken2_trimd_summary_pipeline_stats $krona_trimd $terra + # write synopsis file + ${script_writer} \\ + -d ${prefix} \\ + -q ${prefix}.tax \\ + -5 $coverage \\ + $raw_qc \\ + $fastp_total_qc_pipeline_stats \\ + $kraken2_trimd_report \\ + $kraken2_trimd_summary_pipeline_stats \\ + $krona_trimd $terra # write summary_line file - ${ica_python}Phoenix_summary_line.py -n ${prefix} -s ${prefix}.synopsis -x ${prefix}.tax -o ${prefix}_summaryline.tsv\\ - $kraken2_trimd_summary_summaryline $fastp_total_qc_summaryline $extended_qc_arg + ${script_summary} \\ + -n ${prefix} \\ + -s ${prefix}.synopsis \\ + -x ${prefix}.tax + -o ${prefix}_summaryline.tsv\\ + $kraken2_trimd_summary_summaryline \\ + $fastp_total_qc_summaryline \\ + $extended_qc_arg # change pass to fail and add in error - ${ica_python}edit_line_summary.py -i ${prefix}_summaryline.tsv - - #change file name. - cp ${prefix}_summary_old_3.txt ${prefix}_summary.txt + ${script_edit} -i ${prefix}_summaryline.tsv # if there are scaffolds left after filtering do the following... else #Check if the file exists already (it won't with -entry SCAFFOLDS) - if [ -f ${prefix}_summary_old_3.txt ]; then + if [ -f ${fairy_read_count_outcome} ]; then #replace end of line with actual error message - sed -i 's/End_of_File/PASSED: More than 0 scaffolds in ${prefix} after filtering./' ${fairy_read_count_outcome_file} + cp ${fairy_read_count_outcome} \${complete_summary} + sed -i 's/End_of_File/PASSED: More than 0 scaffolds in ${prefix} after filtering./' \${complete_summary} else - echo "PASSED: Using Scaffold entry no corruption check run on R1." > ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no corruption check run on R2." >> ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no paired reads to check." >> ${prefix}_summary_old_3.txt - echo "PASSED: Using Scaffold entry no trimd reads to check." >> ${prefix}_summary_old_3.txt - echo "PASSED: More than 0 scaffolds in ${prefix} after filtering." >> ${prefix}_summary_old_3.txt + scaffold_entry_file + echo "PASSED: More than 0 scaffolds in ${prefix} after filtering." >> \${complete_summary} fi - cp ${prefix}_summary_old_3.txt ${prefix}_summary.txt fi #gettings script versions - dettaxid_version=\$(${ica_bash}determine_taxID.sh -V) - pipestats_version=\$(${ica_bash}pipeline_stats_writer.sh -V) + dettaxid_version=\$(${script_id} -V) + pipestats_version=\$(${script_writer} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -109,8 +120,8 @@ process SCAFFOLD_COUNT_CHECK { phoenix_base_container: ${container} \${dettaxid_version} \${pipestats_version} - Phoenix_summary_line.py: \$(${ica_python}Phoenix_summary_line.py --version ) - edit_line_summary.py: \$(${ica_python}edit_line_summary.py --version ) + Phoenix_summary_line.py: \$(${script_summary} --version ) + edit_line_summary.py: \$(${script_edit} --version ) END_VERSIONS """ -} +} \ No newline at end of file diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index a05229d0..f1a4e128 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -211,26 +211,26 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(RENAME_FASTA_HEADERS.out.versions) - // // Removing scaffolds <500bp - // BBMAP_REFORMAT ( - // RENAME_FASTA_HEADERS.out.renamed_scaffolds - // ) - // ch_versions = ch_versions.mix(BBMAP_REFORMAT.out.versions) - - // // Combine bbmap log with the fairy outcome file - // scaffold_check_ch = BBMAP_REFORMAT.out.log.map{meta, log -> [[id:meta.id], log]}\ - // .join(GET_TRIMD_STATS.out.outcome_to_edit.map{ meta, outcome_to_edit -> [[id:meta.id], outcome_to_edit]}, by: [0])\ - // .join(GET_RAW_STATS.out.combined_raw_stats.map{meta, combined_raw_stats -> [[id:meta.id], combined_raw_stats]}, by: [0])\ - // .join(GET_TRIMD_STATS.out.fastp_total_qc.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0])\ - // .join(KRAKEN2_TRIMD.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ - // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ - // .join(KRAKEN2_TRIMD.out.krona_html.map{ meta, krona_html -> [[id:meta.id], krona_html]}, by: [0]) - - // // Checking that there are still scaffolds left after filtering - // SCAFFOLD_COUNT_CHECK ( - // scaffold_check_ch, false, params.coverage, params.nodes, params.names - // ) - // ch_versions = ch_versions.mix(SCAFFOLD_COUNT_CHECK.out.versions) + // Removing scaffolds <500bp + BBMAP_REFORMAT ( + RENAME_FASTA_HEADERS.out.renamed_scaffolds + ) + ch_versions = ch_versions.mix(BBMAP_REFORMAT.out.versions) + + // Combine bbmap log with the fairy outcome file + scaffold_check_ch = BBMAP_REFORMAT.out.log.map{ meta, log -> [[id:meta.id], log]}\ + .join(GET_TRIMD_STATS.out.outcome.map{ meta, outcome -> [[id:meta.id], outcome]}, by: [0])\ + .join(GET_RAW_STATS.out.combined_raw_stats.map{ meta, combined_raw_stats -> [[id:meta.id], combined_raw_stats]}, by: [0])\ + .join(GET_TRIMD_STATS.out.fastp_total_qc.map{ meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}, by: [0])\ + .join(KRAKEN2_TRIMD.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ + .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ + .join(KRAKEN2_TRIMD.out.krona_html.map{ meta, krona_html -> [[id:meta.id], krona_html]}, by: [0]) + + // Checking that there are still scaffolds left after filtering + SCAFFOLD_COUNT_CHECK ( + scaffold_check_ch, params.extended_qc, params.coverage, params.nodes, params.names + ) + ch_versions = ch_versions.mix(SCAFFOLD_COUNT_CHECK.out.versions) // //combing scaffolds with scaffold check information to ensure processes that need scaffolds only run when there are scaffolds in the file // filtered_scaffolds_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]} From 2fe0069cc81ec6f4e566237e793b10c3e7d2b5ee Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 02:57:04 +0000 Subject: [PATCH 24/40] refactor: create extended_qc variable #151 --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index 827c48f1..660127e1 100755 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,8 @@ params { // Additional input parameters for -entry SCAFFOLDS and CDC_SCAFFOLDS indir = null scaffolds_ext = '.scaffolds.fa.gz' + extended_qc = false + // Params for filtering minlength = 500 From 3059e03515369f1bdc2519b15645e69d7f3fb112 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 13:27:57 +0000 Subject: [PATCH 25/40] refactor: filtering #147 --- subworkflows/local/spades_failure.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/spades_failure.nf b/subworkflows/local/spades_failure.nf index 878b73bd..c5a70acc 100755 --- a/subworkflows/local/spades_failure.nf +++ b/subworkflows/local/spades_failure.nf @@ -85,7 +85,8 @@ workflow SPADES_WF { // Combining weighted kraken report with the FastANI hit based on meta.id best_hit_ch = k2_bh_summary.map{ meta, ksummary -> [[id:meta.id], ksummary]}\ .join(SPADES.out.spades_outcome.splitCsv(strip:true).map{meta, spades_outcome -> [[id:meta.id], spades_outcome]}) - + .filter { it[2][0].contains('run_failure') || it[2][1].contains('no_scaffolds') || it[2][2].contains('no_contigs')} + // Getting ID from either FastANI or if fails, from Kraken2 DETERMINE_TAXA_ID_FAILURE ( best_hit_ch, params.nodes, params.names From 7720ec4004eb67bac062db298c4cd4e4538cf329 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 13:42:50 +0000 Subject: [PATCH 26/40] refactor: move filtering to workflow level #147 --- modules/local/gamma.nf | 4 -- modules/local/mash_distance.nf | 4 -- modules/local/prokka.nf | 16 +----- modules/local/quast.nf | 4 -- workflows/phoenix.nf | 96 +++++++++++++++++----------------- 5 files changed, 51 insertions(+), 73 deletions(-) diff --git a/modules/local/gamma.nf b/modules/local/gamma.nf index 93c8a1db..1cc433d7 100755 --- a/modules/local/gamma.nf +++ b/modules/local/gamma.nf @@ -15,10 +15,6 @@ process GAMMA { tuple val(meta), path("*.fasta"), optional:true , emit: fasta path "versions.yml" , emit: versions - when: - //if there are scaffolds left after filtering - "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/mash_distance.nf b/modules/local/mash_distance.nf index af7d7ac9..f969b95b 100755 --- a/modules/local/mash_distance.nf +++ b/modules/local/mash_distance.nf @@ -11,10 +11,6 @@ process MASH_DIST { tuple val(meta), path("*.txt"), emit: dist path("versions.yml") , emit: versions - when: - //if there are scaffolds left after filtering - "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/modules/local/prokka.nf b/modules/local/prokka.nf index ceee5c6e..c3902891 100755 --- a/modules/local/prokka.nf +++ b/modules/local/prokka.nf @@ -24,27 +24,15 @@ process PROKKA { tuple val(meta), path("*.tsv"), emit: tsv path "versions.yml" , emit: versions - when: - //if there are scaffolds left after filtering - "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." - script: - //set up for terra - if (params.terra==false) { - terra = "" - terra_exit = "" - } else if (params.terra==true) { - terra = "PATH=/opt/conda/envs/prokka/bin:\$PATH" - terra_exit = """PATH="\$(printf '%s\\n' "\$PATH" | sed 's|/opt/conda/envs/prokka/bin:||')" """ - } else { - error "Please set params.terra to either \"true\" or \"false\"" - } //define variables def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" def proteins_opt = proteins ? "--proteins ${proteins[0]}" : "" def prodigal_opt = prodigal_tf ? "--prodigaltf ${prodigal_tf[0]}" : "" def container = task.container.toString() - "staphb/prokka@" + def terra = params.terra ? "PATH=/opt/conda/envs/prokka/bin:\$PATH" : "" + def terra_exit = params.terra ? """PATH="\$(printf '%s\\n' "\$PATH" | sed 's|/opt/conda/envs/prokka/bin:||')" """ : "" """ #adding python path for running busco on terra $terra diff --git a/modules/local/quast.nf b/modules/local/quast.nf index 59bf2726..92e3df1b 100755 --- a/modules/local/quast.nf +++ b/modules/local/quast.nf @@ -11,10 +11,6 @@ process QUAST { tuple val(meta), path('*.tsv') , emit: report_tsv path "versions.yml" , emit: versions - when: - //if the files are not corrupt and there are equal number of reads in each file then run bbduk - "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." - script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index f1a4e128..0153ab07 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -232,56 +232,64 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(SCAFFOLD_COUNT_CHECK.out.versions) - // //combing scaffolds with scaffold check information to ensure processes that need scaffolds only run when there are scaffolds in the file - // filtered_scaffolds_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]} - // .join(SCAFFOLD_COUNT_CHECK.out.outcome.splitCsv(strip:true, by:5).map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0]) + //combing scaffolds with scaffold check information to ensure processes that need scaffolds only run when there are scaffolds in the file + filtered_scaffolds_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]} + .join(SCAFFOLD_COUNT_CHECK.out.outcome.splitCsv(strip:true, by:5) + .map{meta, fairy_outcome -> [meta, [fairy_outcome[0][0], fairy_outcome[1][0], fairy_outcome[2][0], fairy_outcome[3][0], fairy_outcome[4][0]]]}, by: [0]) + .filter { it[2].findAll {it.contains('PASSED: More than 0 scaffolds')}} + + // Running gamma to identify hypervirulence genes in scaffolds + GAMMA_HV ( + filtered_scaffolds_ch, params.hvgamdb + ) + ch_versions = ch_versions.mix(GAMMA_HV.out.versions) - // // Running gamma to identify hypervirulence genes in scaffolds - // GAMMA_HV ( - // filtered_scaffolds_ch, params.hvgamdb - // ) - // ch_versions = ch_versions.mix(GAMMA_HV.out.versions) + // Running gamma to identify AR genes in scaffolds + GAMMA_AR ( + filtered_scaffolds_ch, params.ardb + ) + ch_versions = ch_versions.mix(GAMMA_AR.out.versions) - // // Running gamma to identify AR genes in scaffolds - // GAMMA_AR ( - // filtered_scaffolds_ch, params.ardb - // ) - // ch_versions = ch_versions.mix(GAMMA_AR.out.versions) + GAMMA_PF ( + filtered_scaffolds_ch, params.gamdbpf + ) + ch_versions = ch_versions.mix(GAMMA_PF.out.versions) - // GAMMA_PF ( - // filtered_scaffolds_ch, params.gamdbpf - // ) - // ch_versions = ch_versions.mix(GAMMA_PF.out.versions) + // Getting Assembly Stats + QUAST ( + filtered_scaffolds_ch + ) + ch_versions = ch_versions.mix(QUAST.out.versions) - // // Getting Assembly Stats - // QUAST ( - // filtered_scaffolds_ch - // ) - // ch_versions = ch_versions.mix(QUAST.out.versions) + // get gff and protein files for amrfinder+ + PROKKA ( + filtered_scaffolds_ch, [], [] + ) + ch_versions = ch_versions.mix(PROKKA.out.versions) - // // Creating krona plots and best hit files for weighted assembly - // KRAKEN2_WTASMBLD ( - // BBMAP_REFORMAT.out.filtered_scaffolds, SCAFFOLD_COUNT_CHECK.out.outcome, "wtasmbld", [], QUAST.out.report_tsv, ASSET_CHECK.out.kraken_db, "reads" - // ) - // ch_versions = ch_versions.mix(KRAKEN2_WTASMBLD.out.versions) + // Creating krona plots and best hit files for weighted assembly + KRAKEN2_WTASMBLD ( + BBMAP_REFORMAT.out.filtered_scaffolds, SCAFFOLD_COUNT_CHECK.out.outcome, "wtasmbld", [], QUAST.out.report_tsv, ASSET_CHECK.out.kraken_db, "reads" + ) + ch_versions = ch_versions.mix(KRAKEN2_WTASMBLD.out.versions) - // // combine filtered scaffolds and mash_sketch so mash_sketch goes with each filtered_scaffolds file - // mash_dist_ch = filtered_scaffolds_ch.combine(ASSET_CHECK.out.mash_sketch) + // combine filtered scaffolds and mash_sketch so mash_sketch goes with each filtered_scaffolds file + mash_dist_ch = filtered_scaffolds_ch.combine(ASSET_CHECK.out.mash_sketch) - // // Running Mash distance to get top 20 matches for fastANI to speed things up - // MASH_DIST ( - // mash_dist_ch - // ) - // ch_versions = ch_versions.mix(MASH_DIST.out.versions) + // Running Mash distance to get top 20 matches for fastANI to speed things up + MASH_DIST ( + mash_dist_ch + ) + ch_versions = ch_versions.mix(MASH_DIST.out.versions) - // // Combining mash dist with filtered scaffolds and the outcome of the scaffolds count check based on meta.id - // top_mash_hits_ch = MASH_DIST.out.dist.join(filtered_scaffolds_ch, by: [0]) + // Combining mash dist with filtered scaffolds and the outcome of the scaffolds count check based on meta.id + top_mash_hits_ch = MASH_DIST.out.dist.join(filtered_scaffolds_ch, by: [0]) - // // Generate file with list of paths of top taxa for fastANI - // DETERMINE_TOP_MASH_HITS ( - // top_mash_hits_ch - // ) - // ch_versions = ch_versions.mix(DETERMINE_TOP_MASH_HITS.out.versions) + // Generate file with list of paths of top taxa for fastANI + DETERMINE_TOP_MASH_HITS ( + top_mash_hits_ch + ) + ch_versions = ch_versions.mix(DETERMINE_TOP_MASH_HITS.out.versions) // // Combining filtered scaffolds with the top taxa list based on meta.id // top_taxa_list_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]}\ @@ -322,12 +330,6 @@ workflow PHOENIX_EXTERNAL { // ) // ch_versions = ch_versions.mix(DO_MLST.out.versions) - // // get gff and protein files for amrfinder+ - // PROKKA ( - // filtered_scaffolds_ch, [], [] - // ) - // ch_versions = ch_versions.mix(PROKKA.out.versions) - // /*// Fetch AMRFinder Database // AMRFINDERPLUS_UPDATE( ) // ch_versions = ch_versions.mix(AMRFINDERPLUS_UPDATE.out.versions)*/ From c22897ca9e4dd7041083613b9b5dbb7da1eeda98 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 13:51:50 +0000 Subject: [PATCH 27/40] refactor: scaffolds samplesshet ica #149 --- modules/local/scaffolds_samplesheet_check.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/local/scaffolds_samplesheet_check.nf b/modules/local/scaffolds_samplesheet_check.nf index cd701ae0..084bf95b 100644 --- a/modules/local/scaffolds_samplesheet_check.nf +++ b/modules/local/scaffolds_samplesheet_check.nf @@ -12,22 +12,19 @@ process SCAFFOLDS_SAMPLESHEET_CHECK { path "versions.yml", emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/check_assembly_samplesheet.py" : "check_assembly_samplesheet.py" """ - ${ica}check_assembly_samplesheet.py \\ + ${script} \\ $samplesheet \\ samplesheet.valid.csv cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - check_assembly_samplesheet.py: \$(${ica}check_assembly_samplesheet.py --version ) + check_assembly_samplesheet.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS From ef6339bbbd22184d28e6b6c5dc9574b1e32ecf16 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 14:01:11 +0000 Subject: [PATCH 28/40] refactor: kraken2 makereport, top mash hits ica #149 --- modules/local/determine_top_mash_hits.nf | 14 ++++---------- modules/local/krakentools_makekreport.nf | 7 ++----- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/modules/local/determine_top_mash_hits.nf b/modules/local/determine_top_mash_hits.nf index 7b809a56..94f47217 100644 --- a/modules/local/determine_top_mash_hits.nf +++ b/modules/local/determine_top_mash_hits.nf @@ -17,25 +17,19 @@ process DETERMINE_TOP_MASH_HITS { "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-t terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def sample_name = "${mash_dists}" - ".txt" //get full sample name with REFSEQ_DATE def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "${params.ica_path}/sort_and_prep_dist.sh" : "sort_and_prep_dist.sh" + def terra = params.terra ? "-t terra" : "" """ mkdir reference_dir - ${ica}sort_and_prep_dist.sh -a $assembly_scaffolds -x $mash_dists -o reference_dir $terra + ${script} -a $assembly_scaffolds -x $mash_dists -o reference_dir $terra - script_version=\$(${ica}sort_and_prep_dist.sh -V) + script_version=\$(${script} -V) if [[ ! -f ${sample_name}_best_MASH_hits.txt ]]; then echo "No MASH hit found" > ${sample_name}_best_MASH_hits.txt diff --git a/modules/local/krakentools_makekreport.nf b/modules/local/krakentools_makekreport.nf index 508d1566..b86b66bc 100644 --- a/modules/local/krakentools_makekreport.nf +++ b/modules/local/krakentools_makekreport.nf @@ -13,17 +13,14 @@ process KRAKENTOOLS_MAKEKREPORT { script: // This script is bundled with the pipeline, in phoenix/bin/ // This script has to be run with kraken output that does not use --use-names flag https://github.com/jenniferlu717/KrakenTools/issues/29 - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def krakentools_version = "1.2" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/make_kreport.py" : "make_kreport.py" """ - ${ica}make_kreport.py \\ + ${script} \\ --input ${kraken_output} \\ --output ${prefix}.kraken2_wtasmbld.summary.txt \\ --taxonomy ${kraken2db_path}/ktaxonomy.tsv \\ From 5ab72ca9187a7d537da9c270d67bb55c49994e75 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 14:01:40 +0000 Subject: [PATCH 29/40] refactor: phoenix wf filtering, ica #147 #149 --- workflows/phoenix.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 0153ab07..6d93ec9a 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -291,10 +291,10 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(DETERMINE_TOP_MASH_HITS.out.versions) - // // Combining filtered scaffolds with the top taxa list based on meta.id - // top_taxa_list_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]}\ - // .join(DETERMINE_TOP_MASH_HITS.out.top_taxa_list.map{ meta, top_taxa_list -> [[id:meta.id], top_taxa_list ]}, by: [0])\ - // .join(DETERMINE_TOP_MASH_HITS.out.reference_dir.map{ meta, reference_dir -> [[id:meta.id], reference_dir ]}, by: [0]) + // Combining filtered scaffolds with the top taxa list based on meta.id + top_taxa_list_ch = BBMAP_REFORMAT.out.filtered_scaffolds.map{meta, filtered_scaffolds -> [[id:meta.id], filtered_scaffolds]}\ + .join(DETERMINE_TOP_MASH_HITS.out.top_taxa_list.map{ meta, top_taxa_list -> [[id:meta.id], top_taxa_list ]}, by: [0])\ + .join(DETERMINE_TOP_MASH_HITS.out.reference_dir.map{ meta, reference_dir -> [[id:meta.id], reference_dir ]}, by: [0]) // // Getting species ID // FASTANI ( From fd2463ac891332c9a2ad9ccbf522556585c8dfff Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 14:07:16 +0000 Subject: [PATCH 30/40] refactor: determine taxaID, fast ani ica #149 --- modules/local/determine_taxa_id.nf | 9 +++---- modules/local/format_ANI_best_hit.nf | 14 +++------- workflows/phoenix.nf | 38 ++++++++++++++-------------- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/modules/local/determine_taxa_id.nf b/modules/local/determine_taxa_id.nf index 5f826a76..b5cf971f 100644 --- a/modules/local/determine_taxa_id.nf +++ b/modules/local/determine_taxa_id.nf @@ -14,20 +14,17 @@ process DETERMINE_TAXA_ID { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" // -r needs to be last as in -entry SCAFFOLDS/CDC_SCAFFOLDS k2_bh_summary is not passed so its a blank argument def k2_bh_file = k2_bh_summary ? "-r $k2_bh_summary" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "${params.ica_path}/determine_taxID.sh" : "determine_taxID.sh" """ - ${ica}determine_taxID.sh -k $kraken_weighted -s $meta.id -f $formatted_ani_file -d $nodes_file -m $names_file $k2_bh_file + ${script} -k $kraken_weighted -s $meta.id -f $formatted_ani_file -d $nodes_file -m $names_file $k2_bh_file - script_version=\$(${ica}determine_taxID.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/format_ANI_best_hit.nf b/modules/local/format_ANI_best_hit.nf index bafb13e5..fd706a63 100644 --- a/modules/local/format_ANI_best_hit.nf +++ b/modules/local/format_ANI_best_hit.nf @@ -12,18 +12,12 @@ process FORMAT_ANI { path("versions.yml"), emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // terra=true sets paths for bc/wget for terra container paths - if (params.terra==false) { terra = ""} - else if (params.terra==true) { terra = "-t terra" } - else { error "Please set params.terra to either \"true\" or \"false\"" } - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "bash ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "${params.ica_path}/ANI_best_hit_formatter.sh" : "ANI_best_hit_formatter.sh" + def terra = params.terra ? "-t terra" : "" """ line=\$(head -n1 ${ani_file}) if [[ "\${line}" == "Mash/FastANI Error:"* ]]; then @@ -35,10 +29,10 @@ process FORMAT_ANI { db_version="REFSEQ_unknown" fi # script also checks that match is 80 or > otherwise an error is thrown - ${ica}ANI_best_hit_formatter.sh -a ${ani_file} -n ${prefix} -d \${db_version} ${terra} + ${script} -a ${ani_file} -n ${prefix} -d \${db_version} ${terra} fi - script_version=\$(${ica}ANI_best_hit_formatter.sh -V) + script_version=\$(${script} -V) cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 6d93ec9a..335ed968 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -296,28 +296,28 @@ workflow PHOENIX_EXTERNAL { .join(DETERMINE_TOP_MASH_HITS.out.top_taxa_list.map{ meta, top_taxa_list -> [[id:meta.id], top_taxa_list ]}, by: [0])\ .join(DETERMINE_TOP_MASH_HITS.out.reference_dir.map{ meta, reference_dir -> [[id:meta.id], reference_dir ]}, by: [0]) - // // Getting species ID - // FASTANI ( - // top_taxa_list_ch - // ) - // ch_versions = ch_versions.mix(FASTANI.out.versions) + // Getting species ID + FASTANI ( + top_taxa_list_ch + ) + ch_versions = ch_versions.mix(FASTANI.out.versions) - // // Reformat ANI headers - // FORMAT_ANI ( - // FASTANI.out.ani - // ) - // ch_versions = ch_versions.mix(FORMAT_ANI.out.versions) + // Reformat ANI headers + FORMAT_ANI ( + FASTANI.out.ani + ) + ch_versions = ch_versions.mix(FORMAT_ANI.out.versions) - // // Combining weighted kraken report with the FastANI hit based on meta.id - // best_hit_ch = KRAKEN2_WTASMBLD.out.k2_bh_summary.map{meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}\ - // .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit ]}, by: [0])\ - // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary ]}, by: [0]) + // Combining weighted kraken report with the FastANI hit based on meta.id + best_hit_ch = KRAKEN2_WTASMBLD.out.k2_bh_summary.map{meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}\ + .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit ]}, by: [0])\ + .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary ]}, by: [0]) - // // Getting ID from either FastANI or if fails, from Kraken2 - // DETERMINE_TAXA_ID ( - // best_hit_ch, params.nodes, params.names - // ) - // ch_versions = ch_versions.mix(DETERMINE_TAXA_ID.out.versions) + // Getting ID from either FastANI or if fails, from Kraken2 + DETERMINE_TAXA_ID ( + best_hit_ch, params.nodes, params.names + ) + ch_versions = ch_versions.mix(DETERMINE_TAXA_ID.out.versions) // // Perform MLST steps on isolates (with srst2 on internal samples) // DO_MLST ( From b05647854b11b4a7d022a7199d9ecba9d6972fe8 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 14:22:52 +0000 Subject: [PATCH 31/40] refactor: mlst ica #149 --- modules/local/check_mlst_with_srst2.nf | 11 ++++------- modules/local/get_mlst_srst2.nf | 9 +++------ modules/local/mlst.nf | 5 +---- workflows/phoenix.nf | 20 ++++++++++---------- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/modules/local/check_mlst_with_srst2.nf b/modules/local/check_mlst_with_srst2.nf index eb2476d3..753e41d2 100644 --- a/modules/local/check_mlst_with_srst2.nf +++ b/modules/local/check_mlst_with_srst2.nf @@ -17,18 +17,15 @@ process CHECK_MLST_WITH_SRST2 { task.ext.when == null || task.ext.when script: - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/fix_MLST2.py" : "fix_MLST2.py" """ if [[ "${status[0]}" == "True" ]]; then - ${ica}fix_MLST2.py --input $mlst_file --srst2 $srst2_file --taxonomy $taxonomy_file --mlst_database $local_dbases + ${script} --input $mlst_file --srst2 $srst2_file --taxonomy $taxonomy_file --mlst_database $local_dbases elif [[ "${status[0]}" == "False" ]]; then - ${ica}fix_MLST2.py --input $mlst_file --taxonomy $taxonomy_file --mlst_database $local_dbases + ${script} --input $mlst_file --taxonomy $taxonomy_file --mlst_database $local_dbases else echo "Something went very wrong, please open an issue on Github for the PHoeNIx developers to address." fi @@ -36,7 +33,7 @@ process CHECK_MLST_WITH_SRST2 { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - fix_MLST2.py: \$(${ica}fix_MLST2.py --version ) + fix_MLST2.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/modules/local/get_mlst_srst2.nf b/modules/local/get_mlst_srst2.nf index 24e66b6d..b8316d77 100755 --- a/modules/local/get_mlst_srst2.nf +++ b/modules/local/get_mlst_srst2.nf @@ -21,13 +21,10 @@ process GET_MLST_SRST2 { (task.ext.when == null || task.ext.when) //& "${status[0]}" == "False" script: - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = task.container.toString() - "quay.io/biocontainers/python@" + def script = params.ica ? "python ${params.ica_path}/local_MLST_converter.py" : "local_MLST_converter.py" """ if [[ "${status[0]}" == "False" ]]; then genus="empty" @@ -47,7 +44,7 @@ process GET_MLST_SRST2 { echo "\${genus}___\${species}" # Old way, now use provided DB with different name format # convert_taxonomy_with_complexes_to_pubMLST.py --genus "\${genus}" --species "\${species}" > DB_defs.txt - ${ica}local_MLST_converter.py --genus "\${genus}" --species "\${species}" > DB_defs.txt + ${script} --genus "\${genus}" --species "\${species}" > DB_defs.txt dbline=\$(tail -n1 DB_defs.txt) echo "\$dbline" @@ -100,7 +97,7 @@ process GET_MLST_SRST2 { cat <<-END_VERSIONS > versions.yml "${task.process}": - local_MLST_converter.py: \$(${ica}local_MLST_converter.py --version ) + local_MLST_converter.py: \$(${script} --version ) python: \$(python --version | sed 's/Python //g') python_container: ${container_version} END_VERSIONS diff --git a/modules/local/mlst.nf b/modules/local/mlst.nf index ed14b0f9..4a324c62 100644 --- a/modules/local/mlst.nf +++ b/modules/local/mlst.nf @@ -16,10 +16,6 @@ process MLST { "${fairy_outcome[4]}" == "PASSED: More than 0 scaffolds in ${meta.id} after filtering." script: - // helps set correct paths to get database version being used - if (params.terra==false) { terra = false } - else if (params.terra==true) { terra = true} - else { error "Please set params.terra to either \"true\" or \"false\""} //define variables def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" @@ -27,6 +23,7 @@ process MLST { def container = task.container.toString() - "quay.io/jvhagey/mlst@" def mlst_version = "2.23.0_01242024" def mlst_version_clean = mlst_version.split("_")[0] + def terra = params.terra ? "true" : "false" """ if [[ ${fasta} = *.gz ]] then diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 335ed968..4d8d7b07 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -319,16 +319,16 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(DETERMINE_TAXA_ID.out.versions) - // // Perform MLST steps on isolates (with srst2 on internal samples) - // DO_MLST ( - // BBMAP_REFORMAT.out.filtered_scaffolds, \ - // SCAFFOLD_COUNT_CHECK.out.outcome, \ - // FASTP_TRIMD.out.reads, \ - // DETERMINE_TAXA_ID.out.taxonomy, \ - // ASSET_CHECK.out.mlst_db, \ - // false - // ) - // ch_versions = ch_versions.mix(DO_MLST.out.versions) + // Perform MLST steps on isolates (with srst2 on internal samples) + DO_MLST ( + BBMAP_REFORMAT.out.filtered_scaffolds, \ + SCAFFOLD_COUNT_CHECK.out.outcome, \ + FASTP_TRIMD.out.reads, \ + DETERMINE_TAXA_ID.out.taxonomy, \ + ASSET_CHECK.out.mlst_db, \ + params.run_srst2_mlst + ) + ch_versions = ch_versions.mix(DO_MLST.out.versions) // /*// Fetch AMRFinder Database // AMRFINDERPLUS_UPDATE( ) From 8c768434532a6f331f651112c3575b371761ffc2 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 14:44:31 +0000 Subject: [PATCH 32/40] refactor: mlst, amrfinder for terra, ica #149 --- modules/local/check_mlst.nf | 8 ++-- modules/local/get_taxa_for_amrfinder.nf | 7 +--- modules/local/run_amrfinder.nf | 17 ++------ workflows/phoenix.nf | 52 ++++++++++++------------- 4 files changed, 34 insertions(+), 50 deletions(-) diff --git a/modules/local/check_mlst.nf b/modules/local/check_mlst.nf index 8c02f03a..d84beead 100644 --- a/modules/local/check_mlst.nf +++ b/modules/local/check_mlst.nf @@ -12,9 +12,6 @@ process CHECK_MLST { tuple val(meta), path("*_status.txt"), emit: status path("versions.yml") , emit: versions - when: - task.ext.when == null || task.ext.when - script: // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. if (params.ica==false) { ica = "" } @@ -22,13 +19,14 @@ process CHECK_MLST { else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/fix_MLST2.py" : "fix_MLST2.py" """ - ${ica}fix_MLST2.py --input $mlst_file --taxonomy $taxonomy_file --mlst_database ${local_dbases} + ${script} --input $mlst_file --taxonomy $taxonomy_file --mlst_database ${local_dbases} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - fix_MLST2.py: \$(${ica}fix_MLST2.py --version ) + fix_MLST2.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/modules/local/get_taxa_for_amrfinder.nf b/modules/local/get_taxa_for_amrfinder.nf index 906b26ae..af6f46dc 100644 --- a/modules/local/get_taxa_for_amrfinder.nf +++ b/modules/local/get_taxa_for_amrfinder.nf @@ -12,16 +12,13 @@ process GET_TAXA_FOR_AMRFINDER { path("versions.yml"), emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/get_taxa_for_amrfinder.py" : "get_taxa_for_amrfinder.py" """ - ${ica}get_taxa_for_amrfinder.py -t $taxa_file -o ${prefix}_AMRFinder_Organism.csv + ${script} -t $taxa_file -o ${prefix}_AMRFinder_Organism.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/run_amrfinder.nf b/modules/local/run_amrfinder.nf index e6964722..4d41e09f 100755 --- a/modules/local/run_amrfinder.nf +++ b/modules/local/run_amrfinder.nf @@ -18,24 +18,13 @@ process AMRFINDERPLUS_RUN { task.ext.when == null || task.ext.when script: - // use --organism - if ( "${organism_param[0]}" != "No Match Found") { - organism = "--organism ${organism_param[0]}" - } else { organism = "" } - //set up for terra - if (params.terra==false) { - terra = "" - terra_exit = "" - } else if (params.terra==true) { - terra = "PATH=/opt/conda/envs/amrfinderplus/bin:\$PATH" - terra_exit = """PATH="\$(printf '%s\\n' "\$PATH" | sed 's|/opt/conda/envs/amrfinderplus/bin:||')" """ - } else { - error "Please set params.terra to either \"true\" or \"false\"" - } // define variables def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def container = task.container.toString() - "staphb/ncbi-amrfinderplus@" + def terra = params.terra ? "PATH=/opt/conda/envs/amrfinderplus/bin:\$PATH" : "" + def terra_exit = params.terra ? """PATH="\$(printf '%s\\n' "\$PATH" | sed 's|/opt/conda/envs/amrfinderplus/bin:||')" """ : "" + def organism = "${organism_param[0]}" != "No Match Found" ? "--organism ${organism_param[0]}" : "" //get name of amrfinder database file db_name = db.toString() - '.tar.gz' """ diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 4d8d7b07..00233ed0 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -330,37 +330,37 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(DO_MLST.out.versions) - // /*// Fetch AMRFinder Database - // AMRFINDERPLUS_UPDATE( ) - // ch_versions = ch_versions.mix(AMRFINDERPLUS_UPDATE.out.versions)*/ + /*// Fetch AMRFinder Database + AMRFINDERPLUS_UPDATE( ) + ch_versions = ch_versions.mix(AMRFINDERPLUS_UPDATE.out.versions)*/ - // // Create file that has the organism name to pass to AMRFinder - // GET_TAXA_FOR_AMRFINDER ( - // DETERMINE_TAXA_ID.out.taxonomy - // ) - // ch_versions = ch_versions.mix(GET_TAXA_FOR_AMRFINDER.out.versions) + // Create file that has the organism name to pass to AMRFinder + GET_TAXA_FOR_AMRFINDER ( + DETERMINE_TAXA_ID.out.taxonomy + ) + ch_versions = ch_versions.mix(GET_TAXA_FOR_AMRFINDER.out.versions) - // // Combining taxa and scaffolds to run amrfinder and get the point mutations. - // amr_channel = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, reads -> [[id:meta.id], reads]}\ - // .join(GET_TAXA_FOR_AMRFINDER.out.amrfinder_taxa.splitCsv(strip:true).map{meta, amrfinder_taxa -> [[id:meta.id], amrfinder_taxa ]}, by: [0])\ - // .join(PROKKA.out.faa.map{ meta, faa -> [[id:meta.id], faa ]}, by: [0])\ - // .join(PROKKA.out.gff.map{ meta, gff -> [[id:meta.id], gff ]}, by: [0]) + // Combining taxa and scaffolds to run amrfinder and get the point mutations. + amr_channel = BBMAP_REFORMAT.out.filtered_scaffolds.map{ meta, reads -> [[id:meta.id], reads]}\ + .join(GET_TAXA_FOR_AMRFINDER.out.amrfinder_taxa.splitCsv(strip:true).map{meta, amrfinder_taxa -> [[id:meta.id], amrfinder_taxa ]}, by: [0])\ + .join(PROKKA.out.faa.map{ meta, faa -> [[id:meta.id], faa ]}, by: [0])\ + .join(PROKKA.out.gff.map{ meta, gff -> [[id:meta.id], gff ]}, by: [0]) - // // Run AMRFinder - // AMRFINDERPLUS_RUN ( - // amr_channel, params.amrfinder_db - // ) - // ch_versions = ch_versions.mix(AMRFINDERPLUS_RUN.out.versions) + // Run AMRFinder + AMRFINDERPLUS_RUN ( + amr_channel, params.amrfinder_db + ) + ch_versions = ch_versions.mix(AMRFINDERPLUS_RUN.out.versions) - // // Combining determined taxa with the assembly stats based on meta.id - // assembly_ratios_ch = DETERMINE_TAXA_ID.out.taxonomy.map{meta, taxonomy -> [[id:meta.id], taxonomy]}\ - // .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) + // Combining determined taxa with the assembly stats based on meta.id + assembly_ratios_ch = DETERMINE_TAXA_ID.out.taxonomy.map{meta, taxonomy -> [[id:meta.id], taxonomy]}\ + .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0]) - // // Calculating the assembly ratio and gather GC% stats - // CALCULATE_ASSEMBLY_RATIO ( - // assembly_ratios_ch, params.ncbi_assembly_stats - // ) - // ch_versions = ch_versions.mix(CALCULATE_ASSEMBLY_RATIO.out.versions) + // Calculating the assembly ratio and gather GC% stats + CALCULATE_ASSEMBLY_RATIO ( + assembly_ratios_ch, params.ncbi_assembly_stats + ) + ch_versions = ch_versions.mix(CALCULATE_ASSEMBLY_RATIO.out.versions) // GENERATE_PIPELINE_STATS_WF ( // GET_RAW_STATS.out.combined_raw_stats, \ From 2700945ba7bf073b714e5f09fce100c63948f969 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Wed, 3 Apr 2024 15:43:16 +0000 Subject: [PATCH 33/40] refactor: summary lines ica #149 --- modules/local/phoenix_summary.nf | 9 +-- modules/local/phoenix_summary_line.nf | 9 +-- workflows/phoenix.nf | 108 +++++++++++++++----------- 3 files changed, 68 insertions(+), 58 deletions(-) diff --git a/modules/local/phoenix_summary.nf b/modules/local/phoenix_summary.nf index a7250efc..1b268c1b 100644 --- a/modules/local/phoenix_summary.nf +++ b/modules/local/phoenix_summary.nf @@ -13,23 +13,20 @@ process GATHER_SUMMARY_LINES { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def busco_parameter = busco_val ? "--busco" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/Create_phoenix_summary_tsv.py" : "Create_phoenix_summary_tsv.py" """ - ${ica}Create_phoenix_summary_tsv.py \\ + ${script} \\ --out Phoenix_Summary.tsv \\ $busco_parameter cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - Create_phoenix_summary_tsv.py: \$(${ica}Create_phoenix_summary_tsv.py --version ) + Create_phoenix_summary_tsv.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/modules/local/phoenix_summary_line.nf b/modules/local/phoenix_summary_line.nf index 8effa698..e4aefef1 100644 --- a/modules/local/phoenix_summary_line.nf +++ b/modules/local/phoenix_summary_line.nf @@ -23,10 +23,6 @@ process CREATE_SUMMARY_LINE { path("versions.yml") , emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def prefix = task.ext.prefix ?: "${meta.id}" // allowing for some optional parameters for -entry SCAFFOLDS/CDC_SCAFFOLDS nothing should be passed. @@ -35,8 +31,9 @@ process CREATE_SUMMARY_LINE { def fastani_file = fastani ? "-f $fastani" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" + def script = params.ica ? "python ${params.ica_path}/Phoenix_summary_line.py" : "Phoenix_summary_line.oy" """ - ${ica}Phoenix_summary_line.py \\ + ${script} \\ -q $quast_report \\ $trimmed_qc_data \\ -a $ar_gamma_file \\ @@ -55,7 +52,7 @@ process CREATE_SUMMARY_LINE { cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - Phoenix_summary_line.py: \$(${ica}Phoenix_summary_line.py --version ) + Phoenix_summary_line.py: \$(${script} --version ) phoenix_base_container_tag: ${container_version} phoenix_base_container: ${container} END_VERSIONS diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 00233ed0..4646b080 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -362,52 +362,68 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(CALCULATE_ASSEMBLY_RATIO.out.versions) - // GENERATE_PIPELINE_STATS_WF ( - // GET_RAW_STATS.out.combined_raw_stats, \ - // GET_TRIMD_STATS.out.fastp_total_qc, \ - // [], \ - // KRAKEN2_TRIMD.out.report, \ - // KRAKEN2_TRIMD.out.krona_html, \ - // KRAKEN2_TRIMD.out.k2_bh_summary, \ - // RENAME_FASTA_HEADERS.out.renamed_scaffolds, \ - // BBMAP_REFORMAT.out.filtered_scaffolds, \ - // DO_MLST.out.checked_MLSTs, \ - // GAMMA_HV.out.gamma, \ - // GAMMA_AR.out.gamma, \ - // GAMMA_PF.out.gamma, \ - // QUAST.out.report_tsv, \ - // [], [], [], [], \ - // KRAKEN2_WTASMBLD.out.report, \ - // KRAKEN2_WTASMBLD.out.krona_html, \ - // KRAKEN2_WTASMBLD.out.k2_bh_summary, \ - // DETERMINE_TAXA_ID.out.taxonomy, \ - // FORMAT_ANI.out.ani_best_hit, \ - // CALCULATE_ASSEMBLY_RATIO.out.ratio, \ - // AMRFINDERPLUS_RUN.out.mutation_report, \ - // CALCULATE_ASSEMBLY_RATIO.out.gc_content, \ - // false - // ) - // ch_versions = ch_versions.mix(GENERATE_PIPELINE_STATS_WF.out.versions) - - // // Combining output based on meta.id to create summary by sample -- is this verbose, ugly and annoying? yes, if anyone has a slicker way to do this we welcome the input. - // line_summary_ch = GET_TRIMD_STATS.out.fastp_total_qc.map{meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}\ - // .join(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> [[id:meta.id], checked_MLSTs]}, by: [0])\ - // .join(GAMMA_HV.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - // .join(GAMMA_AR.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - // .join(GAMMA_PF.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ - // .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0])\ - // .join(CALCULATE_ASSEMBLY_RATIO.out.ratio.map{ meta, ratio -> [[id:meta.id], ratio]}, by: [0])\ - // .join(GENERATE_PIPELINE_STATS_WF.out.pipeline_stats.map{ meta, pipeline_stats -> [[id:meta.id], pipeline_stats]}, by: [0])\ - // .join(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> [[id:meta.id], taxonomy]}, by: [0])\ - // .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ - // .join(AMRFINDERPLUS_RUN.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ - // .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit]}, by: [0]) - - // // Generate summary per sample that passed SPAdes - // CREATE_SUMMARY_LINE ( - // line_summary_ch - // ) - // ch_versions = ch_versions.mix(CREATE_SUMMARY_LINE.out.versions) + // prepare inputs to the stats wf + if (params.run_srst2_mlst){ + fullgene_results=SRST2_TRIMD_AR.out.fullgene_results + } else { + fullgene_results=[] + } + if (params.asmbld){ + asmbld_report=KRAKEN2_ASMBLD.out.report // channel: tuple (meta) path(report) + asmbld_krona_html=KRAKEN2_ASMBLD.out.krona_html // channel: tuple (meta) path(krona_html) + asmbld_k2_bh_summary=KRAKEN2_ASMBLD.out.k2_bh_summary // channel: tuple (meta) path(k2_bh_summary) + } else{ + asmbld_report=[] + asmbld_krona_html=[] + asmbld_k2_bh_summary=[] + } + + GENERATE_PIPELINE_STATS_WF ( + GET_RAW_STATS.out.combined_raw_stats, \ + GET_TRIMD_STATS.out.fastp_total_qc, \ + fullgene_results, \ + KRAKEN2_TRIMD.out.report, \ + KRAKEN2_TRIMD.out.krona_html, \ + KRAKEN2_TRIMD.out.k2_bh_summary, \ + RENAME_FASTA_HEADERS.out.renamed_scaffolds, \ + BBMAP_REFORMAT.out.filtered_scaffolds, \ + DO_MLST.out.checked_MLSTs, \ + GAMMA_HV.out.gamma, \ + GAMMA_AR.out.gamma, \ + GAMMA_PF.out.gamma, \ + QUAST.out.report_tsv, \ + params.busco, asmbld_report, asmbld_krona_html, asmbld_k2_bh_summary, \ + KRAKEN2_WTASMBLD.out.report, \ + KRAKEN2_WTASMBLD.out.krona_html, \ + KRAKEN2_WTASMBLD.out.k2_bh_summary, \ + DETERMINE_TAXA_ID.out.taxonomy, \ + FORMAT_ANI.out.ani_best_hit, \ + CALCULATE_ASSEMBLY_RATIO.out.ratio, \ + AMRFINDERPLUS_RUN.out.mutation_report, \ + CALCULATE_ASSEMBLY_RATIO.out.gc_content, \ + params.extended_qc + ) + ch_versions = ch_versions.mix(GENERATE_PIPELINE_STATS_WF.out.versions) + + // Combining output based on meta.id to create summary by sample -- is this verbose, ugly and annoying? yes, if anyone has a slicker way to do this we welcome the input. + line_summary_ch = GET_TRIMD_STATS.out.fastp_total_qc.map{meta, fastp_total_qc -> [[id:meta.id], fastp_total_qc]}\ + .join(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> [[id:meta.id], checked_MLSTs]}, by: [0])\ + .join(GAMMA_HV.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + .join(GAMMA_AR.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + .join(GAMMA_PF.out.gamma.map{ meta, gamma -> [[id:meta.id], gamma]}, by: [0])\ + .join(QUAST.out.report_tsv.map{ meta, report_tsv -> [[id:meta.id], report_tsv]}, by: [0])\ + .join(CALCULATE_ASSEMBLY_RATIO.out.ratio.map{ meta, ratio -> [[id:meta.id], ratio]}, by: [0])\ + .join(GENERATE_PIPELINE_STATS_WF.out.pipeline_stats.map{ meta, pipeline_stats -> [[id:meta.id], pipeline_stats]}, by: [0])\ + .join(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> [[id:meta.id], taxonomy]}, by: [0])\ + .join(KRAKEN2_TRIMD.out.k2_bh_summary.map{ meta, k2_bh_summary -> [[id:meta.id], k2_bh_summary]}, by: [0])\ + .join(AMRFINDERPLUS_RUN.out.report.map{ meta, report -> [[id:meta.id], report]}, by: [0])\ + .join(FORMAT_ANI.out.ani_best_hit.map{ meta, ani_best_hit -> [[id:meta.id], ani_best_hit]}, by: [0]) + + // Generate summary per sample that passed SPAdes + CREATE_SUMMARY_LINE ( + line_summary_ch + ) + ch_versions = ch_versions.mix(CREATE_SUMMARY_LINE.out.versions) // // Collect all the summary files prior to fetch step to force the fetch process to wait // failed_summaries_ch = SPADES_WF.out.line_summary.collect().ifEmpty(params.placeholder) // if no spades failure pass empty file to keep it moving... From 9437d8d5d73e144b1c003cc54c96b1db49dcc78b Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 4 Apr 2024 03:13:10 +0000 Subject: [PATCH 34/40] chore: fix missed ica flag --- modules/local/check_mlst.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/modules/local/check_mlst.nf b/modules/local/check_mlst.nf index d84beead..1ee95fdc 100644 --- a/modules/local/check_mlst.nf +++ b/modules/local/check_mlst.nf @@ -14,9 +14,6 @@ process CHECK_MLST { script: // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" def script = params.ica ? "python ${params.ica_path}/fix_MLST2.py" : "fix_MLST2.py" From bb780f33f12b1c2a602bf705de0e2c4981742862 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 4 Apr 2024 03:14:34 +0000 Subject: [PATCH 35/40] refactor: summary lines ica #149 --- modules/local/phoenix_summary.nf | 1 - modules/local/phoenix_summary_line.nf | 2 +- workflows/phoenix.nf | 57 ++++++++++++++------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/modules/local/phoenix_summary.nf b/modules/local/phoenix_summary.nf index 1b268c1b..39f42ef4 100644 --- a/modules/local/phoenix_summary.nf +++ b/modules/local/phoenix_summary.nf @@ -5,7 +5,6 @@ process GATHER_SUMMARY_LINES { input: path(summary_line_files) - path(outdir_path) val(busco_val) output: diff --git a/modules/local/phoenix_summary_line.nf b/modules/local/phoenix_summary_line.nf index e4aefef1..c4d7fc29 100644 --- a/modules/local/phoenix_summary_line.nf +++ b/modules/local/phoenix_summary_line.nf @@ -31,7 +31,7 @@ process CREATE_SUMMARY_LINE { def fastani_file = fastani ? "-f $fastani" : "" def container_version = "base_v2.1.0" def container = task.container.toString() - "quay.io/jvhagey/phoenix@" - def script = params.ica ? "python ${params.ica_path}/Phoenix_summary_line.py" : "Phoenix_summary_line.oy" + def script = params.ica ? "python ${params.ica_path}/Phoenix_summary_line.py" : "Phoenix_summary_line.py" """ ${script} \\ -q $quast_report \\ diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 4646b080..55b073e8 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -392,7 +392,7 @@ workflow PHOENIX_EXTERNAL { GAMMA_AR.out.gamma, \ GAMMA_PF.out.gamma, \ QUAST.out.report_tsv, \ - params.busco, asmbld_report, asmbld_krona_html, asmbld_k2_bh_summary, \ + params.run_busco, asmbld_report, asmbld_krona_html, asmbld_k2_bh_summary, \ KRAKEN2_WTASMBLD.out.report, \ KRAKEN2_WTASMBLD.out.krona_html, \ KRAKEN2_WTASMBLD.out.k2_bh_summary, \ @@ -425,33 +425,36 @@ workflow PHOENIX_EXTERNAL { ) ch_versions = ch_versions.mix(CREATE_SUMMARY_LINE.out.versions) - // // Collect all the summary files prior to fetch step to force the fetch process to wait - // failed_summaries_ch = SPADES_WF.out.line_summary.collect().ifEmpty(params.placeholder) // if no spades failure pass empty file to keep it moving... - // // If you only run one sample and it fails spades there is nothing in the create line summary so pass an empty list to keep it moving... - // summaries_ch = CREATE_SUMMARY_LINE.out.line_summary.collect().ifEmpty( [] ) + // Collect all the summary files prior to fetch step to force the fetch process to wait + failed_summaries_ch = SPADES_WF.out.line_summary.collect().ifEmpty(params.placeholder) // if no spades failure pass empty file to keep it moving... + // If you only run one sample and it fails spades there is nothing in the create line summary so pass an empty list to keep it moving... + summaries_ch = CREATE_SUMMARY_LINE.out.line_summary.collect().ifEmpty( [] ) - // // This will check the output directory for an files ending in "_summaryline_failure.tsv" and add them to the output channel - // FETCH_FAILED_SUMMARIES ( - // outdir_path, failed_summaries_ch, summaries_ch - // ) - // ch_versions = ch_versions.mix(FETCH_FAILED_SUMMARIES.out.versions) - - // // combine all line summaries into one channel - // spades_failure_summaries_ch = FETCH_FAILED_SUMMARIES.out.spades_failure_summary_line - // fairy_summary_ch = CORRUPTION_CHECK.out.summary_line.collect().ifEmpty( [] )\ - // .combine(GET_RAW_STATS.out.summary_line.collect().ifEmpty( [] ))\ - // .combine(GET_TRIMD_STATS.out.summary_line.collect().ifEmpty( [] ))\ - // .combine(SCAFFOLD_COUNT_CHECK.out.summary_line.collect().ifEmpty( [] ))\ - // .ifEmpty( [] ) - - // // pulling it all together - // all_summaries_ch = spades_failure_summaries_ch.combine(failed_summaries_ch).combine(summaries_ch).combine(fairy_summary_ch) - - // // Combining sample summaries into final report - // GATHER_SUMMARY_LINES ( - // all_summaries_ch, outdir_path, false - // ) - // ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) + // This will check the output directory for an files ending in "_summaryline_failure.tsv" and add them to the output channel + FETCH_FAILED_SUMMARIES ( + outdir_path, failed_summaries_ch, summaries_ch + ) + ch_versions = ch_versions.mix(FETCH_FAILED_SUMMARIES.out.versions) + + // combine all line summaries into one channel + spades_failure_summaries_ch = FETCH_FAILED_SUMMARIES.out.spades_failure_summary_line + fairy_summary_ch = CORRUPTION_CHECK.out.summary_line.collect().ifEmpty( [] )\ + .combine(GET_RAW_STATS.out.summary_line.collect().ifEmpty( [] ))\ + .combine(GET_TRIMD_STATS.out.summary_line.collect().ifEmpty( [] ))\ + .combine(SCAFFOLD_COUNT_CHECK.out.summary_line.collect().ifEmpty( [] ))\ + .ifEmpty( [] ) + + // pulling it all together + all_summaries_ch = spades_failure_summaries_ch + .combine(failed_summaries_ch) + .combine(summaries_ch) + .combine(fairy_summary_ch) + + // Combining sample summaries into final report + GATHER_SUMMARY_LINES ( + all_summaries_ch, params.run_busco + ) + ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) // //create GRiPHin report // GRIPHIN ( From a33862285000ca2da04ab89541bbd398e44d69a6 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 4 Apr 2024 03:35:53 +0000 Subject: [PATCH 36/40] feat: flags for execution #153 --- nextflow.config | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 660127e1..bb7d2165 100755 --- a/nextflow.config +++ b/nextflow.config @@ -18,14 +18,14 @@ params { // Run flags run_busco = false - + run_srst2_mlst = false + run_griphin = false // Additional input parameters for -entry SCAFFOLDS and CDC_SCAFFOLDS indir = null scaffolds_ext = '.scaffolds.fa.gz' extended_qc = false - // Params for filtering minlength = 500 phred = 33 @@ -37,9 +37,10 @@ params { // Additional input parameters for -entry SRA and CDC_SRA input_sra = null use_sra = false + asmbld = false // For NCBI spreadsheet creation - ncbi_excel_creation = true + ncbi_excel_creation = false microbe_example = "${baseDir}/assets/Microbe.1.0_Example_Data.xlsx" sra_metadata = "${baseDir}/assets/SRA_metadata_example.xlsx" osii_bioprojects = "${baseDir}/assets/osii-bioprojects.yaml" From 28f902916dc2fd8782709b59c7ce605444d14c8f Mon Sep 17 00:00:00 2001 From: slsevilla Date: Thu, 4 Apr 2024 03:36:10 +0000 Subject: [PATCH 37/40] refactor: griphin ica #149 --- modules/local/griphin.nf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/local/griphin.nf b/modules/local/griphin.nf index e64e9133..1d20535c 100755 --- a/modules/local/griphin.nf +++ b/modules/local/griphin.nf @@ -18,23 +18,20 @@ process GRIPHIN { path("versions.yml"), emit: versions script: // This script is bundled with the pipeline, in cdcgov/phoenix/bin/ - // Adding if/else for if running on ICA it is a requirement to state where the script is, however, this causes CLI users to not run the pipeline from any directory. - if (params.ica==false) { ica = "" } - else if (params.ica==true) { ica = "python ${workflow.launchDir}/bin/" } - else { error "Please set params.ica to either \"true\" if running on ICA or \"false\" for all other methods." } // define variables def phoenix = entry ? "--phoenix" : "" def scaffolds = scaffolds_entry ? "--scaffolds" : "" def container = task.container.toString() - "quay.io/jvhagey/phoenix:" + def script = params.ica ? "python ${params.ica_path}/GRiPHin.py" : "GRiPHin.py" """ full_path=\$(readlink -f ${outdir}) - ${ica}GRiPHin.py -d \$full_path -a $db --output ${outdir} --coverage ${coverage} ${phoenix} ${scaffolds} + ${script} -d \$full_path -a $db --output ${outdir} --coverage ${coverage} ${phoenix} ${scaffolds} cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') - griphin.py: \$(${ica}GRiPHin.py --version) + griphin.py: \$(${script} --version) phoenix_base_container: ${container} END_VERSIONS """ From 2168e14cce593529cb38cc1c0262a5c920c0d22c Mon Sep 17 00:00:00 2001 From: slsevilla Date: Tue, 9 Apr 2024 03:28:13 +0000 Subject: [PATCH 38/40] chore: unblock expected outputs --- main.nf | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/main.nf b/main.nf index 4635262c..83f89e31 100755 --- a/main.nf +++ b/main.nf @@ -53,20 +53,19 @@ workflow PHOENIX { main: PHOENIX_EXTERNAL ( ch_input, ch_versions, params.ncbi_excel_creation ) emit: - check = PHOENIX_EXTERNAL.out.check - // scaffolds = PHOENIX_EXTERNAL.out.scaffolds - // trimmed_reads = PHOENIX_EXTERNAL.out.trimmed_reads - // mlst = PHOENIX_EXTERNAL.out.mlst - // amrfinder_output = PHOENIX_EXTERNAL.out.amrfinder_output - // gamma_ar = PHOENIX_EXTERNAL.out.gamma_ar - // phx_summary = PHOENIX_EXTERNAL.out.phx_summary - // //output for phylophoenix - // griphin_tsv = PHOENIX_EXTERNAL.out.griphin_tsv - // griphin_excel = PHOENIX_EXTERNAL.out.griphin_excel - // dir_samplesheet = PHOENIX_EXTERNAL.out.dir_samplesheet - // //output for ncbi upload - // ncbi_sra_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_sra_sheet : null - // ncbi_biosample_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_biosample_sheet : null + scaffolds = PHOENIX_EXTERNAL.out.scaffolds + trimmed_reads = PHOENIX_EXTERNAL.out.trimmed_reads + mlst = PHOENIX_EXTERNAL.out.mlst + amrfinder_output = PHOENIX_EXTERNAL.out.amrfinder_output + gamma_ar = PHOENIX_EXTERNAL.out.gamma_ar + phx_summary = PHOENIX_EXTERNAL.out.phx_summary + //output for phylophoenix + griphin_tsv = params.run_griphin ? PHOENIX_EXTERNAL.out.griphin_tsv : null + griphin_excel = params.run_griphin ? PHOENIX_EXTERNAL.out.griphin_excel : null + dir_samplesheet = params.run_griphin ? PHOENIX_EXTERNAL.out.dir_samplesheet : null + //output for ncbi upload + ncbi_sra_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_sra_sheet : null + ncbi_biosample_sheet = params.create_ncbi_sheet ? PHOENIX_EXTERNAL.out.ncbi_biosample_sheet : null } // From 4556847eb9505e5dd8c16b34726b6f2b8c868e62 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Tue, 9 Apr 2024 03:29:14 +0000 Subject: [PATCH 39/40] chore: unblock griphin, outputs --- workflows/phoenix.nf | 62 +++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/workflows/phoenix.nf b/workflows/phoenix.nf index 55b073e8..a4b3feeb 100644 --- a/workflows/phoenix.nf +++ b/workflows/phoenix.nf @@ -450,21 +450,31 @@ workflow PHOENIX_EXTERNAL { .combine(summaries_ch) .combine(fairy_summary_ch) - // Combining sample summaries into final report - GATHER_SUMMARY_LINES ( - all_summaries_ch, params.run_busco - ) - ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) - - // //create GRiPHin report - // GRIPHIN ( - // all_summaries_ch, INPUT_CHECK.out.valid_samplesheet, params.ardb, outdir_path, params.coverage, true, false + // // Combining sample summaries into final report + // GATHER_SUMMARY_LINES ( + // all_summaries_ch, params.run_busco // ) - // ch_versions = ch_versions.mix(GRIPHIN.out.versions) + // ch_versions = ch_versions.mix(GATHER_SUMMARY_LINES.out.versions) - // if (ncbi_excel_creation == true && params.create_ncbi_sheet == true) { + // //create GRiPHin report + // if(params.run_griphin) { + // GRIPHIN ( + // all_summaries_ch, INPUT_CHECK.out.valid_samplesheet, params.ardb, outdir_path, params.coverage, true, false + // ) + // ch_versions = ch_versions.mix(GRIPHIN.out.versions) + // } + + // // Create NCBI sheet + // if (params.ncbi_excel_creation == true && params.create_ncbi_sheet == true) { // // requiring files so that this process doesn't start until needed files are made. - // required_files_ch = FASTP_TRIMD.out.reads.map{ meta, reads -> reads[0]}.collect().combine(DO_MLST.out.checked_MLSTs.map{ meta, checked_MLSTs -> checked_MLSTs}.collect()).combine(DETERMINE_TAXA_ID.out.taxonomy.map{ meta, taxonomy -> taxonomy}.collect()) + // required_files_ch = FASTP_TRIMD.out.reads + // .map{ meta, reads -> reads[0]} + // .collect() + // .combine(DO_MLST.out.checked_MLSTs + // .map{ meta, checked_MLSTs -> checked_MLSTs} + // .collect()) + // .combine(DETERMINE_TAXA_ID.out.taxonomy + // .map{ meta, taxonomy -> taxonomy}.collect()) // //Fill out NCBI excel sheets for upload based on what PHX found // CREATE_NCBI_UPLOAD_SHEET ( @@ -504,21 +514,19 @@ workflow PHOENIX_EXTERNAL { // ch_versions = ch_versions.mix(MULTIQC.out.versions) emit: - check = FASTP_TRIMD.out.reads - // outcome = GET_RAW_STATS.out.outcome - // scaffolds = BBMAP_REFORMAT.out.filtered_scaffolds - // trimmed_reads = FASTP_TRIMD.out.reads - // mlst = DO_MLST.out.checked_MLSTs - // amrfinder_output = AMRFINDERPLUS_RUN.out.report - // gamma_ar = GAMMA_AR.out.gamma - // phx_summary = GATHER_SUMMARY_LINES.out.summary_report - // //output for phylophoenix - // griphin_tsv = GRIPHIN.out.griphin_report - // griphin_excel = GRIPHIN.out.griphin_tsv_report - // dir_samplesheet = GRIPHIN.out.converted_samplesheet - // //output for ncbi upload - // ncbi_sra_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_sra : null - // ncbi_biosample_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_biosample : null + scaffolds = BBMAP_REFORMAT.out.filtered_scaffolds + trimmed_reads = FASTP_TRIMD.out.reads + mlst = DO_MLST.out.checked_MLSTs + amrfinder_output = AMRFINDERPLUS_RUN.out.report + gamma_ar = GAMMA_AR.out.gamma + phx_summary = GATHER_SUMMARY_LINES.out.summary_report + //output for phylophoenix + griphin_tsv = params.run_griphin ? GRIPHIN.out.griphin_report : null + griphin_excel = params.run_griphin ? GRIPHIN.out.griphin_tsv_report : null + dir_samplesheet = params.run_griphin ? GRIPHIN.out.converted_samplesheet : null + //output for ncbi upload + ncbi_sra_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_sra : null + ncbi_biosample_sheet = params.create_ncbi_sheet ? CREATE_NCBI_UPLOAD_SHEET.out.ncbi_biosample : null } /* From 46f43a4537157d796b534b8b717c572b5f44c921 Mon Sep 17 00:00:00 2001 From: slsevilla Date: Tue, 9 Apr 2024 03:34:38 +0000 Subject: [PATCH 40/40] docs: changes added to log --- CHANGELOG.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55874b35..d4e04661 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -243,3 +243,23 @@ Below are the list of changes to phx since is initial release. As fixes can take - [ARG-ANNOT](http://backup.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/2041/arg-annot-v4-aa-may2018_doc.fasta) hasn't changed since the last time the database was created and contains updates since version [NT v6 July 2019](https://www.mediterranee-infection.com/acces-ressources/base-de-donnees/arg-annot-2/) - [ResFinder](https://bitbucket.org/genomicepidemiology/resfinder_db/src/master/) - Includes until 2024-01-28 [commit 97d1fe0cd0a119172037f6bdb29f8a1c7c6e6019](https://bitbucket.org/genomicepidemiology/resfinder_db/commits/branch/master) + +## [v3.1.0](https://github.com/CDCgov/phoenix/releases/tag/v3.1.0) (04/08/2024) +**Implemented Enhancements** +- refactors filtering failed samples for fairy +- refactors ICA handling, terra handling +- add a param flags in nextflow.config + - execution-based + - run_busco + - ncbi_excel_creation + - extended_qc + - run_srst2_mlst + - run_griphin + - feature-based + - save_trimmed_fail + - save_merged + - save_output_fastqs + - save_reads_assignment +- moves parameter checks upstream to main.nf + - ICA + - TERRA