From 3b8104d0b58b5cfaf094ab0d2b369ef3b84e3e05 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Tue, 17 Dec 2024 17:33:14 +0100 Subject: [PATCH 1/3] Improved output logic --- CHANGELOG.md | 51 ++++++++++++++++++++++++++++++++++++++ src/runner/config.vsh.yaml | 15 +++++------ src/runner/main.nf | 23 ++++++++++++----- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bb38b9..5d3f40b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,54 @@ +# demultiplex v0.3.3 + +## Breaking change + +- The `runner` defines the output differently now: + + - The last part of the `--input` path is expected to be the run ID and this run ID is used to create the output directory. + - If the input is `file.tar.gz` instead of a directory, the `file` part is used as the run ID. + +- The output structure is then as follows: + + ``` + $publish_dir//_demultiplex_/ + ``` + + For instance: + + ``` + $publish_dir + └── 200624_A00834_0183_BHMTFYDRXX + └── 20241217_051404_demultiplex_v1.2 + ├── fastq + │   ├── Sample1_S1_L001_R1_001.fastq.gz + │   ├── Sample23_S3_L001_R1_001.fastq.gz + │   ├── SampleA_S2_L001_R1_001.fastq.gz + │   ├── Undetermined_S0_L001_R1_001.fastq.gz + │   └── sampletest_S4_L001_R1_001.fastq.gz + └── qc + ├── fastqc + │   ├── Sample1_S1_L001_R1_001.fastq.gz_fastqc_data.txt + │   ├── Sample1_S1_L001_R1_001.fastq.gz_fastqc_report.html + │   ├── Sample1_S1_L001_R1_001.fastq.gz_summary.txt + │   ├── Sample23_S3_L001_R1_001.fastq.gz_fastqc_data.txt + │   ├── Sample23_S3_L001_R1_001.fastq.gz_fastqc_report.html + │   ├── Sample23_S3_L001_R1_001.fastq.gz_summary.txt + │   ├── SampleA_S2_L001_R1_001.fastq.gz_fastqc_data.txt + │   ├── SampleA_S2_L001_R1_001.fastq.gz_fastqc_report.html + │   ├── SampleA_S2_L001_R1_001.fastq.gz_summary.txt + │   ├── Undetermined_S0_L001_R1_001.fastq.gz_fastqc_data.txt + │   ├── Undetermined_S0_L001_R1_001.fastq.gz_fastqc_report.html + │   ├── Undetermined_S0_L001_R1_001.fastq.gz_summary.txt + │   ├── sampletest_S4_L001_R1_001.fastq.gz_fastqc_data.txt + │   ├── sampletest_S4_L001_R1_001.fastq.gz_fastqc_report.html + │   └── sampletest_S4_L001_R1_001.fastq.gz_summary.txt + └── multiqc_report.html + + ``` + +- This logic can be avoided by providing the flag `--plain_output`. + + # demultiplex v0.3.2 # Bug fixes diff --git a/src/runner/config.vsh.yaml b/src/runner/config.vsh.yaml index 50e4a1c..1767720 100644 --- a/src/runner/config.vsh.yaml +++ b/src/runner/config.vsh.yaml @@ -4,7 +4,9 @@ argument_groups: - name: Input arguments arguments: - name: --input - description: Base directory of the form `s3://Sequencing///` + description: | + Base directory of the canonical form `s3://///`. + Please note that the workflow supports xxx type: file required: true - name: --run_information @@ -27,15 +29,10 @@ argument_groups: required. - name: Annotation flags arguments: - - name: --add_date_time - description: | - Add date and time to the output directory name. This is useful - when running the same pipeline multiple times on the same input - directory. - type: boolean_true - - name: --add_workflow_id + - name: --plain_output description: | - Add a workflow identifier to the output directory name. + Flag to indicate that the output should be stored directly under $publish_dir rather than + under a subdirectory structure runID/_demultiplex_/. type: boolean_true - name: Output arguments arguments: diff --git a/src/runner/main.nf b/src/runner/main.nf index 5a92905..4fd6aef 100644 --- a/src/runner/main.nf +++ b/src/runner/main.nf @@ -9,6 +9,16 @@ workflow run_wf { main: output_ch = input_ch + // Extract the ID from the input. + // If this is a directory or tar-gz file containing `.`'s, + // only select the part before the first `.`. + | map{ id, state -> + def id_with_dots = state.input.getFileName().toString() + [ + id, + state + [ run_id: id_with_dots - ~/(\.\w+)*$/ ] + ] + } | demultiplex.run( fromState: [ "input": "input", @@ -24,14 +34,15 @@ workflow run_wf { ) | publish.run( fromState: { id, state -> - def id1 = (params.add_date_time) ? "${id}_${date}" : id - def id2 = (params.add_workflow_id) ? "${id1}_demultiplex_${version}" : id1 + println(state.plain_output) + def id1 = (state.plain_output) ? id : "${state.run_id}/${date}" + def id2 = (state.plain_output) ? id : "${id1}_demultiplex_${version}" - def fastq_output_1 = (id == "run") ? state.fastq_output : "${id2}/" + state.fastq_output - def falco_output_1 = (id == "run") ? state.falco_output : "${id2}/" + state.falco_output - def multiqc_output_1 = (id == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output + def fastq_output_1 = (id2 == "run") ? state.fastq_output : "${id2}/" + state.fastq_output + def falco_output_1 = (id2 == "run") ? state.falco_output : "${id2}/" + state.falco_output + def multiqc_output_1 = (id2 == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output - if (id == "run") { + if (id2 == "run") { println("Publising to ${params.publish_dir}") } else { println("Publising to ${params.publish_dir}/${id2}") From c63361741b0517f3fc4023629e7696a7f08ccb2c Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 18 Dec 2024 14:40:19 +0100 Subject: [PATCH 2/3] Strip suffix only + update description --- src/runner/config.vsh.yaml | 3 ++- src/runner/main.nf | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/runner/config.vsh.yaml b/src/runner/config.vsh.yaml index 1767720..020be85 100644 --- a/src/runner/config.vsh.yaml +++ b/src/runner/config.vsh.yaml @@ -6,7 +6,8 @@ argument_groups: - name: --input description: | Base directory of the canonical form `s3://///`. - Please note that the workflow supports xxx + A tarball (tar.gz, .tgz, .tar) containing run information can be provided in which + case the RunID is set to the name of the tarball without the extension. type: file required: true - name: --run_information diff --git a/src/runner/main.nf b/src/runner/main.nf index 4fd6aef..0412e39 100644 --- a/src/runner/main.nf +++ b/src/runner/main.nf @@ -10,13 +10,12 @@ workflow run_wf { main: output_ch = input_ch // Extract the ID from the input. - // If this is a directory or tar-gz file containing `.`'s, - // only select the part before the first `.`. + // If the input is a tarball, strip the suffix. | map{ id, state -> - def id_with_dots = state.input.getFileName().toString() + def id_with_suffix = state.input.getFileName().toString() [ id, - state + [ run_id: id_with_dots - ~/(\.\w+)*$/ ] + state + [ run_id: id_with_suffix.replaceAll(".tgz", "").replaceAll(".tar.gz", "").replaceAll(".tar", "") ] ] } | demultiplex.run( From 00ee87ce80e8578a9f4a1f439f48c9ccbd1eb7e3 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 18 Dec 2024 15:32:24 +0100 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- src/runner/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runner/main.nf b/src/runner/main.nf index 0412e39..ef78f71 100644 --- a/src/runner/main.nf +++ b/src/runner/main.nf @@ -15,7 +15,7 @@ workflow run_wf { def id_with_suffix = state.input.getFileName().toString() [ id, - state + [ run_id: id_with_suffix.replaceAll(".tgz", "").replaceAll(".tar.gz", "").replaceAll(".tar", "") ] + state + [ run_id: id_with_suffix - ~/\.(tar.gz|tgz|tar)$/ ] ] } | demultiplex.run(