Improved output logic (#30)

* Improved output logic * Strip suffix only + update description * Apply suggestions from code review Co-authored-by: Dries Schaumont <[email protected]> --------- Co-authored-by: Dries Schaumont <[email protected]>
viash-hub · Dec 18, 2024 · 8d3c288 · 8d3c288
1 parent 45accaa
commit 8d3c288
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,54 @@
+# demultiplex v0.3.3
+
+## Breaking change
+
+- The `runner` defines the output differently now:
+
+  - The last part of the `--input` path is expected to be the run ID and this run ID is used to create the output directory.
+  - If the input is `file.tar.gz` instead of a directory, the `file` part is used as the run ID.
+
+- The output structure is then as follows:
+
+    ```
+    $publish_dir/<run_id>/<date_time_stamp>_demultiplex_<version>/
+    ```
+
+    For instance:
+
+    ```
+    $publish_dir
+    └── 200624_A00834_0183_BHMTFYDRXX
+        └── 20241217_051404_demultiplex_v1.2
+            ├── fastq
+            │   ├── Sample1_S1_L001_R1_001.fastq.gz
+            │   ├── Sample23_S3_L001_R1_001.fastq.gz
+            │   ├── SampleA_S2_L001_R1_001.fastq.gz
+            │   ├── Undetermined_S0_L001_R1_001.fastq.gz
+            │   └── sampletest_S4_L001_R1_001.fastq.gz
+            └── qc
+                ├── fastqc
+                │   ├── Sample1_S1_L001_R1_001.fastq.gz_fastqc_data.txt
+                │   ├── Sample1_S1_L001_R1_001.fastq.gz_fastqc_report.html
+                │   ├── Sample1_S1_L001_R1_001.fastq.gz_summary.txt
+                │   ├── Sample23_S3_L001_R1_001.fastq.gz_fastqc_data.txt
+                │   ├── Sample23_S3_L001_R1_001.fastq.gz_fastqc_report.html
+                │   ├── Sample23_S3_L001_R1_001.fastq.gz_summary.txt
+                │   ├── SampleA_S2_L001_R1_001.fastq.gz_fastqc_data.txt
+                │   ├── SampleA_S2_L001_R1_001.fastq.gz_fastqc_report.html
+                │   ├── SampleA_S2_L001_R1_001.fastq.gz_summary.txt
+                │   ├── Undetermined_S0_L001_R1_001.fastq.gz_fastqc_data.txt
+                │   ├── Undetermined_S0_L001_R1_001.fastq.gz_fastqc_report.html
+                │   ├── Undetermined_S0_L001_R1_001.fastq.gz_summary.txt
+                │   ├── sampletest_S4_L001_R1_001.fastq.gz_fastqc_data.txt
+                │   ├── sampletest_S4_L001_R1_001.fastq.gz_fastqc_report.html
+                │   └── sampletest_S4_L001_R1_001.fastq.gz_summary.txt
+                └── multiqc_report.html
+
+    ```
+
+- This logic can be avoided by providing the flag `--plain_output`.
+
+
 # demultiplex v0.3.2
 
 # Bug fixes

diff --git a/src/runner/config.vsh.yaml b/src/runner/config.vsh.yaml
@@ -4,7 +4,10 @@ argument_groups:
   - name: Input arguments
     arguments:
       - name: --input
-        description: Base directory of the form `s3:/<bucket>/Sequencing/<Sequencer>/<RunID>/`
+        description: |
+          Base directory of the canonical form `s3://<bucket>/<path>/<RunID>/`.
+          A tarball (tar.gz, .tgz, .tar) containing run information can be provided in which
+          case the RunID is set to the name of the tarball without the extension.
         type: file
         required: true
       - name: --run_information
@@ -27,15 +30,10 @@ argument_groups:
           required.
   - name: Annotation flags
     arguments:
-      - name: --add_date_time
-        description: |
-          Add date and time to the output directory name. This is useful
-          when running the same pipeline multiple times on the same input
-          directory.
-        type: boolean_true
-      - name: --add_workflow_id
+      - name: --plain_output
         description: |
-          Add a workflow identifier to the output directory name.
+          Flag to indicate that the output should be stored directly under $publish_dir rather than
+          under a subdirectory structure runID/<date_time>_demultiplex_<version>/.
         type: boolean_true
   - name: Output arguments
     arguments:

diff --git a/src/runner/main.nf b/src/runner/main.nf
@@ -9,6 +9,15 @@ workflow run_wf {
 
   main:
     output_ch = input_ch
+      // Extract the ID from the input.
+      // If the input is a tarball, strip the suffix.
+      | map{ id, state ->
+        def id_with_suffix = state.input.getFileName().toString()
+        [
+          id,
+          state + [ run_id: id_with_suffix - ~/\.(tar.gz|tgz|tar)$/ ]
+        ]
+      }
       | demultiplex.run(
         fromState: [
           "input": "input",
@@ -24,14 +33,15 @@ workflow run_wf {
       )
       | publish.run(
         fromState: { id, state ->
-          def id1 = (params.add_date_time) ? "${id}_${date}" : id
-          def id2 = (params.add_workflow_id) ? "${id1}_demultiplex_${version}" : id1
+          println(state.plain_output)
+          def id1 = (state.plain_output) ? id : "${state.run_id}/${date}"
+          def id2 = (state.plain_output) ? id : "${id1}_demultiplex_${version}"
 
-          def fastq_output_1 = (id == "run") ? state.fastq_output : "${id2}/" + state.fastq_output
-          def falco_output_1 = (id == "run") ? state.falco_output : "${id2}/" + state.falco_output
-          def multiqc_output_1 = (id == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output
+          def fastq_output_1 = (id2 == "run") ? state.fastq_output : "${id2}/" + state.fastq_output
+          def falco_output_1 = (id2 == "run") ? state.falco_output : "${id2}/" + state.falco_output
+          def multiqc_output_1 = (id2 == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output
 
-          if (id == "run") {
+          if (id2 == "run") {
             println("Publising to ${params.publish_dir}")
           } else {
             println("Publising to ${params.publish_dir}/${id2}")