diff --git a/CHANGELOG.md b/CHANGELOG.md index c5a5880..bf4ad97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,34 @@ +# demultiplex x.y.z + +## Major updates +The outflow of the workflow has been refactored to be more flexible. This is done by creating a wrapper workflow `runner` that wraps the native `demultiplex` workflow. The `runner` workflow is responsible for setting the output directory based on the input arguments: + +3 arguments exist for specifying the relative location of the 3 _outputs_ of the workflow: + +- `fastq_output`: The directory where the demultiplexed fastq files are stored. +- `falco_output`: the directory for the `fastqc`/`falco` reports. +- `multiqc_output`: The filename for the `multiqc` report. + +The target location path is determined by the following logic: + +- If no `id` is provided, the output directory is set to `$publish_dir`. +- If an `id` is explicitly set using Seqera Cloud or by adding `--id <>`, the output directory is set to `$publish_dir/`. + +The workflow has two optional flags to be used in combination with `--id`: + +- `--add_date_time`: rather than publishing the results under `$publish_dir`, this adds an additional layer `$publish_dir//`. This is useful when you want to keep track of multiple runs of the workflow (example: `240322_143020`). +- `--add_workflow_id`: adding this flag will add `_demultiplex_` to the output directory (example: `demultiplex_v0.2.0`). When starting the workflow from a non-release, the version will be set to `version_unkonwn`. + +The default structure in the output directory is: + +- Two sub-directories: + - `fastq` + - `qc` for the reports: + - `multiqc_report.html` + - `fastqc/` directory containing the different fastqc (falco) reports. + +The `$publish_dir` variable corresponds to the argument provided with `--publish-dir`. The `date-time-stamp` is generated by the workflow based on when it was launched and is thus guaranteed to be unique. + # demultiplex v0.2.0 ## Breaking changes diff --git a/_viash.yaml b/_viash.yaml index 5229d3a..a0e3528 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -15,3 +15,4 @@ viash_version: 0.9.0 config_mods: | .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].directives.tag := '$id' diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..6536fa9 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,12 @@ +manifest { + homePage = 'https://github.com/viash-hub/demultiplex' + description = 'Demultiplexing pipeline for sequencing data' + mainScript = 'target/nextflow/demultiplex/main.nf' +} + +process { + withName: publishStatesProc { + publishDir = [ enabled: false ] + } +} + diff --git a/src/demultiplex/config.vsh.yaml b/src/demultiplex/config.vsh.yaml index d275653..73ca153 100644 --- a/src/demultiplex/config.vsh.yaml +++ b/src/demultiplex/config.vsh.yaml @@ -3,6 +3,9 @@ description: Demultiplexing of raw sequencing data argument_groups: - name: Input arguments arguments: + - name: --id + description: Unique identifier for the run + type: string - name: --input description: Directory containing raw sequencing data type: file @@ -31,19 +34,20 @@ argument_groups: description: Directory to write fastq data to type: file direction: output - required: true + required: false + default: "$id/fastq" - name: "--output_falco" description: Directory to write falco output to type: file direction: output required: false - default: "$id/falco" + default: "$id/qc/fastqc" - name: "--output_multiqc" description: Directory to write falco output to type: file direction: output required: false - default: "$id/multiqc_report.html" + default: "$id/qc/multiqc_report.html" resources: - type: nextflow_script path: main.nf diff --git a/src/demultiplex/integration_tests.sh b/src/demultiplex/integration_tests.sh index d2e486f..53f7283 100755 --- a/src/demultiplex/integration_tests.sh +++ b/src/demultiplex/integration_tests.sh @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -viash ns build --setup cb +viash ns build --setup cb -q demultiplex nextflow run . \ -main-script src/demultiplex/test.nf \ diff --git a/src/demultiplex/main.nf b/src/demultiplex/main.nf index c4cb8ab..4f2f7fb 100644 --- a/src/demultiplex/main.nf +++ b/src/demultiplex/main.nf @@ -4,6 +4,7 @@ workflow run_wf { main: samples_ch = input_ch + // untar input if needed | untar.run( directives: [label: ["lowmem", "lowcpu"]], @@ -109,11 +110,15 @@ workflow run_wf { | bcl_convert.run( runIf: {id, state -> state.demultiplexer in ["bclconvert"]}, directives: [label: ["highmem", "midcpu"]], - fromState: [ - "bcl_input_directory": "input", - "sample_sheet": "run_information", - "output_directory": "output", - ], + fromState: { id, state -> + [ + bcl_input_directory: state.input, + sample_sheet: state.run_information, + output_directory: state.output, + reports: "reports", + logs: "logs" + ] + }, toState: {id, result, state -> def toAdd = [ "output_demultiplexer" : result.output_directory, @@ -159,6 +164,8 @@ workflow run_wf { ) output_ch = samples_ch + + | combine_samples.run( fromState: { id, state -> [ @@ -186,10 +193,10 @@ workflow run_wf { }, toState: { id, result, state -> state + [ "output_falco" : result.outdir ] - }, + } ) | multiqc.run( - directives: [label: ["lowcpu", "lowmem"]], + directives: [label: ["midcpu", "midmem"]], fromState: {id, state -> def new_state = [ "input": [state.output_falco], @@ -206,10 +213,11 @@ workflow run_wf { }, toState: { id, result, state -> state + [ "output_multiqc" : result.output_report ] - }, + } ) | setState( [ + //"_meta": "_meta", "output": "output_demultiplexer", "output_falco": "output_falco", "output_multiqc": "output_multiqc" diff --git a/src/io/publish/code.sh b/src/io/publish/code.sh new file mode 100755 index 0000000..76927a7 --- /dev/null +++ b/src/io/publish/code.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +echo "Publishing $par_input -> $par_output" +echo "Publishing $par_input_falco -> $par_output_falco" +echo "Publishing $par_input_multiqc -> $par_output_multiqc" + +echo +echo "Creating directory if it does not exist:" +mkdir -p $(dirname "$par_output") && echo "Containing directory $par_output created" +mkdir -p $(dirname "$par_output_falco") && echo "Containing directory $par_output_falco created" +mkdir -p $(dirname "$par_output_multiqc") && echo "Containing directory $par_output_multiqc created" + +echo +echo "Copying files..." +cp -rL "$par_input" "$par_output" +cp -rL "$par_input_falco" "$par_output_falco" +cp -rL "$par_input_multiqc" "$par_output_multiqc" + +echo +echo "Output files:" +echo "par_output:" +ls "$par_output" + +echo +echo "par_output_falco:" +ls "$par_output_falco" + +echo +echo "par_output_multiqc:" +ls "$par_output_multiqc" diff --git a/src/io/publish/config.vsh.yaml b/src/io/publish/config.vsh.yaml new file mode 100644 index 0000000..5683dc2 --- /dev/null +++ b/src/io/publish/config.vsh.yaml @@ -0,0 +1,48 @@ +name: "publish" +namespace: "io" +description: "Publish the processed results of the run" +argument_groups: + - name: Input arguments + arguments: + - name: --input + description: Directory to write fastq data to + type: file + required: true + - name: "--input_falco" + description: Directory to write falco output to + type: file + required: true + - name: "--input_multiqc" + description: Directory to write falco output to + type: file + required: true + - name: Output arguments + arguments: + - name: --output + type: file + direction: output + default: "fastq" + - name: --output_falco + type: file + direction: output + default: "qc/fastqc" + - name: --output_multiqc + type: file + direction: output + default: "qc/multiqc_report.html" + +resources: + - type: bash_script + path: ./code.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: + - procps + +runners: + - type: executable + - type: nextflow diff --git a/src/runner/config.vsh.yaml b/src/runner/config.vsh.yaml new file mode 100644 index 0000000..53ca7db --- /dev/null +++ b/src/runner/config.vsh.yaml @@ -0,0 +1,52 @@ +name: runner +description: Runner for demultiplexing of raw sequencing data +argument_groups: + - name: Input arguments + arguments: + - name: --input + description: Base directory of the form `s3://Sequencing///` + type: file + required: true + - name: Annotation flags + arguments: + - name: --add_date_time + description: | + Add date and time to the output directory name. This is useful + when running the same pipeline multiple times on the same input + directory. + type: boolean_true + - name: --add_workflow_id + description: | + Add a workflow identifier to the output directory name. + type: boolean_true + - name: Output arguments + arguments: + - name: --fastq_output + type: file + direction: output + default: "fastq" + - name: --falco_output + type: file + direction: output + default: "qc/fastqc" + - name: --multiqc_output + type: file + direction: output + default: "qc/multiqc_report.html" + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +dependencies: + - name: demultiplex + repository: local + - name: io/publish + repository: local + +runners: + - type: nextflow + +engines: + - type: native diff --git a/src/runner/main.nf b/src/runner/main.nf new file mode 100644 index 0000000..404f40d --- /dev/null +++ b/src/runner/main.nf @@ -0,0 +1,67 @@ +def date = new Date().format('yyyyMMdd_hhmmss') + +def viash_config = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString() + "/_viash.yaml" +def version = get_version(viash_config) + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + | demultiplex.run( + fromState: [ + "input": "input", + "output": "fastq", + "output_falco": "qc/fastqc", + "output_multiqc": "qc/multiqc_report.html", + ], + toState: { id, result, state -> + state + result + }, + ) + | publish.run( + fromState: { id, state -> + def id1 = (params.add_date_time) ? "${id}_${date}" : id + def id2 = (params.add_workflow_id) ? "${id1}_demultiplex_${version}" : id1 + + def fastq_output_1 = (id == "run") ? state.fastq_output : "${id2}/" + state.fastq_output + def falco_output_1 = (id == "run") ? state.falco_output : "${id2}/" + state.falco_output + def multiqc_output_1 = (id == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output + + if (id == "run") { + println("Publising to ${params.publish_dir}") + } else { + println("Publising to ${params.publish_dir}/${id2}") + } + + [ + input: state.output, + input_falco: state.output_falco, + input_multiqc: state.output_multiqc, + output: fastq_output_1, + output_falco: falco_output_1, + output_multiqc: multiqc_output_1 + ] + }, + toState: { id, result, state -> [:] }, + directives: [ + publishDir: [ + path: "${params.publish_dir}", + overwrite: false, + mode: "copy" + ] + ] + ) + + emit: + output_ch +} + +def get_version(inputFile) { + def yamlSlurper = new groovy.yaml.YamlSlurper() + def loaded_viash_config = yamlSlurper.parse(file(inputFile)) + def version = (loaded_viash_config.version) ? loaded_viash_config.version : "unknown_version" + println("Version to be used: ${version}") + return version +} diff --git a/src/runner/nextflow.config b/src/runner/nextflow.config new file mode 100644 index 0000000..20297b8 --- /dev/null +++ b/src/runner/nextflow.config @@ -0,0 +1,12 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +process { + withName: publishStatesProc { + publishDir = [ enabled: false ] + } +} + +// include common settings +includeConfig("${params.rootDir}/src/config/labels.config")