Skip to content

Commit

Permalink
Adjust structure of output (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
tverbeiren authored Dec 11, 2024
1 parent 6e6be28 commit 2474718
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 12 deletions.
31 changes: 31 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,34 @@
# demultiplex x.y.z

## Major updates
The outflow of the workflow has been refactored to be more flexible. This is done by creating a wrapper workflow `runner` that wraps the native `demultiplex` workflow. The `runner` workflow is responsible for setting the output directory based on the input arguments:

3 arguments exist for specifying the relative location of the 3 _outputs_ of the workflow:

- `fastq_output`: The directory where the demultiplexed fastq files are stored.
- `falco_output`: the directory for the `fastqc`/`falco` reports.
- `multiqc_output`: The filename for the `multiqc` report.

The target location path is determined by the following logic:

- If no `id` is provided, the output directory is set to `$publish_dir`.
- If an `id` is explicitly set using Seqera Cloud or by adding `--id <>`, the output directory is set to `$publish_dir/<id>`.

The workflow has two optional flags to be used in combination with `--id`:

- `--add_date_time`: rather than publishing the results under `$publish_dir`, this adds an additional layer `$publish_dir/<date-time-stamp>/`. This is useful when you want to keep track of multiple runs of the workflow (example: `240322_143020`).
- `--add_workflow_id`: adding this flag will add `_demultiplex_<version>` to the output directory (example: `demultiplex_v0.2.0`). When starting the workflow from a non-release, the version will be set to `version_unkonwn`.

The default structure in the output directory is:

- Two sub-directories:
- `fastq`
- `qc` for the reports:
- `multiqc_report.html`
- `fastqc/` directory containing the different fastqc (falco) reports.

The `$publish_dir` variable corresponds to the argument provided with `--publish-dir`. The `date-time-stamp` is generated by the workflow based on when it was launched and is thus guaranteed to be unique.

# demultiplex v0.2.0

## Breaking changes
Expand Down
1 change: 1 addition & 0 deletions _viash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ viash_version: 0.9.0

config_mods: |
.requirements.commands := ['ps']
.runners[.type == 'nextflow'].directives.tag := '$id'
12 changes: 12 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
manifest {
homePage = 'https://github.com/viash-hub/demultiplex'
description = 'Demultiplexing pipeline for sequencing data'
mainScript = 'target/nextflow/demultiplex/main.nf'
}

process {
withName: publishStatesProc {
publishDir = [ enabled: false ]
}
}

10 changes: 7 additions & 3 deletions src/demultiplex/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ description: Demultiplexing of raw sequencing data
argument_groups:
- name: Input arguments
arguments:
- name: --id
description: Unique identifier for the run
type: string
- name: --input
description: Directory containing raw sequencing data
type: file
Expand Down Expand Up @@ -31,19 +34,20 @@ argument_groups:
description: Directory to write fastq data to
type: file
direction: output
required: true
required: false
default: "$id/fastq"
- name: "--output_falco"
description: Directory to write falco output to
type: file
direction: output
required: false
default: "$id/falco"
default: "$id/qc/fastqc"
- name: "--output_multiqc"
description: Directory to write falco output to
type: file
direction: output
required: false
default: "$id/multiqc_report.html"
default: "$id/qc/multiqc_report.html"
resources:
- type: nextflow_script
path: main.nf
Expand Down
2 changes: 1 addition & 1 deletion src/demultiplex/integration_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

viash ns build --setup cb
viash ns build --setup cb -q demultiplex

nextflow run . \
-main-script src/demultiplex/test.nf \
Expand Down
24 changes: 16 additions & 8 deletions src/demultiplex/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ workflow run_wf {

main:
samples_ch = input_ch

// untar input if needed
| untar.run(
directives: [label: ["lowmem", "lowcpu"]],
Expand Down Expand Up @@ -109,11 +110,15 @@ workflow run_wf {
| bcl_convert.run(
runIf: {id, state -> state.demultiplexer in ["bclconvert"]},
directives: [label: ["highmem", "midcpu"]],
fromState: [
"bcl_input_directory": "input",
"sample_sheet": "run_information",
"output_directory": "output",
],
fromState: { id, state ->
[
bcl_input_directory: state.input,
sample_sheet: state.run_information,
output_directory: state.output,
reports: "reports",
logs: "logs"
]
},
toState: {id, result, state ->
def toAdd = [
"output_demultiplexer" : result.output_directory,
Expand Down Expand Up @@ -159,6 +164,8 @@ workflow run_wf {
)

output_ch = samples_ch


| combine_samples.run(
fromState: { id, state ->
[
Expand Down Expand Up @@ -186,10 +193,10 @@ workflow run_wf {
},
toState: { id, result, state ->
state + [ "output_falco" : result.outdir ]
},
}
)
| multiqc.run(
directives: [label: ["lowcpu", "lowmem"]],
directives: [label: ["midcpu", "midmem"]],
fromState: {id, state ->
def new_state = [
"input": [state.output_falco],
Expand All @@ -206,10 +213,11 @@ workflow run_wf {
},
toState: { id, result, state ->
state + [ "output_multiqc" : result.output_report ]
},
}
)
| setState(
[
//"_meta": "_meta",
"output": "output_demultiplexer",
"output_falco": "output_falco",
"output_multiqc": "output_multiqc"
Expand Down
30 changes: 30 additions & 0 deletions src/io/publish/code.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

echo "Publishing $par_input -> $par_output"
echo "Publishing $par_input_falco -> $par_output_falco"
echo "Publishing $par_input_multiqc -> $par_output_multiqc"

echo
echo "Creating directory if it does not exist:"
mkdir -p $(dirname "$par_output") && echo "Containing directory $par_output created"
mkdir -p $(dirname "$par_output_falco") && echo "Containing directory $par_output_falco created"
mkdir -p $(dirname "$par_output_multiqc") && echo "Containing directory $par_output_multiqc created"

echo
echo "Copying files..."
cp -rL "$par_input" "$par_output"
cp -rL "$par_input_falco" "$par_output_falco"
cp -rL "$par_input_multiqc" "$par_output_multiqc"

echo
echo "Output files:"
echo "par_output:"
ls "$par_output"

echo
echo "par_output_falco:"
ls "$par_output_falco"

echo
echo "par_output_multiqc:"
ls "$par_output_multiqc"
48 changes: 48 additions & 0 deletions src/io/publish/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: "publish"
namespace: "io"
description: "Publish the processed results of the run"
argument_groups:
- name: Input arguments
arguments:
- name: --input
description: Directory to write fastq data to
type: file
required: true
- name: "--input_falco"
description: Directory to write falco output to
type: file
required: true
- name: "--input_multiqc"
description: Directory to write falco output to
type: file
required: true
- name: Output arguments
arguments:
- name: --output
type: file
direction: output
default: "fastq"
- name: --output_falco
type: file
direction: output
default: "qc/fastqc"
- name: --output_multiqc
type: file
direction: output
default: "qc/multiqc_report.html"

resources:
- type: bash_script
path: ./code.sh

engines:
- type: docker
image: debian:stable-slim
setup:
- type: apt
packages:
- procps

runners:
- type: executable
- type: nextflow
52 changes: 52 additions & 0 deletions src/runner/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: runner
description: Runner for demultiplexing of raw sequencing data
argument_groups:
- name: Input arguments
arguments:
- name: --input
description: Base directory of the form `s3:/<bucket>/Sequencing/<Sequencer>/<RunID>/`
type: file
required: true
- name: Annotation flags
arguments:
- name: --add_date_time
description: |
Add date and time to the output directory name. This is useful
when running the same pipeline multiple times on the same input
directory.
type: boolean_true
- name: --add_workflow_id
description: |
Add a workflow identifier to the output directory name.
type: boolean_true
- name: Output arguments
arguments:
- name: --fastq_output
type: file
direction: output
default: "fastq"
- name: --falco_output
type: file
direction: output
default: "qc/fastqc"
- name: --multiqc_output
type: file
direction: output
default: "qc/multiqc_report.html"

resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf

dependencies:
- name: demultiplex
repository: local
- name: io/publish
repository: local

runners:
- type: nextflow

engines:
- type: native
67 changes: 67 additions & 0 deletions src/runner/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
def date = new Date().format('yyyyMMdd_hhmmss')

def viash_config = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString() + "/_viash.yaml"
def version = get_version(viash_config)

workflow run_wf {
take:
input_ch

main:
output_ch = input_ch
| demultiplex.run(
fromState: [
"input": "input",
"output": "fastq",
"output_falco": "qc/fastqc",
"output_multiqc": "qc/multiqc_report.html",
],
toState: { id, result, state ->
state + result
},
)
| publish.run(
fromState: { id, state ->
def id1 = (params.add_date_time) ? "${id}_${date}" : id
def id2 = (params.add_workflow_id) ? "${id1}_demultiplex_${version}" : id1

def fastq_output_1 = (id == "run") ? state.fastq_output : "${id2}/" + state.fastq_output
def falco_output_1 = (id == "run") ? state.falco_output : "${id2}/" + state.falco_output
def multiqc_output_1 = (id == "run") ? state.multiqc_output : "${id2}/" + state.multiqc_output

if (id == "run") {
println("Publising to ${params.publish_dir}")
} else {
println("Publising to ${params.publish_dir}/${id2}")
}

[
input: state.output,
input_falco: state.output_falco,
input_multiqc: state.output_multiqc,
output: fastq_output_1,
output_falco: falco_output_1,
output_multiqc: multiqc_output_1
]
},
toState: { id, result, state -> [:] },
directives: [
publishDir: [
path: "${params.publish_dir}",
overwrite: false,
mode: "copy"
]
]
)

emit:
output_ch
}

def get_version(inputFile) {
def yamlSlurper = new groovy.yaml.YamlSlurper()
def loaded_viash_config = yamlSlurper.parse(file(inputFile))
def version = (loaded_viash_config.version) ? loaded_viash_config.version : "unknown_version"
println("Version to be used: ${version}")
return version
}
12 changes: 12 additions & 0 deletions src/runner/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}

process {
withName: publishStatesProc {
publishDir = [ enabled: false ]
}
}

// include common settings
includeConfig("${params.rootDir}/src/config/labels.config")

0 comments on commit 2474718

Please sign in to comment.