From 66b9bfc5de84f00ae7859a2805f1db7f408f4e77 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 4 Sep 2024 22:29:31 -0300 Subject: [PATCH] Add nf-schema, remove unnecessary parameters. (#4) --- README.md | 32 +++++++------ assets/samples_data_schema.json | 11 +++++ conf/params.config | 8 ++-- main.nf | 28 +++++------- nextflow.config | 24 +++++++--- nextflow_schema.json | 74 +++++++++++++++++++++++++++++++ params.example.yml | 3 -- subworkflows/basecalling_demux.nf | 20 +++++---- subworkflows/reports.nf | 13 ++++-- 9 files changed, 154 insertions(+), 59 deletions(-) create mode 100644 assets/samples_data_schema.json create mode 100644 nextflow_schema.json diff --git a/README.md b/README.md index 34a74d4..ad8e8d5 100644 --- a/README.md +++ b/README.md @@ -30,25 +30,23 @@ Uses Dorado for basecalling and demultiplexing. ## Pipeline Parameters -| Parameter | Required | Default | Description | -| --------------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- | -| `experiment_name` | No | - | Name of the experiment, used for final reports (title and filename). | -| `data_dir` | Yes | - | Path to the directory containing POD5 files. | -| `sample_data` | Yes | `input/samples.csv` | Path to the CSV file containing the sample data (required if demultiplexing). | -| `output_dir` | No | `demultiplex_results` | Directory for saving results. | -| `fastq_output` | No | `true` | Generates FASTQ files if `true`; otherwise, generates UBAM files. | -| `qscore_filter` | No | `10` | Minimum QScore threshold for "pass" data, used in demultiplexing. | -| `dorado_basecalling_model` | No | `sup` | Model used for basecalling. Check Dorado help for available options. | -| `dorado_basecalling_extra_config` | No | - | Additional configuration options for Dorado basecalling. | -| `dorado_basecalling_gpus` | No | `1` | Number of GPUs to allocate for basecalling. | -| `skip_demultiplexing` | No | `false` | Skips demultiplexing if `true`. | -| `dorado_demux_kit` | No | `EXP-NBD196` | Kit identifier used for demultiplexing. | -| `dorado_demux_both_ends` | No | `false` | Demultiplexes using barcodes on both ends (5' and 3') if `true`. | -| `dorado_demux_extra_config` | No | - | Additional configuration options for Dorado demultiplexing. | -| `use_dorado_container` | No | `true` | Uses Dorado via container if `true`; expects a local installation if `false`. | -| `qc_tools` | No | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. | +| Parameter | Required | Default | Description | +| -------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- | +| `experiment_name` | No | - | Name of the experiment, used for final reports (title and filename). | +| `data_dir` | Yes | - | Path to the directory containing POD5 files. | +| `sample_data` | No | - | Path to the CSV file containing the sample data (if not provided, will not perform demux). | +| `output_dir` | No | `demultiplex_results` | Directory for saving results. | +| `fastq_output` | No | `true` | Generates FASTQ files if `true`; otherwise, generates UBAM files. | +| `qscore_filter` | No | `10` | Minimum QScore threshold for "pass" data, used in demultiplexing. | +| `dorado_basecalling_model` | No | `sup` | Model used for basecalling. Check Dorado help for available options. | +| `dorado_basecalling_gpus` | No | `1` | Number of GPUs to allocate for basecalling. | +| `dorado_demux_kit` | No | `EXP-NBD196` | Kit identifier used for demultiplexing. | +| `dorado_demux_both_ends` | No | `false` | Demultiplexes using barcodes on both ends (5' and 3') if `true`. | +| `use_dorado_container` | No | `true` | Uses Dorado via container if `true`; expects a local installation if `false`. | +| `qc_tools` | No | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. | ## Considerations - The pipeline is compatible with SLURM clusters; use `-profile slurm`. - GPU resources are required for basecalling. On SLURM, ensure jobs request GPUs with the `--gres=gpu:X` option. +- You can provide extra args to dorado basecalling and demultiplexing using `ext.args`. diff --git a/assets/samples_data_schema.json b/assets/samples_data_schema.json new file mode 100644 index 0000000..b451cf9 --- /dev/null +++ b/assets/samples_data_schema.json @@ -0,0 +1,11 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "array", + "items": { + "type": "object", + "properties": { + "barcode": { "type": "string" }, + "sample": { "type": "string" } + } + } +} diff --git a/conf/params.config b/conf/params.config index 6e9d38c..d6dee49 100644 --- a/conf/params.config +++ b/conf/params.config @@ -1,17 +1,15 @@ params { experiment_name = '' data_dir = null - sample_data = 'input/samples.csv' - output_dir = 'demultiplex_results/' + sample_data = null + output_dir = 'results/' fastq_output = true qscore_filter = 10 dorado_basecalling_model = 'sup' - dorado_basecalling_extra_config = '' dorado_basecalling_gpus = 1 - skip_demultiplexing = false dorado_demux_kit = 'SQK-NBD114-96' dorado_demux_both_ends = false - dorado_demux_extra_config = '' use_dorado_container = true qc_tools = ['fastqc', 'nanoq', 'toulligqc'] + showHidden = false } diff --git a/main.nf b/main.nf index 1bdf0eb..7e5d527 100644 --- a/main.nf +++ b/main.nf @@ -1,27 +1,23 @@ #!/usr/bin/env nextflow -include { BasecallingAndDemux } from './subworkflows/basecalling_demux.nf' -include { QualityCheck } from './subworkflows/quality_check.nf' -include { GenerateReports } from './subworkflows/reports.nf' -include { CollectVersions } from './subworkflows/versions.nf' +include { validateParameters; samplesheetToList } from 'plugin/nf-schema' +include { BasecallingAndDemux } from './subworkflows/basecalling_demux.nf' +include { QualityCheck } from './subworkflows/quality_check.nf' +include { GenerateReports } from './subworkflows/reports.nf' +include { CollectVersions } from './subworkflows/versions.nf' -// check and prepare input channels -data_dir = file(params.data_dir, checkIfExists: true, type: 'dir') +// validate and prepare input channels +validateParameters() + +data_dir = file(params.data_dir, type: 'dir') multiqc_config = file("${workflow.projectDir}/tool_conf/multiqc_config.yaml", checkIfExists: true) -if (params.skip_demultiplexing) { - samples = channel.fromList([]) +if (params.sample_data) { + samples = channel.fromList(samplesheetToList(params.sample_data, "assets/samples_data_schema.json")) } else { - file(params.sample_data, checkIfExists: true) - channel - .fromPath(params.sample_data) - .splitCsv(header: true) - .map { row -> [row.barcode, row.sample] } - .set { samples } + samples = channel.empty() } -params.qc_tools = params.qc_tools.each { it.toLowerCase() } - workflow { BasecallingAndDemux(samples, data_dir) diff --git a/nextflow.config b/nextflow.config index 64e5731..9302c1e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,15 +1,29 @@ +plugins { + id 'nf-schema@2.1.0' +} + +validation { + help { + enabled = true + showHidden = false + } + ignoreParams = ['showHidden', 'show-hidden'] +} + resume = true process { errorStrategy = 'finish' } -report { - enabled = true - file = "${params.output_dir}/reports/nextflow_report.html" - overwrite = true -} includeConfig 'conf/params.config' includeConfig 'conf/profiles.config' includeConfig 'conf/containers.config' + + +def trace_timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') +report { + enabled = true + file = "${params.output_dir}/reports/execution_report_${trace_timestamp}.html" +} diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..1ddd49f --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,74 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com//master/nextflow_schema.json", + "title": " pipeline parameters", + "description": "", + "type": "object", + "properties": { + "experiment_name": { + "type": "string", + "description": "Name of the experiment, used for final reports (title and filename)." + }, + "data_dir": { + "type": "string", + "description": "Path to the directory containing POD5 files.", + "format": "directory-path" + }, + "sample_data": { + "type": "string", + "default": "input/samples.csv", + "format": "file-path", + "schema": "/assets/samples_data_schema.json", + "mimetype": "text/csv", + "description": "Path to the CSV file containing the sample data (if not provided, will not perform demux)." + }, + "output_dir": { + "type": "string", + "default": "results/", + "description": "Directory for saving results." + }, + "fastq_output": { + "type": "boolean", + "default": true, + "description": "Generates FASTQ files if `true`; otherwise, generates UBAM files." + }, + "qscore_filter": { + "type": "integer", + "default": 10, + "description": "Minimum QScore threshold for \"pass\" data, used in demultiplexing." + }, + "dorado_basecalling_model": { + "type": "string", + "default": "sup", + "description": "Model used for basecalling. Check Dorado help for available options." + }, + "dorado_basecalling_gpus": { + "type": "integer", + "default": 1, + "description": "Number of GPUs to allocate for basecalling." + }, + "dorado_demux_kit": { + "type": "string", + "default": "SQK-NBD114-96", + "description": "Kit identifier used for demultiplexing." + }, + "dorado_demux_both_ends": { + "type": "boolean", + "description": "Demultiplexes using barcodes on both ends (5' and 3') if `true`." + }, + "use_dorado_container": { + "type": "boolean", + "default": true, + "description": "Uses Dorado via container if `true`; expects a local installation if `false`." + }, + "qc_tools": { + "type": "array", + "items": { + "type": "string", + "enum": ["nanoq", "nanoplot", "fastqc", "toulligqc", "pycoqc"] + }, + "description": "Specifies which QC tools to run." + } + }, + "required": ["data_dir"] +} diff --git a/params.example.yml b/params.example.yml index dff8b5d..ceecbc3 100644 --- a/params.example.yml +++ b/params.example.yml @@ -5,12 +5,9 @@ output_dir: demultiplex_results/ fastq_output: true qscore_filter: 10 dorado_basecalling_model: sup -dorado_basecalling_extra_config: '' dorado_basecalling_gpus: 1 -skip_demultiplexing: false dorado_demux_kit: SQK-NBD114-96 dorado_demux_both_ends: false -dorado_demux_extra_config: '' use_dorado_container: true qc_tools: - fastqc diff --git a/subworkflows/basecalling_demux.nf b/subworkflows/basecalling_demux.nf index 2fc10d9..c46c5e6 100644 --- a/subworkflows/basecalling_demux.nf +++ b/subworkflows/basecalling_demux.nf @@ -17,9 +17,7 @@ workflow BasecallingAndDemux { basecalled_reads = basecalled_reads_qscore_filtered } - if (params.skip_demultiplexing) { - sequences_to_postprocess = basecalled_reads - } else { + if (params.sample_data) { demultiplexing(qscoreFiltering.out.reads_pass) demultiplexing.out.classified @@ -32,6 +30,8 @@ workflow BasecallingAndDemux { | mix(demultiplexing.out.unclassified.map { ['unclassified', it] }) | mix(basecalled_reads) | set { sequences_to_postprocess } + } else { + sequences_to_postprocess = basecalled_reads } emit: @@ -58,17 +58,18 @@ process basecalling { path('sequencing_summary.txt') , emit: sequencing_summary script: - if (params.skip_demultiplexing) { - demux_opts = '' - } else { + if (params.sample_data) { demux_opts = "--kit-name ${params.dorado_demux_kit}" demux_opts += params.dorado_demux_both_ends ? ' --barcode-both-ends' : '' + } else { + demux_opts = '' } + extra_args = task.ext.args ?: '' """ dorado basecaller \ --recursive \ --device 'cuda:all' \ - ${params.dorado_basecalling_extra_config} \ + ${extra_args} \ ${params.dorado_basecalling_model} \ ${demux_opts} \ ${data_dir} \ @@ -84,7 +85,7 @@ process qscoreFiltering { publishDir "${params.output_dir}/basecalled/", \ pattern: '*.bam', \ mode: 'copy', \ - enabled: params.skip_demultiplexing && !params.fastq_output + enabled: !params.sample_data && !params.fastq_output cpus 4 input: @@ -119,13 +120,14 @@ process demultiplexing { script: emit_fastq = params.fastq_output ? '--emit-fastq' : '' + extra_args = task.ext.args ?: '' """ dorado demux \ --output-dir demultiplexed/ \ --no-classify \ ${emit_fastq} \ --threads ${task.cpus} \ - ${params.dorado_demux_extra_config} \ + ${extra_args} \ ${basecalled_reads} """ } diff --git a/subworkflows/reports.nf b/subworkflows/reports.nf index 85369c4..edb47e1 100644 --- a/subworkflows/reports.nf +++ b/subworkflows/reports.nf @@ -62,9 +62,14 @@ process toulligQC { tuple val('ToulligQC'), eval('toulligqc --version'), topic: versions script: - report_filename = "${slugify(params.experiment_name)}_toulligqc.html" - barcode_list = ["NB23", "NB24"] - barcodes_opt = !params.skip_demultiplexing && barcode_list + if (params.experiment_name) { + report_filename = "${slugify(params.experiment_name)}_toulligqc.html" + name_opt = "--report-name ${params.experiment_name}" + } else { + report_filename = "toulligqc.html" + name_opt = '' + } + barcodes_opt = params.sample_data && barcode_list ? "--barcoding --barcodes ${barcode_list.join(',')}" : '' """ @@ -74,8 +79,8 @@ process toulligQC { --sequencing-summary-source sequencing_summary.mod.txt \ --pod5-source ${data_dir} \ --html-report-path ${report_filename} \ - --report-name ${params.experiment_name} \ --qscore-threshold ${params.qscore_filter} \ + ${name_opt} \ ${barcodes_opt} """ }