Skip to content

Commit

Permalink
Add nf-schema, remove unnecessary parameters. (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
dialvarezs authored Sep 5, 2024
1 parent e5d715c commit 66b9bfc
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 59 deletions.
32 changes: 15 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,23 @@ Uses Dorado for basecalling and demultiplexing.

## Pipeline Parameters

| Parameter | Required | Default | Description |
| --------------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- |
| `experiment_name` | No | - | Name of the experiment, used for final reports (title and filename). |
| `data_dir` | Yes | - | Path to the directory containing POD5 files. |
| `sample_data` | Yes | `input/samples.csv` | Path to the CSV file containing the sample data (required if demultiplexing). |
| `output_dir` | No | `demultiplex_results` | Directory for saving results. |
| `fastq_output` | No | `true` | Generates FASTQ files if `true`; otherwise, generates UBAM files. |
| `qscore_filter` | No | `10` | Minimum QScore threshold for "pass" data, used in demultiplexing. |
| `dorado_basecalling_model` | No | `sup` | Model used for basecalling. Check Dorado help for available options. |
| `dorado_basecalling_extra_config` | No | - | Additional configuration options for Dorado basecalling. |
| `dorado_basecalling_gpus` | No | `1` | Number of GPUs to allocate for basecalling. |
| `skip_demultiplexing` | No | `false` | Skips demultiplexing if `true`. |
| `dorado_demux_kit` | No | `EXP-NBD196` | Kit identifier used for demultiplexing. |
| `dorado_demux_both_ends` | No | `false` | Demultiplexes using barcodes on both ends (5' and 3') if `true`. |
| `dorado_demux_extra_config` | No | - | Additional configuration options for Dorado demultiplexing. |
| `use_dorado_container` | No | `true` | Uses Dorado via container if `true`; expects a local installation if `false`. |
| `qc_tools` | No | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. |
| Parameter | Required | Default | Description |
| -------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- |
| `experiment_name` | No | - | Name of the experiment, used for final reports (title and filename). |
| `data_dir` | Yes | - | Path to the directory containing POD5 files. |
| `sample_data` | No | - | Path to the CSV file containing the sample data (if not provided, will not perform demux). |
| `output_dir` | No | `demultiplex_results` | Directory for saving results. |
| `fastq_output` | No | `true` | Generates FASTQ files if `true`; otherwise, generates UBAM files. |
| `qscore_filter` | No | `10` | Minimum QScore threshold for "pass" data, used in demultiplexing. |
| `dorado_basecalling_model` | No | `sup` | Model used for basecalling. Check Dorado help for available options. |
| `dorado_basecalling_gpus` | No | `1` | Number of GPUs to allocate for basecalling. |
| `dorado_demux_kit` | No | `EXP-NBD196` | Kit identifier used for demultiplexing. |
| `dorado_demux_both_ends` | No | `false` | Demultiplexes using barcodes on both ends (5' and 3') if `true`. |
| `use_dorado_container` | No | `true` | Uses Dorado via container if `true`; expects a local installation if `false`. |
| `qc_tools` | No | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. |

## Considerations

- The pipeline is compatible with SLURM clusters; use `-profile slurm`.
- GPU resources are required for basecalling. On SLURM, ensure jobs request GPUs with the `--gres=gpu:X` option.
- You can provide extra args to dorado basecalling and demultiplexing using `ext.args`.
11 changes: 11 additions & 0 deletions assets/samples_data_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "array",
"items": {
"type": "object",
"properties": {
"barcode": { "type": "string" },
"sample": { "type": "string" }
}
}
}
8 changes: 3 additions & 5 deletions conf/params.config
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
params {
experiment_name = ''
data_dir = null
sample_data = 'input/samples.csv'
output_dir = 'demultiplex_results/'
sample_data = null
output_dir = 'results/'
fastq_output = true
qscore_filter = 10
dorado_basecalling_model = 'sup'
dorado_basecalling_extra_config = ''
dorado_basecalling_gpus = 1
skip_demultiplexing = false
dorado_demux_kit = 'SQK-NBD114-96'
dorado_demux_both_ends = false
dorado_demux_extra_config = ''
use_dorado_container = true
qc_tools = ['fastqc', 'nanoq', 'toulligqc']
showHidden = false
}
28 changes: 12 additions & 16 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
#!/usr/bin/env nextflow
include { BasecallingAndDemux } from './subworkflows/basecalling_demux.nf'
include { QualityCheck } from './subworkflows/quality_check.nf'
include { GenerateReports } from './subworkflows/reports.nf'
include { CollectVersions } from './subworkflows/versions.nf'
include { validateParameters; samplesheetToList } from 'plugin/nf-schema'
include { BasecallingAndDemux } from './subworkflows/basecalling_demux.nf'
include { QualityCheck } from './subworkflows/quality_check.nf'
include { GenerateReports } from './subworkflows/reports.nf'
include { CollectVersions } from './subworkflows/versions.nf'


// check and prepare input channels
data_dir = file(params.data_dir, checkIfExists: true, type: 'dir')
// validate and prepare input channels
validateParameters()

data_dir = file(params.data_dir, type: 'dir')
multiqc_config = file("${workflow.projectDir}/tool_conf/multiqc_config.yaml", checkIfExists: true)

if (params.skip_demultiplexing) {
samples = channel.fromList([])
if (params.sample_data) {
samples = channel.fromList(samplesheetToList(params.sample_data, "assets/samples_data_schema.json"))
} else {
file(params.sample_data, checkIfExists: true)
channel
.fromPath(params.sample_data)
.splitCsv(header: true)
.map { row -> [row.barcode, row.sample] }
.set { samples }
samples = channel.empty()
}

params.qc_tools = params.qc_tools.each { it.toLowerCase() }


workflow {
BasecallingAndDemux(samples, data_dir)
Expand Down
24 changes: 19 additions & 5 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
plugins {
id '[email protected]'
}

validation {
help {
enabled = true
showHidden = false
}
ignoreParams = ['showHidden', 'show-hidden']
}

resume = true

process {
errorStrategy = 'finish'
}

report {
enabled = true
file = "${params.output_dir}/reports/nextflow_report.html"
overwrite = true
}

includeConfig 'conf/params.config'
includeConfig 'conf/profiles.config'
includeConfig 'conf/containers.config'


def trace_timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
report {
enabled = true
file = "${params.output_dir}/reports/execution_report_${trace_timestamp}.html"
}
74 changes: 74 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com//master/nextflow_schema.json",
"title": " pipeline parameters",
"description": "",
"type": "object",
"properties": {
"experiment_name": {
"type": "string",
"description": "Name of the experiment, used for final reports (title and filename)."
},
"data_dir": {
"type": "string",
"description": "Path to the directory containing POD5 files.",
"format": "directory-path"
},
"sample_data": {
"type": "string",
"default": "input/samples.csv",
"format": "file-path",
"schema": "/assets/samples_data_schema.json",
"mimetype": "text/csv",
"description": "Path to the CSV file containing the sample data (if not provided, will not perform demux)."
},
"output_dir": {
"type": "string",
"default": "results/",
"description": "Directory for saving results."
},
"fastq_output": {
"type": "boolean",
"default": true,
"description": "Generates FASTQ files if `true`; otherwise, generates UBAM files."
},
"qscore_filter": {
"type": "integer",
"default": 10,
"description": "Minimum QScore threshold for \"pass\" data, used in demultiplexing."
},
"dorado_basecalling_model": {
"type": "string",
"default": "sup",
"description": "Model used for basecalling. Check Dorado help for available options."
},
"dorado_basecalling_gpus": {
"type": "integer",
"default": 1,
"description": "Number of GPUs to allocate for basecalling."
},
"dorado_demux_kit": {
"type": "string",
"default": "SQK-NBD114-96",
"description": "Kit identifier used for demultiplexing."
},
"dorado_demux_both_ends": {
"type": "boolean",
"description": "Demultiplexes using barcodes on both ends (5' and 3') if `true`."
},
"use_dorado_container": {
"type": "boolean",
"default": true,
"description": "Uses Dorado via container if `true`; expects a local installation if `false`."
},
"qc_tools": {
"type": "array",
"items": {
"type": "string",
"enum": ["nanoq", "nanoplot", "fastqc", "toulligqc", "pycoqc"]
},
"description": "Specifies which QC tools to run."
}
},
"required": ["data_dir"]
}
3 changes: 0 additions & 3 deletions params.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,9 @@ output_dir: demultiplex_results/
fastq_output: true
qscore_filter: 10
dorado_basecalling_model: sup
dorado_basecalling_extra_config: ''
dorado_basecalling_gpus: 1
skip_demultiplexing: false
dorado_demux_kit: SQK-NBD114-96
dorado_demux_both_ends: false
dorado_demux_extra_config: ''
use_dorado_container: true
qc_tools:
- fastqc
Expand Down
20 changes: 11 additions & 9 deletions subworkflows/basecalling_demux.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ workflow BasecallingAndDemux {
basecalled_reads = basecalled_reads_qscore_filtered
}

if (params.skip_demultiplexing) {
sequences_to_postprocess = basecalled_reads
} else {
if (params.sample_data) {
demultiplexing(qscoreFiltering.out.reads_pass)

demultiplexing.out.classified
Expand All @@ -32,6 +30,8 @@ workflow BasecallingAndDemux {
| mix(demultiplexing.out.unclassified.map { ['unclassified', it] })
| mix(basecalled_reads)
| set { sequences_to_postprocess }
} else {
sequences_to_postprocess = basecalled_reads
}

emit:
Expand All @@ -58,17 +58,18 @@ process basecalling {
path('sequencing_summary.txt') , emit: sequencing_summary

script:
if (params.skip_demultiplexing) {
demux_opts = ''
} else {
if (params.sample_data) {
demux_opts = "--kit-name ${params.dorado_demux_kit}"
demux_opts += params.dorado_demux_both_ends ? ' --barcode-both-ends' : ''
} else {
demux_opts = ''
}
extra_args = task.ext.args ?: ''
"""
dorado basecaller \
--recursive \
--device 'cuda:all' \
${params.dorado_basecalling_extra_config} \
${extra_args} \
${params.dorado_basecalling_model} \
${demux_opts} \
${data_dir} \
Expand All @@ -84,7 +85,7 @@ process qscoreFiltering {
publishDir "${params.output_dir}/basecalled/", \
pattern: '*.bam', \
mode: 'copy', \
enabled: params.skip_demultiplexing && !params.fastq_output
enabled: !params.sample_data && !params.fastq_output
cpus 4

input:
Expand Down Expand Up @@ -119,13 +120,14 @@ process demultiplexing {

script:
emit_fastq = params.fastq_output ? '--emit-fastq' : ''
extra_args = task.ext.args ?: ''
"""
dorado demux \
--output-dir demultiplexed/ \
--no-classify \
${emit_fastq} \
--threads ${task.cpus} \
${params.dorado_demux_extra_config} \
${extra_args} \
${basecalled_reads}
"""
}
Expand Down
13 changes: 9 additions & 4 deletions subworkflows/reports.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,14 @@ process toulligQC {
tuple val('ToulligQC'), eval('toulligqc --version'), topic: versions

script:
report_filename = "${slugify(params.experiment_name)}_toulligqc.html"
barcode_list = ["NB23", "NB24"]
barcodes_opt = !params.skip_demultiplexing && barcode_list
if (params.experiment_name) {
report_filename = "${slugify(params.experiment_name)}_toulligqc.html"
name_opt = "--report-name ${params.experiment_name}"
} else {
report_filename = "toulligqc.html"
name_opt = ''
}
barcodes_opt = params.sample_data && barcode_list
? "--barcoding --barcodes ${barcode_list.join(',')}"
: ''
"""
Expand All @@ -74,8 +79,8 @@ process toulligQC {
--sequencing-summary-source sequencing_summary.mod.txt \
--pod5-source ${data_dir} \
--html-report-path ${report_filename} \
--report-name ${params.experiment_name} \
--qscore-threshold ${params.qscore_filter} \
${name_opt} \
${barcodes_opt}
"""
}
Expand Down

0 comments on commit 66b9bfc

Please sign in to comment.