From 66b9bfc5de84f00ae7859a2805f1db7f408f4e77 Mon Sep 17 00:00:00 2001
From: "Diego Alvarez S." <dialvarezs@gmail.com>
Date: Wed, 4 Sep 2024 22:29:31 -0300
Subject: [PATCH] Add nf-schema, remove unnecessary parameters. (#4)

---
 README.md                         | 32 +++++++------
 assets/samples_data_schema.json   | 11 +++++
 conf/params.config                |  8 ++--
 main.nf                           | 28 +++++-------
 nextflow.config                   | 24 +++++++---
 nextflow_schema.json              | 74 +++++++++++++++++++++++++++++++
 params.example.yml                |  3 --
 subworkflows/basecalling_demux.nf | 20 +++++----
 subworkflows/reports.nf           | 13 ++++--
 9 files changed, 154 insertions(+), 59 deletions(-)
 create mode 100644 assets/samples_data_schema.json
 create mode 100644 nextflow_schema.json

diff --git a/README.md b/README.md
index 34a74d4..ad8e8d5 100644
--- a/README.md
+++ b/README.md
@@ -30,25 +30,23 @@ Uses Dorado for basecalling and demultiplexing.
 
 ## Pipeline Parameters
 
-| Parameter                         | Required | Default                            | Description                                                                                     |
-| --------------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- |
-| `experiment_name`                 | No       | -                                  | Name of the experiment, used for final reports (title and filename).                            |
-| `data_dir`                        | Yes      | -                                  | Path to the directory containing POD5 files.                                                    |
-| `sample_data`                     | Yes      | `input/samples.csv`                | Path to the CSV file containing the sample data (required if demultiplexing).                   |
-| `output_dir`                      | No       | `demultiplex_results`              | Directory for saving results.                                                                   |
-| `fastq_output`                    | No       | `true`                             | Generates FASTQ files if `true`; otherwise, generates UBAM files.                               |
-| `qscore_filter`                   | No       | `10`                               | Minimum QScore threshold for "pass" data, used in demultiplexing.                               |
-| `dorado_basecalling_model`        | No       | `sup`                              | Model used for basecalling. Check Dorado help for available options.                            |
-| `dorado_basecalling_extra_config` | No       | -                                  | Additional configuration options for Dorado basecalling.                                        |
-| `dorado_basecalling_gpus`         | No       | `1`                                | Number of GPUs to allocate for basecalling.                                                     |
-| `skip_demultiplexing`             | No       | `false`                            | Skips demultiplexing if `true`.                                                                 |
-| `dorado_demux_kit`                | No       | `EXP-NBD196`                       | Kit identifier used for demultiplexing.                                                         |
-| `dorado_demux_both_ends`          | No       | `false`                            | Demultiplexes using barcodes on both ends (5' and 3') if `true`.                                |
-| `dorado_demux_extra_config`       | No       | -                                  | Additional configuration options for Dorado demultiplexing.                                     |
-| `use_dorado_container`            | No       | `true`                             | Uses Dorado via container if `true`; expects a local installation if `false`.                   |
-| `qc_tools`                        | No       | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. |
+| Parameter                  | Required | Default                            | Description                                                                                     |
+| -------------------------- | -------- | ---------------------------------- | ----------------------------------------------------------------------------------------------- |
+| `experiment_name`          | No       | -                                  | Name of the experiment, used for final reports (title and filename).                            |
+| `data_dir`                 | Yes      | -                                  | Path to the directory containing POD5 files.                                                    |
+| `sample_data`              | No       | -                                  | Path to the CSV file containing the sample data (if not provided, will not perform demux).      |
+| `output_dir`               | No       | `demultiplex_results`              | Directory for saving results.                                                                   |
+| `fastq_output`             | No       | `true`                             | Generates FASTQ files if `true`; otherwise, generates UBAM files.                               |
+| `qscore_filter`            | No       | `10`                               | Minimum QScore threshold for "pass" data, used in demultiplexing.                               |
+| `dorado_basecalling_model` | No       | `sup`                              | Model used for basecalling. Check Dorado help for available options.                            |
+| `dorado_basecalling_gpus`  | No       | `1`                                | Number of GPUs to allocate for basecalling.                                                     |
+| `dorado_demux_kit`         | No       | `EXP-NBD196`                       | Kit identifier used for demultiplexing.                                                         |
+| `dorado_demux_both_ends`   | No       | `false`                            | Demultiplexes using barcodes on both ends (5' and 3') if `true`.                                |
+| `use_dorado_container`     | No       | `true`                             | Uses Dorado via container if `true`; expects a local installation if `false`.                   |
+| `qc_tools`                 | No       | `['fastqc', 'nanoq', 'toulligqc']` | Specifies which QC tools to run. Options: 'nanoq', 'nanoplot', 'fastqc', 'toulligqc', 'pycoqc'. |
 
 ## Considerations
 
 - The pipeline is compatible with SLURM clusters; use `-profile slurm`.
 - GPU resources are required for basecalling. On SLURM, ensure jobs request GPUs with the `--gres=gpu:X` option.
+- You can provide extra args to dorado basecalling and demultiplexing using `ext.args`.
diff --git a/assets/samples_data_schema.json b/assets/samples_data_schema.json
new file mode 100644
index 0000000..b451cf9
--- /dev/null
+++ b/assets/samples_data_schema.json
@@ -0,0 +1,11 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "array",
+  "items": {
+    "type": "object",
+    "properties": {
+      "barcode": { "type": "string" },
+      "sample": { "type": "string" }
+    }
+  }
+}
diff --git a/conf/params.config b/conf/params.config
index 6e9d38c..d6dee49 100644
--- a/conf/params.config
+++ b/conf/params.config
@@ -1,17 +1,15 @@
 params {
 	experiment_name = ''
 	data_dir = null
-	sample_data = 'input/samples.csv'
-	output_dir = 'demultiplex_results/'
+	sample_data = null
+	output_dir = 'results/'
 	fastq_output = true
 	qscore_filter = 10
 	dorado_basecalling_model = 'sup'
-	dorado_basecalling_extra_config = ''
 	dorado_basecalling_gpus = 1
-	skip_demultiplexing = false
 	dorado_demux_kit = 'SQK-NBD114-96'
 	dorado_demux_both_ends = false
-	dorado_demux_extra_config = ''
 	use_dorado_container = true
 	qc_tools = ['fastqc', 'nanoq', 'toulligqc']
+	showHidden = false
 }
diff --git a/main.nf b/main.nf
index 1bdf0eb..7e5d527 100644
--- a/main.nf
+++ b/main.nf
@@ -1,27 +1,23 @@
 #!/usr/bin/env nextflow
-include { BasecallingAndDemux } from './subworkflows/basecalling_demux.nf'
-include { QualityCheck }        from './subworkflows/quality_check.nf'
-include { GenerateReports }     from './subworkflows/reports.nf'
-include { CollectVersions }     from './subworkflows/versions.nf'
+include { validateParameters; samplesheetToList } from 'plugin/nf-schema'
+include { BasecallingAndDemux }                   from './subworkflows/basecalling_demux.nf'
+include { QualityCheck }                          from './subworkflows/quality_check.nf'
+include { GenerateReports }                       from './subworkflows/reports.nf'
+include { CollectVersions }                       from './subworkflows/versions.nf'
 
 
-// check and prepare input channels
-data_dir = file(params.data_dir, checkIfExists: true, type: 'dir')
+// validate and prepare input channels
+validateParameters()
+
+data_dir = file(params.data_dir, type: 'dir')
 multiqc_config = file("${workflow.projectDir}/tool_conf/multiqc_config.yaml", checkIfExists: true)
 
-if (params.skip_demultiplexing) {
-  samples = channel.fromList([])
+if (params.sample_data) {
+  samples = channel.fromList(samplesheetToList(params.sample_data, "assets/samples_data_schema.json"))
 } else {
-  file(params.sample_data, checkIfExists: true)
-  channel
-    .fromPath(params.sample_data)
-    .splitCsv(header: true)
-    .map { row -> [row.barcode, row.sample] }
-    .set { samples }
+  samples = channel.empty()
 }
 
-params.qc_tools = params.qc_tools.each { it.toLowerCase() }
-
 
 workflow {
   BasecallingAndDemux(samples, data_dir)
diff --git a/nextflow.config b/nextflow.config
index 64e5731..9302c1e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,15 +1,29 @@
+plugins {
+  id 'nf-schema@2.1.0'
+}
+
+validation {
+  help {
+    enabled    = true
+    showHidden = false
+  }
+  ignoreParams = ['showHidden', 'show-hidden']
+}
+
 resume = true
 
 process {
   errorStrategy = 'finish'
 }
 
-report {
-  enabled = true
-  file = "${params.output_dir}/reports/nextflow_report.html"
-  overwrite = true
-}
 
 includeConfig 'conf/params.config'
 includeConfig 'conf/profiles.config'
 includeConfig 'conf/containers.config'
+
+
+def trace_timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss')
+report {
+  enabled = true
+  file    = "${params.output_dir}/reports/execution_report_${trace_timestamp}.html"
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
new file mode 100644
index 0000000..1ddd49f
--- /dev/null
+++ b/nextflow_schema.json
@@ -0,0 +1,74 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://raw.githubusercontent.com//master/nextflow_schema.json",
+  "title": " pipeline parameters",
+  "description": "",
+  "type": "object",
+  "properties": {
+    "experiment_name": {
+      "type": "string",
+      "description": "Name of the experiment, used for final reports (title and filename)."
+    },
+    "data_dir": {
+      "type": "string",
+      "description": "Path to the directory containing POD5 files.",
+      "format": "directory-path"
+    },
+    "sample_data": {
+      "type": "string",
+      "default": "input/samples.csv",
+      "format": "file-path",
+      "schema": "/assets/samples_data_schema.json",
+      "mimetype": "text/csv",
+      "description": "Path to the CSV file containing the sample data (if not provided, will not perform demux)."
+    },
+    "output_dir": {
+      "type": "string",
+      "default": "results/",
+      "description": "Directory for saving results."
+    },
+    "fastq_output": {
+      "type": "boolean",
+      "default": true,
+      "description": "Generates FASTQ files if `true`; otherwise, generates UBAM files."
+    },
+    "qscore_filter": {
+      "type": "integer",
+      "default": 10,
+      "description": "Minimum QScore threshold for \"pass\" data, used in demultiplexing."
+    },
+    "dorado_basecalling_model": {
+      "type": "string",
+      "default": "sup",
+      "description": "Model used for basecalling. Check Dorado help for available options."
+    },
+    "dorado_basecalling_gpus": {
+      "type": "integer",
+      "default": 1,
+      "description": "Number of GPUs to allocate for basecalling."
+    },
+    "dorado_demux_kit": {
+      "type": "string",
+      "default": "SQK-NBD114-96",
+      "description": "Kit identifier used for demultiplexing."
+    },
+    "dorado_demux_both_ends": {
+      "type": "boolean",
+      "description": "Demultiplexes using barcodes on both ends (5' and 3') if `true`."
+    },
+    "use_dorado_container": {
+      "type": "boolean",
+      "default": true,
+      "description": "Uses Dorado via container if `true`; expects a local installation if `false`."
+    },
+    "qc_tools": {
+      "type": "array",
+      "items": {
+        "type": "string",
+        "enum": ["nanoq", "nanoplot", "fastqc", "toulligqc", "pycoqc"]
+      },
+      "description": "Specifies which QC tools to run."
+    }
+  },
+  "required": ["data_dir"]
+}
diff --git a/params.example.yml b/params.example.yml
index dff8b5d..ceecbc3 100644
--- a/params.example.yml
+++ b/params.example.yml
@@ -5,12 +5,9 @@ output_dir: demultiplex_results/
 fastq_output: true
 qscore_filter: 10
 dorado_basecalling_model: sup
-dorado_basecalling_extra_config: ''
 dorado_basecalling_gpus: 1
-skip_demultiplexing: false
 dorado_demux_kit: SQK-NBD114-96
 dorado_demux_both_ends: false
-dorado_demux_extra_config: ''
 use_dorado_container: true
 qc_tools:
   - fastqc
diff --git a/subworkflows/basecalling_demux.nf b/subworkflows/basecalling_demux.nf
index 2fc10d9..c46c5e6 100644
--- a/subworkflows/basecalling_demux.nf
+++ b/subworkflows/basecalling_demux.nf
@@ -17,9 +17,7 @@ workflow BasecallingAndDemux {
       basecalled_reads = basecalled_reads_qscore_filtered
     }
 
-    if (params.skip_demultiplexing) {
-      sequences_to_postprocess = basecalled_reads
-    } else {
+    if (params.sample_data) {
       demultiplexing(qscoreFiltering.out.reads_pass)
 
       demultiplexing.out.classified
@@ -32,6 +30,8 @@ workflow BasecallingAndDemux {
         | mix(demultiplexing.out.unclassified.map { ['unclassified', it] })
         | mix(basecalled_reads)
         | set { sequences_to_postprocess }
+    } else {
+      sequences_to_postprocess = basecalled_reads
     }
 
   emit:
@@ -58,17 +58,18 @@ process basecalling {
   path('sequencing_summary.txt')                  , emit: sequencing_summary
 
   script:
-  if (params.skip_demultiplexing) {
-    demux_opts = ''
-  } else {
+  if (params.sample_data) {
     demux_opts = "--kit-name ${params.dorado_demux_kit}"
     demux_opts += params.dorado_demux_both_ends ? ' --barcode-both-ends' : ''
+  } else {
+    demux_opts = ''
   }
+  extra_args = task.ext.args ?: ''
   """
   dorado basecaller \
     --recursive \
     --device 'cuda:all' \
-    ${params.dorado_basecalling_extra_config} \
+    ${extra_args} \
     ${params.dorado_basecalling_model} \
     ${demux_opts} \
     ${data_dir} \
@@ -84,7 +85,7 @@ process qscoreFiltering {
   publishDir "${params.output_dir}/basecalled/", \
     pattern: '*.bam', \
     mode: 'copy', \
-    enabled: params.skip_demultiplexing && !params.fastq_output
+    enabled: !params.sample_data && !params.fastq_output
   cpus 4
 
   input:
@@ -119,13 +120,14 @@ process demultiplexing {
 
   script:
   emit_fastq = params.fastq_output ? '--emit-fastq' : ''
+  extra_args = task.ext.args ?: ''
   """
   dorado demux \
     --output-dir demultiplexed/ \
     --no-classify \
     ${emit_fastq} \
     --threads ${task.cpus} \
-    ${params.dorado_demux_extra_config} \
+    ${extra_args} \
     ${basecalled_reads}
   """
 }
diff --git a/subworkflows/reports.nf b/subworkflows/reports.nf
index 85369c4..edb47e1 100644
--- a/subworkflows/reports.nf
+++ b/subworkflows/reports.nf
@@ -62,9 +62,14 @@ process toulligQC {
   tuple val('ToulligQC'), eval('toulligqc --version'), topic: versions
   
   script:
-  report_filename = "${slugify(params.experiment_name)}_toulligqc.html"
-  barcode_list = ["NB23", "NB24"]
-  barcodes_opt = !params.skip_demultiplexing && barcode_list
+  if (params.experiment_name) {
+    report_filename = "${slugify(params.experiment_name)}_toulligqc.html"
+    name_opt = "--report-name ${params.experiment_name}"
+  } else {
+    report_filename = "toulligqc.html"
+    name_opt = ''
+  }
+  barcodes_opt = params.sample_data && barcode_list
     ? "--barcoding --barcodes ${barcode_list.join(',')}"
     : ''
   """
@@ -74,8 +79,8 @@ process toulligQC {
     --sequencing-summary-source sequencing_summary.mod.txt \
     --pod5-source ${data_dir} \
     --html-report-path ${report_filename} \
-    --report-name ${params.experiment_name} \
     --qscore-threshold ${params.qscore_filter} \
+    ${name_opt} \
     ${barcodes_opt}
   """
 }