WIP

epi2me-labs · Feb 17, 2023 · 7afb2ea · 7afb2ea
1 parent 6985e44
commit 7afb2ea
Show file tree

Hide file tree

Showing 99 changed files with 35 additions and 1,703 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,77 +4,6 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [unreleased]
-### Added
-- GitHub issues template.
-- Return of metadata with fastqingress.
-- Check of number of samples and barcoded directories.
-- Example of how to use the metadata from `fastqingress`.
-- Implemented `--version`
-- `fastcat_extra_args` option to `fastq_ingress` to pass arbitrary arguments to `fastcat` (defaults to empty string).
-- `fastcat_stats` option to `fastq_ingress` to force generation of `fastcat` stats even when the input is only a single file (default is false).
-### Changed
-- `fastq_ingress` now returns `[metamap, path-to-fastcat-seqs, path-to-fastcat-stats | null]`.
-- Bumped base container to v0.2.0.
-- Use groovy script to ping after workflow has run.
-- Removed sanitize fastq option.
-- fastq_ingress now removes unclassified read folders by default.
-- Workflow name and version is now more prominently displayed on start
-### Fixed
-- Output argument in Fastqingress homogenised.
-- Sanitize fastq intermittent null object error.
-- Add `*.pyc` and `*.pyo` ignores to wf-template .gitignore
-### Note
-- Bumped version to `v4` to align versioning with Launcher v4
-
-## [v0.2.0]
-### Added
-- default process label parameter
-- Added `params.wf.example_cmd` list to populate `--help`
-### Changed
-- Update WorkflowMain.groovy to provide better `--help`
-
-## [v0.1.0]
-### Changed
-- `sample_name` to `sample_id` throughout to mathc MinKNOW samplesheet.
-### Added
-- Singularity profile include in base config.
-- Numerous other changes that have been lost to the mists of time.
-
-## [v0.0.7]
-### Added
-- Fastqingress module for common handling of (possibly
-  multiplexed) inputs.
-- Optimized container size through removal of various
-  conda cruft.
-### Changed
-- Use mamba by default for building conda environments.
-- Cut down README to items specific to workflow.
-### Fixed
-- Incorrect specification of conda environment file in Nextflow config.
-
-## [v0.0.6]
-### Changed
-- Explicitely install into base conda env
-
-## [v0.0.5]
-### Added
-- Software versioning report example.
-
-## [v0.0.4]
-### Changed
-- Version bump to test CI.
-
-## [v0.0.3]
-### Changed
-- Moved all CI to templates.
-- Use canned aplanat report components.
-
-## [v0.0.2]
-### Added
-- CI release checks.
-- Create pre-releases in CI from dev branch.
-
 ## [v0.0.1]
 
 First release.
diff --git a/lib/fastqingress.nf b/lib/fastqingress.nf
@@ -167,7 +167,7 @@ def watch_path(Map margs) {
 
 
 process mv_or_pigz {
-    label "wftemplate"
+    label "wfamplicon"
     cpus params.threads
     input:
         tuple val(meta), path(input)
@@ -190,7 +190,7 @@ process mv_or_pigz {
 
 
 process fastcat {
-    label "wftemplate"
+    label "wfamplicon"
     cpus params.threads
     input:
         tuple val(meta), path(input)
@@ -408,7 +408,7 @@ def get_sample_sheet(Path sample_sheet) {
  * @return: string (optional)
  */
 process validate_sample_sheet {
-    label "wftemplate"
+    label "wfamplicon"
     input: path csv
     output: stdout
     """

diff --git a/main.nf b/main.nf
@@ -1,24 +1,16 @@
 #!/usr/bin/env nextflow
 
-// Developer notes
-//
-// This template workflow provides a basic structure to copy in order
-// to create a new workflow. Current recommended pratices are:
-//     i) create a simple command-line interface.
-//    ii) include an abstract workflow scope named "pipeline" to be used
-//        in a module fashion
-//   iii) a second concreate, but anonymous, workflow scope to be used
-//        as an entry point when using this workflow in isolation.
-
 import groovy.json.JsonBuilder
 nextflow.enable.dsl = 2
 
 include { fastq_ingress } from './lib/fastqingress'
+include { clusterReads } from './subworkflows/clustering/vsearch'
+include { draftAssembly } from './subworkflows/assembly/flye'
 
 OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE")
 
 process getVersions {
-    label "wftemplate"
+    label "wfamplicon"
     cpus 1
     output:
         path "versions.txt"
@@ -31,7 +23,7 @@ process getVersions {
 
 
 process getParams {
-    label "wftemplate"
+    label "wfamplicon"
     cpus 1
     output:
         path "params.json"
@@ -44,38 +36,14 @@ process getParams {
 }
 
 
-process makeReport {
-    label "wftemplate"
-    input:
-        val metadata
-        path per_read_stats
-        path "versions/*"
-        path "params.json"
-    output:
-        path "wf-template-*.html"
-    script:
-        String report_name = "wf-template-report.html"
-        String metadata = new JsonBuilder(metadata).toPrettyString()
-        String stats_args = \
-            (per_read_stats.name == OPTIONAL_FILE.name) ? "" : "--stats $per_read_stats"
-    """
-    echo '${metadata}' > metadata.json
-    workflow-glue report $report_name \
-        --versions versions \
-        $stats_args \
-        --params params.json \
-        --metadata metadata.json
-    """
-}
-
 
 // See https://github.com/nextflow-io/nextflow/issues/1636. This is the only way to
 // publish files from a workflow whilst decoupling the publish from the process steps.
 // The process takes a tuple containing the filename and the name of a sub-directory to
 // put the file into. If the latter is `null`, puts it into the top-level directory.
 process output {
     // publish inputs to output directory
-    label "wftemplate"
+    label "wfamplicon"
     publishDir (
         params.out_dir,
         mode: "copy",
@@ -89,52 +57,31 @@ process output {
     """
 }
 
-// Creates a new directory named after the sample alias and moves the fastcat results
-// into it.
-process collect_fastq_ingress_results_in_dir {
-    label "wftemplate"
-    input:
-        tuple val(meta), path(concat_seqs), path(fastcat_stats)
-    output:
-        path "*"
-    script:
-    String outdir = meta["alias"]
-    String fastcat_stats = \
-        (fastcat_stats.name == OPTIONAL_FILE.name) ? "" : fastcat_stats
-    """
-    mkdir $outdir
-    mv $concat_seqs $fastcat_stats $outdir
-    """
-}
-
 // workflow module
 workflow pipeline {
     take:
         reads
     main:
-        per_read_stats = reads.map {
-            it[2] ? it[2].resolve('per-read-stats.tsv') : null
-        }
-        | collectFile ( keepHeader: true )
-        | ifEmpty ( OPTIONAL_FILE )
         software_versions = getVersions()
         workflow_params = getParams()
-        metadata = reads.map { it[0] }.toList()
-        report = makeReport(
-            metadata, per_read_stats, software_versions.collect(), workflow_params
+
+        // the reads have already been filtered by `fastcat` --> cluster next
+        clustering = clusterReads(
+            reads.map {it[0..1]},
+            params.min_cluster_size,
         )
-        reads
-        | map { [it[0], it[1], it[2] ?: OPTIONAL_FILE ] }
-        | collect_fastq_ingress_results_in_dir
+
+
     emit:
-        fastq_ingress_results = collect_fastq_ingress_results_in_dir.out
-        report
         workflow_params
-        // TODO: use something more useful as telemetry
-        telemetry = workflow_params
 }
 
 
+params.min_read_length = 300
+params.max_read_length = 3600
+params.min_read_qual = 8
+params.min_cluster_size = 0.2
+
 // entrypoint workflow
 WorkflowMain.initialise(workflow, params, log)
 workflow {
@@ -143,23 +90,29 @@ workflow {
         Pinguscript.ping_post(workflow, "start", "none", params.out_dir, params)
     }
 
+    ArrayList fastcat_extra_args = []
+    if (params.min_read_length) { fastcat_extra_args << "-a $params.min_read_length" }
+    if (params.max_read_length) { fastcat_extra_args << "-b $params.max_read_length" }
+    if (params.min_read_qual) { fastcat_extra_args << "-q $params.min_read_qual" }
 
     samples = fastq_ingress([
         "input":params.fastq,
         "sample":params.sample,
         "sample_sheet":params.sample_sheet,
         "analyse_unclassified":params.analyse_unclassified,
-        "fastcat_stats": params.wf.fastcat_stats,
-        "fastcat_extra_args": ""])
+        "fastcat_stats": false,
+        "fastcat_extra_args": fastcat_extra_args.join(" ")])
+
+    // looks like this is the most robust way to check if a `param` coming from the
+    // command line is a number
+    if (
+        params.min_cluster_size instanceof String ||
+        !params.min_cluster_size.toString().isNumber()
+    ) {
+        error "`--min_cluster_size` must be a float or integer."
+    }
 
     pipeline(samples)
-    pipeline.out.fastq_ingress_results
-    | map { [it, "fastq_ingress_results"] }
-    | concat (
-        pipeline.out.report.concat(pipeline.out.workflow_params)
-        | map { [it, null] }
-    )
-    | output
 }
 
 if (params.disable_ping == false) {

diff --git a/test/run_fastq_ingress_test.sh b/test/run_fastq_ingress_test.sh