From 513d14a28e8c9c6c932cb0a8f07a44c85fac7027 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 13 Sep 2023 15:44:55 -0400
Subject: [PATCH 1/3] chore: rename mkdocs action

---
 .github/workflows/{build_mkdocs.yml => docs.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{build_mkdocs.yml => docs.yml} (100%)

diff --git a/.github/workflows/build_mkdocs.yml b/.github/workflows/docs.yml
similarity index 100%
rename from .github/workflows/build_mkdocs.yml
rename to .github/workflows/docs.yml

From fc2878d64e2fec51725cac0a07a51abd048e9f5a Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 13 Sep 2023 16:04:37 -0400
Subject: [PATCH 2/3] feat: input check for samplesheet

---
 assets/README.md                   |   6 ++
 assets/samplesheet_test.csv        |   2 +
 bin/check_samplesheet.py           | 166 +++++++++++++++++++++++++++++
 conf/modules.config                |  10 +-
 conf/test.config                   |  24 +++++
 main.nf                            |  10 ++
 modules/local/samplesheet_check.nf |  27 +++++
 modules/local/trim.nf              |  42 ++++++++
 nextflow.config                    |  12 +++
 submodules/local/input_check.nf    |  51 +++++++++
 10 files changed, 348 insertions(+), 2 deletions(-)
 create mode 100644 assets/README.md
 create mode 100644 assets/samplesheet_test.csv
 create mode 100755 bin/check_samplesheet.py
 create mode 100644 conf/test.config
 create mode 100644 modules/local/samplesheet_check.nf
 create mode 100644 modules/local/trim.nf
 create mode 100644 submodules/local/input_check.nf

diff --git a/assets/README.md b/assets/README.md
new file mode 100644
index 0000000..4846036
--- /dev/null
+++ b/assets/README.md
@@ -0,0 +1,6 @@
+
+## test dataset
+
+- geo https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111309
+- paper https://doi.org/10.1016/j.celrep.2019.02.041
+- sample sheet with just one sample: assets/samplesheet_test.csv
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
new file mode 100644
index 0000000..d7be801
--- /dev/null
+++ b/assets/samplesheet_test.csv
@@ -0,0 +1,2 @@
+sample,fastq_1,fastq_2
+haploid_hESC_library_Day30_rep1_run2,data/test/SRR6795299.fastq.gz,,
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
new file mode 100755
index 0000000..b6dc239
--- /dev/null
+++ b/bin/check_samplesheet.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+"""
+adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/bin/check_samplesheet.py
+"""
+
+import os
+import sys
+import errno
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Reformat samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+    sample,fastq_1,fastq_2
+    SPT5_T0_REP1,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz
+    SPT5_T0_REP2,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz
+    """
+
+    sample_mapping_dict = {}
+    with open(file_in, "r", encoding="utf-8-sig") as fin:
+        ## Check header
+        MIN_COLS = 2
+        HEADER = ["sample", "fastq_1", "fastq_2"]
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(HEADER)] != HEADER:
+            print(
+                f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
+            )
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            # Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error(
+                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Line",
+                    line,
+                )
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error(
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
+                    "Line",
+                    line,
+                )
+
+            ## Check sample name entries
+            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            if sample.find(" ") != -1:
+                print(
+                    f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
+                )
+                sample = sample.replace(" ", "_")
+            if not sample:
+                print_error("Sample entry has not been specified!", "Line", line)
+
+            ## Check FastQ file extension
+            for fastq in [fastq_1, fastq_2]:
+                if fastq:
+                    if fastq.find(" ") != -1:
+                        print_error("FastQ file contains spaces!", "Line", line)
+                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                        print_error(
+                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            "Line",
+                            line,
+                        )
+
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2]
+            else:
+                print_error("Invalid combination of columns provided!", "Line", line)
+
+            ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]}
+            if sample not in sample_mapping_dict:
+                sample_mapping_dict[sample] = [sample_info]
+            else:
+                if sample_info in sample_mapping_dict[sample]:
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
+                else:
+                    sample_mapping_dict[sample].append(sample_info)
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_mapping_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write(
+                ",".join(
+                    [
+                        "sample",
+                        "single_end",
+                        "fastq_1",
+                        "fastq_2",
+                    ]
+                )
+                + "\n"
+            )
+            for sample in sorted(sample_mapping_dict.keys()):
+                ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
+                        "Sample",
+                        sample,
+                    )
+
+                for idx, val in enumerate(sample_mapping_dict[sample]):
+                    plus_T = (
+                        f"_T{idx+1}" if len(sample_mapping_dict[sample]) > 1 else ""
+                    )  # do not append _T{idx} if not needed
+                    fout.write(",".join([f"{sample}{plus_T}"] + val) + "\n")
+    else:
+        print_error(f"No entries to process!", "Samplesheet: {file_in}")
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
index fdc7887..00794a9 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -25,13 +25,19 @@ process {
 process {
 
     // catch any process without a container.
-    // this must be first before any custom process containers
-    withName: ".*" {
+    // this must be first before any other withName selectors.
+    // oddly, it will override any withLabel selectors even though
+    // label should have a higher priority than name.
+    // https://www.nextflow.io/docs/latest/config.html#selector-priority
+    withName: '.*' {
         container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
     }
 
     // custom process containers
 
+    withName: 'TRIM.*' {
+        container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
+    }
     withName: 'MAGECK.*' {
         container = 'quay.io/biocontainers/mageck:0.5.9.5--py39h1f90b4d_3'
     }
diff --git a/conf/test.config b/conf/test.config
new file mode 100644
index 0000000..8602356
--- /dev/null
+++ b/conf/test.config
@@ -0,0 +1,24 @@
+params {
+    config_profile_name = 'Test dataset'
+    config_profile_description = 'Minimal test dataset for pipeline functionality'
+
+    input = 'assets/samplesheet_test.csv'
+    outdir = 'results/test'
+    genome = null
+
+    max_cpus = 32        // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
+    max_memory = '120.GB'
+    max_time   = '12.h'
+
+    publish_dir_mode = 'symlink'
+}
+dag {
+    enabled = true
+    overwrite = true
+    file = 'assets/dag.png'
+}
+report {
+    enabled = true
+    overwrite = true
+    file = "${params.outdir}/pipeline_info/execution_report.html"
+}
diff --git a/main.nf b/main.nf
index c8f1faf..ca008c3 100644
--- a/main.nf
+++ b/main.nf
@@ -16,6 +16,13 @@ reads        : ${params.input}
 """
 .stripIndent()
 
+
+// SUBMODULES
+include { INPUT_CHECK } from './submodules/local/input_check.nf'
+
+// MODULES
+include { TRIM_SE } from './modules/local/trim.nf'
+
 process BAGEL {
     output:
         path("output.txt")
@@ -76,4 +83,7 @@ workflow {
     BAGEL()
     MAGECK()
     VISPR()
+    INPUT_CHECK(file(params.input), params.seq_center)
+    INPUT_CHECK.out.reads.set{ raw_fastqs }
+    raw_fastqs | TRIM_SE
 }
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
new file mode 100644
index 0000000..e0ca829
--- /dev/null
+++ b/modules/local/samplesheet_check.nf
@@ -0,0 +1,27 @@
+// adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/local/samplesheet_check.nf
+process SAMPLESHEET_CHECK {
+    tag "$samplesheet"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'quay.io/biocontainers/python:3.8.3' }"
+
+    input:
+    path samplesheet
+
+    output:
+    path '*.csv'       , emit: csv
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    check_samplesheet.py \\
+        $samplesheet \\
+        samplesheet.valid.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/trim.nf b/modules/local/trim.nf
new file mode 100644
index 0000000..434a9f3
--- /dev/null
+++ b/modules/local/trim.nf
@@ -0,0 +1,42 @@
+// https://github.com/CCBR/ChiP-seek/blob/9ba449e4855f9710e86f2db7c1d9560de634b3f1/workflow/rules/align.smk#L21
+// https://github.com/nf-core/ampliseq/blob/dev/subworkflows/local/cutadapt_workflow.nf
+// https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/nf-core/modules/trimgalore/main.nf
+process TRIM_SE {
+  tag { meta.id }
+  label 'qc'
+  label 'process_high'
+
+  input:
+    tuple val(meta), path(fastq)
+
+  output:
+    tuple val(meta), path("*.fastq.gz")
+
+  script:
+  def prefix = task.ext.prefix ?: "${meta.id}"
+  if (meta.single_end) {
+    """
+    nseqs_raw=\$(zgrep "^@" ${fastq} | wc -l)
+    echo "\$nseqs_raw in ${fastq}"
+    cutadapt \
+      --nextseq-trim=2 \
+      --trim-n \
+      -n 5 -O 5 \
+      -q ${params.cutadapt.leadingquality},${params.cutadapt.trailingquality} \
+      -m ${params.cutadapt.minlen} \
+      -b file:${params.cutadapt.adapters} \
+      -j $task.cpus \
+      $fastq |\
+    pigz -p ${task.cpus} > ${prefix}.trimmed.fastq.gz
+    nseqs_trimmed=\$(zgrep "^@" ${prefix}.trimmed.fastq.gz | wc -l)
+    echo "\$nseqs_trimmed in ${prefix}.trimmed.fastq.gz"
+    """
+  } else {
+    "paired end reads are not supported yet"
+  }
+
+  stub:
+  """
+  touch ${meta.id}.trimmed.fastq.gz
+  """
+}
diff --git a/nextflow.config b/nextflow.config
index d9e414b..fb0a2fb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -4,9 +4,18 @@ params {
     input = null
     outdir = 'results'
     genome = null
+    seq_center = null
+
     enable_conda = false
     publish_dir_mode = "copy"
 
+    cutadapt {
+        adapters = '/opt2/TruSeq_and_nextera_adapters.consolidated.fa'  // this is in the cutadapt container
+        minlen = 20
+        leadingquality = 10
+        trailingquality = 10
+    }
+
 }
 
 includeConfig 'conf/base.config'
@@ -36,6 +45,9 @@ profiles {
     ci_stub {
         includeConfig "conf/ci_stub.config"
     }
+    test {
+        includeConfig "conf/test.config"
+    }
 }
 
 // Export these variables to prevent local Python/R libraries from conflicting with those in the container
diff --git a/submodules/local/input_check.nf b/submodules/local/input_check.nf
new file mode 100644
index 0000000..8c8702c
--- /dev/null
+++ b/submodules/local/input_check.nf
@@ -0,0 +1,51 @@
+// source: https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/subworkflows/local/input_check.nf
+//
+// Check input samplesheet and get read channels
+//
+
+include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check.nf'
+
+workflow INPUT_CHECK {
+    take:
+    samplesheet // file: /path/to/samplesheet.csv
+    seq_center  // string: sequencing center for read group
+
+    main:
+    SAMPLESHEET_CHECK ( samplesheet )
+        .csv
+        .splitCsv ( header:true, sep:',' )
+        .map { create_fastq_channel(it, seq_center) }
+        .set { reads }
+
+    emit:
+    reads                                     // channel: [ val(meta), [ reads ] ]
+    versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ]
+}
+
+// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
+def create_fastq_channel(LinkedHashMap row, String seq_center) {
+    def meta = [:]
+    meta.id         = row.sample
+    meta.single_end = row.single_end.toBoolean()
+
+    def read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\'"
+    if (seq_center) {
+        read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\\tCN:${seq_center}\'"
+    }
+    meta.read_group = read_group
+
+    // add path(s) of the fastq file(s) to the meta map
+    def fastq_meta = []
+    if (!file(row.fastq_1).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}"
+    }
+    if (meta.single_end) {
+        fastq_meta = [ meta, [ file(row.fastq_1) ] ]
+    } else {
+        if (!file(row.fastq_2).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
+        }
+        fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+    }
+    return fastq_meta
+}

From 43eaefe4d8307051ef508dabe7c2dcb88f677b53 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 13 Sep 2023 16:08:01 -0400
Subject: [PATCH 3/3] chore: move dummy processes to modules

---
 main.nf                 | 66 ++++-------------------------------------
 modules/local/bagel.nf  | 11 +++++++
 modules/local/drugz.nf  | 11 +++++++
 modules/local/mageck.nf | 27 +++++++++++++++++
 4 files changed, 54 insertions(+), 61 deletions(-)
 create mode 100644 modules/local/bagel.nf
 create mode 100644 modules/local/drugz.nf
 create mode 100644 modules/local/mageck.nf

diff --git a/main.nf b/main.nf
index ca008c3..24bf9b6 100644
--- a/main.nf
+++ b/main.nf
@@ -16,74 +16,18 @@ reads        : ${params.input}
 """
 .stripIndent()
 
-
 // SUBMODULES
 include { INPUT_CHECK } from './submodules/local/input_check.nf'
 
 // MODULES
 include { TRIM_SE } from './modules/local/trim.nf'
 
-process BAGEL {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which BAGEL.py >> output.txt
-    """
-}
-
-process DRUGZ {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which drugz.py >> output.txt
-    """
-}
-
-process MAGECK {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which mageck >> output.txt
-    """
-}
-
-process VISPR {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which vispr >> output.txt
-    """
-}
-
-process BASE {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    python -V >> output.txt
-    """
-}
-workflow {
-    BASE()
-    DRUGZ()
-    BAGEL()
-    MAGECK()
-    VISPR()
+workflow CRUISE {
     INPUT_CHECK(file(params.input), params.seq_center)
     INPUT_CHECK.out.reads.set{ raw_fastqs }
     raw_fastqs | TRIM_SE
 }
+
+workflow {
+    CRUISE()
+}
diff --git a/modules/local/bagel.nf b/modules/local/bagel.nf
new file mode 100644
index 0000000..43c5eb9
--- /dev/null
+++ b/modules/local/bagel.nf
@@ -0,0 +1,11 @@
+
+process BAGEL {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which BAGEL.py >> output.txt
+    """
+}
diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf
new file mode 100644
index 0000000..0562dbe
--- /dev/null
+++ b/modules/local/drugz.nf
@@ -0,0 +1,11 @@
+
+process DRUGZ {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which drugz.py >> output.txt
+    """
+}
diff --git a/modules/local/mageck.nf b/modules/local/mageck.nf
new file mode 100644
index 0000000..96b714f
--- /dev/null
+++ b/modules/local/mageck.nf
@@ -0,0 +1,27 @@
+
+process MAGECK_COUNT {
+
+}
+
+process MAGECK {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which mageck >> output.txt
+    """
+}
+
+
+process VISPR {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which vispr >> output.txt
+    """
+}