Merge branch 'main' into ci-build

CCBR · Sep 13, 2023 · 061374f · 061374f
2 parents b9654a1 + 43eaefe
commit 061374f
Show file tree

Hide file tree

Showing 14 changed files with 396 additions and 57 deletions.
diff --git a/.github/workflows/build_mkdocs.yml → .github/workflows/docs.yml b/.github/workflows/build_mkdocs.yml → .github/workflows/docs.yml
diff --git a/assets/README.md b/assets/README.md
diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+"""
+adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/bin/check_samplesheet.py
+"""
+
+import os
+import sys
+import errno
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Reformat samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context="Line", context_str=""):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != "" and context_str != "":
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
+            error, context.strip(), context_str.strip()
+        )
+    print(error_str)
+    sys.exit(1)
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+    sample,fastq_1,fastq_2
+    SPT5_T0_REP1,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz
+    SPT5_T0_REP2,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz
+    """
+
+    sample_mapping_dict = {}
+    with open(file_in, "r", encoding="utf-8-sig") as fin:
+        ## Check header
+        MIN_COLS = 2
+        HEADER = ["sample", "fastq_1", "fastq_2"]
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[: len(HEADER)] != HEADER:
+            print(
+                f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
+            )
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            # Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error(
+                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    "Line",
+                    line,
+                )
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error(
+                    "Invalid number of populated columns (minimum = {})!".format(
+                        MIN_COLS
+                    ),
+                    "Line",
+                    line,
+                )
+
+            ## Check sample name entries
+            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            if sample.find(" ") != -1:
+                print(
+                    f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
+                )
+                sample = sample.replace(" ", "_")
+            if not sample:
+                print_error("Sample entry has not been specified!", "Line", line)
+
+            ## Check FastQ file extension
+            for fastq in [fastq_1, fastq_2]:
+                if fastq:
+                    if fastq.find(" ") != -1:
+                        print_error("FastQ file contains spaces!", "Line", line)
+                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                        print_error(
+                            "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
+                            "Line",
+                            line,
+                        )
+
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2]
+            else:
+                print_error("Invalid combination of columns provided!", "Line", line)
+
+            ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]}
+            if sample not in sample_mapping_dict:
+                sample_mapping_dict[sample] = [sample_info]
+            else:
+                if sample_info in sample_mapping_dict[sample]:
+                    print_error("Samplesheet contains duplicate rows!", "Line", line)
+                else:
+                    sample_mapping_dict[sample].append(sample_info)
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_mapping_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write(
+                ",".join(
+                    [
+                        "sample",
+                        "single_end",
+                        "fastq_1",
+                        "fastq_2",
+                    ]
+                )
+                + "\n"
+            )
+            for sample in sorted(sample_mapping_dict.keys()):
+                ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
+                if not all(
+                    x[0] == sample_mapping_dict[sample][0][0]
+                    for x in sample_mapping_dict[sample]
+                ):
+                    print_error(
+                        f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
+                        "Sample",
+                        sample,
+                    )
+
+                for idx, val in enumerate(sample_mapping_dict[sample]):
+                    plus_T = (
+                        f"_T{idx+1}" if len(sample_mapping_dict[sample]) > 1 else ""
+                    )  # do not append _T{idx} if not needed
+                    fout.write(",".join([f"{sample}{plus_T}"] + val) + "\n")
+    else:
+        print_error(f"No entries to process!", "Samplesheet: {file_in}")
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
@@ -25,13 +25,19 @@ process {
 process {
 
     // catch any process without a container.
-    // this must be first before any custom process containers
-    withName: ".*" {
+    // this must be first before any other withName selectors.
+    // oddly, it will override any withLabel selectors even though
+    // label should have a higher priority than name.
+    // https://www.nextflow.io/docs/latest/config.html#selector-priority
+    withName: '.*' {
         container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
     }
 
     // custom process containers
 
+    withName: 'TRIM.*' {
+        container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
+    }
     withName: 'MAGECK.*' {
         container = 'quay.io/biocontainers/mageck:0.5.9.5--py39h1f90b4d_3'
     }

diff --git a/conf/test.config b/conf/test.config
@@ -0,0 +1,24 @@
+params {
+    config_profile_name = 'Test dataset'
+    config_profile_description = 'Minimal test dataset for pipeline functionality'
+
+    input = 'assets/samplesheet_test.csv'
+    outdir = 'results/test'
+    genome = null
+
+    max_cpus = 32        // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
+    max_memory = '120.GB'
+    max_time   = '12.h'
+
+    publish_dir_mode = 'symlink'
+}
+dag {
+    enabled = true
+    overwrite = true
+    file = 'assets/dag.png'
+}
+report {
+    enabled = true
+    overwrite = true
+    file = "${params.outdir}/pipeline_info/execution_report.html"
+}
diff --git a/main.nf b/main.nf
@@ -16,64 +16,18 @@ reads        : ${params.input}
 """
 .stripIndent()
 
-process BAGEL {
-    output:
-        path("output.txt")
+// SUBMODULES
+include { INPUT_CHECK } from './submodules/local/input_check.nf'
 
-    script:
-    """
-    uname -a >> output.txt
-    which BAGEL.py >> output.txt
-    """
-}
-
-process DRUGZ {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which drugz.py >> output.txt
-    """
-}
-
-process MAGECK {
-    output:
-        path("output.txt")
+// MODULES
+include { TRIM_SE } from './modules/local/trim.nf'
 
-    script:
-    """
-    uname -a >> output.txt
-    which mageck >> output.txt
-    """
+workflow CRUISE {
+    INPUT_CHECK(file(params.input), params.seq_center)
+    INPUT_CHECK.out.reads.set{ raw_fastqs }
+    raw_fastqs | TRIM_SE
 }
 
-process VISPR {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    which vispr >> output.txt
-    """
-}
-
-process BASE {
-    output:
-        path("output.txt")
-
-    script:
-    """
-    uname -a >> output.txt
-    python -V >> output.txt
-    """
-}
 workflow {
-    BASE()
-    DRUGZ()
-    BAGEL()
-    MAGECK()
-    VISPR()
+    CRUISE()
 }
diff --git a/modules/local/bagel.nf b/modules/local/bagel.nf
@@ -0,0 +1,11 @@
+
+process BAGEL {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which BAGEL.py >> output.txt
+    """
+}
diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf
@@ -0,0 +1,11 @@
+
+process DRUGZ {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which drugz.py >> output.txt
+    """
+}
diff --git a/modules/local/mageck.nf b/modules/local/mageck.nf
@@ -0,0 +1,27 @@
+
+process MAGECK_COUNT {
+
+}
+
+process MAGECK {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which mageck >> output.txt
+    """
+}
+
+
+process VISPR {
+    output:
+        path("output.txt")
+
+    script:
+    """
+    uname -a >> output.txt
+    which vispr >> output.txt
+    """
+}
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
@@ -0,0 +1,27 @@
+// adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/local/samplesheet_check.nf
+process SAMPLESHEET_CHECK {
+    tag "$samplesheet"
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'quay.io/biocontainers/python:3.8.3' }"
+
+    input:
+    path samplesheet
+
+    output:
+    path '*.csv'       , emit: csv
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    check_samplesheet.py \\
+        $samplesheet \\
+        samplesheet.valid.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}