From 513d14a28e8c9c6c932cb0a8f07a44c85fac7027 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 13 Sep 2023 15:44:55 -0400 Subject: [PATCH 1/3] chore: rename mkdocs action --- .github/workflows/{build_mkdocs.yml => docs.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{build_mkdocs.yml => docs.yml} (100%) diff --git a/.github/workflows/build_mkdocs.yml b/.github/workflows/docs.yml similarity index 100% rename from .github/workflows/build_mkdocs.yml rename to .github/workflows/docs.yml From fc2878d64e2fec51725cac0a07a51abd048e9f5a Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 13 Sep 2023 16:04:37 -0400 Subject: [PATCH 2/3] feat: input check for samplesheet --- assets/README.md | 6 ++ assets/samplesheet_test.csv | 2 + bin/check_samplesheet.py | 166 +++++++++++++++++++++++++++++ conf/modules.config | 10 +- conf/test.config | 24 +++++ main.nf | 10 ++ modules/local/samplesheet_check.nf | 27 +++++ modules/local/trim.nf | 42 ++++++++ nextflow.config | 12 +++ submodules/local/input_check.nf | 51 +++++++++ 10 files changed, 348 insertions(+), 2 deletions(-) create mode 100644 assets/README.md create mode 100644 assets/samplesheet_test.csv create mode 100755 bin/check_samplesheet.py create mode 100644 conf/test.config create mode 100644 modules/local/samplesheet_check.nf create mode 100644 modules/local/trim.nf create mode 100644 submodules/local/input_check.nf diff --git a/assets/README.md b/assets/README.md new file mode 100644 index 0000000..4846036 --- /dev/null +++ b/assets/README.md @@ -0,0 +1,6 @@ + +## test dataset + +- geo https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE111309 +- paper https://doi.org/10.1016/j.celrep.2019.02.041 +- sample sheet with just one sample: assets/samplesheet_test.csv diff --git a/assets/samplesheet_test.csv b/assets/samplesheet_test.csv new file mode 100644 index 0000000..d7be801 --- /dev/null +++ b/assets/samplesheet_test.csv @@ -0,0 +1,2 @@ +sample,fastq_1,fastq_2 +haploid_hESC_library_Day30_rep1_run2,data/test/SRR6795299.fastq.gz,, diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py new file mode 100755 index 0000000..b6dc239 --- /dev/null +++ b/bin/check_samplesheet.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +""" +adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/bin/check_samplesheet.py +""" + +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + Description = "Reformat samplesheet file and check its contents." + Epilog = "Example usage: python check_samplesheet.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise exception + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check samplesheet -> {}".format(error) + if context != "" and context_str != "": + error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + print(error_str) + sys.exit(1) + + +def check_samplesheet(file_in, file_out): + """ + This function checks that the samplesheet follows the following structure: + sample,fastq_1,fastq_2 + SPT5_T0_REP1,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz + SPT5_T0_REP2,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz + """ + + sample_mapping_dict = {} + with open(file_in, "r", encoding="utf-8-sig") as fin: + ## Check header + MIN_COLS = 2 + HEADER = ["sample", "fastq_1", "fastq_2"] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + if header[: len(HEADER)] != HEADER: + print( + f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}" + ) + sys.exit(1) + + ## Check sample entries + for line in fin: + lspl = [x.strip().strip('"') for x in line.strip().split(",")] + + # Check valid number of columns per row + if len(lspl) < len(HEADER): + print_error( + "Invalid number of columns (minimum = {})!".format(len(HEADER)), + "Line", + line, + ) + num_cols = len([x for x in lspl if x]) + if num_cols < MIN_COLS: + print_error( + "Invalid number of populated columns (minimum = {})!".format( + MIN_COLS + ), + "Line", + line, + ) + + ## Check sample name entries + sample, fastq_1, fastq_2 = lspl[: len(HEADER)] + if sample.find(" ") != -1: + print( + f"WARNING: Spaces have been replaced by underscores for sample: {sample}" + ) + sample = sample.replace(" ", "_") + if not sample: + print_error("Sample entry has not been specified!", "Line", line) + + ## Check FastQ file extension + for fastq in [fastq_1, fastq_2]: + if fastq: + if fastq.find(" ") != -1: + print_error("FastQ file contains spaces!", "Line", line) + if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): + print_error( + "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", + "Line", + line, + ) + + ## Auto-detect paired-end/single-end + sample_info = [] ## [single_end, fastq_1, fastq_2] + if sample and fastq_1 and fastq_2: ## Paired-end short reads + sample_info = ["0", fastq_1, fastq_2] + elif sample and fastq_1 and not fastq_2: ## Single-end short reads + sample_info = ["1", fastq_1, fastq_2] + else: + print_error("Invalid combination of columns provided!", "Line", line) + + ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]} + if sample not in sample_mapping_dict: + sample_mapping_dict[sample] = [sample_info] + else: + if sample_info in sample_mapping_dict[sample]: + print_error("Samplesheet contains duplicate rows!", "Line", line) + else: + sample_mapping_dict[sample].append(sample_info) + + ## Write validated samplesheet with appropriate columns + if len(sample_mapping_dict) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write( + ",".join( + [ + "sample", + "single_end", + "fastq_1", + "fastq_2", + ] + ) + + "\n" + ) + for sample in sorted(sample_mapping_dict.keys()): + ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end + if not all( + x[0] == sample_mapping_dict[sample][0][0] + for x in sample_mapping_dict[sample] + ): + print_error( + f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", + "Sample", + sample, + ) + + for idx, val in enumerate(sample_mapping_dict[sample]): + plus_T = ( + f"_T{idx+1}" if len(sample_mapping_dict[sample]) > 1 else "" + ) # do not append _T{idx} if not needed + fout.write(",".join([f"{sample}{plus_T}"] + val) + "\n") + else: + print_error(f"No entries to process!", "Samplesheet: {file_in}") + + +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index fdc7887..00794a9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -25,13 +25,19 @@ process { process { // catch any process without a container. - // this must be first before any custom process containers - withName: ".*" { + // this must be first before any other withName selectors. + // oddly, it will override any withLabel selectors even though + // label should have a higher priority than name. + // https://www.nextflow.io/docs/latest/config.html#selector-priority + withName: '.*' { container = 'nciccbr/ccbr_ubuntu_base_20.04:latest' } // custom process containers + withName: 'TRIM.*' { + container = 'nciccbr/ncigb_cutadapt_v1.18:latest' + } withName: 'MAGECK.*' { container = 'quay.io/biocontainers/mageck:0.5.9.5--py39h1f90b4d_3' } diff --git a/conf/test.config b/conf/test.config new file mode 100644 index 0000000..8602356 --- /dev/null +++ b/conf/test.config @@ -0,0 +1,24 @@ +params { + config_profile_name = 'Test dataset' + config_profile_description = 'Minimal test dataset for pipeline functionality' + + input = 'assets/samplesheet_test.csv' + outdir = 'results/test' + genome = null + + max_cpus = 32 // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources + max_memory = '120.GB' + max_time = '12.h' + + publish_dir_mode = 'symlink' +} +dag { + enabled = true + overwrite = true + file = 'assets/dag.png' +} +report { + enabled = true + overwrite = true + file = "${params.outdir}/pipeline_info/execution_report.html" +} diff --git a/main.nf b/main.nf index c8f1faf..ca008c3 100644 --- a/main.nf +++ b/main.nf @@ -16,6 +16,13 @@ reads : ${params.input} """ .stripIndent() + +// SUBMODULES +include { INPUT_CHECK } from './submodules/local/input_check.nf' + +// MODULES +include { TRIM_SE } from './modules/local/trim.nf' + process BAGEL { output: path("output.txt") @@ -76,4 +83,7 @@ workflow { BAGEL() MAGECK() VISPR() + INPUT_CHECK(file(params.input), params.seq_center) + INPUT_CHECK.out.reads.set{ raw_fastqs } + raw_fastqs | TRIM_SE } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf new file mode 100644 index 0000000..e0ca829 --- /dev/null +++ b/modules/local/samplesheet_check.nf @@ -0,0 +1,27 @@ +// adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/local/samplesheet_check.nf +process SAMPLESHEET_CHECK { + tag "$samplesheet" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + path samplesheet + + output: + path '*.csv' , emit: csv + path "versions.yml", emit: versions + + script: + """ + check_samplesheet.py \\ + $samplesheet \\ + samplesheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/trim.nf b/modules/local/trim.nf new file mode 100644 index 0000000..434a9f3 --- /dev/null +++ b/modules/local/trim.nf @@ -0,0 +1,42 @@ +// https://github.com/CCBR/ChiP-seek/blob/9ba449e4855f9710e86f2db7c1d9560de634b3f1/workflow/rules/align.smk#L21 +// https://github.com/nf-core/ampliseq/blob/dev/subworkflows/local/cutadapt_workflow.nf +// https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/nf-core/modules/trimgalore/main.nf +process TRIM_SE { + tag { meta.id } + label 'qc' + label 'process_high' + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fastq.gz") + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + nseqs_raw=\$(zgrep "^@" ${fastq} | wc -l) + echo "\$nseqs_raw in ${fastq}" + cutadapt \ + --nextseq-trim=2 \ + --trim-n \ + -n 5 -O 5 \ + -q ${params.cutadapt.leadingquality},${params.cutadapt.trailingquality} \ + -m ${params.cutadapt.minlen} \ + -b file:${params.cutadapt.adapters} \ + -j $task.cpus \ + $fastq |\ + pigz -p ${task.cpus} > ${prefix}.trimmed.fastq.gz + nseqs_trimmed=\$(zgrep "^@" ${prefix}.trimmed.fastq.gz | wc -l) + echo "\$nseqs_trimmed in ${prefix}.trimmed.fastq.gz" + """ + } else { + "paired end reads are not supported yet" + } + + stub: + """ + touch ${meta.id}.trimmed.fastq.gz + """ +} diff --git a/nextflow.config b/nextflow.config index d9e414b..fb0a2fb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,9 +4,18 @@ params { input = null outdir = 'results' genome = null + seq_center = null + enable_conda = false publish_dir_mode = "copy" + cutadapt { + adapters = '/opt2/TruSeq_and_nextera_adapters.consolidated.fa' // this is in the cutadapt container + minlen = 20 + leadingquality = 10 + trailingquality = 10 + } + } includeConfig 'conf/base.config' @@ -36,6 +45,9 @@ profiles { ci_stub { includeConfig "conf/ci_stub.config" } + test { + includeConfig "conf/test.config" + } } // Export these variables to prevent local Python/R libraries from conflicting with those in the container diff --git a/submodules/local/input_check.nf b/submodules/local/input_check.nf new file mode 100644 index 0000000..8c8702c --- /dev/null +++ b/submodules/local/input_check.nf @@ -0,0 +1,51 @@ +// source: https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/subworkflows/local/input_check.nf +// +// Check input samplesheet and get read channels +// + +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check.nf' + +workflow INPUT_CHECK { + take: + samplesheet // file: /path/to/samplesheet.csv + seq_center // string: sequencing center for read group + + main: + SAMPLESHEET_CHECK ( samplesheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .map { create_fastq_channel(it, seq_center) } + .set { reads } + + emit: + reads // channel: [ val(meta), [ reads ] ] + versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] +} + +// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] +def create_fastq_channel(LinkedHashMap row, String seq_center) { + def meta = [:] + meta.id = row.sample + meta.single_end = row.single_end.toBoolean() + + def read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\'" + if (seq_center) { + read_group = "\'@RG\\tID:${meta.id}\\tSM:${meta.id.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${meta.id}\\tPU:1\\tCN:${seq_center}\'" + } + meta.read_group = read_group + + // add path(s) of the fastq file(s) to the meta map + def fastq_meta = [] + if (!file(row.fastq_1).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" + } + if (meta.single_end) { + fastq_meta = [ meta, [ file(row.fastq_1) ] ] + } else { + if (!file(row.fastq_2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + } + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + } + return fastq_meta +} From 43eaefe4d8307051ef508dabe7c2dcb88f677b53 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 13 Sep 2023 16:08:01 -0400 Subject: [PATCH 3/3] chore: move dummy processes to modules --- main.nf | 66 ++++------------------------------------- modules/local/bagel.nf | 11 +++++++ modules/local/drugz.nf | 11 +++++++ modules/local/mageck.nf | 27 +++++++++++++++++ 4 files changed, 54 insertions(+), 61 deletions(-) create mode 100644 modules/local/bagel.nf create mode 100644 modules/local/drugz.nf create mode 100644 modules/local/mageck.nf diff --git a/main.nf b/main.nf index ca008c3..24bf9b6 100644 --- a/main.nf +++ b/main.nf @@ -16,74 +16,18 @@ reads : ${params.input} """ .stripIndent() - // SUBMODULES include { INPUT_CHECK } from './submodules/local/input_check.nf' // MODULES include { TRIM_SE } from './modules/local/trim.nf' -process BAGEL { - output: - path("output.txt") - - script: - """ - uname -a >> output.txt - which BAGEL.py >> output.txt - """ -} - -process DRUGZ { - output: - path("output.txt") - - script: - """ - uname -a >> output.txt - which drugz.py >> output.txt - """ -} - -process MAGECK { - output: - path("output.txt") - - script: - """ - uname -a >> output.txt - which mageck >> output.txt - """ -} - -process VISPR { - output: - path("output.txt") - - script: - """ - uname -a >> output.txt - which vispr >> output.txt - """ -} - -process BASE { - output: - path("output.txt") - - script: - """ - uname -a >> output.txt - python -V >> output.txt - """ -} -workflow { - BASE() - DRUGZ() - BAGEL() - MAGECK() - VISPR() +workflow CRUISE { INPUT_CHECK(file(params.input), params.seq_center) INPUT_CHECK.out.reads.set{ raw_fastqs } raw_fastqs | TRIM_SE } + +workflow { + CRUISE() +} diff --git a/modules/local/bagel.nf b/modules/local/bagel.nf new file mode 100644 index 0000000..43c5eb9 --- /dev/null +++ b/modules/local/bagel.nf @@ -0,0 +1,11 @@ + +process BAGEL { + output: + path("output.txt") + + script: + """ + uname -a >> output.txt + which BAGEL.py >> output.txt + """ +} diff --git a/modules/local/drugz.nf b/modules/local/drugz.nf new file mode 100644 index 0000000..0562dbe --- /dev/null +++ b/modules/local/drugz.nf @@ -0,0 +1,11 @@ + +process DRUGZ { + output: + path("output.txt") + + script: + """ + uname -a >> output.txt + which drugz.py >> output.txt + """ +} diff --git a/modules/local/mageck.nf b/modules/local/mageck.nf new file mode 100644 index 0000000..96b714f --- /dev/null +++ b/modules/local/mageck.nf @@ -0,0 +1,27 @@ + +process MAGECK_COUNT { + +} + +process MAGECK { + output: + path("output.txt") + + script: + """ + uname -a >> output.txt + which mageck >> output.txt + """ +} + + +process VISPR { + output: + path("output.txt") + + script: + """ + uname -a >> output.txt + which vispr >> output.txt + """ +}