Skip to content

Commit

Permalink
Merge branch 'main' into ci-build
Browse files Browse the repository at this point in the history
  • Loading branch information
kelly-sovacool committed Sep 13, 2023
2 parents b9654a1 + 43eaefe commit 061374f
Show file tree
Hide file tree
Showing 14 changed files with 396 additions and 57 deletions.
File renamed without changes.
6 changes: 6 additions & 0 deletions assets/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions assets/samplesheet_test.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

166 changes: 166 additions & 0 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/usr/bin/env python3

"""
adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/bin/check_samplesheet.py
"""

import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,fastq_1,fastq_2
SPT5_T0_REP1,SRR1822153_1.fastq.gz,SRR1822153_2.fastq.gz
SPT5_T0_REP2,SRR1822154_1.fastq.gz,SRR1822154_2.fastq.gz
"""

sample_mapping_dict = {}
with open(file_in, "r", encoding="utf-8-sig") as fin:
## Check header
MIN_COLS = 2
HEADER = ["sample", "fastq_1", "fastq_2"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print(
f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error(
"Invalid number of populated columns (minimum = {})!".format(
MIN_COLS
),
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
if sample.find(" ") != -1:
print(
f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
)
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
else:
print_error("Invalid combination of columns provided!", "Line", line)

## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2,]]}
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(
",".join(
[
"sample",
"single_end",
"fastq_1",
"fastq_2",
]
)
+ "\n"
)
for sample in sorted(sample_mapping_dict.keys()):
## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
if not all(
x[0] == sample_mapping_dict[sample][0][0]
for x in sample_mapping_dict[sample]
):
print_error(
f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
"Sample",
sample,
)

for idx, val in enumerate(sample_mapping_dict[sample]):
plus_T = (
f"_T{idx+1}" if len(sample_mapping_dict[sample]) > 1 else ""
) # do not append _T{idx} if not needed
fout.write(",".join([f"{sample}{plus_T}"] + val) + "\n")
else:
print_error(f"No entries to process!", "Samplesheet: {file_in}")


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
10 changes: 8 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@ process {
process {

// catch any process without a container.
// this must be first before any custom process containers
withName: ".*" {
// this must be first before any other withName selectors.
// oddly, it will override any withLabel selectors even though
// label should have a higher priority than name.
// https://www.nextflow.io/docs/latest/config.html#selector-priority
withName: '.*' {
container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
}

// custom process containers

withName: 'TRIM.*' {
container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
}
withName: 'MAGECK.*' {
container = 'quay.io/biocontainers/mageck:0.5.9.5--py39h1f90b4d_3'
}
Expand Down
24 changes: 24 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
params {
config_profile_name = 'Test dataset'
config_profile_description = 'Minimal test dataset for pipeline functionality'

input = 'assets/samplesheet_test.csv'
outdir = 'results/test'
genome = null

max_cpus = 32 // for GitHub Actions https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
max_memory = '120.GB'
max_time = '12.h'

publish_dir_mode = 'symlink'
}
dag {
enabled = true
overwrite = true
file = 'assets/dag.png'
}
report {
enabled = true
overwrite = true
file = "${params.outdir}/pipeline_info/execution_report.html"
}
64 changes: 9 additions & 55 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,64 +16,18 @@ reads : ${params.input}
"""
.stripIndent()

process BAGEL {
output:
path("output.txt")
// SUBMODULES
include { INPUT_CHECK } from './submodules/local/input_check.nf'

script:
"""
uname -a >> output.txt
which BAGEL.py >> output.txt
"""
}

process DRUGZ {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which drugz.py >> output.txt
"""
}

process MAGECK {
output:
path("output.txt")
// MODULES
include { TRIM_SE } from './modules/local/trim.nf'

script:
"""
uname -a >> output.txt
which mageck >> output.txt
"""
workflow CRUISE {
INPUT_CHECK(file(params.input), params.seq_center)
INPUT_CHECK.out.reads.set{ raw_fastqs }
raw_fastqs | TRIM_SE
}

process VISPR {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which vispr >> output.txt
"""
}

process BASE {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
python -V >> output.txt
"""
}
workflow {
BASE()
DRUGZ()
BAGEL()
MAGECK()
VISPR()
CRUISE()
}
11 changes: 11 additions & 0 deletions modules/local/bagel.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

process BAGEL {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which BAGEL.py >> output.txt
"""
}
11 changes: 11 additions & 0 deletions modules/local/drugz.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

process DRUGZ {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which drugz.py >> output.txt
"""
}
27 changes: 27 additions & 0 deletions modules/local/mageck.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

process MAGECK_COUNT {

}

process MAGECK {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which mageck >> output.txt
"""
}


process VISPR {
output:
path("output.txt")

script:
"""
uname -a >> output.txt
which vispr >> output.txt
"""
}
27 changes: 27 additions & 0 deletions modules/local/samplesheet_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// adapted from https://github.com/nf-core/chipseq/blob/51eba00b32885c4d0bec60db3cb0a45eb61e34c5/modules/local/samplesheet_check.nf
process SAMPLESHEET_CHECK {
tag "$samplesheet"

container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/python:3.8.3' :
'quay.io/biocontainers/python:3.8.3' }"

input:
path samplesheet

output:
path '*.csv' , emit: csv
path "versions.yml", emit: versions

script:
"""
check_samplesheet.py \\
$samplesheet \\
samplesheet.valid.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""
}
Loading

0 comments on commit 061374f

Please sign in to comment.