forked from s-andrews/nextflow_pipelines
-
Notifications
You must be signed in to change notification settings - Fork 2
/
nf_cellranger
executable file
·219 lines (162 loc) · 9.34 KB
/
nf_cellranger
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env nextflow
nextflow.enable.dsl=2
// last modified 23 August 2021
params.outdir = "."
params.genome = ""
params.verbose = false
params.cellranger_args = ''
params.single_end = false // default mode is auto-detect
params.fastqs_dir = ''
params.sample_sheet = 'samples.csv' // for testing purposes, not currently used
params.sample = ''
params.list_genomes = false;
params.help = false
// Show help message and exit
if (params.help){
helpMessage()
exit 1
}
if (params.list_genomes){
println ("[WORKLFOW] List genomes selected")
}
if (params.verbose){
println ("[WORKFLOW] CELLRANGER ARGS ARE: " + params.cellranger_args)
}
include { makeFilesChannel; getFileBaseNames } from './nf_modules/files.mod.nf'
include { getGenome } from './nf_modules/genomes.mod.nf'
include { listGenomes } from './nf_modules/genomes.mod.nf'
if (params.list_genomes){
listGenomes() // this lists all available genomes, and exits
}
genome = getGenome(params.genome)
include { CELLRANGER_COUNT } from './nf_modules/cellranger.mod.nf' params(genome: genome)
// Not currently using a file channel for CellRanger runs, but requiring an option --sample sampleID
// file_ch = makeFilesChannel(args)
// file_ch.view()
// absolute_path = ''
// println("Absolute path:" + absolute_path)
if (params.sample && params.verbose){
println ("[WORKFLOW] Cellranger sampleID: " + params.sample)
}
if (params.sample){
// fine
}
else{
println ("\nPlease specify a sample ID with --sample=Sample_Name\n\n(For the sample ID, please no illegal characters! The characters not allowed are the space character and the following: ? ( ) [ ] / \\ = + < > : ; ' \" , * ^ | & . ")
exit 1
}
if (params.fastqs_dir){
// fine
}
else{
println ("\nPlease specify a directory containing the input FastQ files with --fastqs_dir='/path/to/FastQs/'\n\n")
exit 1
}
// Illumina FASTQ files use the following naming scheme when processed with bcl2fastq:
// <sample name>_<barcode sequence>_L<lane>_R<read number>_<setnumber>.fastq.gz
// Illegal Characters
// Project and sample names in the sample sheet cannot contain illegal characters not
// allowed by some file systems. The characters not allowed are the space character and the
// following:
// ? ( ) [ ] / \ = + < > : ; " ' , * ^ | & .
// Let's assume these are some sample files:
// SIGAG5_9_dnmt3ab_DKO_S2_L005_I1_001.fastq.gz
// SIGAG5_9_dnmt3ab_DKO_S2_L005_R1_001.fastq.gz
// SIGAG5_9_dnmt3ab_DKO_S2_L005_R2_001.fastq.gz
// Then this could be a typical Cellranger count run:
// cellranger count --id='SIGAG5_9_dnmt3ab_DKO' \
// --fastqs='/bi/cellranger/testrun/' \
// --transcriptome='/bi/cellranger/references/' \
// --sample=SIGAG5_9_dnmt3ab_DKO \
// --localcores=16 \
// --localmem=35
workflow {
main:
// This pipeline is different to our usual pipelines, as it does not necessarily require the input filenames
// but only a sample name. In the future, we could work on an automation to extract the sample names if there
// are several different sets of files in the same folder
CELLRANGER_COUNT (params.sample, params.fastqs_dir ,params.outdir, params.cellranger_args, params.verbose)
// The following code does not work, but I'll it here for the time being
// file_ch
// subscribe a function to the channel printing the emitted values
// .map {absolute_path -> it[1][0]}
// .subscribe onNext:{ println "Got: $it\n and PATH: $absolute_path" }
// println("Absolute path now:" + absolute_path)
// CELLRANGER_COUNT (file_ch, params.outdir, params.cellranger_args, params.verbose)
}
// Since workflows with very long command lines tend to fail to get rendered at all, I was experimenting with a
// minimal execution summary report so we at least know what the working directory was...
workflow.onComplete {
def msg = """\
Pipeline execution summary
---------------------------
Jobname : ${workflow.runName}
Success : ${workflow.success}
Command Line: ${workflow.commandLine}
Completed at: ${workflow.complete}
Duration : ${workflow.duration}
Success : ${workflow.success}
workDir : ${workflow.workDir}
exit status : ${workflow.exitStatus}
Nextflow version: ${workflow.nextflow.version}
"""
.stripIndent()
sendMail(to: "${workflow.userName}@babraham.ac.uk", subject: 'Minimal pipeline execution report', body: msg)
}
def helpMessage() {
log.info"""
>>
SYNOPSIS:
This workflow takes in a sample identifier and aligns corresponding FastQ files containing this identifier to a genome using
'cellranger count' using the stone compute cluster at Babraham.
If called as is, 'cellranger count' is run in default mode (i.e. only counting counts over exons) using '--localcores 16' and
'--localmem 35'. To add additional parameters, please consider tool-specific arguments that are compatible with cellranger count
(see '--cellranger_args' below).
Last modified: 25 12 2021
==============================================================================================================
USAGE:
nf_cellranger [options] --genome <genomeID> --sample='sampleID' --fastqs_dir='/path/to/FastQs/'
Mandatory arguments:
====================
--sample=[str] The unique sampleID of R1, R2, and optionally I1 files. If we assume these input files:
SIGAG5_9_dnmt3ab_DKO_S2_L005_I1_001.fastq.gz
SIGAG5_9_dnmt3ab_DKO_S2_L005_R1_001.fastq.gz
SIGAG5_9_dnmt3ab_DKO_S2_L005_R2_001.fastq.gz
the unique sampleID would be 'SIGAG5_9_dnmt3ab_DKO'. Project and sample names in the sample
sheet cannot contain illegal characters not allowed by some file systems. The characters not
allowed are the space character and the following: ? ( ) [ ] / \\ = + < > : ; \" ' , * ^ | & .
--fastqs_dir='/path/' This is the full path to the folder where CellRanger expects the input FastQ files, e.g.
--fastqs_dir='/bi/cellrangertest/fastq/' or --fastqs_dir=`pwd` if you are in the directory containing
the FastQ files.
--genome [str] Genome build ID to be used for the alignment, e.g. GRCh38 (latest human genome) or GRCm38
(latest mouse genome build). To list all available genomes, see '--list_genomes' below.
Tool-Specific Options:
======================
--cellranger_args="[str]" This option can take any number of options that are compatible with Cellranger count to modify its
default mapping behaviour. For more detailed information on available options please refer
to the CellRanger Guide, or run 'cellranger count --help'. As an example, to include counts over
introns use ' --cellranger_args="--include-introns" '. Please note that the format ="your options"
needs to be strictly adhered to in order to work correctly. [Default: None]
Other Options:
==============
--list_genomes List all genome builds that are currently available to choose from. To see this list
of available genomes with more detailed information about paths and indexes, run
the command as '--list_genomes --verbose'
--verbose More verbose status messages. [Default: OFF]
--help Displays this help message and exits.
Workflow Options:
=================
Please note the single '-' hyphen for the following options!
-resume If a pipeline workflow has been interrupted or stopped (e.g. by accidentally closing a laptop),
this option will attempt to resume the workflow at the point it got interrupted by using
Nextflow's caching mechanism. This may save a lot of time.
-bg Sends the entire workflow into the background, thus disconnecting it from the terminal session.
This option launches a daemon process (which will keep running on the headnode) that watches over
your workflow, and submits new jobs to the SLURM queue as required. Use this option for big pipeline
jobs, or whenever you do not want to watch the status progress yourself. Upon completion, the
pipeline will send you an email with the job details. This option is HIGHLY RECOMMENDED!
-process.executor=local Temporarily changes where the workflow is executed to the 'local' machine. Please also see the Nextflow
config file for details. [Default: slurm]
<<
""".stripIndent()
}