-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
560 lines (486 loc) · 18.6 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
========================================================================
IMBA - Gerlich Groupe - scsHi-C PREPROCESSING PIPELINE
========================================================================
C.C.H. Langer, M. Mitter, A. Goloborodko
========================================================================================
----------------------------------------------------------------------------------------
Pipeline overview:
- 1: Demultiplexing with bcl2fastq
- 2: FastQC (on each sample) for raw sequencing reads quality control
- 3: Merge the fq-files of two lanes
- 4: Alignment using bwa mem
- 5.1: Parse: find ligation junctions in .sam, make .pairsam
- 5.2: Sort the .pairsam files
- 5.3: Dedup: Find and remove PCR/optical duplicates.
- 6: Annotate S4T mutations
- 7.1: Filter cis and trans contacts
- 7.2: Merge ref and comp .pairs file
- 8: Generate cools
- 9: Zoomify and balance: generate a multi-resolution cooler file by coarsening and out-of-core matrix balancing
- 10: Run QC scripts and copy all output to final destination
----------------------------------------------------------------------------------------
*/
//TODO Naming into Camel Case
//Mandatory parameters need to be set in run script
params.inputfolder
params.sampleheet
params.machinetype
params.experimentID
params.doubleflowcell
//How the hg19 reference genome fixed for Hela SNPs is explained in the paper.
params.refDir
outputFolder = "$params.experimentID-$params.machinetype"
//Default parameters can be overwritten when calling the pipeline (i.e in the run_xxx.sh scripts):
params.skipFastQC = false
params.skipDemultiplexing = false
//TODO should this be a parameter? Or fetched automatically?
params.taskcpu = 38
params.chrSizes="${baseDir}/bin/hg19.chrom.sizes"
params.outdir="${baseDir}/nf-out"
//Parameters for s4t mutations
params.min_map_q = 30
params.min_right_muts = 2
params.max_wrong_muts = 0
//Performance Parameters
bwa_threads=38
memory_merge = "20G"
nproc_merge = 19
nproc_pairsam = 19
//Step 5.2 parameters
nproc_sort = 19
nproc_out_sort = 19
memory_sort = "20G"
def helpMessage() {
log.info"""
========================================================================
IMBA - Gerlich Groupe - scsHi-C PREPROCESSING PIPELINE
========================================================================
C.C.H. Langer, M. Mitter, A. Goloborodko
========================================================================
Usage:
The typical command for running the pipeline is using a run script,
modified for your experiment, examples can be found on github:
bash run_cbe.sh
For more information and detailed documentation see:
www.github.com/gerlichlab/scshic_pipeline
========================================================================
""".stripIndent()
}
/*
* SET UP CONFIGURATION VARIABLES
*/
// Pipeline version
version = 1.1
// Show help message
params.help = false
if (params.help){
helpMessage()
exit 0
}
/*
* STEP 1 - Demultiplexing with bcl2fastq
*/
process bcl2fastq{
publishDir "$outputFolder/demultiplexed", mode: "symlink"
output:
file("*.fastq.gz") into CH_demult_raw
when:
!params.skipDemultiplexing
script:
"""
bcl2fastq -R $params.inputfolder --sample-sheet $params.sampleheet -o .
"""
}
/*
* STEP 2 - FastQC (on each sample) for raw sequencing reads quality control
*/
CH_demult_raw
.flatMap() //Attention: Do not forget forget to use .flatMap() after this to detect multiple outputs, since bcl2fastq ran only once and put all files into one item of the channel.
.into { CH_demult_for_fastqc; CH_demult_temp }
process fastqc{
publishDir "$outputFolder/fastqc", mode: 'symlink',
saveAs: {filename -> filename.endsWith(".zip") ? "zips/$filename" : "$filename"}
input:
file(fastq) from CH_demult_for_fastqc
output:
file("*.{zip,html}") into CH_fastqc_reports
when:
!params.skipFastQC
script:
"""
fastqc -t $params.taskcpu -q $fastq
"""
}
CH_demult_temp
.map{ item ->
if (! "${item}".contains("Undetermined_")){
return item
}
}
.set{ CH_demult_clean }
CH_demult_clean
.into {CH_demult_iseq; CH_demult_novaseq}
CH_demult_novaseq
.into {CH_demult_novaseq_temp1; CH_demult_novaseq_temp2}
//TODO replace with choice operator
CH_demult_novaseq_temp1
.map { item ->
if ( "${item}".contains("_R1_001.fastq.gz")){
return item
}
}
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{CH_lane_pairs_R1}
CH_demult_novaseq_temp2
.map { item ->
if ( "${item}".contains("_R2_001.fastq.gz")){
//TODO: replace with filter and "*_L00{1,2}_R2*.fastq.gz"
return item
}
}
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{CH_lane_pairs_R2}
/*
* STEP 3 - Merge the fq-files of two lanes
*/
//Iseq has only one lane, simply renames file
process rename_lanes{
publishDir "$outputFolder/merged_fastq", mode: 'symlink'
input:
file(lane_1) from CH_demult_iseq.filter(~/^.*_001\.fastq\.gz/)
output:
file("*R{1,2}.fastq.gz") into CH_iseq_for_align
when:
params.machinetype == 'Iseq'
params.doubleflowcell == false
script:
barcode = lane_1.getName().tokenize('_').get(0)
sample_ID = lane_1.getName().tokenize('_').get(1)
read_direction = lane_1.getName().tokenize('_').get(3)
file_name = "${barcode}_${sample_ID}_${read_direction}"
"""
mv $lane_1 ${file_name}.fastq.gz
"""
}
// In a full Novaseq run two lanes need to be merged
process merge_lanes_R1{
publishDir "$outputFolder/merged_fastq", mode: 'symlink'
input:
set pair_id, file(pair) from CH_lane_pairs_R1
output:
file("*.fastq.gz") into CH_merged_R1
when:
params.machinetype == 'Novaseq'
params.doubleflowcell == true
script:
barcode = pair[0].getName().tokenize('_').get(0)
assert pair_id == barcode
sample_ID = pair[0].getName().tokenize('_').get(1)
read_direction = pair[0].getName().tokenize('_').get(3)
file_name = "${barcode}_${sample_ID}_${read_direction}"
"""
cat ${pair} > $file_name".fastq.gz"
"""
}
process merge_lanes_R2{
publishDir "$outputFolder/merged_fastq", mode: 'symlink'
input:
set pair_id, file(pair) from CH_lane_pairs_R2
output:
file("*.fastq.gz") into CH_merged_R2
when:
params.machinetype == 'Novaseq'
params.doubleflowcell == true
script:
barcode = pair[0].getName().tokenize('_').get(0)
assert pair_id == barcode
sample_ID = pair[0].getName().tokenize('_').get(1)
read_direction = pair[0].getName().tokenize('_').get(3)
file_name = "${barcode}_${sample_ID}_${read_direction}"
"""
cat ${pair} > $file_name".fastq.gz"
"""
}
CH_iseq_for_align.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{ CH_iseq_tuple_for_align}
CH_merged_R1.mix(CH_merged_R2)
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{CH_novaseq_tuple_for_align}
//Since both are exclusive this just merges two branches back again
CH_iseq_tuple_for_align.mix(CH_novaseq_tuple_for_align)
.set{CH_tuple_for_align}
//Caution: groupTuple on ids creates: [id [file1, file2]], joining channels on id will produce: [id file1, file2]
/*
* STEP 4 - Alignement using bwa mem
*/
// TODO sanity check fastq file names - parse till end
process align_w_bwa{
publishDir "$outputFolder/bwa_results", mode: 'symlink'
input:
set pair_id, file(tuple) from CH_tuple_for_align
output:
file "*.sam" into CH_bwa_results
script:
barcode = tuple[0].getName().tokenize('_').get(0)
assert pair_id == tuple[0].getName().tokenize('_').get(0)
assert pair_id == tuple[1].getName().tokenize('_').get(0)
sample_ID = tuple[0].getName().tokenize('_').get(1)
file_name = "${barcode}_${sample_ID}"
"""
bwa mem -SP5M -t $bwa_threads ${params.refDir}hg19_SNPs.fa ${tuple} > ${file_name}_bwa.sam
"""
}
/*
* STEP 5 - Parse: find ligation junctions in .sam, make .pairsam
*/
process parse_pairs{
publishDir "$outputFolder/parsed_pairsam", mode: 'symlink'
input:
file(bwa_sam) from CH_bwa_results
output:
file "*.pairsam" into CH_parsed_pairsam
file "*.txt" into parsed_stats
script:
barcode = bwa_sam.getName().tokenize('_').get(0)
sample_ID = bwa_sam.getName().tokenize('_').get(1)
file_name = "${barcode}_${sample_ID}"
"""
pairtools parse -c $params.chrSizes --add-columns mapq -o ${file_name}.pairsam --nproc-in $nproc_pairsam --output-stats ${file_name}_stats.txt $bwa_sam
"""
}
/*
* STEP 5.2 - Sort the .pairsam files
*/
process sort_pairs{
publishDir "$outputFolder/sorted_pairsam", mode: 'symlink'
input:
file(pairsam) from CH_parsed_pairsam
output:
file "*.pairsam.lz4" into CH_sorted_pairsam
script:
barcode = pairsam.getName().tokenize('_').get(0)
sample_ID = pairsam.getName().tokenize('_').get(1).tokenize('.').get(0)
file_name = "${barcode}_${sample_ID}"
"""
pairtools sort --memory $memory_sort -o ${file_name}.sorted.pairsam.lz4 --nproc-out $nproc_out_sort --nproc $nproc_sort $pairsam
"""
}
/*
* STEP 5.3 - Dedup: Find and remove PCR/optical duplicates.
*/
process dedup_pairs{
publishDir "$outputFolder/dedup_pairsam", mode: 'symlink',
saveAs: {filename -> filename.endsWith(".stats") ? "stats/$filename" : "$filename"}
input:
file(pairsam) from CH_sorted_pairsam
output:
file "*.pairsam.gz" into CH_dedup_pairsam_main
file "*.stats" into CH_dedup_stats
script:
barcode = pairsam.getName().tokenize('_').get(0)
sample_ID = pairsam.getName().tokenize('_').get(1).tokenize('.').get(0)
file_name = "${barcode}_${sample_ID}"
"""
pairtools dedup -o ${file_name}.dedup.pairsam.gz --output-stats ${file_name}_dedup.stats $pairsam
"""
}
CH_dedup_pairsam_main.into { CH_dedup_pairsam; CH_dedup_for_cooler}
/*
* STEP 6 - Annotate S4T mutations
*/
process detect_s4t{
publishDir "$outputFolder/s4t_pairsam", mode: 'symlink'
input:
file(pairsam) from CH_dedup_pairsam
output:
file "*.pairsam.gz" into CH_s4T_pairsam
script:
barcode = pairsam.getName().tokenize('_').get(0)
sample_ID = pairsam.getName().tokenize('_').get(1).tokenize('.').get(0)
file_name = "${barcode}_${sample_ID}"
//TODO make detect_s4t_mutations.py concurrent
//Reminder: $baseDir is the location of main.nf
"""
python ${baseDir}/bin/detect_s4t_mutations.py --chunksize 5000000 -o ${file_name}.dedup.s4t.pairsam.gz $pairsam
"""
}
/*
* STEP 7.1 - Filter cis and trans contacts
*/
process filter_cis_trans{
publishDir "$outputFolder/s4t_filtered_pairsam", mode: 'symlink'
input:
file(s4t_pairs) from CH_s4T_pairsam
output:
file "*right_muts*trans*pairs.gz" into CH_trans_pairs
file "*right_muts*cis*pairs.gz" into CH_cis_pairs
script:
barcode = s4t_pairs.getName().tokenize('_').get(0)
sample_ID = s4t_pairs.getName().tokenize('_').get(1).tokenize('.').get(0)
sample_name = "${barcode}_${sample_ID}"
"""
bash ${baseDir}/bin/filter_single_experiment.sh $sample_name . $s4t_pairs $params.min_map_q $params.min_right_muts $params.max_wrong_muts
"""
}
/*
* STEP 7.2 - Merge ref and comp .pairs file
*/
CH_cis_pairs
.flatMap()
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{ CH_cis_pairs_file }
process merge_cis_ref_comp{
publishDir "$outputFolder/s4t_merged_pairsam", mode: 'symlink'
input:
set pair_id, file(set) from CH_cis_pairs_file
output:
file "*cis.pairs.gz" into CH_cis_merged_pairs
script:
barcode = set[0].getName().tokenize('_').get(0)
sample_ID = set[0].getName().tokenize('_').get(1).tokenize('.').get(0)
sample_name = "${barcode}_${sample_ID}"
"""
pairtools merge --memory ${memory_merge} --nproc ${nproc_merge} -o ${sample_name}.cis.pairs.gz ${set}
"""
}
CH_trans_pairs
.flatMap()
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple(size: 2, sort: true)
.set{ CH_trans_pairs_file }
process merge_trans_ref_comp{
publishDir "$outputFolder/s4t_merged_pairsam", mode: 'symlink'
input:
set pair_id2, file(set) from CH_trans_pairs_file
output:
file "*trans.pairs.gz" into CH_trans_merged_pairs
script:
barcode = set[0].getName().tokenize('_').get(0)
sample_ID = set[0].getName().tokenize('_').get(1).tokenize('.').get(0)
sample_name = "${barcode}_${sample_ID}"
"""
pairtools merge --memory ${memory_merge} --nproc ${nproc_merge} -o ${sample_name}.trans.pairs.gz ${set}
"""
}
/*
* STEP 8 - Generate cools
*/
process generate_cools{
publishDir "$outputFolder/cooler", mode: 'symlink'
input:
file (all_pairs) from CH_dedup_for_cooler
file (trans_pairs) from CH_trans_merged_pairs
file (cis_pairs) from CH_cis_merged_pairs
output:
file "*cool" into CH_cools
script:
trans_barcode = trans_pairs.getName().tokenize('_').get(0)
trans_sample_ID = trans_pairs.getName().tokenize('_').get(1).tokenize('.').get(0)
trans_sample_name = "${trans_barcode}_${trans_sample_ID}"
all_barcode = all_pairs.getName().tokenize('_').get(0)
all_sample_ID = all_pairs.getName().tokenize('_').get(1).tokenize('.').get(0)
all_sample_name = "${all_barcode}_${all_sample_ID}"
cis_barcode = cis_pairs.getName().tokenize('_').get(0)
cis_sample_ID = cis_pairs.getName().tokenize('_').get(1).tokenize('.').get(0)
cis_sample_name = "${cis_barcode}_${cis_sample_ID}"
"""
# All contacts
cooler cload pairs ${params.chrSizes}:1000 $all_pairs ${all_sample_name}.all.1000.cool --chrom1 2 --chrom2 4 --pos1 3 --pos2 5
# Cis contacts
cooler cload pairs ${params.chrSizes}:1000 $cis_pairs ${cis_sample_name}.cis.1000.cool --chrom1 2 --chrom2 4 --pos1 3 --pos2 5
# Trans contacts
cooler cload pairs ${params.chrSizes}:1000 $trans_pairs ${trans_sample_name}.trans.1000.cool --chrom1 2 --chrom2 4 --pos1 3 --pos2 5
"""
}
CH_cools.into { CH_cools_for_balancing; CH_cools_iseq }
/*
* STEP 9 - Zoomify and balance: generate a multi-resolution cooler file by coarsening and out-of-core matrix balancing
*/
process zoomify_and_balance{
publishDir "$outputFolder/balanced_cooler", mode: 'symlink'
input:
file (cooler_2) from CH_cools_for_balancing.flatMap()
output:
file "*mcool" into CH_mcools_novaseq
when:
params.machinetype=='Novaseq'
script:
"""
cooler zoomify ${cooler_2} -n 4 -r 1000,2000,4000,5000,6000,8000,10000,20000,50000,100000,200000,400000,500000,1000000,5000000 --balance --balance-args '--ignore-diags 0 --mad-max 5 --max-iters 500 --convergence-policy store_nan'
"""
}
/*
* STEP 10 - Run QC scripts and copy all output to final destination
*/
process copy_to_output_iseq{
//fake dependency for synchronization (barrier function)
input:
val (out) from CH_cools_iseq.collect()
when:
params.machinetype=='Iseq'
script:
//Creates the Folder is it does not exist yet
//Copy to output and dereference symlinks
"""
mkdir -p $params.outdir/$outputFolder/qc_and_stats/qc
cp -rL ../../../$outputFolder/cooler $params.outdir/$outputFolder/unbalanced_cooler
cp -rL ../../../$outputFolder/dedup_pairsam/stats $params.outdir/$outputFolder/qc_and_stats/stats
cp -rL ../../../$outputFolder/fastqc $params.outdir/$outputFolder/qc_and_stats/fastqc
cp -rL ../../../$outputFolder/s4t_pairsam $params.outdir/$outputFolder/all_pairs
cp -rL ../../../$outputFolder/s4t_merged_pairsam $params.outdir/$outputFolder/cis_trans_pairs
python ${baseDir}/bin/check_duplications_trans_cis_dist.py --inputdir $params.outdir/$outputFolder/qc_and_stats/stats --resultsdir $params.outdir/$outputFolder/qc_and_stats/qc
python ${baseDir}/bin/count_double_labeled.py --inputdir ../../../$outputFolder --resultsdir $params.outdir/$outputFolder/qc_and_stats/qc
"""
}
process copy_to_output_novaseq{
//fake dependency for synchronization (barrier function)
input:
val (out) from CH_mcools_novaseq.collect()
when:
params.machinetype=='Novaseq'
script:
//Creates the Folder is it does not exist yet
//Copy to output and dereference symlinks
"""
mkdir -p $params.outdir/$outputFolder/qc_and_stats/qc
cp -rL ../../../$outputFolder/balanced_cooler $params.outdir/$outputFolder/mcooler
rm $params.outdir/$outputFolder/mcooler/*cis.1000.mcool
rm $params.outdir/$outputFolder/mcooler/*trans.1000.mcool
cp -rL ../../../$outputFolder/dedup_pairsam/stats $params.outdir/$outputFolder/qc_and_stats/stats
cp -rL ../../../$outputFolder/cooler $params.outdir/$outputFolder/unbalanced_cooler
cp -rL ../../../$outputFolder/fastqc $params.outdir/$outputFolder/qc_and_stats/fastqc
cp -rL ../../../$outputFolder/s4t_pairsam $params.outdir/$outputFolder/all_pairs
cp -rL ../../../$outputFolder/s4t_merged_pairsam $params.outdir/$outputFolder/cis_trans_pairs
python ${baseDir}/bin/check_duplications_trans_cis_dist.py --inputdir $params.outdir/$outputFolder/qc_and_stats/stats --resultsdir $params.outdir/$outputFolder/qc_and_stats/qc
python ${baseDir}/bin/count_double_labeled.py --inputdir ../../../$outputFolder --resultsdir $params.outdir/$outputFolder/qc_and_stats/qc
"""
}