forked from IARCbioinfo/gene-fusions-nf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
executable file
·433 lines (368 loc) · 14.9 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
#!/usr/bin/env nextflow
//help function for the tool
def show_help (){
log.info IARC_Header()
log.info tool_header()
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run iarcbioinfo/gene-fusions-nf --reads '*_R{1,2}.fastq.gz' -profile singularity
Mandatory arguments:
--reads [file] Path to input data
--bams [folder] Path to folder containing BAM files for processing
References
--ref_fa [file] Path to fasta reference
--ref_gtf [file] Path to GTF annotation
--star_index [file] Path to STAR-Index reference
Input alternatives:
--reads_csv file with tabular data for each sample to process [sampleID fwd_path rev_path]
--svs file with tabular data for each sample with structural variants in bedpe format [sampleID sv_path]
Visualization :
--arriba_plot [bool] by default plot all the gene fusions detected by arriba. set to false to inactivate.
Test dataset:
The subdirectory test_dataset contains a small simulated dataset to test the whole workflow.
Run examples:
nextflow run iarc/nf-gene-fusions -r v1.1 -singularity --reads 'test_dataset/reads/*.R{1,2}.fastq.gz' --ref_fa test_dataset/genome.fa --ref_gtf test_dataset/genome.gtf
or
nextflow run iarc/nf-gene-fusions -r v1.1 -singularity --bams '/path/to/bams/' --ref_fa test_dataset/genome.fa --ref_gtf test_dataset/genome.gtf
""".stripIndent()
}
if(params.bams){mode="BAM"}else{mode="reads"}
process build_star_index {
tag "star-index"
label 'load_medium'
publishDir params.outdir, mode: 'copy'
input:
path(fasta)
path(gtf)
output:
path("star-index"), emit: index
when: !(params.star_index)
script:
//adjust a variable for working with a smaller reference
def opt_test = params.debug ? "--genomeSAindexNbases 8" : ''
if(params.debug){
"""
mkdir star-index
"""
}else{
"""
mkdir star-index
STAR \\
--runMode genomeGenerate \\
--runThreadN ${task.cpus} \\
--sjdbGTFfile ${gtf} \\
--sjdbOverhang ${params.read_length - 1} \\
--genomeDir star-index/ \\
--genomeFastaFiles ${fasta} \\
${opt_test}
"""
}
}
//we map the reads only if the bams are not mapped
process star_mapping{
tag "${sample}"
label 'load_low2'
//we can remove this to avoid keeping the bam files
publishDir "${params.outdir}/star_mapping", mode: 'copy', pattern: "${sample}.{Log.final.out,ReadsPerGene.out.tab}"
input:
tuple val(sample), file(reads)
file(star_index)
output:
//star bam files
tuple val(sample), file("${sample}_STAR.bam"), emit: bams
//star mapping stats and gene counts *.{tsv,txt}
tuple val(sample), file("${sample}.{Log.final.out,ReadsPerGene.out.tab,fastq.log}") optional true
script:
if(params.debug){
"""
touch ${sample}_STAR.bam
touch ${sample}.Log.final.out
touch ${sample}.ReadsPerGene.out.tab
"""
//BAMs are given as input
}else if(mode=="BAM"){
"""
#we map the reads by picking pairs from input BAM
samtools collate -un 10 -o paired_${sample}.bam ${reads} tmp_${sample}
samtools fastq -1 paired1.fq.gz -2 paired2.fq.gz -0 /dev/null -s /dev/null paired_${sample}.bam > ${sample}.fastq.log
rm -f paired_${sample}.bam
STAR \\
--runThreadN ${task.cpus} \\
--genomeDir ${star_index} \\
--genomeLoad NoSharedMemory \\
--readFilesIn paired1.fq.gz paired2.fq.gz\\
--readFilesCommand zcat \\
--outSAMtype BAM Unsorted --outSAMunmapped Within \\
--outFilterMultimapNmax 1 --outFilterMismatchNmax 3 \\
--chimSegmentMin 10 --chimOutType WithinBAM SoftClip \\
--chimJunctionOverhangMin 10 \\
--chimScoreMin 1 --chimScoreDropMax 30 \\
--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 \\
--alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 \\
--quantMode GeneCounts \\
--outFileNamePrefix ${sample}.
#we rename the defaul star output
mv ${sample}.Aligned.out.bam ${sample}_STAR.bam
#we delete the paired reads
rm -f paired1.fq.gz paired2.fq.gz
"""
//normal reads are given as input
}else{
"""
STAR \\
--runThreadN ${task.cpus} \\
--genomeDir ${star_index} \\
--genomeLoad NoSharedMemory \\
--readFilesIn ${reads} \\
--readFilesCommand zcat \\
--outSAMtype BAM Unsorted --outSAMunmapped Within \\
--outFilterMultimapNmax 1 --outFilterMismatchNmax 3 \\
--chimSegmentMin 10 --chimOutType WithinBAM SoftClip \\
--chimJunctionOverhangMin 10 \\
--chimScoreMin 1 --chimScoreDropMax 30 \\
--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 \\
--alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 \\
--quantMode GeneCounts \\
--outFileNamePrefix ${sample}.
#we rename the defaul star output
mv ${sample}.Aligned.out.bam ${sample}_STAR.bam
"""
}
/*
#The STAR gene counts coincide with those produced by htseq-count with default parameters.
#The file colums are:
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
#column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse
*/
}
//we create a channel with bam and sv files
//}
/*
* run arriba fusion with genomic SVs
* In case of the Variant Call Format, the file must comply with the VCF specification for structural variants.
* In particular, Arriba requires that the SVTYPE field be present in the INFO column and specify one of the four values BND, DEL, DUP, INV.
* In addition, for all SVTYPEs other than BND, the END field must be present and specify the second breakpoint of the structural variant.
* Structural variants with single breakends are silently ignored.
*/
process arriba_sv {
tag "${sample}-arriba_sv"
label 'load_low1'
publishDir "${params.outdir}/arriba/", mode: 'copy'
input:
tuple val(sample), file(vcf), file(bam)
//path(arriba_lib)
file(fasta)
file(gtf)
output:
tuple val(sample), file("${sample}_arriba_sv.tsv") optional true
file("*.{tsv,txt,log}")
//we use this methods when SVs are given
script:
def extra_params = params.arriba_opt ? params.arriba_opt : ''
if(params.debug){
"""
echo arriba \\
-x ${bam} \\
-a ${fasta} \\
-g ${gtf} \\
-d ${vcf} \\
-b ${params.arriba_lib}/blacklist_hg38_GRCh38_v2.1.0.tsv.gz \\
-o ${sample}_arriba_sv.tsv -O ${sample}_discarded_arriba_sv.tsv \\
${extra_params}
touch ${sample}_arriba_sv.tsv
"""
}else{
"""
arriba \\
-x ${bam} \\
-a ${fasta} \\
-g ${gtf} \\
-d ${vcf} \\
-b ${params.arriba_lib}/blacklist_hg38_GRCh38_v2.1.0.tsv.gz \\
-o ${sample}_arriba_sv.tsv -O ${sample}_discarded_arriba_sv.tsv \\
${extra_params} > ${sample}_arriba.log
"""
}
}
/*
* run arriba fusion without genomic structural variants
*/
process arriba {
tag "${sample}-arriba"
label 'load_low1'
publishDir "${params.outdir}/arriba/", mode: 'copy'
input:
tuple val(sample), file(bam)
//path(arriba_lib)
file(fasta)
file(gtf)
output:
tuple val(sample), file("${sample}_arriba.tsv") optional true
file("*.{tsv,txt,log}")
//we use this methods when no SVs are given
script:
def extra_params = params.arriba_opt ? params.arriba_opt : ''
if(params.debug){
"""
echo arriba \\
-x ${bam} \\
-a ${fasta} \\
-g ${gtf} \\
-b ${params.arriba_lib}/blacklist_hg38_GRCh38_v2.1.0.tsv.gz \\
-o ${sample}_arriba.tsv -O ${sample}_discarded_arriba.tsv \\
${extra_params}
touch ${sample}_arriba.tsv
"""
}else{
"""
arriba \\
-x ${bam} \\
-a ${fasta} \\
-g ${gtf} \\
-b ${params.arriba_lib}/blacklist_hg38_GRCh38_v2.1.0.tsv.gz \\
-o ${sample}_arriba.tsv -O ${sample}_discarded_arriba.tsv \\
${extra_params} > ${sample}_arriba.log
"""
}
}
/*
* Arriba plot
*
*/
process arriba_visualization {
tag "${sample}-plot-fusion"
label 'load_low1'
publishDir "${params.outdir}/arriba/plot", mode: 'copy'
input:
//path(arriba_lib)
file(gtf)
tuple val(sample), file(bam), file(fusions)
output:
file("${sample}.pdf") optional true
when: params.arriba_plot
script:
//we do not plot the cytobans and protein domains for the test
def opt_test = params.debug ? "" : "--cytobands=${params.arriba_lib}/cytobands_hg38_GRCh38_v2.1.0.tsv --proteinDomains=${params.arriba_lib}/protein_domains_hg38_GRCh38_v2.1.0.gff3"
if(params.debug){
//bams shold be sorted because were mapped with STAR
"""
echo samtools sort -@ ${task.cpus} -O bam ${bam} > Aligned.sortedByCoord.out.bam
echo samtools index Aligned.sortedByCoord.out.bam
echo draw_fusions.R \\
--fusions=${fusions} \\
--alignments=Aligned.sortedByCoord.out.bam \\
--output=${sample}.pdf \\
--annotation=${gtf} \\
${opt_test}
touch ${sample}.pdf
"""
}else{
"""
samtools sort -@ ${task.cpus} -O bam ${bam} > Aligned.sortedByCoord.out.bam
samtools index Aligned.sortedByCoord.out.bam
draw_fusions.R \\
--fusions=${fusions} \\
--alignments=Aligned.sortedByCoord.out.bam \\
--output=${sample}.pdf \\
--annotation=${gtf} \\
${opt_test}
#we remove the tmp files
rm -f Aligned.sortedByCoord.out.bam
rm -f Aligned.sortedByCoord.out.bam.bai
"""
}
}
//main workflow
workflow{
//display help information
if (params.help){ show_help(); exit 0;}
//display the header of the tool
log.info IARC_Header()
log.info tool_header()
//check star index or fasta and GTF
if (!params.star_index && (!params.ref_fa && !params.ref_gtf)) exit 1, "Either specify a STAR-INDEX or Fasta and GTF files!"
// create input channels
ch_fasta = Channel.value(file(params.ref_fa)).ifEmpty{exit 1, "Fasta file not found: ${params.ref_fa}"}
ch_gtf = Channel.value(file(params.ref_gtf)).ifEmpty{exit 1, "GTF annotation file not found: ${params.ref_gtf}"}
//init arriba values
params.arriba_lib = "/opt/conda/envs/gene-fusions/var/lib/arriba"
//adding arriba lib
//arriba_lib = Channel.value(file(params.arriba_lib)).ifEmpty{exit 1, "Arriba lib directory not found!"}
//expect a file with header "sampleID fwd_path rev_path"
//see file ./test_dataset/sample_fwrev.txt
if(params.reads_csv) {
read_files_star = Channel.fromPath(file(params.reads_csv)).splitCsv(header: true, sep: '\t', strip: true)
.map{row -> [ row.sampleID, [file(row.fwd), file(row.rev)]]}
.ifEmpty{exit 1, "params.reads_csv was empty - no input files supplied" }
//read_files_star.view()
}else if(params.bams){
//we process the bam files
if (file(params.bams).listFiles().findAll { it.name ==~ /.*cram/ }.size() > 0){
println "BAM files found, proceed with arriba";
read_files_star=Channel.fromPath( params.bams+'/*.cram' )
.map { path -> [ path.name.replace(".cram",""), path] }//.view()
mode="BAM"
}else{
println "ERROR: input folder contains no fastq nor BAM files"; System.exit(0)
}
}else{
//expect a regular expresion like '*_{1,2}.fastq.gz'
read_files_star = Channel.fromFilePairs(params.reads, size: 2 )
.ifEmpty{exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!" }
}
//sv are given as input to include in arriba
if (params.svs){
println "SV file provided";
//Channel for vcf files
vcf_files = Channel.fromPath(file(params.svs)).splitCsv(header: true, sep: '\t', strip: true)
.map{row -> [ row.sampleID, file(row.bedpe)]}
.ifEmpty{exit 1, "params.reads_svs was empty - no vcf files supplied" }
//.view()
}
// Build STAR index
if(mode!="BAM"){
build_star_index(ch_fasta,ch_gtf)
ch_star_index = params.star_index ? Channel.value(file(params.star_index)).ifEmpty{exit 1, "STAR index not found: ${params.star_index}" } : build_star_index.out.index
//ch_star_index = ch_star_index
//map the rna-seq reads to the genome
star_mapping(read_files_star,ch_star_index)
read_files_star = star_mapping.out.bams
}
//run arriba
if(params.svs){
println "Run arriba in SV mode";
star_bam_sv = vcf_files.join(read_files_star).view()
arriba_sv(star_bam_sv,ch_fasta,ch_gtf)
}else{
println "Run arriba";
arriba(read_files_star,ch_fasta,ch_gtf)
}
//we merge into a single channel the arriba result + the star mapping
plot_arriba = params.svs ? read_files_star.join(arriba_sv.out[0]):read_files_star.join(arriba.out[0])
arriba_visualization(ch_gtf,plot_arriba)
}
//header for the IARC tools
// the logo was generated using the following page
// http://patorjk.com/software/taag (ANSI logo generator)
def IARC_Header (){
return """
#################################################################################
# ██╗ █████╗ ██████╗ ██████╗██████╗ ██╗ ██████╗ ██╗███╗ ██╗███████╗ ██████╗ #
# ██║██╔══██╗██╔══██╗██╔════╝██╔══██╗██║██╔═══██╗██║████╗ ██║██╔════╝██╔═══██╗ #
# ██║███████║██████╔╝██║ ██████╔╝██║██║ ██║██║██╔██╗ ██║█████╗ ██║ ██║ #
# ██║██╔══██║██╔══██╗██║ ██╔══██╗██║██║ ██║██║██║╚██╗██║██╔══╝ ██║ ██║ #
# ██║██║ ██║██║ ██║╚██████╗██████╔╝██║╚██████╔╝██║██║ ╚████║██║ ╚██████╔╝ #
# ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═════╝ ╚═╝ ╚═════╝ ╚═╝╚═╝ ╚═══╝╚═╝ ╚═════╝ #
# Nextflow pipelines for cancer genomics.########################################
"""
}
//this use ANSI colors to make a short tool description
//useful url: http://www.lihaoyi.com/post/BuildyourownCommandLinewithANSIescapecodes.html
def tool_header (){
return """
F\u001b[31;1mU\u001b[32;1mS\u001b[33;1mI\u001b[0mO\u001b[33;1mN\u001b[31;1m : Gene\u001b[32;1m Fusion\u001b[33;1m Caller\u001b[31;1m (${workflow.manifest.version})
"""
}