-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnnotation_Manager.sh
729 lines (654 loc) · 29.3 KB
/
Annotation_Manager.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
#!/bin/bash
#
# Annotation_Manager.sh
# This script sets up and launches annotation scripts for CADD, RVIS, and can be expanded to more
# INPUTS: VCF file to be annotated, output dir, temp dir, uploads flags
# OUTPUTS: annotated VCF file with all previous annotations removed
#
# Last modified on: 09/21/15
# Last modified by: Mike Warburton
usage(){
cat << EOF
This script sets up and launches annotation scripts for CADD and RVIS.
Annotate Mode is default mode
-i <VCF file> Required The VCF file to be annotated
-o <output dir> Required The directory in which to save the output
-t <temp dir> Optional The directory in which to save the temporary files; the default is the output directory
-s <number of lines> Optional The number of lines per part when the VCF file is split; by default the input is split into no more than 100 pieces such that each is 1000+ lines
-w <walltime> Optional The Biowulf2 walltime to set per VCF piece. Quick nodes are used by default. This will tell the script to use normal nodes with the specified walltime
-p Optional A flag to skip bcftools if the file already prepared with 1 variant per line and is left-normalized
-e Optional A flag to erase the INFO field of the input VCF file
-k Optional A flag to keep the temporary files after the script finishes
-f <config file> Optional The config file with database paths to use; the default is /data/Udpwork/usr/Common/Git/VCF_scripts/Annotation/databasePaths.config
-c CADD Annotation
-a An optional flag to show all variants, even those without a CADD score in the local database, in the output VCF file (default is to only show those in the *.noCADD.vcf file)
-r RVIS Annotatoin
-h ExAC Homozygote Annotation
Database Mode is set with '-d'
-c CADD Database Update
-n <new scores> Required The file downloaded from CADD's website with the new InDel scores
EOF
}
# Check and collect input
if [ $# -lt 4 ] || [ $# -gt 19 ]; then
echo -e "\nERROR: Too few or too many arguments\n"
usage
exit 1
fi
# Set up the main variables
scriptDir=/data/Udpwork/usr/Common/Git/VCF_scripts/Annotation/
defaultConfigFile=/data/Udpwork/usr/Common/Git/VCF_scripts/Annotation/databasePaths.config
annotateMode=false
databaseMode=false
partition="quick"
walltime="2:00:00"
preppedFile=false
clearInfo=false
keepTemp=false
caddModule=false
outputAllCadd=false
rvisModule=false
exacModule=false
# Pull in the arguments from the command line
while getopts "i:o:t:s:w:pekf:carhdn:" OPTION; do
case $OPTION in
i)
inputFile="$OPTARG"
annotateMode=true;;
o)
outputDir="$OPTARG"
annotateMode=true;;
t)
tempDir="$OPTARG"
annotateMode=true;;
s)
chunkSize="$OPTARG"
annotateMode=true;;
w)
walltime="$OPTARG"
partition="norm";;
p)
preppedFile=true
annotateMode=true;;
e)
clearInfo=true
annotateMode=true;;
k)
keepTemp=true;;
f)
configFile="$OPTARG";;
c)
caddModule=true;;
a)
outputAllCadd=true;;
r)
rvisModule=true;;
h)
exacModule=true;;
d)
databaseMode=true;;
n)
newCADDdata="$OPTARG"
databaseMode=true;;
\?)
echo -e "\nERROR: unrecognized option\n"
usage
exit 1;;
:)
echo -e "\nOption -$OPTARG requires an argument\n"
usage
exit 1;;
esac
done
# Check that the arguments for at least Mode were provided
if ! $annotateMode && ! $databaseMode; then
echo -e "\nERROR: Insufficient arguments to run either Annotate Mode or Database Mode\n"
usage
exit 1
fi
# Check that Annotate Mode and Database Mode were not both run
if $annotateMode && $databaseMode; then
echo -e "\nERROR: Cannot run annotation and database update at the same time\n"
usage
exit 1
fi
# If a config file was not given, use the default file
if [ "$configFile" == "" ]; then
configFile=$defaultConfigFile
fi
# Check that the config file exists
if [ ! -e $configFile ]; then
echo -e "ERROR: the config file with database paths, $configFile, does not exist"
usage
exit 1
fi
# Read the config file and pull out the database paths
caddInDels=`grep "CADD InDels" $configFile | cut -f 2` # The CADD database of InDels
caddSNVs=`grep "CADD SNVs" $configFile | cut -f 2` # The CADD database of SNVs
caddExtras=`grep "CADD Extras" $configFile | cut -f 2` # The local CADD database of scores calculated by CADD's website for variants not in the default databases
rvisDb=`grep "RVIS Database" $configFile | cut -f 2` # The RVIS database of intolerance scores by gene
geneTable=`grep "UCSC Gene Table" $configFile | cut -f 2` # The databse of UCSC gene names and their coverage regions
exacDb=`grep "ExAC Database" $configFile | cut -f 2` # The database of ExAC scores
referenceFasta=`grep "bcftools Reference Fasta" $configFile | cut -f 2` # The reference genome fasta file for bcftools's left-alignment
# If a walltime was given, check its format
if [ "$partition" != "quick" ]; then
if [[ "$walltime" != [0-9]*:[0-9][0-9]:[0-9][0-9] ]]; then
echo -e "\nERROR: The given walltime did not match the required format. Check the Biowulf User Guide.\n"
usage
exit 1
fi
fi
# If Annotate Mode was run, check the inputs
if $annotateMode; then
# Make sure an input file and an output directory were given
if [ "$inputFile" == "" ] || [ "$outputDir" == "" ]; then
echo -e "\nERROR: One of the required inputs is missing\n"
usage
exit 1
fi
# Make sure the input file exists
if [ ! -e "$inputFile" ]; then
echo -e "\nERROR: The input VCF $inputFile does not exist\n"
usage
exit 1
fi
# Make sure the output directory exists; if it doesn't, try to make it
if [ ! -d "$outputDir" ]; then
mkdir -p "$outputDir"
if [ ! -d "$outputDir" ]; then
echo -e "\nERROR: The output directory $outputDir does not exist and could not be made\n"
usage
exit 1
fi
fi
# If the user didn't supply a directory for the temporary files, use the output directory
if [ "$tempDir" == "" ]; then
tempDir="$outputDir"
# If the user gave a temp directory, check that it exists, and try to make it if i doesn't
elif [ ! -d "$tempDir" ]; then
mkdir -p $tempDir
if [ ! -d "$tempDir" ]; then
echo -e "\nERROR: The temp directory $tempDir does not exist and could not be made\n"
usage
exit 1
fi
fi
# Check that the reference fasta file exists
if [ ! -e $referenceFasta ]; then
echo -e "\nERROR: The reference fasta file, $referenceFasta, does not exist\n"
usage
exit 1
fi
# Check that at least one type of annotation was chosen
if ! $caddModule && ! $rvisModule && ! $exacModule; then
echo -e "\nERROR: No annotation type selected\n"
usage
exit 1
fi
# Set up a variable used under multiple circumstances in Annotate Mode
fieldsLine=`grep -n -m1 "^#CHROM" $inputFile | cut -d':' -f 1`
fi
# If Database Mode was run, check the inputs
if $databaseMode; then
# Make sure ExAC module wasn't set
if $exacModule; then
echo -e "\nERROR: ExAC databases cannot be updated with this script\n"
usage
exit 1
fi
# Make sure RVIS module wasn't set
if $rvisModule; then
echo -e "\nERROR: RVIS databases cannot be updated with this script\n"
usage
exit 1
fi
# If the CADD module wasn't set, there is nothing for Database Mode to do
if ! $caddModule; then
echo -e "\nERROR: No database type selected\n"
usage
exit 1
# If the CADD module was set, check that a file of scores was provided and that the file exists
else
if [ "$newCADDdata" == "" ]; then
echo -e "\nERROR: No new CADD scores provided\n"
usage
exit 1
fi
if [ ! -e "$newCADDdata" ]; then
echo -e "\nERROR: New CADD scores file $newCADDdata does not exist\n"
usage
exit 1
fi
fi
fi
# Perform checks specific to the CADD module, if it was set
if $caddModule; then
# Check that the InDel database given in the config file exists
if [ ! -e $caddInDels ]; then
echo -e "\nERROR: $caddInDels does not exist\n"
usage
exit 1
fi
# Get the last-modification time of the InDel database
caddInDelsTime=`stat -c%Y $caddInDels`
# Check that the index for the InDel database exists
if [ ! -e ${caddInDels}.tbi ]; then
echo -e "\nERROR: the index for $caddInDels does not exist. Please make it\n"
usage
exit 1
fi
# Get the last-modification time of the InDel database index
caddInDelsIndexTime=`stat -c%Y ${caddInDels}.tbi`
# Check that the InDel database has not been modified more recently than its index
if [ $caddInDelsTime -gt $caddInDelsIndexTime ]; then
echo -e "\nERROR: the index for $caddInDels is out of date. Please remake it\n"
usage
exit 1
fi
# Check that the SNV database given in the config file exists
if [ ! -e $caddSNVs ]; then
echo -e "\nERROR: $caddSNVs does not exist\n"
usage
exit 1
fi
# Get the last-modification time of the SNV database
caddSNVsTime=`stat -c%Y $caddSNVs`
# Check that the index for the SNV database exists
if [ ! -e ${caddSNVs}.tbi ]; then
echo -e "\nERROR: the index for $caddSNVs does not exist. Please make it\n"
usage
exit 1
fi
# Get the last-modification time of the SNV database index
caddSNVsIndexTime=`stat -c%Y ${caddSNVs}.tbi`
# Check that the SNV database has not been modified more recently than its index
if [ $caddSNVsTime -gt $caddSNVsIndexTime ]; then
echo -e "\nERROR: the index $caddSNVs is out of date. Please remake it\n"
usage
exit 1
fi
# Check that the local non-standard variant database given in the config file exists
if [ ! -e $caddExtras ]; then
echo -e "\nERROR: $caddExtras does not exist\n"
usage
exit 1
fi
# Get the last-modification time of the local non-standard variant database
caddExtrasTime=`stat -c%Y $caddExtras`
# Check that the index for the local non-standard variant database exists
if [ ! -e ${caddExtras}.tbi ]; then
echo -e "\nERROR: the index for $caddExtras does not exist. Please make it\n"
usage
exit 1
fi
# Get the last-modification time of the local non-standard variant database index
caddExtrasIndexTime=`stat -c%Y ${caddExtras}.tbi`
# Check that the local non-standard variant database has not been modified more recently than its index
if [ $caddExtrasTime -gt $caddExtrasIndexTime ]; then
echo -e "\nERROR: the index for $caddExtras is out of date. Please remake it\n"
usage
exit 1
fi
realCaddSNVs=`readlink -f $caddSNVs` # Get the target of the CADD SNV database soft link
caddDir=`dirname $realCaddSNVs` # Get the CADD directory
caddGenome=`basename $caddDir` # Get the genome of the CADD version
caddVersion=`basename $(dirname $caddDir)` # Get the CADD version
elif $outputAllCadd; then
echo -e "\nERROR: -a (output all variants, even ones missing CADD scores) was set without -c (add CADD annotation) also being set.\n"
usage
exit 1
fi
# Perform checks specific to the RVIS module, if it was set
if $rvisModule; then
# Check that the RVIS database exists
if [ ! -e $rvisDb ]; then
echo -e "\nERROR: $rvisDb does not exist\n"
usage
exit 1
fi
# Check that the UCSC gene table exists
if [ ! -e $geneTable ]; then
echo -e "\nERROR: $geneTable does not exist\n"
usage
exit 1
fi
# Get the last-modification time of the UCSC gene table
geneTableTime=`stat -c%Y $geneTable`
# Check that the index for the UCSC gene table exists
if [ ! -e ${geneTable}.tbi ]; then
echo -e "\nERROR: the index for $geneTable does not exist. Please make it\n"
usage
exit 1
fi
# Check that the UCSC gene table has not been modified more recently than its index
geneTableIndexTime=`stat -c%Y ${geneTable}.tbi`
if [ $geneTableTime -gt $geneTableIndexTime ]; then
echo -e "\nERROR: the index for $geneTable is out of date. Please remake it\n"
usage
exit 1
fi
realRvisDb=`readlink -f $rvisDb` # Get the target of the RVIS database soft link
rvisRelease=`basename $(dirname $realRvisDb)` # Get the RVIS release number
realGeneTable=`readlink -f $geneTable` # Get the target of the UCSC gene table soft link
geneTableGenome=`basename $(dirname $realGeneTable)` # Get the genome of the UCSC gene table
fi
# Perform checks specific to the ExAC module, if it was run
if $exacModule; then
# Check that the ExAC database exists
if [ ! -e $exacDb ]; then
echo -e "\nERROR: $exacDb does not exist\n"
usage
exit 1
fi
# Get the last-modified time of the ExAC database
exacDbTime=`stat -c%Y $exacDb`
# Check that the index for the ExAC database exists
if [ ! -e ${exacDb}.tbi ]; then
echo -e "\nERROR: the index for $exacDb does not exist. Please make it\n"
usage
exit 1
fi
# Get the last-modified time of the ExAC database index
exacDbIndexTime=`stat -c%Y ${exacDb}.tbi`
# Check that the ExAC database has not been modified more recently than it index
if [ $exacDbTime -gt $exacDbIndexTime ]; then
echo -e "\nERROR: the index for $exacDb is out of date. Please remake it\n"
usage
exit 1
fi
realExacDb=`readlink -f $exacDb` # Get the target of the ExAC database soft link
exacRelease=`basename $(dirname $realExacDb)` # Get the ExAC release number
exacRefFile=`gunzip -c $exacDb | grep -m1 "^##reference=file:"` # Get the reference genome file used in the creation of the ExAC database
# Check that the reference genome file was the hg19 fasta file
if [ "$exacRefFile" == "##reference=file:///seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta" ]; then
exacGenome="hg19"
else
echo -e "\nERROR: did not recognize reference file ($exacRefFile) in ExAC database $exacDb. Please use a different ExAC database or update this script\n"
usage
exit 1
fi
fi
# Check that script is not being run on Helix
onHelix=false
if [ $HOSTNAME == "helix.nih.gov" ]; then
onHelix=true
fi
if $onHelix && $annotateMode; then
echo -e "\nERROR: Annotate Mode must be run on from Biowulf2\n"
usage
exit 1
fi
# Check that if user claimed that the input file was already prepped, that it does not contain a line with more than one variant
if $preppedFile; then
multiVariants=`tail -n +$fieldsLine $inputFile | cut -f4,5 | grep ","`
if [ "$multiVariants" != "" ]; then
echo -e "\nERROR: The '-p' flag was used with a file that has more than one variant per line\n"
usage
exit 1
fi
fi
# Run Annotation Mode
if $annotateMode; then
# Create the variables for bcftools and the annotate script
inputBase=`basename $inputFile`
inputBaseStem=${inputBase%.vcf}
# Set the file to hold the header of the VCF file
header=$tempDir/${inputBaseStem}.header.vcf
# Pull the header from the input VCF file
((fieldsLine--))
head -$fieldsLine $inputFile | grep -v "^##INFO=<ID" > $header
# If the CADD module was set, print the CADD annotation definitons to the header
if $caddModule; then
echo "##INFO=<ID=cadd_raw,Number=1,Type=Float,Description=\"The raw CADD score for the variant. From CADD ${caddVersion}, $caddGenome\">" >> $header
echo "##INFO=<ID=cadd_phred,Number=1,Type=Float,Description=\"The Phred-scaled CADD score for the variant. From CADD ${caddVersion}, $caddGenome\">" >> $header
fi
# If the RVIS module was set, print the RVIS annotation definition to the header
if $rvisModule; then
echo "##INFO=<ID=rvis,Number=.,Type=String,Description=\"The 0.01% RVIS score and its percentile for the gene(s) of the variant. From RVIS ${rvisRelease}, $geneTableGenome\">" >> $header
fi
# If the ExAC module was set, print the ExAC annotation definition to the header
if $exacModule; then
echo "##INFO=<ID=exac_AC_hom,Number=1,Type=Float,Description=\"The ExAC homozygote allele count. From ExAC ${exacRelease}, ${exacGenome}\">" >> $header
fi
# Pull the '#CHROM...' line from the input VCF file and append it to the header
((fieldsLine++))
head -$fieldsLine $inputFile | tail -1 >> $header
# Set the file to become the VCF file to be annotated
annotationInput=$tempDir/${inputBaseStem}.annotationPrepped.vcf
# If the input VCF file was not prepped, use bcftools to split and left-align it
if ! $preppedFile; then
# Set the files to hold the temporary versions of the input as it's processed
cleanFORMATvcf=$tempDir/${inputBaseStem}.pgt_pid_removed.vcf
splitVCF=$tempDir/${inputBaseStem}.splitVariants.vcf
strippedVCF=$tempDir/${inputBaseStem}.infoStripped.vcf
newBody=$tempDir/${inputBaseStem}.newBody.vcf
# Remove PGT and PID FORMAT fields from any lines that have them
#py
# Strip the INFO fields
[ -e $newBody ] && rm $newBody
echo "Stripping the INFO fields"
awk 'BEGIN {FS="\t"; OFS="\t"} ($1 !~ "\#.*") {$8="."; print $0}' $inputFile > $newBody
# Combine the header with the info-stripped body
cat $header $newBody > $strippedVCF
rm $header
rm $newBody
module load samtools/1.2
echo "Running bcftools to prepare VCF file for annotation"
# Run bcftools to split lines in the VCF file so that there is one variant per line
bcftools norm -m-both -o $splitVCF $strippedVCF
if [ $? != "0" ]; then
echo -e "\nERROR: bcftools failed while trying to split the variants in $strippedVCF\n"
exit 1
fi
# Run bcftools to left-align the REF/ALT sequences on each line
bcftools norm -f "$referenceFasta" -o "$annotationInput" "$splitVCF"
if [ $? != "0" ]; then
echo -e "\nERROR: bcftools failed while trying to left-normalize the sequence positions in $splitVCF\n"
exit 1
fi
# If the user doesn't want to keep the temporary files, remove them
if ! $keepTemp; then
rm $strippedVCF
rm $splitVCF
fi
# If the input VCF file was prepped, combine the appended header with the body
else
((fieldsLine++))
vcfBody=$tempDir/${inputBaseStem}.bodyOnly.vcf
tail -n +$fieldsLine $inputFile > $vcfBody
((fieldsLine--))
cat $header $vcfBody > $annotationInput
fi
echo "Splitting VCF file into parts"
# Set the variables to be used in creating new filenames
annotationBase=`basename $annotationInput`
annotationBaseStem=${annotationBase%.vcf}
annotationPart=$tempDir/${annotationBaseStem}.part
# Split the body of the VCF file to be annotation into parts
fieldsLine=`grep -n -m1 "^#CHROM" $annotationInput | cut -d':' -f1`
((fieldsLine++))
# If the user did not specify a size, set the chunk size so that no more than 100 pieces, each at least 1,000 lines, are created
if [ "$chunkSize" == "" ]; then
lineCount=`tail -n +$fieldsLine $annotationInput | wc -l`
chunkSize=${lineCount%??}
((chunkSize++))
if [ $chunkSize -lt 1000 ]; then
chunkSize=1000
fi
fi
split -a4 -l $chunkSize <(tail -n +$fieldsLine $annotationInput) $annotationPart
((fieldsLine--))
# Set the name of the output VCF file
successFile=$outputDir/$annotationBaseStem.
if $caddModule; then # If the CADD module was set, include it in the name
successFile+="CADD_"
fi
if $rvisModule; then # Same for the RVIS module
successFile+="RVIS_"
fi
if $exacModule; then # Same for the ExAC module
successFile+="ExAC_"
fi
successFile+="annotated.vcf"
echo "Submitting Biowulf2 annotation jobs for each part of VCF file"
# Loop through pieces of the annotation-ready input VCF file
for file in ${annotationPart}*; do
# Set up the variables used between annotation modules
partID=${file##*.}
fileForModule=$file
caddJob=${file%.part*}.caddJob.$partID
rvisJob=${file%.part*}.rvisJob.$partID
exacJob=${file%.part*}.exacJob.$partID
# If the CADD module was set, create an sbatch file and job for it
if $caddModule; then
fileForModuleOut=${fileForModule%.part*}.withCADD.$partID # The output file for the CADD annotation on this piece
fileForModuleCaddFail=${fileForModule%.part*}.noCADD.$partID # The output file for the unsuccessful CADD annotations on this piece
caddCMD=${file%.part*}.caddModuleCMD.$partID # The file to hold the commands submitted to Biowulf2
echo "#!/bin/bash" > $caddCMD # Set the language
echo "#SBATCH --job-name=CADD$partID" >> $caddCMD # Set the job ID
echo "if [ ! -e $fileForModule ]; then" >> $caddCMD # Check that the input file exists
echo -e "\techo \"ERROR: CADD input, $fileForModule, does not exist\"" >> $caddCMD # Print an error if it doesn't
echo -e "\texit 1" >> $caddCMD # Then quit the job
echo "fi" >> $caddCMD
# Execute the CADD annotation worker scrit, feeding the file to annotate, the output file, the failed output file, and the reference databases
echo -n "$scriptDir/annotateCADD.worker.sh -i $fileForModule -o $fileForModuleOut -f $fileForModuleCaddFail -s $caddSNVs -d $caddInDels -e $caddExtras" >> $caddCMD
if ! $preppedFile && $clearInfo; then # If the file wasn't prepped and user wants to erase the INFO fields, set the flag for that
echo -n " -c" >> $caddCMD
fi
if $outputAllCadd; then # If the user wants every variant in the output file, set the flag for that
echo -n " -a" >> $caddCMD
fi
echo "" >> $caddCMD
if ! $keepTemp; then # If the user doesn't want to keep the temporary files, include commands to delete:
echo "rm $fileForModule" >> $caddCMD # the input file to the worker script
echo "rm $caddCMD" >> $caddCMD # the command script itself
fi
# Submit the launch commands for the CADD annotation to Biowulf2, saving the output and error streams to file, and using a 'quick' partition
sbatch --error=$tempDir/${annotationBaseStem}.CADD.${partID}.err --output=$tempDir/${annotationBaseStem}.CADD.${partID}.out --partition=$partition --time=$walltime $caddCMD > $caddJob
# Set the input file for the next module as the output file of this module
fileForModule=$fileForModuleOut
fi
# If the RVIS module was set, create an sbatch file and job for it
if $rvisModule; then
fileForModuleOut=${fileForModule%.part*}.withRVIS.$partID # The output file for the RVIS annotation on this piece
rvisCMD=${file%.part*}.rvisModuleCMD.$partID # The file to hold the commands submitted to Biowulf2
echo "#!/bin/bash" > $rvisCMD # Set the language
echo "#SBATCH --job-name=RVIS$partID" >> $rvisCMD # Set the job ID
echo "if [ ! -e $fileForModule ]; then" >> $rvisCMD # Check that the input file exists
echo -e "\techo \"ERROR: RVIS input, $fileForModule, does not exist\"" >> $rvisCMD # Print an error if it doesn't
echo -e "\texit 1" >> $rvisCMD # Then quit the job
echo "fi" >> $rvisCMD
# Execute the RVIS annotation worker script, feeding the file to annotate, the output file, the RVIS database, and the UCSC gene table
echo -n "$scriptDir/annotateRVIS.worker.sh -i $fileForModule -o $fileForModuleOut -r $rvisDb -g $geneTable" >> $rvisCMD
# If the file wasn't prepped, the CADD module wasn't set, and the user wants to erase the INFO fields, set the flag for that
if ! $preppedFile && ! $caddModule && $clearInfo; then
echo -n " -c" >> $rvisCMD
fi
echo "" >> $rvisCMD
if ! $keepTemp; then # If the user doesn't want to keep the temporary files, include commands to delete:
echo "rm $fileForModule" >> $rvisCMD # the input file to the worker script
echo "rm $rvisCMD" >> $rvisCMD # the command script itself
fi
# Submit the launch commands for the RVIS annotation to Biowulf2, saving the output and error streams to file, and using a 'quick' partition
# If a CADD job was run for this piece, get the job ID for that annotation and set this module to be dependent on that one
if [ -e "$caddJob" ]; then
caddJobID=`cat $caddJob`
if ! $keepTemp; then
rm $caddJob
fi
sbatch --error=$tempDir/${annotationBaseStem}.RVIS.${partID}.err --output=$tempDir/${annotationBaseStem}.RVIS.${partID}.out --partition=$partition --time=$walltime --dependency=afterany:$caddJobID $rvisCMD >> $rvisJob
else
sbatch --error=$tempDir/${annotationBaseStem}.RVIS.${partID}.err --output=$tempDir/${annotationBaseStem}.RVIS.${partID}.out --partition=$partition --time=$walltime $rvisCMD >> $rvisJob
fi
# Set the input file for the next module as the output file of this module
fileForModule=$fileForModuleOut
fi
# If the ExAC module was set, create an sbatch file and job for it
if $exacModule; then
fileForModuleOut=${fileForModule%.part*}.withExAC.$partID # The output file for the ExAC annotation on this piece
exacCMD=${file%.part*}.exacModuleCMD.$partID # The file to hold the commands submitted to Biowulf2
echo "#!/bin/bash" > $exacCMD # Set the language
echo "#SBATCH --job-name=ExAC$partID" >> $exacCMD # Set the job ID
echo "if [ ! -e $fileForModule ]; then" >> $exacCMD # Check that the input file exists
echo -e "\techo \"ERROR: ExAC input, $fileForModule, does not exist\"" >> $exacCMD # Print an error if it doesn't
echo -e "\texit 1" >> $exacCMD # Then quit the job
echo "fi" >> $exacCMD
# Execute the ExAC annotation worker script, feeding the file to annotate, the output file, and the ExAC database
echo -n "$scriptDir/annotateExAC.worker.sh -i $fileForModule -o $fileForModuleOut -d $exacDb" >> $exacCMD
# If the input VCF file wasn't prepped, no other module was set, and the user wants to erase the INFO fields, set the flag for that
if ! $preppedFile && ! $caddModule && ! $rvisModule && $clearInfo; then
echo -n " -c" >> $exacCMD
fi
echo "" >> $exacCMD
if ! $keepTemp; then # If the user doesn't want to keep the temporary files, include commands to delete:
echo "rm $fileForModule" >> $exacCMD # the input file to the worker script
echo "rm $exacCMD" >> $exacCMD # the command scrit itself
fi
# Submit the launch commands for the ExAC annotation to Biowulf2, saving the output and error streams to file, and using a 'quick' partition
# If a RVIS job was run for this piece, get the job ID for that annotation and set this module to be dependent on that one
if [ -e "$rvisJob" ]; then
rvisJobID=`cat $rvisJob`
if ! $keepTemp; then
rm $rvisJob
fi
sbatch --error=$tempDir/${annotationBaseStem}.ExAC.${partID}.err --output=$tempDir/${annotationBaseStem}.ExAC.${partID}.out --partition=$partition --time=$walltime --dependency=afterany:$rvisJobID $exacCMD >> $exacJob
# If not a RVIS but a CADD job was run for this piece, use the job ID to set this module to be dependent on that one
elif [ -e "$caddJob" ]; then
caddJobID=`cat $caddJob`
if ! $keepTemp; then
rm $caddJob
fi
sbatch --error=$tempDir/${annotationBaseStem}.ExAC.${partID}.err --output=$tempDir/${annotationBaseStem}.ExAC.${partID}.out --partition=$partition --time=$walltime --dependency=afterany:$caddJobID $exacCMD >> $exacJob
else
sbatch --error=$tempDir/${annotationBaseStem}.ExAC.${partID}.err --output=$tempDir/${annotationBaseStem}.ExAC.${partID}.out --partition=$partition --time=$walltime $exacCMD >> $exacJob
fi
# Set the input file for the next module as the output file of this module
fileForModule=$fileForModuleOut
fi
done
# Combine the annotation files
allJobs=$tempDir/${inputBase}.allJobs # Set the file name to contain the job IDs for all the annotation jobs
cat ${annotationPart%.part*}.*Job.part* > $allJobs # Print all the job IDs to the file
jobIDs=`tr '\n' ',' < $allJobs` # Change the list from line-separated to comma-delimited
jobIDs=${jobIDs%,} # Remove the trailing comma
concatCMD=$tempDir/${annotationBaseStem}.collectAll.cmd # Set the file to hold the concatenation and clean-up commands
echo "#!/bin/bash" > $concatCMD # Set the language
echo "#SBATCH --job-name=ConcatParts" >> $concatCMD # Set the job ID
# Concatenate the header of the annotation-input file and all the final annotated pieces of the body of the input VCF file
echo "cat <(head -$fieldsLine $annotationInput) ${fileForModule%.part*}* > $successFile" >> $concatCMD
# If the CADD module was set, also concatenate the failed output file from the CADD annotation
if $caddModule; then
echo "cat <(head -$fieldsLine $annotationInput) ${fileForModuleCaddFail%.part*}* > $outputDir/${annotationBaseStem}.noCADD.vcf" >> $concatCMD
# If the user doesn't want to keep the temporary files, remove all the failed CADD pieces
if ! $keepTemp; then
echo "rm ${fileForModuleCaddFail%.part*}.part*" >> $concatCMD
fi
fi
# If the user doesn't want to keep the temporary files, include commnads to remove them
if ! $keepTemp; then
echo "rm $annotationInput" >> $concatCMD
# Create a loop to catch all the sbatch output & error files that aren't empty and save them
echo "for file in $tempDir/${annotationBaseStem}*part*err $tempDir/${annotationBaseStem}*part*out; do" >> $concatCMD
echo " if [ -s \"\$file\" ]; then" >> $concatCMD
echo " newFile=\$(echo \"\$file\" | sed s/\\.part/./)" >> $concatCMD
echo " mv \$file \$newFile" >> $concatCMD
echo " fi" >> $concatCMD
echo "done" >> $concatCMD
echo "rm ${annotationPart%.part*}.*.part*" >> $concatCMD
echo "rm $allJobs" >> $concatCMD
echo "rm $concatCMD" >> $concatCMD
fi
# Submit the lauch commands for the concatenation script, saving the output and error streams to file, using a 'quick' partition, and making it dependent on all the annotation jobs
sbatch --error=$tempDir/${annotationBaseStem}.ConcatParts.err --output=$tempDir/${annotationBaseStem}.ConcatParts.out --partition=quick --dependency=afterany:$jobIDs $concatCMD
echo "This script is finished, but the annotation jobs most likely are not. Check your jobload for further information."
fi
# Run Database Mode
if $databaseMode; then
if $caddModule; then
updateCaddCMD="$scriptDir/append_non-standard-CADD.sh -v $caddVersion -g $caddGenome -n $newCADDdata"
if $caddUploads; then
updateCaddCMD+=" -u"
fi
$updateCaddCMD
fi
fi