forked from neurocompnerds/VariantAnnotationToolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ANNOVARv3.sh
executable file
·131 lines (118 loc) · 7.72 KB
/
ANNOVARv3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
# ANNOVARv3.sh my.vcf
# Variables that usually don't need changing once set for your system
AnnovarPATH=/opt/annovar # Where the Annovar program is
SCRIPTPATH="$(dirname "$(readlink -f "$0")")" # Where the python & perl scripts for Annovar or other general scripts are
BUILD=hg19 # Genome build used by ANNOVAR either hg18 or $BUILD. This will also be incorporated into file names
AV_INPUT=$1.avinput
AV_DB=/opt/annovar/humandb/
usage()
{
echo "# $0 A script to annotate variants using ANNOVAR
# Usage $0 VCF.filename.vcf | -h
#
# -h --help Prints this message
#
# Example:
# $0 GATK.filtered.snps.vcf
#
# Paths currently set
# AnnovarPATH=$AnnovarPATH
# SCRIPTPATH=$SCRIPTPATH
# BUILD=$BUILD
# AV_DB=$AV_DB
#
# History:
# Original: Mark Corbett
# Modified (date; name; description)
# 25/11/2013; Mark Corbett; Extract Annovar script from BWA-GATK-Annovar pipeline and update to latest version
# 29/11/2013; Mark Corbett; Standardise variable assignments. Fix clean up script creation. Add usage.
# 17/03/2014; Mark Corbett; Add call to AnnovarGenomeSummaryCombo.sh
# 22/04/2014; Mark Corbett; Update to include new databases
# 04/08/2014; Mark Corbett; Change splicing threshold from default 2 to 12
# 04/08/2014; Mark Corbett; Change to using table_annovar.pl as summarise_annovar.pl is deprecated
# 08/09/2014; Thuong Ha; Update to include new databases
# 31/10/2014; Mark Corbett; Add scary number of variants in ExAC database v0.1
# 27/05/2015; Mark Corbett; Bring up to date with the March 2015 version of ANNOVAR
# 04/01/2016; Mark corbett; Replace vcf4old with vcf4 as new GATK outputs this format
# 11/04/2016; Ali Gardner; Update new databases used with Annovar
# 11/07/2018; Mark Corbett; Organise table order see also AnnovarGenomeSummaryCombo.v2.sh
# 03/09/2018; Ali Gardner; Update most recent ID gene list
# 01/03/2019; Mark Corbett; Update to match Ali's December 2018 version. Now annotates multisample VCF.
# 15/03/2019; Mark Corbett; Remove Eye gene list and replace with CP gene bed file.
# 11/06/2019; Ali Gardner; Updated some databases in Annovar
# 28/04/2020; Ali Gardner; Updated Annovar to version 24/10/19
# 15/06/2020; Ali Gardner; Updated Clinvar, ID & Epilepsy lists
# 01/09/2020; Ali Gardner; Added SplicAI data, Mackenzie & 5'UTR genes
"
}
if [ -z "$1" ] ; then
usage
exit
fi
case $1 in
-h | --help ) usage
exit
;;
esac
# Create clean up script
#cp $SCRIPTPATH/BWA-Picard-GATK-CleanUp.sh $1.CleanUp.sh # Items are added to this script that can then be run later to delete all redundant files
#Annovar - Conversion to Annovar file format. Will skip if file exists so if you think the file is dodgy need to delete it
if [ ! -f $AV_INPUT ]; then
perl $AnnovarPATH/convert2annovar.pl --format vcf4 --allsample --withfreq --includeinfo $1 > $AV_INPUT
fi
# Run Annovar
# This section is starting to slow down might need to consider running some filters seperately and in parallel in future
perl $AnnovarPATH/table_annovar.pl -thread 8 $AV_INPUT $AV_DB/ \
--buildver $BUILD \
--remove \
--protocol gene,phastConsElements46way,genomicSuperDups,esp6500siv2_all,ExAC.r0.1.filtered,1000g2015aug_all,UK10K-AF-all,cg69,Wellderly_v1-0-all,popfreq_max_20150413,avsnp150,snp138NonFlagged,clinvar_20210501,dbnsfp35a,\
dbnsfp31a_interpro,dbscsnv11,regsnpintron,gwava,spidex,spliceai_filtered,\
dgvMerged,evoCpg,cpgIslandExt,evofold,gwasCatalog,mirCodeMicroRNAsites,switchDbTss,targetScanS,tfbsConsSites,vistaEnhancers,wgEncodeRegTfbsClusteredV3,wgRna \
--operation g,r,r,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,r,r,r,r,r,r,r,r,r,r,r,r \
--otherinfo \
--nastring . \
--csvout \
-out $1.snps_annotated > $1.pipeline.log 2>&1
# ExAC and gnomAD
perl $AnnovarPATH/annotate_variation.pl --filter --buildver $BUILD --thread 8 --dbtype exac03 $AV_INPUT $AV_DB/ >> $1.pipeline.log 2>&1
perl $AnnovarPATH/annotate_variation.pl --filter --buildver $BUILD --thread 8 --dbtype gnomad211_exome $AV_INPUT $AV_DB/ >> $1.pipeline.log 2>&1
perl $AnnovarPATH/annotate_variation.pl --filter --buildver $BUILD --thread 8 --dbtype gnomad211_genome $AV_INPUT $AV_DB/ >> $1.pipeline.log 2>&1
# Generate special regionanno files
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile Epilepsy_hg19_genes_June2020.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","EpilepsyGene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with Epilepsy_gene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_EpilepsyGene #Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile ID_gene_list_hg19_June2020.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","IDGene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with ID_gene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_IDGene # Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile hg19_CPgenesNov2018.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","CPgene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with CPgene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_CPgene #Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile Mckenzie_hg19_genes_Aug20.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","MckenzieGene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with Mckenzie_gene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_MckenzieGene # Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile 5UTR_hg19_genes_Aug20.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","5UTRGene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with 5UTR_gene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_5UTRGene # Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype bed -bedfile CMHg19_080914TH_50bp.bed $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
awk '{gsub("bed","MCDGene",$1)}1' $AV_INPUT.$BUILD\_bed > $AV_INPUT.$BUILD.bed_new # replace bed with MCDgene
awk -v OFS="\t" '$1=$1' $AV_INPUT.$BUILD.bed_new > $AV_INPUT.$BUILD\_MCDGene # Replace whitespaces with tabs in linux
rm $AV_INPUT.$BUILD.bed_new $AV_INPUT.$BUILD\_bed
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype DDG2P --colsWanted 5,6,7,8,9 $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype LoFToolScores --colsWanted 4 $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype RVISExACscores --colsWanted 4,5 $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
perl $AnnovarPATH/annotate_variation.pl -regionanno -buildver $BUILD -dbtype GDIScores --colsWanted 4,5,6,7 $AV_INPUT $AV_DB >> $1.pipeline.log 2>&1
# Combine all variant files
python $SCRIPTPATH/annovar_combine_csv.py $AV_INPUT $AV_INPUT.$BUILD* > $AV_INPUT.combo.csv
#perl $SCRIPTPATH/vcfCSVFix.pl $AV_INPUT.combo.csv > $AV_INPUT.combo.txt
# Create genome summary combo file
$SCRIPTPATH/AnnovarGenomeSummaryCombo.v2.sh -c $AV_INPUT.combo.csv -g $1.snps_annotated.$BUILD\_multianno.csv -o $1
# Add files to clean up script
echo "rm $AV_INPUT" >> $1.CleanUp.sh
echo "rm $AV_INPUT.$BUILD\_*" >> $1.CleanUp.sh