-
Notifications
You must be signed in to change notification settings - Fork 1
/
AnnovarGenomeSummaryCombo.v2_for_hg38.sh
executable file
·136 lines (122 loc) · 6.28 KB
/
AnnovarGenomeSummaryCombo.v2_for_hg38.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
usage()
{
echo "## AnnovarGenomeSummaryCombo.sh -c ComboFile.csv -g GenomeSummaryFile.csv -o OUTPREFIX [-h | --help]
# Script is specific to the annovar pipeline in neurogenetics this will eventually be merged with that script.
# For combining two output files:
# OUTPREFIX.annotated.genome_summmary.fixed.csv
# OUTPREFIX.snps.filtered.avinput.combo.txt
#
# Options:
# -c Combo file
# -g ANNOVAR genome_summary or multianno file file
# -o OUTPREFIX Usually a DNA number that forms the prefix of the file (required)
# -h | --help Prints this message
#
# Example:
# AnnovarGenomeSummaryCombo.sh -c ComboFile.csv -g multianno.csv
#
# History:
# Created by: Mark Corbett, 16/07/2013, [email protected]
# Modified: (Date; Name; Modification)
# 02/12/2013; Mark Corbett; Added generic file specification. Now calculates number of cols in genome summary file
# 31/12/2013; Mark Corbett; Changed evaluation of total line numbers was working on previous version of Ubuntu
# 23/04/2014; Mark Corbett; Minor shuffle of column numbers to account for addition of new databases in ANNOVAR
# 14/05/2014; Mark Corbett; Added generation of a variant key
# 28/08/2014; Mark Corbett; Update to suit July 2014 ANNOVAR. Move script to version 2
# 08/09/2014; Thuong Ha; Minor shuffle of column numbers to account for addition of new databases in ANNOVAR
# 31/10/2014; Mark Corbett; Minor shuffle of column numbers to account for addition of new databases in ANNOVAR
# 31/10/2014; Mark Corbett; Fix missing VCF fields
# 27/05/2015; Mark Corbett; Update to suit March 2015 ANNOVAR
# 04/01/2016; Mark Corbett; Check if input VCF was bgzipped before extracting missing VCF fields
# 11/04/2016; Ali Gardner; Update for use with new databases in Annovar
# 11/07/2018; Mark Corbett; Change ordering and headers of new ANNOVAR output
# 15/03/2019; Mark corbett; Updated script to handle extra Otherinfo columns that get added using table_annovar.pl -allsamples flag
# 11/06/2019; Ali Gardner; Updated some databases in Annovar
# 27/03/2020; Mark Corbett; Fix the annoying otherInfo labels
# 11/06/2020; Ali Gardner; Update cols for hg38 databases
"
}
while [ "$1" != "" ]; do
case $1 in
-c ) shift
ComboFile=$1
;;
-g ) shift
GenomeFile=$1
;;
-o ) shift
OUTPREFIX=$1
;;
-h | --help ) usage
exit 1
;;
* ) usage
exit 1
esac
shift
done
if [ -z "$ComboFile" ]; then #If combo file is empty then do not proceed
echo "#ERROR: You need to specify the name of the ANNOVAR combo file"
usage
exit 1
fi
if [ -z "$GenomeFile" ]; then #If genome summary file is empty then do not proceed
echo "#ERROR: You need to specify the name of the ANNOVAR genome summary file"
usage
exit 1
fi
if [ -z "$OUTPREFIX" ]; then #If no outprefix specified then do not proceed
echo "#ERROR: You need to specify a prefix for the file output"
usage
exit 1
fi
# Sanity Check
LinesA=$(wc -l < $ComboFile | sed 's/[^0-9]*//g')
LinesB=$(wc -l < $GenomeFile | sed 's/[^0-9]*//g')
if [ "$LinesA" -eq "$LinesB" ]; then
echo "# OK We're good to go. Now combining the files"
else
echo "#ERROR: The number of lines in both files is not the same.
If the VCF was from complete genomics try running the output of awk -F "," '\$5 !~/[\<\[\]]/' $ComboFile to a new combo file and use that instead"
wc -l $ComboFile
wc -l $GenomeFile
exit 1
fi
# Fix up field delimiters
awk -F\" '{for(i=1;i<=NF;i+=2) {gsub(",", "\t", $i)}}1' OFS= $ComboFile > $ComboFile.txt
awk -F\" '{for(i=1;i<=NF;i+=2) {gsub(",", "\t", $i)}}1' OFS= $GenomeFile > $GenomeFile.txt
NColsGenomeFile=$(awk -F "\t" 'FNR == 2 {print NF}' $GenomeFile.txt)
# Combine columns (specific to how ANNOVAR is run so may or may not work perfectly)
cut -f 1-5 $ComboFile.txt > $OUTPREFIX.tmp.1.txt # chr,start,end,ref,obs
cut -f 6-14 $GenomeFile.txt > $OUTPREFIX.tmp.2a.txt # Func.gene,Gene.refGene,GeneDetail,ExonicFunc,AAChange,Conserved,SegDup,ESP6500siv2_ALL,1000g2015aug_ALL
cut -f 14-16 $ComboFile.txt > $OUTPREFIX.tmp.2b.txt # exac03,gnomad_exome,gnomad_genome
cut -f 28-33 $GenomeFile.txt > $OUTPREFIX.tmp.3a.txt # snp150, clinvar (5cols)
cut -f 6,7,8,10,12 $ComboFile.txt > $OUTPREFIX.tmp.3b.txt # DDG2P,EpilepsyGene,CPGene,IDGene,MCDGene
cut -f 9,11,13 $ComboFile.txt > $OUTPREFIX.tmp.3c.txt # GDIScores,LoFToolScores,RVISExACscores
cut -f 34-106 $GenomeFile.txt > $OUTPREFIX.tmp.4.txt # All the other annotations
cut -f 15-27 $GenomeFile.txt > $OUTPREFIX.tmp.4b.txt # gnomad30_genome
cut -f 107-$NColsGenomeFile $GenomeFile.txt > $OUTPREFIX.tmp.5.txt # VCF info
# Replace Otherinfo labels with VCF fields"
if [ "${OUTPREFIX:(-2)}" = "gz" ]; then
VCFFIELDS=$(zcat $OUTPREFIX | grep -m1 \#CHROM) #Assumes $OUTPREFIX = a bgzipped vcf file which might be the case if fed from ANNOVARv3.sh
NcolsVCFfields=$(zcat $OUTPREFIX | grep -m1 \#CHROM | awk -F "\t" 'FNR == 1 {print NF}')
else
VCFFIELDS=$(grep -m1 \#CHROM $OUTPREFIX) #Assumes $OUTPREFIX = a vcf file which might be the case if fed from ANNOVARv3.sh
NcolsVCFfields=$(grep -m1 \#CHROM $OUTPREFIX | awk -F "\t" 'FNR == 1 {print NF}')
fi
NColsTmp5=$(awk -F "\t" 'FNR == 2 {print NF}' $OUTPREFIX.tmp.5.txt)
if [ "$NcolsVCFfields" -eq "$NColsTmp5" ]; then
printf "$VCFFIELDS\n" > $OUTPREFIX.tmp.6.txt
sed '1d' $OUTPREFIX.tmp.5.txt >> $OUTPREFIX.tmp.6.txt
mv $OUTPREFIX.tmp.6.txt $OUTPREFIX.tmp.5.txt
else # A bit risky but should work unless something weird happened
printf "Allele_Fraction\t$VCFFIELDS\n" > $OUTPREFIX.tmp.6.txt
cut -f 1,4-$NColsTmp5 $OUTPREFIX.tmp.5.txt | sed '1d'>> $OUTPREFIX.tmp.6.txt # Assume -withzyg info has been added. Keep the Allele fraction but chuck out the Depth and QUAL data
mv $OUTPREFIX.tmp.6.txt $OUTPREFIX.tmp.5.txt
fi
# Create a key for variant lookup chr-start-ref-obs
cut -f 1,2,4,5 $OUTPREFIX.tmp.1.txt | sed 's,\t,-,g' > $OUTPREFIX.tmp.key.txt
paste $OUTPREFIX.tmp.1.txt $OUTPREFIX.tmp.2a.txt $OUTPREFIX.tmp.2b.txt $OUTPREFIX.tmp.3a.txt $OUTPREFIX.tmp.3b.txt $OUTPREFIX.tmp.3c.txt $OUTPREFIX.tmp.4.txt $OUTPREFIX.tmp.4b.txt $OUTPREFIX.tmp.5.txt $OUTPREFIX.tmp.key.txt > $OUTPREFIX.GenomeAnnotationsCombined.txt
# Clean up temp files
rm $OUTPREFIX.tmp.1.txt $OUTPREFIX.tmp.2a.txt $OUTPREFIX.tmp.2b.txt $OUTPREFIX.tmp.3a.txt $OUTPREFIX.tmp.3b.txt $OUTPREFIX.tmp.3c.txt $OUTPREFIX.tmp.4.txt $OUTPREFIX.tmp.4b.txt $OUTPREFIX.tmp.5.txt $ComboFile.txt $GenomeFile.txt