-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfixing_tassel_vcf_seq.sh
270 lines (221 loc) · 8.38 KB
/
fixing_tassel_vcf_seq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/bin/bash
# Function to display usage instructions
usage() {
echo
echo "###############################################"
echo "# #"
echo "# Help Information #"
echo "# #"
echo "###############################################"
echo
echo "Usage:"
echo -e "\t$0 [OPTIONS] ARGUMENT"
echo
echo "Description:"
echo -e "\tThis script will take a known reference genome (.fa) and a molecular marker file (.vcf.gz)"
echo -e "\tand checks for agreement between the marker file and the reference sequences. This function was made in"
echo -e "\treponse to the TASSEL v2.0 Standalone GBS pipeline only putting VCFs out in major/minor allele format."
echo -e "\tThis script ensures that the VCF shares the same ref/alt information as the reference genome."
echo
echo "Options:"
echo -e "\t-h, --help Display this help message and exit."
echo
echo "Arguments:"
echo -e "\t-v, --vcf, The VCF input (.vcf.gz file) file path. This must be the full length real path!"
echo -e "\t-r, --ref, The reference sequence (.fa file) file path that you are checking against the VCF. This must be the full length real path!"
echo -e "\t-n, --name The name of the new vcf.gz output."
echo
echo "Examples:"
echo -e "\tbash $0 -v 'example.vcf.gz' -r 'example.fa' -n 'fixed_example.vcf.gz'"
echo
exit 1
}
# Initialize variables
vcf=""
refseq=""
cores=""
name=""
# Parse command-line options
while getopts "v:r:n:h" opt; do
case ${opt} in
v | --vcf )
vcf=$OPTARG
;;
r | --ref )
refseq=$OPTARG
;;
n | --name )
name=$OPTARG
;;
h | --help )
usage
;;
\? )
echo "Invalid option: $OPTARG" 1>&2
usage
;;
: )
echo "Invalid option: $OPTARG requires an argument" 1>&2
usage
;;
esac
done
shift $((OPTIND -1))
# Check if vcf and refseq are provided
if [ -z "$vcf" ] || [ -z "$refseq" ]; then
echo
echo "*** Error: Both VCF and reference sequence files are required! ***"
usage
fi
# Check if the files exist
if [ ! -f "$vcf" ]; then
echo
echo "*** Error: VCF file '$vcf' not found! ***"
usage
fi
if [ ! -f "$refseq" ]; then
echo
echo "*** Error: Reference sequence '$refseq' not found! ***"
usage
fi
# Echo header
echo
echo "###############################################"
echo "# #"
echo "# TASSEL VCF Fixer v1.0 #"
echo "# #"
echo "###############################################"
echo
echo "Written by: Zachary J. Winn PhD"
echo "Contact information:"
echo -e "\tGovernment Email: [email protected]"
echo -e "\tPersonal Email: [email protected]"
echo
echo "###############################################"
echo "# WARNING: This program is not under warranty #"
echo "# Use at your own discretion! #"
echo "###############################################"
echo
# Function to check if a string is a number
is_number() {
# Regex pattern to match a number
number_pattern="^[0-9]+([.][0-9]+)?$"
# Check if the input matches the number pattern
if [[ $1 =~ $number_pattern ]]; then
echo 0
else
echo 1
fi
}
# Main script
echo "### Provided path of VCF.GZ file: $vcf"
echo
echo "### Provided path of reference sequence FA file: $refseq"
echo
# Print
echo "### Copying, unzipping, and editing vcf file..."
echo
# Make Temp Dir
if [ -d "fixing_tassel_vcf_temp" ]; then
echo "*** Error: The directory "fixing_tassel_vcf_temp" already exist! Delete that directory and try again! ***"
exit 1
fi
# Make a temp directory
mkdir fixing_tassel_vcf_temp
# Write out temp file which has the chromsomes listed as chr1A, chr1B, etc
cd ${PWD}/fixing_tassel_vcf_temp
cp $vcf ${PWD}/temp1.vcf.gz
# gunzip temp1.vcf.gz
# Change id in temp1.vcf.gz if needed
# if ! grep -q "##contig=<ID=Chr" temp1.vcf; then
# echo "*** Warning: string '##contig=<ID=Chr' not found in header, attempting to fix..."
# echo ""
# sed -i 's/##contig=<ID=/##contig=<ID=Chr/g' temp1.vcf
# fi
# if grep -q "UNKNOWN" temp1.vcf; then
# echo "*** Warning: string 'UNKNOWN' found in header, attempting to fix..."
# echo ""
# sed -i 's/UNKNOWN/Unknown/g' temp1.vcf
# fi
# Add Chr to each chromosome
# awk -v OFS='\t' '{ if ($1 !~ /^#/) $1 = "Chr" $1; print }' temp1.vcf > temp_file && mv temp_file temp1.vcf
# Make into GZ
# bgzip temp1.vcf
# Print
echo "### Pulling positions from sequences for the provided VCF..."
echo
# Make a correpsonding list of the correct ref and alt in the reference
bedtools getfasta -fi $refseq -bed temp1.vcf.gz > correct_ref_alt.txt
# Print
echo "### Looping through and fixing ref/alt marker info..."
echo
# Separate vcf header
zcat temp1.vcf.gz | grep --text '^#' > header.txt
# Separate vcf body
zcat temp1.vcf.gz | grep -v '^#' > body.txt
# Get marker info
bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%QUAL\t%FILTER\n' "temp1.vcf.gz" > marker_info.txt
# Find length
length=$(zcat temp1.vcf.gz | grep -v '^#' | wc -l)
# Make an empty body
touch fixed_body.vcf
# Loop and fix
for i in $(seq 1 $length); do
# Pull
CHROM=$(awk -F'\t' -v row=$i -v col=1 'NR==row {print $col}' "marker_info.txt")
POS=$(awk -F'\t' -v row=$i -v col=2 'NR==row {print $col}' "marker_info.txt")
BACKPOS=$((POS-1))
ID=$(awk -F'\t' -v row=$i -v col=3 'NR==row {print $col}' "marker_info.txt")
REF=$(awk -F'\t' -v row=$i -v col=4 'NR==row {print $col}' "marker_info.txt")
ALT=$(awk -F'\t' -v row=$i -v col=5 'NR==row {print $col}' "marker_info.txt")
BEDREF=$(echo ">${CHROM}:${BACKPOS}-${POS}")
BEDLOC=$(grep -n "$BEDREF" "correct_ref_alt.txt" | cut -d: -f1)
TRUEREF=$((BEDLOC+1))
TRUEREF=$(awk "NR == $TRUEREF" "correct_ref_alt.txt")
# Check if the GBS Ref and True Ref agree
if [ "$REF" = "$TRUEREF" ]; then
# Pull that line and put it in the new vcf
NEWLINE=$(awk -F'\t' "NR == $i" "body.txt")
echo "$NEWLINE" >> fixed_body.vcf
elif [ "$REF" != "$TRUEREF" ] && [ "$ALT" != "$TRUEREF" ]; then
# Pull that line, fix the ref alt, and then put in the new vcf
echo "********************************************************************"
echo "Error:"
echo "${ID} (Line = ${i}) is not in correct Ref/Alt!"
echo "The Reference provided states ${TRUEREF} is the correct allele..."
echo "However, the GBS provided states that Ref=${REF} and Alt=${ALT}..."
echo "Something is wrong with this marker! Omitting from the final VCF..."
echo "********************************************************************"
else
#do something else
echo "${ID} (Line = ${i}) is not in correct Ref/Alt! Fixing!"
NEWLINE=$(awk -F'\t' "NR == $i" "body.txt")
NEWLINE=($NEWLINE)
NEWLINE[3]=$TRUEREF
NEWLINE[4]=$REF
NEWLINE_SPACED=$(echo "${NEWLINE[@]}" | tr ' ' '\t')
echo "${NEWLINE_SPACED[@]}" >> fixed_body.vcf
fi
done
# Bind header and body
cat "header.txt" >> "corrected_genotyping_file.vcf"
cat "fixed_body.vcf" >> "corrected_genotyping_file.vcf"
# Change id in temp1.vcf.gz
sed -i 's/##contig=<ID=Chr/##contig=<ID=/g' corrected_genotyping_file.vcf
sed -i 's/Unknown/UNKNOWN/g' corrected_genotyping_file.vcf
sed -i 's/Chr//g' corrected_genotyping_file.vcf
sed -i 's/Reference allele is not known. The major allele was used as reference allele/Alleles reported as reference-alternative/g' corrected_genotyping_file.vcf
# Zip it up using bgzip
bgzip corrected_genotyping_file.vcf
# Move to top directory
mv corrected_genotyping_file.vcf.gz ../
# Change directory back to top
cd ..
# Change name
mv corrected_genotyping_file.vcf.gz $name
# Index the vcf.gz
bcftools index -c $name
# Print
echo "### Done!"
# Remove
rm -rf fixing_tassel_vcf_temp