-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGlobal-local-ancestry-pipe.sh
264 lines (229 loc) · 11.8 KB
/
Global-local-ancestry-pipe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# Author: Colin Naughton, inspired by Varsha Bhat
# May 2022
# Global Ancestry prediction pipeline.
# TODO: Get variable name for number of threads
##################### Phase autosomes with Beagle #########################################
# The presence of a "chr" prefix on chromosomes on either the map of vcf file but not the other causes an error.
# Changed the map files to include the "chr" prefix via:
seq 1 22 | xargs -I{} sed -i 's/^/chr/' beagle_maps/plink.chr{}.GRCh38.map
# TODO: Check for beagle map files. If not present, download. Delete after phasing
# TODO: Check for reference files. If not present, download. Is this necessary if no imputation is done?
# TODO: Check if "phase_out" directory exists, if not, then make
# TODO: Check for total memory available on each node; set as variable
#Note: Expects 96GB of memory total per node with 12 threads per node
mkdir $PBS_O_WORKDIR/phase_out
beagle_maps=beagle_maps
raw_data=raw/genotype
beagle_out=phase_out
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
java -Xmx96g -jar /storage/home/hcoda1/0/cnaughton7/p-ggibson3-0/software/beagle.22Jul22.46e.jar \
gt=$PBS_O_WORKDIR/$raw_data/chr{1}_Emory-alloimmunization.vcf.gz \
map=$PBS_O_WORKDIR/$beagle_maps/plink.chr{1}.GRCh38.map \
out=$PBS_O_WORKDIR/$beagle_out/chr{1}_phased_Emory-alloimmunization \
impute=false \
nthreads=12
# Remove "chr" prefixes from vcf entries.
# TODO: Do this before phasing!
seq 1 22 | parallel \
"zcat phase_out/chr{1}_phased_Emory-alloimmunization.vcf.gz | sed 's/^chr//' | bgzip > phase_out/chr{1}_phased_Emory-alloimmunization.fixedChrPrefix.vcf.gz"
# Remove multiallelic sites & add missing variant IDs
# sort removes duplicates based on location;
# awk replaces variants IDs with CHR_POS_REF_ALT from the same entry
mkdir $PBS_O_WORKDIR/rename_dedup_out
echo "Remove multiallelic sites & add missing variant IDs" > rename_dedup_out/README
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
"zcat $PBS_O_WORKDIR/phase_out/chr{1}_phased_Emory-alloimmunization.fixedChrPrefix.vcf.gz | (head -n 9 && tail -n +10 | \
sort -n -u -t $'\t' -k2 | \
awk -v OFS='\t' '{\$3=\$1\"_\"\$2\"_\"\$4\"_\"\$5;print \$0}') | \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bgzip > \
$PBS_O_WORKDIR/rename_dedup_out/chr{1}_phased_Emory-alloimmunization.fixedChrPrefix.biallelic.renamed.vcf.gz"
#Renaming ref variant IDs requires "head -n 22 && tail -n +23"
#################### Get sample names specific to each query population ##################################
# Originally used data from /~/gridftp/1000g/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/
# Now using updated data from
# Run from base directory
ls igsr* | xargs -I{} tail -n +2 {} | cut -f1 >> query_population_sampleNames_noGender
# Include gender with sample names. NOTE: This was causing an issue for downstream analysis, so the "nogender" version was used.
ls igsr* | xargs -I{} tail -n +2 {} | sed 's/female/F/;s/male/M/' | cut -f1,2 >> query_population_sampleNames_withGender
############# Subset 1000 Genomes VCF based on population-specific sample names #########################
# TODO: Include variable for conda env path
# TODO: Add output dir path
# TODO: Add variable associated with reference population acronyms; add to output path
# TODO: Check for output directory; if not present, make
ref_sub_out=ref/subset
find ref/ALL_POP/ -name *ALL.chr[0-9]*.gz | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bcftools view \
--threads 12 \
--force-samples \
-S $PBS_O_WORKDIR/query_population_sampleNames_noGender \
-Oz \
-o $PBS_O_WORKDIR/$ref_sub_out/{=1 's/ref\/ALL_POP\/ALL\.//;s/.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz//' =}.CEU-YRI.vcf.gz \
$PBS_O_WORKDIR/{1} #Input
#################### Index Phased Autosomes & Subsetted Ref VCFs#############################################
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bcftools index \
$PBS_O_WORKDIR/rename_dedup_out/chr{1}_phased_.vcf.gz
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bcftools index \
$PBS_O_WORKDIR/rename_dedup_out/chr{1}.CEU-YRI.vcf.gz
#################### Intersect VCFs ##########################################################
# Intersect all chromosome vcfs between St. Jude patient data and CEU-YRI 1000 Genomes data
# Note: VCFs must be indexed
# Batch rename (Extra if needed)
#ls *.gz | sed 'p;s/.gz//' | xargs -n2 -P8 mv
mkdir $PBS_O_WORKDIR/intersect_out
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bEmory-alloimmunizationcftools isec \
-Oz \
-p $PBS_O_WORKDIR/intersect_out/chr{1} \
$PBS_O_WORKDIR/rename_dedup_out/chr{1}_phased_Emory-alloimmunization.fixedChrPrefix.biallelic.renamed.vcf.gz \
$PBS_O_WORKDIR/rename_dedup_out/subset/chr{1}.CEU-YRI.vcf.gz
################## Add variant names to vcf #################################################
# Remove?
# The following loops work; the subsequent oneliners don't
for file in intersect_out/chr*/0002*gz; do
out_name=$(echo $file | sed 's/intersect_out\///;s/\/0002//')
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bcftools annotate \
--set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' \
-Oz \
--threads 8 \
-o $annotation_out/$out_name \
$file
done
for file in intersect_out/chr*/0003*gz; do
out_name=$(echo $file | sed 's/intersect_out\///;s/\/0003//')
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/bcftools annotate \
--set-id +'%CHROM:%POS:%REF:%FIRST_ALT' \
-Oz \
--threads 12 \
-o $annotation_out/CEU-YRI_$out_name \
$file
done
###################### Convert to & merge plink files.###############################################
# NOTE: All samples did not have sex recorded in the output; they came up as "ambiguous"..
# Used Plink 1.9
# FUTURE: Make edits for use in PBS script
# Query data conversion
mkdir $PBS_O_WORKDIR/plink_out
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/plink \
--vcf $PBS_O_WORKDIR/intersect_out/chr{1}/0002.vcf.gz \
--out $PBS_O_WORKDIR/plink_out/Emory-alloimmunization_chr{1} \
--make-bed \
--threads 12
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/plink \
--vcf $PBS_O_WORKDIR/intersect_out/chr{1}/0003.vcf.gz \
--out $PBS_O_WORKDIR/plink_out/CEU-YRI_chr{1} \
--make-bed \
--threads 12
# Merge chromosomal plink files for downstream ancestry analysis with 'admixture'
ls plink_out/*[bf]* | xargs -n3 echo > mergedList.txt
plink --make-bed --out plink_merge-all_out --merge-list mergedList.txt
################## Associate ancestry with sample names####################################
# Creates file with ancestry acronyms on each line in the case of ref samples, or '-' in the case of non-ref samples
# Takes a few minutes.
# TODO: Add output name variable.
# TODO: Add sample prefix variable.
plink_fam_file=plink_merge-all_out.fam
# Read through sample names in the plink fam file
cat $plink_fam_file | while read line; do
sample_name=$(echo $line | cut -f1 -d ' ')
# Check if sample name has prefix of non-reference samples.
sample_prefix=$(echo $sample_name | grep -o ^..)
if [[ $sample_prefix == 'NW' ]]; then
echo '-' >> plink_merge-all_out.pop
continue
fi
# Loop through ref metadata to associate ancestry with sample name
cat igsr* | while read line; do
pop=$(echo $line | cut -f4 -d ' ') # Population acrononym e.g. "CEU"
pop_sample=$(echo $line | cut -f1 -d ' ') # Reference sample name
#Check if sample name from plink fam file is same as sample name from ref metadata
if [[ $sample_name == $pop_sample ]]; then
echo $pop >> plink_merge-all_out.pop
break
fi
done
done &
################### Run admixture #####################################################
# The "-j" option indicates the number of threads
mkdir admixture_out
echo "Admixture run with 2 populations." > admixture_out/README
admixture plink_merge-all_out.bed 2 --supervised -j24
paste plink_merge-all_out.fam plink_merge-all_out.pop plink_merge-all_out.2.Q > admixture_out/ancestry_estimates.txt # Associate global ancestry esitmates with sample names.
# Make .tsv with admixture ancestry estimates including header.
grep NW.* admixture_out/ancestry_estimates.txt | \
awk 'BEGIN {print "Sample_ID\tCEU\tYRI"} {print $1"\t"$8"\t"$9}' > admixture_out/query_ancestry_estimates.tsv
# R Code For plotting admixture results
# TODO: Run from bash script
# TODO: Add population legend
anc <- read.table("query_ancestry_estimates.tsv", header = TRUE)
anc <- anc[order(anc$CEU),]
barplot(t(as.matrix(anc[,2:3])), col=c("#0000ff","#F4A500"), fill=c("#0000ff","#F4A500"),
main="Alloimmunization Cohort Global Ancestry Proportions",
xlab="Patient (n=355)",
ylab="Ancestry",
space = 0,
border=NA,
xaxt = 'n',
cex.lab=2,
cex.axis = 1.8,
cex.main = 1.8,
mgp=c(1,-0.5,-1.5)) +
legend("top",
legend = c("European (CEU)", "African (YRI)"),
fill = c("#0000ff", "#F4A500"),
cex =1.5)
################### Local Ancestry Inference with RFMix2 ##########################
# TODO: Isolate local ancestry inference pipeline in separate script
# TODO: Check for out folder
mkdir $PBS_O_WORKDIR/rfmix_out
echo "RFMix2 run per chromsome with 24 cores/node" > $PBS_O_WORKDIR/rfmix_out/README
rfmix_out=$PBS_O_WORKDIR/rfmix_out
# Make file mapping sample names to ancestry required by RFMix2
ls igsr* | xargs -I{} tail -n +2 {} | cut -f1,4 > query_population_sampleNames_ancestry
# Edit genomic map files to format required by RFMix2
for file in beagle_maps/*chr[1-9]*; do
out_name=$rfmix_out/$(echo $file | sed 's/beagle_maps\/plink/rfmix/')
awk '{print $1"\t"$4"\t"$3}' $file > $out_name
done
# Run RFMix2
# Note: Took ~1.5hours with 22 nodes at 24 cores/node.
# Note: The developers recommended VCFs be joined prior for some reason.
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/rfmix \
-f $PBS_O_WORKDIR/intersect_out/chr{1}/0002.vcf.gz \
-r $PBS_O_WORKDIR/intersect_out/chr{1}/0003.vcf.gz \
-m $PBS_O_WORKDIR/query_population_sampleNames_ancestry \
-g $PBS_O_WORKDIR/rfmix_out/rfmix.chr{1}.GRCh38.map \
-o $PBS_O_WORKDIR/rfmix_out/chr{1} \
--chromosome={1} \
--n-threads=24
################### Tractor Analysis ######################################
git clone https://github.com/Atkinson-Lab/Tractor
# Step 1: Recovering tracts
# Required numpy installation
# TODO: Submit pull request for fixes
# Identify and correct switch errors in local ancestry calls
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
python3 $PBS_O_WORKDIR/Tractor/UnkinkMSPfile.py \
--msp $PBS_O_WORKDIR/rfmix_out/chr{1}
# Unzip phased VCFs
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
gunzip -c $PBS_O_WORKDIR/phase_out/chr{1}.vcf.gz '>' $PBS_O_WORKDIR/phase_out/chr{1}.vcf
# Correcting switch errors in genotype data
# Should this step be done on the subset VCF? Or full VCF?
# TODO: Un-hardcode python.
seq 1 22 | parallel -N1 -j1 --sshloginfile $PBS_NODEFILE \
/storage/home/hcoda1/0/cnaughton7/.conda/envs/ancestry-env1/bin/python \
$PBS_O_WORKDIR/Tractor/UnkinkGenofile.py \
--switches $PBS_O_WORKDIR/rfmix_out/chr{1}.switches.txt \
--genofile $PBS_O_WORKDIR/annotation_out/chr{1}.vcf
# Step 2: Extracting tracts & ancestral dosages
# TODO: Wasn't working on multiple nodes, additionally, a single node only supports 15 parallel jobs at one time regardless of whether there is enough cores
seq 1 22 | parallel -N1 -j22 --sshloginfile $PBS_NODEFILE \
python3 $PBS_O_WORKDIR/Tractor/ExtractTracts.py \
--msp $PBS_O_WORKDIR/rfmix_out/chr{1} \
--vcf $PBS_O_WORKDIR/tractor_out/unkinked/chr{1}.unkinked \
--num-ancs 2