This repository has been archived by the owner on Feb 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
6_DoSimulationAnalysis.sh
executable file
·152 lines (123 loc) · 7.7 KB
/
6_DoSimulationAnalysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#! /bin/bash
# Run a simulation analysis with different QTL settings to make sure these methods can actually find causal QTN
scriptdir=$1
simdir=$2
genotypes=$3
phenotypes=$4
covariate_file=$5
traits="$6"
num_qtn="$7"
herits="$8"
nreps=$9
gwas_outdir=${10}
genos_bychrom_prefix=${11}
chromlengths=${12}
permdir=${13}
empirical_pval_cutoff=${14}
maxprocs=${15}
TASSEL5="${16}"
splitdir=$simdir/6a_split_genos
myperms=$simdir/6a_permuted_genos
if [ ! -e $splitdir ]; then mkdir $splitdir; fi
if [ ! -e $myperms ]; then mkdir $myperms; fi
# # Pull phenotypes out from GWAS input file
Rscript $scriptdir/6a_SubsetPhenotypes.r -i $phenotypes -t $traits -o $simdir/6a_target_phenos
# Get BLUPs and residuals after fitting a kinship matrix
$TASSEL5 -h $genotypes -ck -export $simdir/6b_kinship.txt
zcat $genotypes | head -n 2 > $simdir/6b_dummy_genos.hmp.txt
# Subset to just the genotypes that had pvals of 0.001 or better among all real phenotypes. (Meant to speed computation; presumably other sites are mostly irrelevant/underpowered)
python3 $scriptdir/6a_IdentifyMeaningfulSites.py -i $gwas_outdir/*.pvals.txt --max-p 0.001 -o $simdir/6a_target_sites.txt
$TASSEL5 -h $genotypes -includeSiteNamesInFile $simdir/6a_target_sites.txt -export $simdir/6a_genos_short.hmp.txt.gz
# Separate out the above smaller genotype files into individual chromosomes, but still keep the original kinship matrices (since filtering genotypes is just meant to have MLM take less time)
for chr in `seq 1 10`; do
cp ${genos_bychrom_prefix}${chr}.other_chroms.kinship.txt $splitdir # Copy over original kinship matrix of all other chromosomes
$TASSEL5 -h $simdir/6a_genos_short.hmp.txt.gz -separate $chr -export $splitdir/chr$chr.hmp.txt.gz # Separate out per-chromosome genotypes; file name should basically match that of the genos_bychrom_prefix, just in a different directory
done
# Filter all the permuted genotype files to just the above sites
for hmp in $permdir/*.hmp.txt.gz; do
filename=`basename $hmp`
$TASSEL5 -h $hmp -includeSiteNamesInFile $simdir/6a_target_sites.txt -export $myperms/$filename
done
# Make numeric genotypes to do simulations from
$TASSEL5 -h $simdir/6a_genos_short.hmp.txt.gz -NumericalGenotypePlugin -endPlugin -ExportPlugin -saveAs $simdir/6b_genos.numeric.txt -format ReferenceProbability
# Run analysis separately for each traits
seed=0 # Random seed for permuting residuals
for trait in $traits; do
seed=$(($seed + 1))
genos=$simdir/6b_dummy_genos.hmp.txt # Single site since only need residuals
kinship=$simdir/6b_kinship.txt
blups=$simdir/6a_target_phenos.$trait.tassel.txt
# Create subdirectory
workdir=$simdir/6c_$trait
if [ ! -e $workdir ]; then mkdir $workdir; fi
# Run MLM to get residuals
if [ $trait != "flowering_time" ] ; then
$TASSEL5 -fork1 -h $genos -fork2 -t $blups -fork3 -k $kinship -fork99 -t $covariate_file -combine4 -input1 -input2 -input99 -intersect -combine5 -input3 -input4 \
-mlm -mlmMaxP 1 -mlmCompressionLevel None -export $workdir/6c_mlm -runfork1 -runfork2 -runfork3 -runfork99
else
$TASSEL5 -fork1 -h $genos -fork2 -t $blups -fork3 -k $kinship -combine4 -input1 -input2 -intersect -combine5 -input3 -input4 \
-mlm -mlmMaxP 1 -mlmCompressionLevel None -export $workdir/6c_mlm -runfork1 -runfork2 -runfork3
fi
# Rename residuals
mv $workdir/6c_mlm1.txt $workdir/6c_mlm.residuals.txt
mv $workdir/6c_mlm2.txt $workdir/6c_mlm.stats.txt
# Scramble residuals
Rscript $scriptdir/6d_PermuteResiduals.r -i $workdir/6c_mlm.residuals.txt -o $workdir/6d_mlm.residuals.permuted.txt --seed $seed --reps $nreps
# Add QTL
Rscript $scriptdir/6e_SimulateQtl.r -i $workdir/6d_mlm.residuals.permuted.txt --genos $simdir/6b_genos.numeric.txt -o $workdir/6e_residuals_with_qtl --num-qtl $num_qtn --herits $herits --seed $seed
# Add these new residuals back in place of the old ones to get phenotypes ready for complete GWAS analysis
Rscript $scriptdir/6f_RegenerateBlups.r --raw $blups --residuals $workdir/6c_mlm.residuals.txt --perms $workdir/6e_residuals_with_qtl.phenos.txt -o $workdir/6f_blups_with_qtl.txt
# break
done
# Now go through and run GWAS (Put in a separate loop for ease of debugging and analysis
for trait in $traits; do
workdir=$simdir/6c_$trait
genos=$simdir/6b_dummy_genos.hmp.txt
kinship=$simdir/6b_kinship.txt
blups=$workdir/6f_blups_with_qtl.txt
extradir=$workdir/6g_extra_files
if [ ! -e $extradir ]; then mkdir $extradir; fi
# Run MLM on simulated data to get heritabilities
if [ $trait != "flowering_time" ] ; then
$TASSEL5 -fork1 -h $genos -fork2 -t $blups -fork3 -k $kinship -fork99 -t $covariate_file -combine4 -input1 -input2 -input99 -intersect -combine5 -input3 -input4 \
-mlm -mlmMaxP 1 -mlmCompressionLevel None -export $workdir/6g_mlm -runfork1 -runfork2 -runfork3 -runfork99
else
$TASSEL5 -fork1 -h $genos -fork2 -t $blups -fork3 -k $kinship -combine4 -input1 -input2 -intersect -combine5 -input3 -input4 \
-mlm -mlmMaxP 1 -mlmCompressionLevel None -export $workdir/6g_mlm -runfork1 -runfork2 -runfork3
fi
# Parse output files (I really wish TASSEL let me set them individually)
for outfile in $workdir/6g_mlm*.txt; do
is_stats=`head -n 1 $outfile | grep "MarkerR2" | wc -l | cut -f1`
if [[ $is_stats -eq 1 ]] ; then
mv $outfile $outfile.stats
else
mv $outfile $extradir
fi
done
# Extract heritabilities to compare original with new
python3 $scriptdir/6h_PlotPermHeritabilities.py --orig $workdir/6c_mlm.stats.txt --perms $workdir/6g_mlm*.txt.stats -o $workdir/6h_perm_heritabilities.png
# # Run on full dataset and plot GWAS
outprefix=$workdir/6i_gwas
genos_prefix=$splitdir/chr
outdir=$outprefix
if [ ! -e $outdir ]; then mkdir $outdir; fi
$scriptdir/4u_RunTasselGwasByChrom.sh $scriptdir $blups $genos_prefix $outdir $outprefix $covariate_file flowering_time $chromlengths $maxprocs "$TASSEL5" $myperms $empirical_pval_cutoff
# Subset original hapmap to just simulated QTL so have their physical positions
Rscript -e "x=read.delim('$workdir/6e_residuals_with_qtl.simulation_stats.txt', stringsAsFactors=F); qtn=x[,'qtn']; qtn=unlist(lapply(qtn, strsplit, split=',')); write(sort(unique(qtn)), '$workdir/6j_qtn.txt')"
$TASSEL5 -h $genotypes -includeSiteNamesInFile $workdir/6j_qtn.txt -export $workdir/6j_qtn.hmp.txt
# Replot Manhatten plots with simulated QTL marked (code modified from 4u_RunTasselGwasByChrom.sh; overwrites ones made by the above script (but are identical except for QTL locations anyway)
plot_commands=$outprefix.plot_with_qtl_commands.txt
gwasdir=$outdir/4u_orig_gwas
plotdir=$outdir/4v_gwas_results
echo "echo Plotting empirical GWAS results with QTL marked" > $plot_commands
for trait in $gwasdir/*_chr1.pvals.txt; do
trait=`basename $trait`
trait=${trait/_chr1.pvals.txt/}
echo "python3 $scriptdir/6j_ParseGwasResultsWithSimulatedQtl.py -i $gwasdir/${trait}_chr*.pvals.txt --perms $plotdir/$trait.pvals.txt --sparsify 0.1 --empirical-p-cutoff $empirical_pval_cutoff -o $plotdir/$trait.parsed_with_qtl --chromlengths $chromlengths --qtl $workdir/6e_residuals_with_qtl.simulation_stats.txt --hapmap $workdir/6j_qtn.hmp.txt" >> $plot_commands
# break
done
cat $plot_commands | parallel --progress
# Parse QTL results down to a single file
python3 $scriptdir/6k_CollateQtlResults.py -i $plotdir/*.parsed_with_qtl.found_qtl.txt --qtl $workdir/6e_residuals_with_qtl.simulation_stats.txt -o $workdir/6k_qtl_simulation_results -p 0.1 0.05 0.01
# break
done