-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_RerunHempGenotyping.sh
172 lines (141 loc) · 8.41 KB
/
0_RerunHempGenotyping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#! /bin/bash
# Rerunning the hemp genotyping pipeline because first run filtered out PCR duplicates, which is all GBS is
# BWA alignment step maintained from initial analysis; those commands kept in 0_bwa_commands.sh
TASSEL5="run_pipeline.pl -Xms10g -Xmx40g"
bam_dir="/media/jgwall/Seagate Expansion Drive/Hemp_diversity_GS/Hemp_variant_calling/Bams/"
genome="$HOME/Projects/0_RawData/HempGenome/GCF_900626175.1_cs10_genomic.fna"
restriction_site="CAYNNNNRTG" # MslI restriction site in normal nucleotides. Mostly here for reference
hplc_results="0_HDGS_HPLC_Results.csv"
ncores=7 # number of processor cores to use
# Subdirectories
scriptdir="0_Scripts"
sortdir="1_bams_sorted"
filtdir="1a_filtering"
if [ ! -e $sortdir ] ; then mkdir $sortdir; fi
if [ ! -e $filtdir ] ; then mkdir $filtdir; fi
###################
# CONDA ENVIRONMENT
###################
# Conda environment name. See accompanying 0_CreateCondaEnvironment.sh file for how to create an identical environment
conda_hemp=proj-hemp-genotyping-2020
# Load functions required to be able to activate conda environments within scripts. KEEP THIS COMMAND UNCOMMENTED
. $(conda info --root)/etc/profile.d/conda.sh
conda activate $conda_hemp
##############
# Rebuild genotypes from BAM files
##############
# for sam in "$bam_dir"/*.aligned.sam; do
# name=`basename "$sam"`
# name=${name/.sam/}
# echo $name
#
#
# samtools view -h -S -b "$sam" > $sortdir/$name.bam
# samtools sort -@ $ncores $sortdir/$name.bam $sortdir/$name.sorted
# samtools index $sortdir/$name.sorted.bam
# rm $sortdir/$name.bam
#
# done
# # Call SNPs on all samples (skip indels with --skip-variants b/c these are harder to score and can cause issues). This takes ~36 hours on an 8-core machine
# max_depth=10000
# min_base_quality=20
# bcftools mpileup -d $max_depth -f "$genome" -Q $min_base_quality --output-type u --skip-indels --threads $ncores --annotate FORMAT/AD,FORMAT/DP $sortdir/*.sorted.bam | \
# bcftools call --multiallelic-caller --threads $ncores --skip-variants indels --output 1b_hemp.snp_calls.bcf --output-type b --variants-only
##############
# Basic Filtering - Depth
##############
# bcftools view 1b_hemp.snp_calls.bcf --output-type z > 1b_hemp.snp_calls.vcf.gz
#
# # Get reports
# $TASSEL5 -SortGenotypeFilePlugin -inputFile 1b_hemp.snp_calls.vcf.gz -outputFile 1b_hemp.snp_calls.sorted.vcf.gz
# $TASSEL5 -vcf 1b_hemp.snp_calls.sorted.vcf.gz -genotypeSummary site -export $filtdir/1c_sitesummary.txt
# $TASSEL5 -vcf 1b_hemp.snp_calls.sorted.vcf.gz -genotypeSummary taxa -export $filtdir/1c_taxasummary.txt
# vcftools --gzvcf 1b_hemp.snp_calls.sorted.vcf.gz --site-depth --stdout > $filtdir/1c_site_depth.txt
# # Complicated bash one-liner to count size of predicted restriction fragments in the hemp genome
# cat "$genome" | sed -r -e "s|^>.+|=====|" | tr -d '\n' | tr [:lower:] [:upper:] | \
# sed -r -e "s|(CA[CT]..)(..[AG]TG)|\1\n\2|g" -e "s|=====||g" | \
# awk '{print length}' > $filtdir/1c_restriction_frag_lengths.txt
# # Plot diagnostic plots from the above summary data
# Rscript $scriptdir/1c_PlotGenoSummary.r --sitefile $filtdir/1c_sitesummary.txt --taxafile $filtdir/1c_taxasummary.txt --depthfile $filtdir/1c_site_depth.txt \
# --restriction-frags $filtdir/1c_restriction_frag_lengths.txt --max-datapoints 10000 -o $filtdir/1c_summary_plots.png
# Hmm...based on number of restriction fragments and read depth, I'd expect a true mean depth of single-copy DNA around 5, but actual cluster seems around 100.
# (Ignoring the cluster of aligment errors that only have 1 read) Let's filter for the 100x cluster and see how it works
# # Initial filter for mean depth and % presence
# site_min_mean_depth=15
# site_max_mean_depth=125
# site_max_missing=0.2 # Equivalent to 80% present
# Rscript $scriptdir/1d_GetFilterLists.r --sitefile $filtdir/1c_sitesummary.txt --taxafile $filtdir/1c_taxasummary.txt --depthfile $filtdir/1c_site_depth.txt \
# --site-min-mean-depth $site_min_mean_depth --site-max-mean-depth $site_max_mean_depth --site-max-missing $site_max_missing \
# --outtaxa $filtdir/1d_taxa_to_keep.txt --outsites $filtdir/1d_sites_to_keep.txt
# # Perform actual filtering
# bcftools view --targets-file $filtdir/1d_sites_to_keep.txt --samples-file $filtdir/1d_taxa_to_keep.txt --output-type z --output-file $filtdir/1e_hemp_filter1.vcf.gz 1b_hemp.snp_calls.bcf
# Below for backup purposes
##############
# Final Filtering - Allele frequency and heterozygosity
##############
# # Run post-filter diagnostic plots (really should make this step its own function or subscript or something)
# $TASSEL5 -SortGenotypeFilePlugin -inputFile $filtdir/1e_hemp_filter1.vcf.gz -outputFile $filtdir/1e_hemp_filter1.sorted.vcf.gz
# $TASSEL5 -vcf $filtdir/1e_hemp_filter1.sorted.vcf.gz -genotypeSummary site -export $filtdir/1e_sitesummary.filter1.txt
# $TASSEL5 -vcf $filtdir/1e_hemp_filter1.sorted.vcf.gz -genotypeSummary taxa -export $filtdir/1e_taxasummary.filter1.txt
# vcftools --gzvcf $filtdir/1e_hemp_filter1.sorted.vcf.gz --site-depth --stdout > $filtdir/1e_site_depth.filter1.txt
#
# # Plot
# Rscript $scriptdir/1c_PlotGenoSummary.r --sitefile $filtdir/1e_sitesummary.filter1.txt --taxafile $filtdir/1e_taxasummary.filter1.txt --depthfile $filtdir/1e_site_depth.filter1.txt \
# --restriction-frags $filtdir/1c_restriction_frag_lengths.txt --max-datapoints 10000 -o $filtdir/1e_hemp_filter1.summary_plots.png
#
#
#
# # # Second round of filtering
# site_max_het=0.1
# site_min_maf=0.025
# Rscript $scriptdir/1d_GetFilterLists.r --sitefile $filtdir/1e_sitesummary.filter1.txt --taxafile $filtdir/1e_taxasummary.filter1.txt --depthfile $filtdir/1e_site_depth.filter1.txt \
# --site-max-het $site_max_het --site-min-maf $site_min_maf \
# --outtaxa $filtdir/1f_taxa_to_keep.filt2.txt --outsites $filtdir/1f_sites_to_keep.filt2.txt
#
# # Perform actual filtering
# bcftools view --targets-file $filtdir/1f_sites_to_keep.filt2.txt --samples-file $filtdir/1f_taxa_to_keep.filt2.txt --output-type z --output-file $filtdir/1g_hemp_filter2.vcf.gz $filtdir/1e_hemp_filter1.vcf.gz
#
#
# # # Final post-filter diagnostic plots (mostly as a sanity check)
# $TASSEL5 -SortGenotypeFilePlugin -inputFile $filtdir/1g_hemp_filter2.vcf.gz -outputFile $filtdir/1g_hemp_filter2.sorted.vcf.gz
# $TASSEL5 -vcf $filtdir/1g_hemp_filter2.sorted.vcf.gz -genotypeSummary site -export $filtdir/1g_sitesummary.filter2.txt
# $TASSEL5 -vcf $filtdir/1g_hemp_filter2.sorted.vcf.gz -genotypeSummary taxa -export $filtdir/1g_taxasummary.filter2.txt
# vcftools --gzvcf $filtdir/1g_hemp_filter2.sorted.vcf.gz --site-depth --stdout > $filtdir/1g_site_depth.filter2.txt
#
# # Plot
# Rscript $scriptdir/1c_PlotGenoSummary.r --sitefile $filtdir/1g_sitesummary.filter2.txt --taxafile $filtdir/1g_taxasummary.filter2.txt --depthfile $filtdir/1g_site_depth.filter2.txt \
# --restriction-frags $filtdir/1c_restriction_frag_lengths.txt --max-datapoints 10000 -o $filtdir/1g_hemp_filter2.summary_plots.png
#
#
# # Copy filtered genotypes back to main directory
# cp $filtdir/1g_hemp_filter2.sorted.vcf.gz ./1g_hemp_filtered.sorted.vcf.gz
# ##############
# # Cladogram
# ##############
#
# # Tree
# $TASSEL5 -vcf 1g_hemp_filtered.sorted.vcf.gz -tree Neighbor -treeSaveDistance false -export 1g_hemp_filtered.sorted.tre -exportType Text
# mv 1g_hemp_filtered.sorted.tre.txt 1g_hemp_filtered.sorted.tre
# ##############
# # Genomic prediction
# ##############
#
# # Rename samples to match phenotype info (Tried with 'bcftools reheader', but it did the compression wrong somehow)
# source=1g_hemp_filtered.sorted.vcf.gz
# newfile=1h_hemp_filtered.sorted.renamed.vcf
# zcat $source | grep "^##" > $newfile # Metadata lines
# zcat $source | grep "^#CHROM" | sed -r "s|1_bams_sorted/P.-HDGS-([0-9]+).aligned.sorted.bam|HDGS_\1|g" >> $newfile # Header line
# zcat $source | grep -v "^#" >> $newfile # Genotypes
# gzip $newfile
#
# # Put phenotypes in TASSEL format; also do sqrt and log transformations, just in case
# Rscript $scriptdir/1i_FormatPhenosForTassel.r -i $hplc_results -o 1i_hdgs_phenos --log --sqrt
#
# # Run genomic prediction with TASSEL
# $TASSEL5 -fork1 -vcf 1h_hemp_filtered.sorted.renamed.vcf.gz -ck -fork2 -t 1i_hdgs_phenos.tassel.txt -combine3 -input1 -input2 \
# -GenomicSelectionPlugin -doCV true -kFolds 10 -nIter 1000 -endPlugin -export 1j_gs_results.txt
#
# # Graph results as violin plots
# Rscript $scriptdir/1k_PlotPredictionAccuracy.r -i 1j_gs_results.txt -o 1k_gs_results
#
# # Not great, unfortunately