-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtassel_filter_hapmap.R
226 lines (177 loc) · 7.11 KB
/
tassel_filter_hapmap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#change working directory
args <- commandArgs(trailingOnly=T)
# args[1] = "pel2b_crosscheck.hmnfas.csv"
# args[2] = "PeliSampleData - SpecimenWell.csv"
# args[3] = "pel2b_crosscheck"
### Set Cutoffs
#This is at the start of the file to make it easy to adjust them
snp.cutoff = 78 # SNP must be present in this many samples (78)
sample.cutoff = 500 # Sample must have data on at least this many SNPS (500)
paralog.cutoff = 35 # If snps have more than this many heterozygotes (35)
### Import data
# Read TASSEL output which contains SNP data
geno <- read.csv(args[1],header=T,na.strings=".")
# strip sample info by discarding columns 1-12
# Gives a <num.samples> X <num.snps> data frame
g <- geno[,-(1:12)]
# Save tag seqs in a vector
g.seqtags <- as.character(geno[,2])
### Image raw data
# Save (to png device)
png(file=paste(args[3],"raw.png",sep="."))
# Create colour map of genotype matrix, White cells are missing data
image(as.matrix(g), main="Raw Data", col=rainbow(3))
dev.off()
### Filter data
# For each row (SNP) in the genotype, count how many samples have data on this snp
snp.sample.counts <- apply(g, 1, function(x) sum(!is.na(x)))
# Plot the number of samples each snp has
# DISABLED TO ALLOW SCRIPTING
# plot(cumsum(table(snp.sample.counts)))
# hist(snp.sample.counts,breaks=96)
# abline(v=snp.cutoff)
# Save histogram which describes the cutoff point used
png(file=paste(args[3], "snp_filtering_hist.png", sep="."))
hist(snp.sample.counts,breaks=96)
abline(v=snp.cutoff)
dev.off()
# How many SNP sites are present after filtering (= TRUE)
table(snp.sample.counts > snp.cutoff)
# Transfer useful SNP sites to new data frame
g.snp <- g[snp.sample.counts > snp.cutoff,]
png(file=paste(args[3], "data_post_SNP_filt.png", sep="."))
image(as.matrix(g.snp), col=rainbow(3), main="Data After SNP filtering")
dev.off()
### Qualtity control samples
# For each column (Samples) in the filterd SNP set, count how many SNPs this sample has
sample.snp.counts<-apply(g.snp,2,function(x) sum(!is.na(x)))
# Plot graphical summary of samples
# DISABLED TO ALLOW SCRIPTING
# hist(sample.snp.counts, breaks=40, xlim=c(0,600))
# abline(v=sample.cutoff)
png(file=paste(args[3], "sample_filtering_hist.png", sep="."))
hist(sample.snp.counts, breaks=40, xlim=c(0,600))
abline(v=sample.cutoff)
dev.off()
table(sample.snp.counts > sample.cutoff)
# perform filtering
gg <- g.snp[,sample.snp.counts > sample.cutoff]
# How much data is still missing?
table(!is.na(gg))
#image(as.matrix(gg), col=rainbow(3))
png(file=paste(args[3], "data_post_sample_filt.png", sep="."))
image(as.matrix(gg), col=rainbow(3), main="Data after SNP and Sample Filtering")
dev.off()
### Calculate major and minor allele encodings
freq <- rowSums(gg, na.rm=T)
tot <- rowSums(!is.na(gg))
# How many samples have major allele encoded as 2?
table(freq > tot)
## re-encode minor allele homo = 2
g.minor <- gg
# Invert encoding
g.minor[gg == 2] <- 0
g.minor[gg == 0] <- 2
# replace encoding for samples which already had minor allele homo = 2
g.minor[freq < tot,] <- gg[freq < tot,]
# Compute minor allele frequencies
min.allele.freq <- rowSums(g.minor, na.rm=T) / (rowSums(!is.na(g.minor)) *2)
#DISABLED TO ALLOW SCRITPING ON CLUSTER
# hist(min.allele.freq, breaks=20)
# Image data after allele correction
png(file=paste(args[3], "data_post_allele_correction.png", sep="."))
image(as.matrix(g.minor), col=rainbow(3), main="Data after allele correction")
dev.off()
### Filter for paralogs
# Excessive heterozygosity across many samples indicates probable paralogs
# Inspect this visually
#DISABLED FOR SCRIPTING
# image(g.minor==1, col=rainbow(3), main="Heterozygous Data (Checking for Paralogs)")
## Filter for excessive hets across samples
hets.per.sample <- colSums(g.minor==1,na.rm=T)
#DISABLED FOR SCRIPTING
# hist(hets.per.sample, breaks=25)
#threshold for too many 'het' calls in some samples??
which(hets.per.sample> 200)
## Filter snps for excessive heterozygosity
hets.per.snp <- rowSums(g.minor==1, na.rm=T)
#DISABLED FOR SCRIPTING
# hist(hets.per.snp, breaks=20)
# filter out paralogs
paralogous.snps <- hets.per.snp > paralog.cutoff
g.final <- g.minor[!paralogous.snps,]
# minor allele frequency distribution
final.min.allele.freq <- rowSums(g.final,na.rm=T)/ (2 * rowSums(!is.na(g.final)))
#DISABLED TO ALLOW SCRIPTING
# hist(final.min.allele.freq, breaks=20)
# How much data is still missing?
table(!is.na(g.final))
# Image this
png(file=paste(args[3], "final_data.png", sep="."))
image(
as.matrix(g.final),
sub=paste("sample_cutoff=", sample.cutoff, "snp_cutoff=", snp.cutoff),
col=rainbow(3),
main=paste("Final Data for dataset", args[3])
)
dev.off()
### Create name matrix. Do it down here, so we can apply this to the phylogenetic grouping
# Names gets the name of each column in g.final (the name of the sample)
names <- names(g.final)
# strsplit splits names of the samples in g on "_"
split.names <- strsplit(names ,split="_")
# unlist creates a single list from the split names
names.lst <- unlist(split.names)
# Create names matrix. nrow splits list into matrix with 5 rows
names.matrix <- matrix(names.lst, nrow=5)
names <- names.matrix[5,]
### Edits name matrix
# Substitute "A01" to "A1". "\\1" is a regex backreference to the first subexpression in parenthesis.
wells <- gsub("([[:upper:]])0([[:digit:]])", replacement="\\1\\2", names.matrix[5,])
plate.num <- names.matrix[3,]
# Import detailed sample data
samples.names <- read.csv(args[2])
# not all samples in samples.names were run through sequecing
# this gets the subset of samples.names which were run
index <- match(
paste(plate.num, wells),
paste(samples.names$Plate.Number,samples.names$Well.Number)
)
# build names table with extra information
names.matrix <- rbind(
names.matrix,
as.character(samples.names$Well.Number[index]),
as.character(samples.names$Locality[index]),
as.numeric(samples.names$Lat[index]),
as.numeric(samples.names$Long[index]),
as.character(samples.names$EntityCode[index]),
as.character(samples.names$State[index])
)
# Rename samples to "<entity.code> <collection.location> <collection.state>"
names(g.final) <- paste(names.matrix[10,],names.matrix[7,],names.matrix[11,])
# write names matrix
write.csv(names.matrix,file=paste(args[3],"names.csv", sep="."))
### Export final filtered data.
g.output <- g.final
g.output$tagseqs <- g.seqtags[as.numeric(row.names(g.output))]
write.csv(g.output,file = paste(args[3],"filtered.csv", sep="."))
# Minor Allele Frequency histogram of final data
png(file=paste(args[3], "minor.allel.freqs.png", sep="."))
hist(final.min.allele.freq,breaks=50, main="Minor Allele Frequencies")
dev.off()
# Genotype call image of final data
png(file=paste(args[3], "final_data.png", sep="."))
image(
as.matrix(g.final),
main="Genotype Call Data",
sub=paste("sample_cutoff=", sample.cutoff, "snp_cutoff=", snp.cutoff),
col=rainbow(3),
ylab="Samples",
xlab="SNPs"
)
dev.off()
### Print final summary
final.dim <- dim(g.final)
final.dat <- as.numeric(table(!is.na(g.final)))
print(paste("Final data matrix has", final.dim[1], "SNP sites in", final.dim[2], "samples"))
print(paste("Final data matrix has", final.dat[1], "pieces of missing data, and", final.dat[2], "pieces of data present"))