-
Notifications
You must be signed in to change notification settings - Fork 0
/
AlphaFamImpute_prep.R
204 lines (149 loc) · 6.35 KB
/
AlphaFamImpute_prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
library(vcfR)
#Family 30.004
vcf = read.vcfR("30.004_snps.vcf.gz")
ad = extract.gt(vcf, element = "AD")
colnames(ad) = gsub("\\.bam*","",basename(colnames(ad)))
ad = t(ad)
gt = extract.gt(vcf, element = "GT")
colnames(gt) = gsub("\\.bam*","",basename(colnames(gt)))
gt[gt=="0/0"] = 0
gt[gt=="0/1"] = 1
gt[gt=="1/1"] = 2
gt = t(gt)
mode(gt) = "numeric"
dp = extract.gt(vcf, element = "DP")
colnames(dp) = gsub("\\.bam*","",basename(colnames(dp)))
dp = t(dp)
mode(dp) = "numeric"
gt[which(dp<5 | dp>100)] = NA
ad[which(dp<5 | dp>100)] = "0,0"
MAF = colMeans(gt,na.rm=T)/2
gt = gt[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,which(colSums(is.na(gt))<nrow(ad)*0.8)]
seqfile = matrix(NA,nrow(ad)*2,ncol(ad))
for(i in 1:nrow(ad)){
seqfile[seq(1,nrow(ad)*2,2)[i],] = sapply(strsplit(ad[i,],","), `[`, 1)
seqfile[seq(1,nrow(ad)*2,2)[i]+1,] = sapply(strsplit(ad[i,],","), `[`, 2)
}
mode(seqfile) = "numeric"
rownames(seqfile) = rep(rownames(ad),each=2)
map = cbind(gsub("Chr","",sapply(strsplit(colnames(ad),"_"), `[`, 1)),sapply(strsplit(colnames(ad),"_"), `[`, 2))
pedigree = rbind(cbind(rownames(ad)[2],0,0),
cbind(rownames(ad)[1],0,0),
cbind(rownames(ad)[-c(1:2)],rownames(ad)[2],rownames(ad)[1]))
write.table(seqfile,"seqfile_30.004.txt",col.names = F,quote = F)
write.table(map,"map_30.004.txt",row.names = F,col.names = F, quote = F)
write.table(pedigree,"pedigree_30.004.txt",row.names = F,col.names = F, quote = F)
write.table(colnames(ad),"snpnames_30.004.txt",row.names = F,col.names = F,quote = F)
#Family 30.058
vcf = read.vcfR("30.058_snps.vcf.gz")
ad = extract.gt(vcf, element = "AD")
colnames(ad) = gsub("\\.bam*","",basename(colnames(ad)))
ad = t(ad)
gt = extract.gt(vcf, element = "GT")
colnames(gt) = gsub("\\.bam*","",basename(colnames(gt)))
gt[gt=="0/0"] = 0
gt[gt=="0/1"] = 1
gt[gt=="1/1"] = 2
gt = t(gt)
mode(gt) = "numeric"
dp = extract.gt(vcf, element = "DP")
colnames(dp) = gsub("\\.bam*","",basename(colnames(dp)))
dp = t(dp)
mode(dp) = "numeric"
gt[which(dp<5 | dp>100)] = NA
ad[which(dp<5 | dp>100)] = "0,0"
MAF = colMeans(gt,na.rm=T)/2
gt = gt[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,which(colSums(is.na(gt))<nrow(ad)*0.8)]
seqfile = matrix(NA,nrow(ad)*2,ncol(ad))
for(i in 1:nrow(ad)){
seqfile[seq(1,nrow(ad)*2,2)[i],] = sapply(strsplit(ad[i,],","), `[`, 1)
seqfile[seq(1,nrow(ad)*2,2)[i]+1,] = sapply(strsplit(ad[i,],","), `[`, 2)
}
mode(seqfile) = "numeric"
rownames(seqfile) = rep(rownames(ad),each=2)
map = cbind(gsub("Chr","",sapply(strsplit(colnames(ad),"_"), `[`, 1)),sapply(strsplit(colnames(ad),"_"), `[`, 2))
pedigree = rbind(cbind(rownames(ad)[2],0,0),
cbind(rownames(ad)[1],0,0),
cbind(rownames(ad)[-c(1:2)],rownames(ad)[2],rownames(ad)[1]))
write.table(seqfile,"seqfile_30.058.txt",col.names = F,quote = F)
write.table(map,"map_30.058.txt",row.names = F,col.names = F, quote = F)
write.table(pedigree,"pedigree_30.058.txt",row.names = F,col.names = F, quote = F)
write.table(colnames(ad),"snpnames_30.058.txt",row.names = F,col.names = F,quote = F)
#Family 30.062
vcf = read.vcfR("30.062_snps.vcf.gz")
ad = extract.gt(vcf, element = "AD")
colnames(ad) = gsub("\\.bam*","",basename(colnames(ad)))
ad = t(ad)
gt = extract.gt(vcf, element = "GT")
colnames(gt) = gsub("\\.bam*","",basename(colnames(gt)))
gt[gt=="0/0"] = 0
gt[gt=="0/1"] = 1
gt[gt=="1/1"] = 2
gt = t(gt)
mode(gt) = "numeric"
dp = extract.gt(vcf, element = "DP")
colnames(dp) = gsub("\\.bam*","",basename(colnames(dp)))
dp = t(dp)
mode(dp) = "numeric"
gt[which(dp<5 | dp>100)] = NA
ad[which(dp<5 | dp>100)] = "0,0"
MAF = colMeans(gt,na.rm=T)/2
gt = gt[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,which(colSums(is.na(gt))<nrow(ad)*0.8)]
seqfile = matrix(NA,nrow(ad)*2,ncol(ad))
for(i in 1:nrow(ad)){
seqfile[seq(1,nrow(ad)*2,2)[i],] = sapply(strsplit(ad[i,],","), `[`, 1)
seqfile[seq(1,nrow(ad)*2,2)[i]+1,] = sapply(strsplit(ad[i,],","), `[`, 2)
}
mode(seqfile) = "numeric"
rownames(seqfile) = rep(rownames(ad),each=2)
map = cbind(gsub("Chr","",sapply(strsplit(colnames(ad),"_"), `[`, 1)),sapply(strsplit(colnames(ad),"_"), `[`, 2))
pedigree = rbind(cbind(rownames(ad)[2],0,0),
cbind(rownames(ad)[1],0,0),
cbind(rownames(ad)[-c(1:2)],rownames(ad)[2],rownames(ad)[1]))
write.table(seqfile,"seqfile_30.062.txt",col.names = F,quote = F)
write.table(map,"map_30.062.txt",row.names = F,col.names = F, quote = F)
write.table(pedigree,"pedigree_30.062.txt",row.names = F,col.names = F, quote = F)
write.table(colnames(ad),"snpnames_30.062.txt",row.names = F,col.names = F,quote = F)
#Family 30.065
vcf = read.vcfR("30.065_snps.vcf.gz")
ad = extract.gt(vcf, element = "AD")
colnames(ad) = gsub("\\.bam*","",basename(colnames(ad)))
ad = t(ad)
gt = extract.gt(vcf, element = "GT")
colnames(gt) = gsub("\\.bam*","",basename(colnames(gt)))
gt[gt=="0/0"] = 0
gt[gt=="0/1"] = 1
gt[gt=="1/1"] = 2
gt = t(gt)
mode(gt) = "numeric"
dp = extract.gt(vcf, element = "DP")
colnames(dp) = gsub("\\.bam*","",basename(colnames(dp)))
dp = t(dp)
mode(dp) = "numeric"
gt[which(dp<5 | dp>100)] = NA
ad[which(dp<5 | dp>100)] = "0,0"
MAF = colMeans(gt,na.rm=T)/2
gt = gt[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,-which(MAF<0.15 | MAF>0.85)]
ad = ad[,which(colSums(is.na(gt))<nrow(ad)*0.8)]
seqfile = matrix(NA,nrow(ad)*2,ncol(ad))
for(i in 1:nrow(ad)){
seqfile[seq(1,nrow(ad)*2,2)[i],] = sapply(strsplit(ad[i,],","), `[`, 1)
seqfile[seq(1,nrow(ad)*2,2)[i]+1,] = sapply(strsplit(ad[i,],","), `[`, 2)
}
mode(seqfile) = "numeric"
rownames(seqfile) = rep(rownames(ad),each=2)
map = cbind(gsub("Chr","",sapply(strsplit(colnames(ad),"_"), `[`, 1)),sapply(strsplit(colnames(ad),"_"), `[`, 2))
pedigree = rbind(cbind(rownames(ad)[2],0,0),
cbind(rownames(ad)[1],0,0),
cbind(rownames(ad)[-c(1:2)],rownames(ad)[2],rownames(ad)[1]))
write.table(seqfile,"seqfile_30.065.txt",col.names = F,quote = F)
write.table(map,"map_30.065.txt",row.names = F,col.names = F, quote = F)
write.table(pedigree,"pedigree_30.065.txt",row.names = F,col.names = F, quote = F)
write.table(colnames(ad),"snpnames_30.065.txt",row.names = F,col.names = F,quote = F)