generated from stajichlab/PopGenomics_template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Algatr_test_v2.R
330 lines (283 loc) · 12.5 KB
/
Algatr_test_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env Rscript
#algatr test
#Dec. 1st 2023
#jnadams
library(algatr)
library(here)
library(raster)
library(sp)
library(vcfR)
library(adegenet)
library(tess3r)
pdf('algatr_plots.pdf')
#load example files
load_algatr_example()
#set working directory
getwd()
#read in vcf file
CCGP_vcf <- read.vcfR("vcf/ASCCGP_v3.All.SNP.combined_selected.vcf", verbose = FALSE)
head(CCGP_vcf, n = 20)
#read in filtered vcf file
CCGP_vcf_filtered <- read.vcfR("vcf/filtered.vcf", verbose = FALSE)
head(CCGP_vcf_filtered, n = 20)
tail(CCGP_vcf_filtered)
View(CCGP_vcf_filtered)
#CCGP_vcf_filtered_v2 <- is.na(CCGP_vcf_filtered)
#head(CCGP_vcf_filtered_v2)
#convert vcf to genotype matrix
Acarospora_dosage <- vcf_to_dosage(CCGP_vcf)
head(Acarospora_dosage)
#read in environmental data; we can use an environmental layer (PC1)
krig_raster <- raster::aggregate(CA_env[[1]], fact = 6)
terra::plot(CA_env[[1]], col = mako(100), axes = FALSE)
#run with compressed file
#read in vcf file
CCGP_vcf_gz <- read.vcfR("vcf/ASCCGP_v3.TestFresh.SNP.combined_selected.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz)
#run with compressed file from a year ago
CCGP_vcf_gz_v2 <- read.vcfR("vcf/ASCCGP_v1.All.INDEL.combined_selected.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v2)
#run with compressed file from a year ago with goodmapper
CCGP_vcf_gz_v3 <- read.vcfR("vcf/ASCCGP_v1.GoodMapper.SNP.combined_selected.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v3)
CCGP_vcf_gz_v4 <- read.vcfR("vcf/ASCCGP_v1.All.SNP.combined_selected.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v4)
tail(CCGP_vcf_gz_v4)
CCGP_vcf_gz_v5 <- read.vcfR("vcf/ASCCGP_v3.All.SNP.selected_pruned.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v5)
tail(CCGP_vcf_gz_v5)
CCGP_vcf_gz_v6 <- read.vcfR("vcf/CCGP_output_filtered_pruned_v19.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v6)
tail(CCGP_vcf_gz_v6)
CCGP_vcf_gz_v7 <- read.vcfR("vcf/plink_v2.vcf.gz", verbose = FALSE)
head(CCGP_vcf_gz_v7)
tail(CCGP_vcf_gz_v7)
#convert vcf to genotype matrix
Acarospora_dosage_gz <- vcf_to_dosage(CCGP_vcf_gz)
head(Acarospora_dosage_gz)
Acarospora_dosage_gz <- vcf_to_dosage("vcf/ASCCGP_v3.TestFresh.SNP.combined_selected.vcf.gz")
#read in coordinates file
CCGP_coords <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_v3.csv")
head(CCGP_coords)
tail(CCGP_coords)
#read in coordinates file for 11 samples
CCGP_coords_filtered <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_filtered.csv")
head(CCGP_coords_filtered, n = 20)
tail(CCGP_coords_filtered)
#read in coordinate file that is reordered
CCGP_coords_reordered <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_reordered.csv")
head(CCGP_coords_reordered)
tail(CCGP_coords_reordered)
CCGP_coords_reordered_v2 <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_reordered_v2.csv")
head(CCGP_coords_reordered_v2)
tail(CCGP_coords_reordered_v2)
CCGP_coords_reordered_v3 <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_reordered_v3.csv")
head(CCGP_coords_reordered_v3)
tail(CCGP_coords_reordered_v3)
CCGP_coords_reordered_v4 <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_reordered_v4.csv")
head(CCGP_coords_reordered_v4)
tail(CCGP_coords_reordered_v4)
#read in coordinates file
CCGP_coords_v2 <- read.csv("/bigdata/stajichlab/shared/projects/Population_Genomics/Asocialis_CCGP/CCGP_coords_v4.csv")
head(CCGP_coords_v2)
tail(CCGP_coords_v2)
#get rid of NA in CCGP_coords_v2
CCGP_coords_v3 <- na.omit(CCGP_coords_v2)
head(CCGP_coords_v3)
tail(CCGP_coords_v3)
tot <- rowSums(CCGP_coords_filtered)
print(tot)
CCGP_coords_filtered_v2 <- is.na(CCGP_coords_filtered)
head(CCGP_coords_filtered_v2)
tail(CCGP_coords_filtered_v2)
qmat <- qmatrix(tess3_obj, K = bestK)
# First, create a grid for kriging
# We can use one environmental layer (PC1), aggregated (i.e., increased cell size) to increase computational speed
krig_raster <- raster::aggregate(CA_env[[1]], fact = 6)
head(krig_raster)
krig_admix <- tess_krig(tess_CCGP_results_v6$qmat, tess_CCGP_results_v6$coords, tess_CCGP_results_v6$grid)
#Run tess do everything
tess_CCGP_results_v2 <- tess_do_everything(CCGP_vcf_gz_v3, CCGP_coords_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v2 <- tess_do_everything(CCGP_vcf_gz_v2, CCGP_coords_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v3 <- tess_do_everything(CCGP_vcf_gz_v2, CCGP_coords_reordered, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v4 <- tess_do_everything(CCGP_vcf_gz_v2, CCGP_coords_reordered_v2, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v5 <- tess_do_everything(CCGP_vcf_gz_v2, CCGP_coords_reordered_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v6 <- tess_do_everything(CCGP_vcf_gz_v4, CCGP_coords_reordered_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v7 <- tess_do_everything(CCGP_vcf_gz_v5, CCGP_coords_reordered_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v8 <- tess_do_everything(CCGP_vcf_gz_v6, CCGP_coords_reordered_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_v9 <- tess_do_everything(CCGP_vcf_gz_v7, CCGP_coords_reordered_v4, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_filtered <- tess_do_everything(CCGP_vcf_filtered, CCGP_coords_filtered, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
tess_CCGP_results_all <- tess_do_everything(CCGP_vcf, CCGP_coords_v3, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
#tess_CCGP_results_filtered <- tess_do_everything(CCGP_vcf_filtered_v2, CCGP_coords_filtered, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
#tess_CCGP_results_filtered_v2 <- tess_do_everything(CCGP_vcf_filtered, CCGP_coords_filtered_v2, grid = CA_env[[1]], Kvals = 1:10, K_selection = "auto")
class(Coord) #check the class of Coord
any(is.na(Coord)) # Check for missing values
all(is.numeric(Coord)) # Check if all values are numeric
par(mfrow = c(2, 2), pty = "s", mar = rep(0, 4))
tess_ggplot(tess_CCGP_results_v6$krig_admix, plot_method = "maxQ")
tess_CCGP_results_v6$krig_admix@cpp
#Run tess with k=3
#convert vcf to genotype matrix
Acarospora_dosage <- vcf_to_dosage(CCGP_vcf_gz_v7)
head(Acarospora_dosage)
tess3_obj_noK <- tess3(Acarospora_dosage, coord = as.matrix(CCGP_coords_reordered_v4), K = 3, method = "projected.ls", ploidy = 2)
# Get TESS object and best K from results
tess3_obj <- tess3_result$tess3_obj_noK
bestK <- tess3_result[["K"]]
summary(tess3_obj_noK)
# Get Qmatrix with ancestry coefficients
qmat <- qmatrix(tess3_obj_noK, K = 3)
# qmat contains ancestry coefficient values for each individual (row) and each K value (column)
head(qmat)
krig_admix <- tess_krig(qmat, CCGP_coords_reordered_v4, krig_raster)
#visualize tess results
tess_barplot(qmat)
tess_Acarospora <- tess_ggbarplot(qmat, legend = FALSE)
tess_Acarospora
head(tess_Acarospora$data)
head(tess_Acarospora$labels)
tess_barplot(qmat, legend = FALSE)
par(mfrow = c(2, 2), pty = "s", mar = rep(0, 4))
tess_ggplot(krig_admix, plot_method = "maxQ")
#run tess3r with labeled structure plot
# retrieve tess3 Q matrix for K = 3 clusters
q.matrix <- qmatrix(tess3_obj_noK, K = 3)
# STRUCTURE-like barplot for the Q-matrix
barplot(q.matrix, border = NA, space = 0,
xlab = "SampleID", ylab = "Ancestry proportions",
main = "Ancestry matrix") -> bp
axis(1, at = 1:nrow(q.matrix), labels = bp$order, las = 3, cex.axis = .4)
#run wingen do everything:
CCGP_lyr <- coords_to_raster(CCGP_coords_reordered_v4, res = 0.5, buffer = 5)
envlayer <- rast(CA_env$CA_rPCA1)
CCGP_wingen_results <- wingen_do_everything(
gen = CCGP_vcf_gz_v7,
lyr = CCGP_lyr,
coords = CCGP_coords_reordered_v4,
wdim = 3,
fact = 0,
sample_count = TRUE,
preview = FALSE,
min_n = 2,
stat = "pi",
rarify = FALSE,
kriged = TRUE,
grd = CCGP_lyr,
index = 1:2,
agg_grd = NULL, disagg_grd = 4,
agg_r = NULL, disagg_r = NULL,
masked = TRUE, mask = envlayer,
bkg = envlayer, plot_count = TRUE
)
#run mmrr do everything
#to run mmrr we need a genetic distance matrix, which can be generated from a vcf file
#Now let's calculate PC distances
pc_dists_Acarospora <- gen_dist(CCGP_vcf_gz_v7, dist_type = "pc", npc_selection = "auto", criticalpoint = 2.0234)
set.seed(01)
mmrr_full_Acarospora_everything <- mmrr_do_everything(pc_dists_Acarospora, CCGP_coords_reordered_v4, env = CA_env, geo = TRUE, model = "full")
#run the mmrr vignette
# Install packages
mmrr_packages()
#load required libs
library(algatr)
library(here)
library(raster)
#load example data
#load_algatr_example()
#make a matrix of gendist
Y_v2 <- as.matrix(pc_dists_Acarospora)
# Extract enviro vars
env_v3 <- raster::extract(CA_env, CCGP_coords_reordered_v4)
# Calculate environmental distances
X_v2 <- env_dist(env_v3)
# Add geographic distance to X
X_v2[["geodist"]] <- geo_dist(CCGP_coords_reordered_v4)
#Run mmrr_run
set.seed(10)
results_full_Acarospora <- mmrr_run(Y_v2, X_v2, nperm = 99, stdz = TRUE, model = "full")
# Run MMRR with all variables
set.seed(01)
results_best_Acarospora <- mmrr_run(Y_v2, X_v2, nperm = 99, stdz = TRUE, model = "best")
# Single variable plot
mmrr_plot(Y_v2, X_v2, mod = results_full_Acarospora$mod, plot_type = "vars", stdz = TRUE)
# Fitted variable plot
mmrr_plot(Y_v2, X_v2, mod = results_full_Acarospora$mod, plot_type = "fitted", stdz = TRUE)
# Covariance plot
mmrr_plot(Y_v2, X_v2, mod = results_full_Acarospora$mod, plot_type = "cov", stdz = TRUE)
#mmrr_plot best model
mmrr_plot(Y_v2, X_v2, mod = results_best_Acarospora$mod, plot_type = "all", stdz = TRUE)
#mmrr summary statistics table
mmrr_table(results_full_Acarospora, digits = 2, summary_stats = TRUE)
# here we aggregate the layer for computational speed
lyr_v2 <- aggregate(CA_env$CA_rPCA1, 50)
plot(lyr_v2)
points(CCGP_coords_reordered_v4)
# Recreate MMRR input with resistance distances
# Calculate environmental distances
X_v2 <- env_dist(env_v3)
# Add geographic distance to X
X_v2[["resistdist"]] <- geo_dist(CCGP_coords_reordered_v4, type = "resistance", lyr = lyr_v2)
# Run MMRR with resistance distances
results_resist_Acarospora <- mmrr_run(Y_v2, X_v2, nperm = 99, stdz = TRUE, model = "full")
mmrr_plot(Y_v2, X_v2, mod = results_resist_Acarospora$mod, plot_type = "all", stdz = TRUE)
#table of stats for results_resist
mmrr_table(results_resist_Acarospora)
#Run GDM do everything
gdm_full_Acarospora_everything <- gdm_do_everything(pc_dists_Acarospora,
CCGP_coords_reordered_v4,
envlayers = CA_env,
model = "full",
scale_gendist = TRUE
)
#run GDM vignette
# Install packages
gdm_packages()
library(algatr)
library(raster)
library(terra)
#load example datasets
load_algatr_example()
env_Acarospora <- raster::extract(CA_env, CCGP_coords_reordered_v4)
gdm_full_Acarospora <- gdm_run(
gendist = pc_dists_Acarospora,
coords = CCGP_coords_reordered_v4,
env = env_Acarospora,
model = "full",
scale_gendist = TRUE
)
gdm_full_Acarospora$varimp
gdm_full_Acarospora$model
gdm_full_Acarospora$pvalues
CA_env$CA_rPCA1
CA_env$CA_rPCA2
CA_env$CA_rPCA3
#model selection with gdm_run; this code is not run within the vignette, the code does not work
#gdm_best <- gdm_run(gendist = liz_gendist,
# coords = liz_coords,
# env = env_liz,
# model = "best",
# scale_gendist = TRUE,
# nperm = 500,
# sig = 0.05)
# Look at p-values
#gdm_best$pvalues
#gdm_best$varimp
summary(gdm_full_Acarospora$model)
#plot the gdm
gdm_plot_diss(gdm_full_Acarospora$model)
#plotting fitted I-splines for variables
par(mfrow = c(2, 2))
gdm_plot_isplines(gdm_full_Acarospora$model)
#generate a table of GDM statistics
gdm_table(gdm_full_Acarospora)
# Extract the GDM map from the GDM model object
map_Acarospora <- gdm_map(gdm_full_Acarospora$model, CA_env, CCGP_coords_reordered_v4, plot_vars = FALSE)
maprgb_Acarospora <- map_Acarospora$pcaRastRGB
# Now, use `extrap_mask()` to do buffer-based masking
map_mask_Acarospora <- extrap_mask(CCGP_coords_reordered_v4, maprgb_Acarospora, method = "buffer", buffer_width = 1.25)
# Plot the resulting masked map
p <- plot_extrap_mask(maprgb_Acarospora, map_mask_Acarospora, RGB = TRUE, mask_col = rgb(1, 1, 1, alpha = 0.6))
ggsave("extrap_mask.pdf",p,width=12,height=12)
#RDA vignette
# Install packages
rda_packages()