-
Notifications
You must be signed in to change notification settings - Fork 0
/
LFMM.r
155 lines (127 loc) · 5.33 KB
/
LFMM.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Convert VCF to BED format using PLINK
plink --bfile 04.geno0.1maf0.05mind0.5 --recode vcf --out 04.geno0.1maf0.05mind0.5
# Impute missing genotypes using Beagle
java -jar beagle.01Mar24.d36.jar gt=04.geno0.1maf0.05mind0.5.vcf out=04.geno0.1maf0.05mind0.5.beagle
# Convert back from VCF to BED format using PLINK
plink --vcf 04.geno0.1maf0.05mind0.5.beagle.vcf.gz --make-bed --out 553
# Convert SV to SNP format and adjust .bim file
awk '{print "chr" $1 "\t" "chr"$1"_"$4 "\t" $3 "\t" $4 "\t" "A" "\t" "T"}' 553.bim > chr.bim
cp 553.bed chr.bed
cp 553.fam chr.fam
# Generate raw data file for LFMM format conversion
plink --bfile chr --recodeA --out final
cut -d " " -f 7- final.raw | sed '1d' > final.lfmm
# Copy necessary R script files to the working directory
cp fn-landgen1.R landgen1.R lib.R ./
# Read and process genotype data for LFMM analysis
x = "final1"
Y <- data.table::fread(paste(x, ".raw", sep = ""), na.strings = "NA", header = T)
Y <- Y[, -c(1:6)]
fwrite(Y, paste(x, ".lfmm", sep = ""), col.names = F, row.names = F, sep = "\t", na = "9")
Y <- LEA::lfmm2geno(paste(x, ".lfmm", sep = ""))
# Read environmental data (ecotype)
X <- read.table("eco.txt", h = T, stringsAsFactors = F)
X <- as.matrix(X[, c(5:20)])
# PCA analysis using LEA
pc <- LEA::pca(paste(x, ".lfmm", sep = ""), scale = TRUE)
tw <- LEA::tracy.widom(pc)
print(tw$pvalues[1:10])
# Scree plot of PCA eigenvalues
pdf("1.pdf")
plot(tw$eigenvalues, main = "Scree plot", ylab = "Eigenvalues", xlab = "PCs", t = "b")
dev.off()
# Perform SNP-based clustering (snmf) and plot results
obj.snmf <- LEA::snmf(Y, K = 1:20, entropy = T, ploidy = 2, CPU = 48, project = "new")
pdf("2.pdf")
plot(obj.snmf, pch = 16, col = "blue")
dev.off()
# Barplot of the inferred genetic clusters
K <- 16
pdf("3.pdf")
barplot(t(Q(obj.snmf, K = K)), col = c("forestgreen", "blue", "pink", "black"), border = NA, las = 2)
box(lwd = 2)
dev.off()
# LFMM ridge regression for association testing
mod.lfmm <- lfmm::lfmm_ridge(Y = Y, X = X, K = K)
# Perform LFMM tests for associations with environmental variables
pv <- lfmm::lfmm_test(Y = Y, X = X, lfmm = mod.lfmm, calibrate = "gif")
pvalues <- pv$calibrated.pvalue
# Save p-values and plot histograms
write.table(pvalues, file = "pvalues.txt", sep = "\t")
# Histogram of p-values for each environmental variable
pdf("4.pdf")
par(mfrow = c(3, 3), mar = c(3.5, 3.5, 3, 0.5), mgp = c(2.5, 0.8, 0))
for (i in 1:ncol(pvalues)) {
hist(pvalues[, i], breaks = 20, xlab = "p-values", xlim = c(0, 1), main = paste("Env. variable ", i, sep = ""), col = "darkgray", border = "darkgray")
}
dev.off()
# Q-Q plot for p-values
pdf("5.pdf")
qqplot(rexp(length(pvalues), rate = log(10)), -log10(pvalues), xlab = "Expected quantile", ylab = expression(-log[10] ~ p - values), pch = 19, cex = 0.4)
abline(0, 1)
dev.off()
# Control False Discovery Rate (FDR) with q-values
qobj <- qvalue::qvalue(pvalues[, 1])
pdf("6.pdf")
hist(qobj)
dev.off()
pdf("7.pdf")
plot(qobj)
dev.off()
# Apply q-value calculation for each environmental variable
my_qvalue <- function(x) {
q <- qvalue::qvalue(x)
return(q$qvalues)
}
qvalues <- apply(pvalues, 2, my_qvalue)
# Q-value histograms for each environmental variable
pdf("8.pdf")
par(mfrow = c(3, 3), mar = c(3.5, 3.5, 3, 0.5), mgp = c(2.5, 0.8, 0))
for (i in 1:ncol(qvalues)) {
hist(qvalues[, i], breaks = 20, xlab = "q-values", xlim = c(0, 1), main = paste("Env. variable ", i, sep = ""), col = "darkgray", border = "darkgray")
}
dev.off()
# Writing final results with SNP information and q-values
map <- read.table(paste("chr1", ".map", sep = ""))
qvalues <- as.data.frame(qvalues)
qvalues$SNP <- as.character(map$V2)
qvalues$CHR <- map$V1
qvalues$BP <- map$V4
write.table(qvalues, file = "qvalues.txt", sep = "\t")
# Decide on a q-value cutoff and filter results
lfmm_qvalcut <- function(qvalues, cutoff) {
res <- list()
e <- qvalues[, 1:(ncol(qvalues) - 3)]
s <- qvalues[, (ncol(qvalues) - 2):ncol(qvalues)]
for (i in 1:ncol(e)) {
j <- which(e[, i] < cutoff)
if (length(j) == 0) {
t <- data.frame("ENV" = colnames(e)[i], "qval" = NA, "SNP" = NA, "CHR" = NA, "BP" = NA)
res[[i]] <- t
} else {
t <- data.frame("ENV" = colnames(e)[i], "qval" = e[j, i], "SNP" = s[j, 1], "CHR" = s[j, 2], "BP" = s[j, 3])
res[[i]] <- t
}
}
res <- do.call(rbind.data.frame, res)
res <- na.omit(res)
return(res)
}
# Filter results based on q-value cutoff
lfmm.res <- lfmm_qvalcut(qvalues = qvalues, cutoff = 0.01)
write.table(lfmm.res, file = "lfmm.res.txt", sep = "\t")
# Generate Manhattan plots for results
manh <- pvalues[, c(17:19, 1:16)]
write.table(manh, file = "manh.txt", sep = "\t")
# Create Manhattan plots for each environmental variable
pdf("9.pdf")
par(mfrow = c(3, 1), oma = c(1, 1, 1, 1), mar = c(4, 5, 3, 1), mgp = c(2.5, 0.8, 0))
for (i in 4:ncol(manh)) {
manh_i <- manh[, c(1:3, i)]
SNPs <- lfmm.res$SNP[lfmm.res$ENV == colnames(manh_i)[4]]
colnames(manh_i)[4] <- "P"
qqman::manhattan(manh_i, genomewideline = F, suggestiveline = F, main = paste("LFMM - Env. variable ", colnames(manh)[i], sep = ""), highlight = SNPs, col = c(alpha("gray70", 0.7), alpha("gray30", 0.7)), xlab = "Chromosome", cex.lab = 1.5, cex.axis = 1)
box()
}
dev.off()
# Repeat plotting for the other files as needed