-
Notifications
You must be signed in to change notification settings - Fork 1
/
Analysis of apple trees.R
243 lines (172 loc) · 8.26 KB
/
Analysis of apple trees.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# For the initial exercise use the following code
# to generate your own unique dataset for tree1, 2 & 3
Na <- 20
apples1 <-rnorm(Na,rnorm(1,20,3),rnorm(1,5))
apples2<-rnorm(Na,rnorm(1,40,3),rnorm(1,15))
apples3<-rnorm(Na,rnorm(1,80,3),rnorm(1,10))
# Set up some graphical parameters
par(mfrow=c(3,1))
xvals<-0:100
# Calculate the likelihood curves (normalized)
likeMean1<-dnorm(xvals,mean=mean(apples1),sd=sd(apples1)/Na^.5)*100
likeMean2<-dnorm(xvals,mean=mean(apples2),sd=sd(apples2)/Na^.5)*100
likeMean3<-dnorm(xvals,mean=mean(apples3),sd=sd(apples3)/Na^.5)*100
# Plot the raw data
hist( apples1,xlim=c(0,100),
ylim=c(0,max(likeMean1)),breaks=seq(0,150,5), border='purple',
main='Weights of apples on Tree 1')
lines( xvals,
dnorm(xvals,mean=mean(apples1),sd=sd(apples1))*100,
col='purple')
hist( apples2,xlim=c(0,100),
ylim=c(0,max(likeMean2)),breaks=seq(0,150,5), border='red',
main='Weights of apples on Tree 2')
lines( xvals,
dnorm(xvals,mean=mean(apples2),sd=sd(apples2))*100,
col='red')
hist( apples3,xlim=c(0,100),
ylim=c(0,max(likeMean3)),breaks=seq(0,150,5), border='blue',
main='Weights of apples on Tree 3')
lines( xvals,dnorm(xvals,
mean=mean(apples3),sd=sd(apples3))*100,
col='blue')
# use the mean() and sd() commands to obtain the mean and SD for each tree
# compare them with your neighbour
# use lm() command to find the mean & residual variation for each of apples1-3
# e.g. mod1<-lm(apples1~1); summary(mod1)
# Make sure you understand what the residual standard error is
# make sure you understand what the intercept and 'Std. Error' (of the intercept) is, and why it is different
# How do these values relate to the mean and sd you calculated?
# plot out the likelihood curves, how do they relate to the Std. Error
par(mfrow=c(3,1))
hist(apples1,xlim=c(0,100),ylim=c(0,max(likeMean1)),breaks=seq(0,150,5), border='purple')
lines(xvals,dnorm(xvals,mean=mean(apples1),sd=sd(apples1))*100,col='purple')
lines(xvals,(likeMean1))
hist(apples2,xlim=c(0,100),ylim=c(0,max(likeMean2)),breaks=seq(0,150,5), border='red')
lines(xvals,dnorm(xvals,mean=mean(apples2),sd=sd(apples2))*100,col='red')
lines(xvals,(likeMean2))
hist(apples3,xlim=c(0,100),ylim=c(0,max(likeMean3)),breaks=seq(0,150,5), border='blue')
lines(xvals,dnorm(xvals,mean=mean(apples3),sd=sd(apples3))*100,col='blue')
lines(xvals,(likeMean3))
# Now bind your dataset into one long vector, and create a factor to distinguish the data from each tree
allApples<-c(apples1,apples2,apples3)
treef<-factor(rep(c('Tree1','Tree2','Tree3'),each=Na))
# print them out to have a look by typing their names 'allApples' and 'treef'
# use mod4<-lm(allApples~treef) to fit all 3 tree means.
# use the summary() command to look at the results
# Which mean is the intercept equal to, why?
# Why are the values relating to the other two means not the same as you found before?
# Why are the standard errors for these two values the same, when they have different variances?
# Install the library allowing you to run mixed effects models
library(nlme)
# Fit a simple nlme
mod5<-lme(allApples~1,random=~1|treef)
# Examine the result using the summary() command
# what is the intercept of the fixed effect equal to ?
# what do the StdDev: (Intercept) Residual refer to
# See how these new fitted values compare to the raw mean values
par(mfrow=c(3,1))
hist(apples1,xlim=c(0,100),ylim=c(0,max(likeMean1)),breaks=seq(0,150,5), border='purple')
lines(xvals,dnorm(xvals,mean=mean(apples1),sd=sd(apples1))*100,col='purple')
lines(xvals,(likeMean1))
abline(v=fitted(mod5)[treef=='Tree1'],col='red')
hist(apples2,xlim=c(0,100),ylim=c(0,max(likeMean2)),breaks=seq(0,150,5), border='red')
lines(xvals,dnorm(xvals,mean=mean(apples2),sd=sd(apples2))*100,col='red')
lines(xvals,(likeMean2))
abline(v=fitted(mod5)[treef=='Tree2'],col='red')
hist(apples3,xlim=c(0,100),ylim=c(0,max(likeMean3)),breaks=seq(0,150,5), border='blue')
lines(xvals,dnorm(xvals,mean=mean(apples3),sd=sd(apples3))*100,col='blue')
lines(xvals,(likeMean3))
abline(v=fitted(mod5)[treef=='Tree3'],col='red')
# Are theseequal to the means of each tree?
# Use the plot(fitted()) command to see what the predictions are for each tree
# Use unique(fitted()) combination to pull out the values
# compare them to your previous estimates
# run this command to modify your data so that weight of each apple depends
# on its height in the tree
numTrees<-4 # Number of trees (each has 30 apples)
allApplesh<-rnorm(numTrees*30,40,3) # give all apples a random element to their weight
height<-runif(numTrees*30,1,3) # give all apples a randomly allocated height on the tree
treef<-factor(rep(paste('Tree',1:numTrees),each=30)) # A factor to identify the trees (each with 30 apples)
treeIntercepts<-rnorm(numTrees,0,9) # for each tree choose a value for the intercept
treeSlopes<-rnorm(numTrees,0,3) # and slope for the regression of apple weight on height
# Calculate the effect of height in the tree on each apple in each tree and plot
extraWt<-height*treeSlopes[treef]+treeIntercepts[treef]
plot(height,extraWt)
# add the height effect and underlying variation together
allApplesh<-allApplesh+extraWt
# enable lattice plots
library(lattice)
# Use xyplot to examine the data
xyplot(allApplesh~height|treef)
# Fit a different fixed effect of height for each tree, plus the random effects
mod6<-lme(allApplesh~height+height/treef,random=~1|treef)
mod7<-lm(allApplesh~height*treef)
mod8<-lme(allApplesh~height,random=~height|treef)
xyplot(allApplesh+fitted(mod7)+fitted(mod8)~height|treef)
# Advanced topic: Why do the fitted lines from lm and lme differ from each other
# generate a new modified dataset if your effect is not obvious
##################################################################
# Applying this logic to genetic data
# Generate some genotypes
nIndivs<-30;nLoci=40
genotypes<-matrix(sample(-1:1,nIndivs*nLoci,T),nrow=nIndivs)
# Give the rows and cols of the matrix appropriate names
gnames<-list()
gnames[[1]]<-paste("Ind", 1:nIndivs)
gnames[[2]]<-paste("Loc", 1:nLoci)
dimnames(genotypes)<-gnames
# print out the top left of the matrix
genotypes[1:10,1:7]
# look at the genotype data using the head(command)
# now generate the effects of each locus
locusEffects<-rnorm(nLoci,sd=3)
# Look at the effect size for each locus by printing the values
# Use matrix multiplication to obtain expected phenotype
expectedWT<-100+genotypes %*% locusEffects
# examine expectedWT
# add the environmental variation to get the phenotype
phenotypeWT<-expectedWT+rnorm(nIndivs,sd=2)
# use the plot function to compare expected and observed phenotypes
# see that lm doesnt work
mod10<-lm(phenotypeWT~genotypes)
summary(mod10)
# Why does it not work?
## We can try a special version of mixed effects modelling to analyse this relationship
install.packages('rrBLUP',dependencies=T)
# load the library
library(rrBLUP)
BLUP<-mixed.solve(phenotypeWT,Z=genotypes,
K = NULL, SE = FALSE, return.Hinv=FALSE)
# Show that BLUP has successfully obtained an (over) accurate estimate of phenotype
# by taking the BLUP estimates of the genotypic effects 'u' and multiplying them by
# genotye
est<-as.vector(genotypes%*%BLUP$u)
plot(est,phenotypeWT)
# Actually they have over-fitted the data as you can see by plotting these values
# against the true genotypic effects
plot(expectedWT,est)
# Compare the estimated genotypic effects, BLUP$u
# with the real 'locusEffects' , using plot()
#### Super advanced stuff######
# you can show this in real life by using half the data to estimate u
# then using it to predict the other half
testWT<-phenotypeWT[16:30]
trainWT<-phenotypeWT[1:15]
testG<-genotypes[16:30,]
trainG<-genotypes[1:15,]
BLUP1<-mixed.solve(trainWT,Z=trainG,K = NULL, SE = FALSE, return.Hinv=FALSE)
# Show that estimated effects matrix-multiplied by genotype is the prediction
est1<-as.vector(trainG%*%BLUP1$u)
plot(est1,trainWT)
# compare the prediction vs the true values
plot(est1,expectedWT[1:15])
# now the harder task, use the estimates to predict the phenotypes that were
# excluded from the analysis
est2<-as.vector(testG%*%BLUP1$u)
plot(est2,testWT)
# Advanced: why is the 2nd correlation poorer
# What happens if you add more environmental variation to the phenotype?
######################################################
########### Exercise ends ############################
######################################################