-
Notifications
You must be signed in to change notification settings - Fork 0
/
jadebackup3011
387 lines (294 loc) · 11.6 KB
/
jadebackup3011
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
---
title: "forecasting"
output: html_document
date: "2023-11-30"
---
---
title: "ecotimeseries"
output: html_document
date: "2023-11-27"
---
```{r}
# Install and load necessary packages
install.packages("readxl")
library(readxl)
install.packages("dplyr")
library(dplyr)
install.packages("zoo")
library(zoo)
install.packages("TTR")
library(TTR)
install.packages("forecast")
library(forecast)
install.packages("ggplot2")
library(ggplot2)
install.packages("Metrics")
library(Metrics)
```
```{r}
# Read and check excel data
c14 <- read_excel("/Users/jadeoh/RStudio/eco_time_series/c14.xlsx")
head(c14)
flies = c14$TotalFlies
fliestime = c(c14$TotalFlies, c14$Week)
parasites = c14$TotalPara
paratime <- c(c14$TotalParaA, c14$Week)
fliespara <- c(c14$TotalFlies, c14$TotalParaA)
```
```{r}
# Extract relevant columns into new dataframe
c14_df <- c14 %>% select(Week, TotalFlies, TotalPara) # add %>% pull() if we want this to be a vector
head(c14_df)
```
```{r}
# Plot the time series to visually inspect flies and parasites over time.
ggplot(c14, aes(x = Week)) +
geom_line(aes(y = TotalFlies), color = "blue", size = 1, linetype = "solid", alpha = 0.8) +
geom_line(aes(y = TotalPara), color = "red", size = 1, linetype = "solid", alpha = 0.8) +
labs(title = "Flies and Parasites Over Time",
x = "Weeks",
y = "Counts")
```
```{r}
# Convert dataframe into time series object
ts_flies <- ts(c14$TotalFlies, start = 1, frequency = 1)
ts_para <- ts(c14$TotalPara, start = 1, frequency = 1)
summary(ts_flies)
summary(ts_para)
```
Can't use decomposition function because data is not long enough/lacks seasonality, which i guess should be expected? The functions are unable to detect a clear periodic pattern in the time series
```{r}
# Testing out some decomposition functions, but none of them seem to work. Just leaving them in here for future ref.
# decomposition <- decompose(ts_flies)
# decomposition_stl <- stl(ts_flies, s.window = "periodic")
#
# library(seasonal)
# decomposition <- seas(ts_para)
# plot(decomposition)
```
Preparing timeseries for forecasting
```{r}
# Split the timeseries in half, into training and testing data
n <- length(ts_flies)
splitpoint <- floor(n/2)
ts_flies_train <- ts_flies[1:splitpoint]
ts_flies_test <- ts_flies[(splitpoint+1):n]
```
```{r}
#Evaluating ARIMA model with Mean Absolute Percentage Error (mape)
mape <- function(x, y){
mape <- mean(abs((x - y)/x))*100
return (mape)
}
```
Naive Forecasting Method
The simplest forecasting method is to use the most recent observation as the forecast for the next observation. This is called a naive forecast and can be implemented using the 'naive()' function. This method may not be the best forecasting technique, but it often provides a useful benchmark for other, more advanced forecasting methods.
```{r}
naive_mod <- naive(ts_flies_test, h = 80)
summary(naive_mod)
print(naive_mod)
plot(naive_mod, main = "Fly population forecast from Naive method", xlab = "Weeks", ylab = "Number of Flies")
```
```{r}
# Calculating simple moving averages
# Moving averages. Smooth out short-term fluctuations + highlight long-term trends.
library(TTR)
window_size <- 3 #Not sure how we choose window size tbh
MA_flies <- SMA(ts_flies, n = window_size)
MA_para <- SMA(ts_para, n = window_size)
```
```{r}
# Linear model
time_index <- time(ts_flies)
lm_flies <- lm(ts_flies ~ time_index)
summary(lm_flies)
plot(lm_flies)
plot(residuals(lm_flies))
time_index <- time(ts_para)
lm_para <- lm(ts_para ~ time_index)
summary(lm_para)
plot(lm_para)
lm_fliespara <- lm(ts_flies ~ ts_para+time_index)
summary(lm_fliespara)
plot(lm_fliespara)
```
```{r}
# auto.ARIMA model - searches for best ARIMA model based on AIC
library(forecast)
autoarima_flies <- auto.arima(ts_flies)
autoarima_flies_forecast <- forecast(autoarima_flies, 52)
plot(fitted(autoarima_flies), col = "red", xlab = "Time", ylab = "Value", main = "auto.ARIMA Fitted Values Flies", xlim = c(1, length(ts_flies) + 52))
lines(autoarima_flies_forecast$mean, col = "pink")
plot(autoarima_flies_forecast) # Shows historical timeseries, forecast for future points + uncertainty of those forecasts.
autoarima_para <- auto.arima(ts_para)
autoarima_para_forecast <- forecast(autoarima_para, 52)
plot(fitted(autoarima_para), col = "red", xlab = "Time", ylab = "Value", main = "auto.ARIMA Fitted Values Parasites", xlim = c(1, length(ts_flies) + 52))
lines(autoarima_para_forecast$mean, col = "pink")
# Predictions flatten out cos it's using the mean??
plot(autoarima_para_forecast)
```
Arima model and predictions
```{r}
# Fit arima model and generate predictions
library(forecast)
ts_flies_model <- auto.arima(ts_flies_train)
ts_flies_prediction <- forecast(ts_flies_model, h = 159)
plot(ts_flies_prediction)
lines(ts_flies_prediction$fitted, col = "pink")
plot(ts_flies_train, xlim = c(1, length(ts_flies)), ylim = c(-400, 400))
plot(ts_flies_test, xlim = c(1, length(ts_flies)), ylim = c(-400, 400))
par(new = TRUE)
plot(ts_flies_prediction$fitted, col = "orange")
```
Highlighting limitations of arima forecasting
because it's an additive model of linear models
try forecasting for a week or two and then
how can i predict the next week from the MA? then 2 weeks? 4 weeks?
Linearising isn't always a good thing to do, so this shows the limitations of linear approaches to time series
Non-linear models e.g. in the bruchids.txt
dependent on multiplicative growth of the population
How far can we get with linear time series analysis? (not very far)
They're not sufficient for modelling the complexity of the interactions
Tells us we need to move to a different, non-linear framework.
How much can we believe in replication? Variation and heterogeneity between replicates even though they're meant to be in a controlled environment. Do they differe in mean, variation or coefficient?
Highlight how different these replicates are.
My bit: how well does forecasting work with lienear models? (not very well)
Alice: VAR approach reasonably well at first, but when adding complexity it breaks down and thus we need non-linear models.
Environmental stochasticity
Demographic stochasticity: variation within flies, wasps e.g. some less fecund than others, some more resilient than others.
Conglomerate of measurement error, environmental stochasticity, individual stochasticity.
Decompose all that understanding and ideas into measurements associated into all these sources of variation and heterogeneity.
We can only forecast one week ahead with this model. this highlights that linear models aren't the right approach
predator prey interactions oscillate and are not linear.
chaos statistics
Non linear model, non-normal distribution, and conditioning. This makes it machine learning. But it's just from the adding of those 3 things. '
GAM general additive model would be useful.
GAMs work very well when there's lots of data and less well when there's sparse data.
ARIMA model and predictions attempt 2
```{r}
library(stats)
stats_arima <- stats::arima(ts_flies_test, order=c(0,1,2))
print(stats_arima)
plot(stats_arima)
```
```{r}
library(Metrics)
#Mean Absolute Error (MAE), Mean Squared Error (MSE), and Root Mean Squared Error (RMSE).
mae_flies <- mae(ts_flies_prediction, ts_flies_test)
mse_flies <- mse(ts_flies_prediction$fitted, ts_flies_test)
rmse_flies <- sqrt(mse_flies)
cat(mae_flies, mse_flies, rmse_flies)
```
```{r}
plot(ts_flies, col = "blue", xlab = "Weeks", ylab = "Value", main = "ARIMA prediction vs actual")
lines(ts_flies_prediction$fitted, col = "red", lty = 2)
plot(ts_flies, col = "black", xlab = "Time", ylab = "Value", main = "ARIMA predictions vs Actual", xlim = c(1, length(ts_flies)))
lines(ts_flies_prediction$fitted, col = "pink")
lines(ts_flies_test)
```
```{r}
plot(ts_flies, xlim = c(1, length(ts_flies)), ylim =c(-400, 400), col = "black")
plot(ts_flies_prediction$fitted, xlim = c(1, length(ts_flies)), ylim = c(-400, 400), col = "pink")
par(new = TRUE)
plot(ts_flies_prediction, xlim = c(1, length(ts_flies)), ylim = c(-400, 400))
```
```{r}
# Plot ACF
plot(ts_flies, main = "ts_flies", ylab = "Value")
acf_values = acf(ts_flies)
arimamodel = Arima(ts_flies, c(1,0,1)) #autoregressive, integrating(not important for us), moving average
str(arimamodel)
modelforecast = forecast(arimamodel, 48)
str(modelforecast)
modelforecast$mean
#Adding forecast to plot
plot(ts_flies)
lines(modelforecast$mean, col = "pink")
```
```{r}
# ADF test to check for stationarity
differencing_count <- 0
adf_test <- adf.test(ts_flies)
print(adf_test) # p-value > 0.05 indicates series is non-stationary
# write a while loop here to run differencing as long as series is non-stationary
while (adf_test$p.value > 0.05) { # Adjust the significance level as needed
ts_flies_diff <- diff(ts_flies)
adf_test <- adf.test(ts_flies_diff)
differencing_count <- differencing_count + 1
print(adf_test)
}
cat("Number of differencing steps:", differencing_count, "\n")
```
```{r}
#Repeat d calculation for parasites
differencing_count <- 0
adf_test <- adf.test(ts_para)
print(adf_test) # p-value > 0.05 indicates series is non-stationary
# write a while loop here to run differencing as long as series is non-stationary
while (adf_test$p.value > 0.05) { # Adjust the significance level as needed
ts_para_diff <- diff(ts_para)
adf_test <- adf.test(ts_para_diff)
differencing_count <- differencing_count + 1
print(adf_test)
}
cat("Number of differencing steps:", differencing_count, "\n")
```
d = differencing order - represents number of times the series needs to be differences for stationarity.
```{r}
# Calculating q - moving average order
print(ts_flies)
pacf_result <- pacf(ts_flies)
plot(pacf_result, main = "Partial Autocorrelation Function (PACF)")
```
The q value is the highest significant lag in the PACF plot - looks like 11 here?
```{r}
# ARIMA model
arima_flies <- arima(ts_flies, order = c(4, differencing_count, 11))
predicted_values <- predict(arima_flies, n.ahead = length(ts_flies))
# Plot the original data and the predicted values
plot(ts_flies, type = "l", col = "blue", xlab = "Time", ylab = "Value", main = "ARIMA vs Original Data", xlim = c(1, length(ts_flies) + 100))
lines(predicted_values$pred, col = "red", lty = 2)
# Add legend
legend("topright", legend = c("ts_flies", "ARIMA Predictions"), col = c("blue", "red"), lty = c(1, 2))
```
```{r}
# Basic plotting of time series all the other stuff we calculated
plot(ts_flies, xlab = "Time", ylab = "Value", main = "Time Series of Flies")
abline(lm_flies, col = "red")
# lines(MA_flies, col = "purple")
lines(fitted(arima_flies), col = "lightgreen")
plot(ts_para, xlab = "Time", ylab = "Value", main = "Time Series of Parasites")
abline(lm_para, col = "red")
# lines(MA_para, col = "purple")
lines(fitted(arima_para), col = "lightgreen")
```
```
```{r}
#Do the acf to determine the autoregressive (p) component in an ARIMA model
#All data interactions, 2x2 plot
acf(fliespara, lag.max = 20, plot = TRUE)
```
#Not stationary
#Best to use 4 as lifespan of fly, or maybe 5
#Then do for other three series
#Condition the data?
#Flies
```{r}
flies$TF_c <- c(rep(mean(flies$TF), 4), diff(flies$TF, lag = 4))
#Parasites
parasites$TP_c <- c(rep(mean(parasites$TA), 4), diff(parasites$TA, lag = 4))
# Join back together
flies_c <- flies[,'TF_c']
parasites_c <- parasites[,'TP_c']
fliesparasites_c <- cbind(flies_c, parasites_c)
```
#Replot the conditioned data
```{r}
acf(fliesparasites_c, lag.max = 20, plot = TRUE)
```
#Next steps: look at the other cage runs
#Are the significant lags at the same place?
#The lags once corrected then become the number of weeks you specified in the diff
#Can look if there is the same importance of weeks in the other three time series
#Linear modelling and comparison (on ln scale data)
#ARMA model on a log scale, choose the same noisy parameter as the ACF one