-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathARIMA.R
282 lines (200 loc) · 12.2 KB
/
ARIMA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
## 16.867 Fall 2020
### ARIMA MODELING for Time Series ###
##---------------------------------------- TIME SERIES MODELING ----------------------------------------##
##---------------------------------------- Load Packages ----------------------------------------##
library("grDevices")
library("forecast")
library("TTR")
##---------------------------------------- Read in Data ----------------------------------------##
rm(list=ls()) # clear all current variables
# ticker_vec = c("AMZN", "GE", "T", "PFE", "GS")
ticker_vec = c("AMZN")
ticker = 'AMZN'
csv_name_start = "numeric_and_text_training_data_"
# csv_name_start = "numeric_training_data_"
ARIMA_and_REG_Experimentation <- function(ticker, csv_name_start){
csv_name_end = "_9_30_2012_9_30_2020"
filepath = paste("DataCSVs/", csv_name_start, ticker, csv_name_end , ".csv" , sep = '')
filepath
data = read.csv(filepath)
head(data)
data$Date_Column = as.Date(data$Date_Column, format="%Y/%m/%d")
typeof(data$Date_Column)
target_data_train = head(data, round(length(data$TARGET) * 0.7)+1)
NROW(target_data_train)
target_data_test = tail(data, round(length(data$TARGET) * 0.3)-1)
NROW(target_data_test)
start= c(as.numeric(substr(data$Date_Column[1],1,4)), as.numeric(substr(data$Date_Column[1],6,7)), as.numeric(substr(data$Date_Column[1],9,10)))
end= c(as.numeric(substr(data$Date_Column[nrow(data)],1,4)), as.numeric(substr(data$Date_Column[nrow(data)],6,7)), as.numeric(substr(data$Date_Column[nrow(data)],9,10)))
target_ts = as.ts(data['TARGET'], frequency = 365, start = start, end =end)
typeof(target_ts)
target_ts_train <- ts(head(target_ts, round(length(target_ts) * 0.7)+1), frequency = 365)
NROW(target_ts_train)
target_ts_test <- ts(tail(target_ts, round(length(target_ts) * 0.3)-1), frequency = 365)
NROW(target_ts_test)
##---------------------------------------- Plot Basic Time Series Info ----------------------------------------##
main = paste("Daily Returns for", ticker, sep = " ")
xlab = "Time Index"
ylab = "Return"
plot(target_ts_train, xlab = "Time Index", ylab = ylab, main = main)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(1) ", csv_name_start, " - Return_Time_Series", ".png", sep = ""))
dev.off()
##---------------------------------------- Decompose Time Series ----------------------------------------##
ts_components = decompose(target_ts_train)
ts_components$trend # computed using Moving Average
ts_components$seasonal # compute the average of each month (on the detrended data) to get the seasonal component
ts_components$random
plot(ts_components)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(2) ", csv_name_start," - Time Series Decomposition", ".png", sep = ""))
dev.off()
ts_components_random = ts_components$random
random_Error = na.omit(ts_components_random)
acf(random_Error, main = paste("Autocorrelation of Random Error -", ticker, sep = " "))
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(3) ", csv_name_start," - ACF of Random Error", ".png", sep = ""))
dev.off()
##---------------------------------------- ARIMA ----------------------------------------##
## Compute ARIMA Model for Random Error
arima = auto.arima(random_Error, seasonal = FALSE)
arima
accuracy(arima) # Get accuracy statistics
## Make Accuracy Plots
# install.packages("ggfortify")
library(ggfortify)
ggtsdiag(arima)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(4) ", csv_name_start," - Random Error Fits", ".png", sep = ""))
dev.off()
##---------------------------------------- Re-Composing Time Series Fit ----------------------------------------##
fitted_random_ARIMA = fitted(arima) #fitted values for the ARIMA model on the random error component
resid_ARIMA = residuals(arima) # residuals of the ARIMA fit on the random error component
time_series_fitted = ts_components$trend + ts_components$seasonal + fitted_random_ARIMA
### Plot ###
plot(target_ts_train, main = main, xlab = xlab, ylab = ylab)
lines(target_ts_train, col = "blue")
lines(time_series_fitted, col = "green")
legend("topleft", legend=c("Training Target", "Fitted with Full ARIMA"),col=c("blue", "green"), lty=1:2, cex=0.8)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(5) ", csv_name_start," - Decomposition and ARIMA Training Fit", ".png", sep = ""))
dev.off()
## Calculate Training Residuals on ARIMA Model with NO Additional Features ###
overall_fit_residuals_only_arima_TRAIN = subset(target_ts_train, !is.na(ts_components$trend)) - time_series_fitted
# MSE
mean(overall_fit_residuals_only_arima_TRAIN^2)
# MAD
mean(abs(overall_fit_residuals_only_arima_TRAIN))
training_arima_only = c(mean(overall_fit_residuals_only_arima_TRAIN^2))
##---------------------------------------- Forecasting & Test Fit----------------------------------------##
num_points_to_forecast = nrow(target_ts_test)
num_points_to_forecast
# Forecast Trend
trend = ts(na.omit(ts_components$trend)) # Make to Time Series
trend_forecast = forecast(trend, h = num_points_to_forecast)
plot(trend_forecast)
# Forecast Seasonality
season_forecast = as.vector(ts_components$season)[c(1:num_points_to_forecast)]
plot(season_forecast)
# Forecast Random Error
forecasted_random_error = forecast(arima, level = c(95), h = num_points_to_forecast)
autoplot(forecasted_random_error)
# Forecast Entire Time Series by Construction
constructed_forecast = as.vector(trend_forecast$mean) + as.vector(season_forecast) + as.vector(forecasted_random_error$mean)
# Plot target_ts_train, Forecast from Forecast Function, Forecast from Construction of Forecast
# forecasted_forecast_full_horizon = append(target_ts_train, as.vector(forecast_target_ts_train$mean))
constructed_forecast_full_horizon = append(target_ts_train, as.vector(constructed_forecast))
plot.new()
plot(as.vector(target_ts), col = 'blue', ylab = "returns", xlab = "time index", main = paste(main, "-", "Full Horizon", sep = " "))
lines(as.vector(target_ts), col = 'blue')
# lines(forecasted_forecast_full_horizon, col = "red")
lines(constructed_forecast_full_horizon, col = "green")
legend("topleft", legend=c("by Construction of ts - training set + forecasts",'True Data'), col=c("green", "blue"), lty=1:2, cex=0.8)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(6) ", csv_name_start," - Forecast on Test", ".png", sep = ""))
dev.off()
## Calculate TESTING FIT Residuals on ARIMA Model with NO Additional Features ###
# Recomposition of TS
overall_fit_residuals_only_arima_TEST = target_ts_test - as.vector(constructed_forecast)
# MSE
mean(overall_fit_residuals_only_arima_TEST^2)
# MAD
mean(abs(overall_fit_residuals_only_arima_TEST))
testing_arima_only = c(mean(overall_fit_residuals_only_arima_TEST^2))
##---------------------------------------- Stationarity / Non-Stationarity in the Time Series ----------------------------------------##
# https://rpubs.com/richkt/269797
# https://datascienceplus.com/time-series-analysis-using-arima-model-in-r/
##---------------------------------------- LINEAR REGRESSION WITH TIME SERIES MODELING - Basic Idea ----------------------------------------##
##---------------------------------------- Time Series with Regression ----------------------------------------##
# Now that we have decomposed our time series and done ARIMA, we are left with residuals that we don't know about.
# We want a model like:
# Need to get indices of resid_ARIMA to use:
vec = vector()
vec = c(vec, 1:NROW(ts_components_random))
vec = subset(vec, !is.na(ts_components_random))
numericalfeatures = target_data_train[vec, 3:ncol(data)]
linreg = lm(resid_ARIMA ~ ., data = numericalfeatures)
summary(linreg)
FIT_ON_resid_ARIMA_given_by_linreg = linreg$fitted.values # fitted values
### Stepwise Regression ###
# library(MASS)
# step <- stepAIC(linreg, direction="both")
# step$anova # display results
#
first_index_to_plot = which(!is.na(ts_components$trend),, TRUE)[1]
time_series_reconstructed = subset(ts_components$trend, !is.na(ts_components$trend)) + subset(ts_components$seasonal, !is.na(ts_components$trend)) + as.vector(FIT_ON_resid_ARIMA_given_by_linreg) # as.vector(fitted_random_ARIMA) +
last_index_to_plot = first_index_to_plot + NROW(time_series_reconstructed)
number_of_indices = last_index_to_plot[1] - first_index_to_plot[1]
x = seq(first_index_to_plot, first_index_to_plot+number_of_indices-1, by=1)
# Plot target_ts_train, Forecast from Construction of Forecast
plot.new()
plot(as.vector(target_ts_train), col = 'blue', ylab = "returns", xlab = "time index", main = paste(main, "- Regression on ARIMA Residuals", sep = " "))
lines(as.vector(target_ts_train), col = 'blue')
lines(x, time_series_reconstructed, col = "red")
legend("topleft", legend=c("Arima with Regression on Residuals - Training Data", "Constructed Time Series - Training"), col=c("red", "blue"), lty=1:2, cex=0.8)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(7) ", csv_name_start, " - ARIMA Training Fit with Regression on Residuals", ".png", sep = ""))
dev.off()
## Calculate Training Residuals on Full Model with Additional Features ###
overall_fit_residuals_arima_and_regression_TRAIN = subset(target_ts_train, !is.na(ts_components$trend)) - time_series_reconstructed
# MSE
mean(overall_fit_residuals_arima_and_regression_TRAIN^2)
# MAD
mean(abs(overall_fit_residuals_arima_and_regression_TRAIN))
training_arima_and_reg = c(mean(overall_fit_residuals_arima_and_regression_TRAIN^2))
##---------------------------------------- Test Fit----------------------------------------##
## Calculate TESTING FIT Residuals on ARIMA Model with Additional Features ###
pred_reg_on_resid = predict(linreg, target_data_test)
FIT = as.vector(trend_forecast$mean) + as.vector(season_forecast) + as.vector(pred_reg_on_resid)
# Recomposition of TS
overall_fit_residuals_arima_and_regression_TEST = as.vector(target_ts_test) - FIT
# MSE
mean(overall_fit_residuals_arima_and_regression_TEST^2)
# MAD
mean(abs(overall_fit_residuals_arima_and_regression_TEST))
testing_arima_and_reg = c(mean(overall_fit_residuals_arima_and_regression_TEST^2))
plot.new()
plot(as.vector(target_ts_test), col = 'blue', ylab = "TESTING Returns", xlab = "time index", main = paste(main, "- TESTING RETURNS", sep = " "))
lines(as.vector(target_ts_test), col = 'blue')
lines(constructed_forecast, col = "red")
lines(FIT, col = "green")
legend("topleft", legend=c("True Test Data", "ARIMA", "ARIMA w/Regression"), col=c("blue", "red", "green"), lty=1:2, cex=0.8)
dev.copy(png, paste("ARIMA_PNGs/", ticker, "(8) ", csv_name_start," - ARIMA and Reg. Testing Fits", ".png", sep = ""))
dev.off()
print(paste("TRAINING, ARIMA ONLY:", as.character(training_arima_only)))
print(paste("TESTING, ARIMA ONLY:", as.character(testing_arima_only)))
print(paste("TRAINING, ARIMA AND REG:", as.character(training_arima_and_reg)))
print(paste("TESTING, ARIMA AND REG:", as.character(testing_arima_and_reg)))
return(c(training_arima_only, training_arima_and_reg, testing_arima_only, testing_arima_and_reg))
}
csv_name_start = "numeric_training_data_"
results = matrix( 0, nrow = NROW(ticker_vec)+1, ncol=4+1)
results[1,] = c("", "Training, ARIMA, Numeric", "Training, ARIMA and Regression, Numeric", "Testing, ARIMA, Numeric", "Testing, ARIMA and Regression, Numeric")
for (i in 1:NROW(ticker_vec)){
ticker = ticker_vec[i]
results[i+1,1] = ticker
results[i+1,2:NCOL(results)] = ARIMA_and_REG_Experimentation(ticker, csv_name_start)
}
write.table(results, file = "ARIMA_PNGs/Numeric_Results.csv", sep = ",", col.names = FALSE, row.names=FALSE)
csv_name_start = "numeric_and_text_training_data_"
results = matrix( 0, nrow = NROW(ticker_vec)+1, ncol=4+1)
results[1,] = c("", "Training, ARIMA, Numeric and Text", "Training, ARIMA and Regression, Numeric and Text", "Testing, ARIMA, Numeric and Text", "Testing, ARIMA and Regression, Numeric and Text")
for (i in 1:NROW(ticker_vec)){
ticker = ticker_vec[i]
results[i+1,1] = ticker
results[i+1,2:NCOL(results)] = ARIMA_and_REG_Experimentation(ticker, csv_name_start)
}
write.table(results, file = "ARIMA_PNGs/Numeric_and_Text_Results.csv", sep = ",", col.names = FALSE, row.names=FALSE)