diff --git a/R/configure_forecast_run.R b/R/configure_forecast_run.R index 75b2eafd..ced63f87 100644 --- a/R/configure_forecast_run.R +++ b/R/configure_forecast_run.R @@ -33,7 +33,7 @@ get_fourier_periods <- function(fourier_periods, #' #' @param lag_periods lag_periods override #' @param date_type year, quarter, month, week, day -#' @param forecast_horizon horion input from user +#' @param forecast_horizon horizon input from user #' #' @return Returns lag_periods #' @noRd @@ -49,9 +49,7 @@ get_lag_periods <- function(lag_periods, "quarter" = c(1,2,3,4), "month" = c(1, 2, 3, 6, 9, 12), "week" = c(1, 2, 3, 4, 8, 12, 24, 48, 52), - "day" = c(1, 2, 3, 4, 5, 6, 7, 14, - 21, 28, 28*2, 28*3, 28*6, - 28*9, 28*12, 365) + "day" = c(7, 14, 21, 28, 60, 90, 180, 365) ) oplist <- c(oplist,forecast_horizon) diff --git a/R/forecast_models.R b/R/forecast_models.R index a681e32b..667c6f7f 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -164,7 +164,8 @@ invoke_forecast_function <- function(fn_to_invoke, date_rm_regex, back_test_spacing, fiscal_year_start, - model_type){ + model_type, + pca){ exp_arg_list <- formalArgs(fn_to_invoke) @@ -177,7 +178,8 @@ invoke_forecast_function <- function(fn_to_invoke, 'date_rm_regex' = date_rm_regex, 'back_test_spacing' = back_test_spacing, 'fiscal_year_start' = fiscal_year_start, - 'model_type' = model_type) + 'model_type' = model_type, + "pca" = pca) avail_names <- names(avail_arg_list) @@ -248,9 +250,10 @@ construct_forecast_models <- function(full_data_tbl, back_test_scenarios, date_regex, fiscal_year_start, - seasonal_periods -){ - + seasonal_periods, + pca + ){ + forecast_models <- function(combo_value) { cli::cli_h2("Running Combo: {combo_value}") @@ -335,6 +338,8 @@ construct_forecast_models <- function(full_data_tbl, cli::cli_h3("Individual Model Training") + + # models to run model_list <- get_model_functions(models_to_run, models_not_to_run, run_deep_learning) @@ -347,7 +352,13 @@ construct_forecast_models <- function(full_data_tbl, models_to_go_over <- names(model_list) - + # PCA + if(sum(pca == TRUE) == 1 | (combo_value == "All-Data" & is.null(pca)) | (is.null(pca) & date_type %in% c("day", "week"))) { + run_pca <- TRUE + } else { + run_pca <- FALSE + } + for(model_name in models_to_go_over){ model_fn <- as.character(model_list[model_name]) @@ -372,7 +383,9 @@ construct_forecast_models <- function(full_data_tbl, fiscal_year_start = fiscal_year_start, tscv_inital = hist_periods_80, date_rm_regex = date_regex, - model_type = "single")) + model_type = "single", + pca = run_pca)) + try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, mdl_called, @@ -392,25 +405,26 @@ construct_forecast_models <- function(full_data_tbl, freq_val <- gluon_ts_frequency add_name <- paste0(model_name,model_name_suffix) } - - - try(mdl_called <- invoke_forecast_function(fn_to_invoke = model_fn, - train_data = train_data_recipe_1, - frequency = freq_val, - parallel = run_model_parallel, - horizon = forecast_horizon, - seasonal_period =seasonal_periods, - back_test_spacing = back_test_spacing, - fiscal_year_start = fiscal_year_start, - tscv_inital = hist_periods_80, - date_rm_regex = date_regex, - model_type = "single")) - - try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, - mdl_called, - location = "top") %>% - update_model_description(1, add_name), - silent = TRUE) + + try(mdl_called <- invoke_forecast_function(fn_to_invoke = model_fn, + train_data = train_data_recipe_1, + frequency = freq_val, + parallel = run_model_parallel, + horizon = forecast_horizon, + seasonal_period =seasonal_periods, + back_test_spacing = back_test_spacing, + fiscal_year_start = fiscal_year_start, + tscv_inital = hist_periods_80, + date_rm_regex = date_regex, + model_type = "single", + pca = run_pca)) + + try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, + mdl_called, + location = "top") %>% + update_model_description(1, add_name), + silent = TRUE) + } if(model_name %in% r2_models & ("R2" %in% recipes_to_run | sum(recipes_to_run == "all") == 1 | (is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")))){ @@ -426,8 +440,9 @@ construct_forecast_models <- function(full_data_tbl, fiscal_year_start = fiscal_year_start, tscv_inital = hist_periods_80, date_rm_regex = date_regex, - model_type = "single")) - + model_type = "single", + pca = run_pca)) + try(combined_models_recipe_2 <- modeltime::add_modeltime_model(combined_models_recipe_2, mdl_called, location = "top") %>% @@ -630,7 +645,8 @@ construct_forecast_models <- function(full_data_tbl, fiscal_year_start = fiscal_year_start, tscv_inital = "1 year", date_rm_regex = date_regex, - model_type = "ensemble")) + model_type = "ensemble", + pca = FALSE)) try(combined_ensemble_models <- modeltime::add_modeltime_model(combined_ensemble_models, mdl_ensemble, diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index 5398df2b..f7b90476 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -51,10 +51,12 @@ #' @param lag_periods List of values to use in creating lag features. Default of NULL automatically chooses these values #' based on date_type. #' @param rolling_window_periods List of values to use in creating rolling window features. Default of NULL automatically -#' chooses these values based on date_type. +#' chooses these values based on date type. #' @param recipes_to_run List of recipes to run on multivariate models that can run different recipes. A value of NULL runs #' all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless #' of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe. +#' @param pca Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +#' PCA on day and week date types across all local multivariate models, and also for global models across all date types. #' @param reticulate_environment File path to python environment to use when training gluonts deep learning models. #' Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image #' that has python environment already installed. @@ -116,6 +118,7 @@ forecast_time_series <- function(input_data, lag_periods = NULL, rolling_window_periods = NULL, recipes_to_run = NULL, + pca = NULL, reticulate_environment = NULL, models_to_run = NULL, models_not_to_run = NULL, @@ -278,7 +281,8 @@ forecast_time_series <- function(input_data, back_test_scenarios, date_regex, fiscal_year_start, - seasonal_periods) + seasonal_periods, + pca) # * Run Forecast ---- if(forecast_approach == "bottoms_up" & length(unique(full_data_tbl$Combo)) > 1 & (sum(run_global_models == TRUE) == 1 | (is.null(run_global_models) & date_type %in% c("month", "quarter", "year"))) & run_local_models) { diff --git a/R/models.R b/R/models.R index 553f357f..be137df4 100644 --- a/R/models.R +++ b/R/models.R @@ -43,7 +43,8 @@ get_recipie_configurable <- function(train_data, dummy_one_hot = TRUE, character_factor = FALSE, center_scale=FALSE, - one_hot = FALSE){ + one_hot = FALSE, + pca = TRUE){ mutate_adj_half_fn <- function(df){ if(mutate_adj_half){ @@ -120,6 +121,16 @@ get_recipie_configurable <- function(train_data, } } + pca_fn <-function(df){ + if(pca){ + df %>% + recipes::step_pca(tidyselect::contains("lag"), threshold = .99, options = list(center = !center_scale, scale. = !center_scale)) + } + else{ + df + } + } + recipes::recipe(Target ~ ., data = train_data %>% dplyr::select(-Combo)) %>% recipes::step_mutate(Date_Adj = Date %m+% months(fiscal_year_start-1)) %>% timetk::step_timeseries_signature(Date_Adj) %>% @@ -129,7 +140,8 @@ get_recipie_configurable <- function(train_data, norm_date_adj_year_fn() %>% dummy_one_hot_fn() %>% character_factor_fn() %>% - center_scale_fn() + center_scale_fn() %>% + pca_fn() } @@ -275,7 +287,7 @@ get_tune_grid <- function(train_data, tgCall$grid <- 10 tgCall$control <- tune::control_grid(verbose = FALSE, allow_par = parallel, - parallel_over = "everything", + parallel_over = NULL, pkgs = get_export_packages()) if(isMetrics){ @@ -411,7 +423,8 @@ arima_boost <- function(train_data, tscv_initial, date_rm_regex, back_test_spacing, - fiscal_year_start) { + fiscal_year_start, + pca) { #create model recipe date_rm_regex_final <- paste0(date_rm_regex) @@ -421,7 +434,8 @@ arima_boost <- function(train_data, date_rm_regex_final, step_nzv = "zv", norm_date_adj_year = TRUE, - one_hot = TRUE) + one_hot = TRUE, + pca = pca) #create model spec model_spec_arima_boost_tune = modeltime::arima_boost( @@ -479,7 +493,8 @@ cubist <- function(train_data, tscv_initial, date_rm_regex, back_test_spacing, - fiscal_year_start) { + fiscal_year_start, + pca) { date_rm_regex_final <- paste0(date_rm_regex, '|(year)') @@ -492,14 +507,16 @@ cubist <- function(train_data, date_rm_regex_final, rm_date = "with_adj_index", step_nzv = "nzv", - one_hot = FALSE) + one_hot = FALSE, + pca = pca) }else{ recipe_spec_cubist <-train_data %>% get_recipie_configurable(fiscal_year_start, date_rm_regex_final, rm_date = "with_adj", step_nzv = "nzv", - one_hot = FALSE) + one_hot = FALSE, + pca = pca) } model_spec_cubist <- rules::cubist_rules( @@ -652,7 +669,8 @@ glmnet <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing){ + back_test_spacing, + pca){ date_rm_regex_final <- paste0(date_rm_regex, '|(year)') @@ -663,7 +681,8 @@ glmnet <- function(train_data, rm_date = "with_adj_index", step_nzv = "nzv", one_hot = FALSE, - center_scale = TRUE) + center_scale = TRUE, + pca = pca) }else{ recipe_spec_glmnet <- train_data %>% get_recipie_configurable(fiscal_year_start, @@ -671,7 +690,8 @@ glmnet <- function(train_data, rm_date = "with_adj", step_nzv = "nzv", one_hot = FALSE, - center_scale = TRUE) + center_scale = TRUE, + pca = pca) } model_spec_glmnet <- parsnip::linear_reg( @@ -715,12 +735,14 @@ mars <- function(train_data, parallel, model_type = "single", date_rm_regex, - fiscal_year_start) { + fiscal_year_start, + pca) { recipe_spec_mars <- train_data %>% get_recipie_configurable(fiscal_year_start, date_rm_regex, - rm_date = "with_adj") + rm_date = "with_adj", + pca = pca) model_spec_mars <- parsnip::mars( mode = "regression", @@ -893,18 +915,17 @@ nnetar_xregs <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { - - + back_test_spacing, + pca) { date_rm_regex_final = paste0(date_rm_regex) - recipe_spec_nnetar <- train_data %>% get_recipie_configurable(fiscal_year_start, date_rm_regex_final, norm_date_adj_year = TRUE, - one_hot = TRUE) + one_hot = TRUE, + pca = pca) model_spec_nnetar = modeltime::nnetar_reg( seasonal_period = frequency, @@ -1015,7 +1036,8 @@ prophet_boost <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { + back_test_spacing, + pca) { #create model recipe date_rm_regex_final = paste0(date_rm_regex) @@ -1025,7 +1047,8 @@ prophet_boost <- function(train_data, date_rm_regex_final, step_nzv = "zv", norm_date_adj_year = TRUE, - one_hot = TRUE) + one_hot = TRUE, + pca = pca) #create model spec model_spec_prophet_boost_tune <- modeltime::prophet_boost( @@ -1080,7 +1103,8 @@ prophet_xregs <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { + back_test_spacing, + pca) { date_rm_regex_final <- paste0(date_rm_regex) @@ -1089,7 +1113,8 @@ prophet_xregs <- function(train_data, date_rm_regex_final, step_nzv = "zv", dummy_one_hot = FALSE, - character_factor = TRUE) + character_factor = TRUE, + pca = pca) model_spec_prophet_xregs <- modeltime::prophet_reg( growth = tune::tune(), @@ -1240,7 +1265,8 @@ svm_poly <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { + back_test_spacing, + pca) { if(model_type == 'ensemble') { @@ -1250,7 +1276,8 @@ svm_poly <- function(train_data, get_recipie_configurable(fiscal_year_start, date_rm_regex_final, rm_date = "with_adj_index", - one_hot = FALSE) + one_hot = FALSE, + pca = pca) } else { @@ -1261,7 +1288,8 @@ svm_poly <- function(train_data, date_rm_regex_final, rm_date = "with_adj", norm_date_adj_year = TRUE, - one_hot = FALSE) + one_hot = FALSE, + pca = pca) } model_spec_svm <- parsnip::svm_poly( @@ -1316,7 +1344,8 @@ svm_rbf <- function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { + back_test_spacing, + pca) { if(model_type == 'ensemble') { @@ -1326,7 +1355,8 @@ svm_rbf <- function(train_data, get_recipie_configurable(fiscal_year_start, date_rm_regex_final, rm_date = "with_adj_index", - one_hot = FALSE) + one_hot = FALSE, + pca = pca) }else{ date_rm_regex_final = date_rm_regex @@ -1336,7 +1366,8 @@ svm_rbf <- function(train_data, date_rm_regex_final, norm_date_adj_year = TRUE, rm_date = "with_adj", - one_hot = FALSE) + one_hot = FALSE, + pca = pca) } model_spec_svm = parsnip::svm_rbf( @@ -1381,7 +1412,8 @@ svm_rbf <- function(train_data, tabnet <- function(train_data, parallel, fiscal_year_start, - date_rm_regex) { + date_rm_regex, + pca) { date_rm_regex_final <- "(.xts$)|(.iso$)|(hour)|(minute)|(second)|(am.pm)|(day)|(week)" #create model recipe @@ -1391,7 +1423,8 @@ tabnet <- function(train_data, date_rm_regex_final, mutate_adj_half = FALSE, step_nzv = "none", - one_hot = TRUE) + one_hot = TRUE, + pca = pca) model_spec_tabnet <- tabnet::tabnet( mode = "regression", @@ -1503,7 +1536,8 @@ xgboost <-function(train_data, tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing) { + back_test_spacing, + pca) { #create model recipe if(model_type == 'ensemble') { @@ -1515,7 +1549,8 @@ xgboost <-function(train_data, date_rm_regex_final, rm_date = "with_adj_index", step_nzv = "zv", - one_hot = TRUE) + one_hot = TRUE, + pca = pca) } else { @@ -1526,7 +1561,8 @@ xgboost <-function(train_data, date_rm_regex_final, rm_date = "with_adj", step_nzv = "zv", - one_hot = TRUE) + one_hot = TRUE, + pca = pca) } model_spec_xgboost <- parsnip::boost_tree( diff --git a/R/multivariate_data_prep.R b/R/multivariate_data_prep.R index 9252d288..7d073955 100644 --- a/R/multivariate_data_prep.R +++ b/R/multivariate_data_prep.R @@ -179,7 +179,7 @@ multivariate_prep_recipe_2 <- function(data, external_regressors, xregs_future_v #add horizon specific features if(date_type == 'day') { - lag_periods_r2 <- unique(c(1, 2, 3, 4, 5, 6, 7, 14, 21, 28, 28*2, 28*3, 28*6, 28*9, 28*12, 365, forecast_horizon)) + lag_periods_r2 <- unique(c(7, 14, 21, 30, 90, 180, 365, forecast_horizon)) } else { lag_periods_r2 <- 1:forecast_horizon } diff --git a/man/arima_boost.Rd b/man/arima_boost.Rd index 32ec7062..65b07184 100644 --- a/man/arima_boost.Rd +++ b/man/arima_boost.Rd @@ -12,7 +12,8 @@ arima_boost( tscv_initial, date_rm_regex, back_test_spacing, - fiscal_year_start + fiscal_year_start, + pca ) } \arguments{ diff --git a/man/cubist.Rd b/man/cubist.Rd index 45cfa0ae..a4696162 100644 --- a/man/cubist.Rd +++ b/man/cubist.Rd @@ -12,7 +12,8 @@ cubist( tscv_initial, date_rm_regex, back_test_spacing, - fiscal_year_start + fiscal_year_start, + pca ) } \arguments{ diff --git a/man/forecast_time_series.Rd b/man/forecast_time_series.Rd index e0c0bfdc..4a9a95bc 100644 --- a/man/forecast_time_series.Rd +++ b/man/forecast_time_series.Rd @@ -34,6 +34,7 @@ forecast_time_series( lag_periods = NULL, rolling_window_periods = NULL, recipes_to_run = NULL, + pca = NULL, reticulate_environment = NULL, models_to_run = NULL, models_not_to_run = NULL, @@ -129,6 +130,9 @@ chooses these values based on date_type.} all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe.} +\item{pca}{Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +PCA on day and week date types across all local multivariate models, and also for global models across all date types.} + \item{reticulate_environment}{File path to python environment to use when training gluonts deep learning models. Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image that has python environment already installed.} diff --git a/man/glmnet.Rd b/man/glmnet.Rd index c038632a..de1a3806 100644 --- a/man/glmnet.Rd +++ b/man/glmnet.Rd @@ -12,7 +12,8 @@ glmnet( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/mars.Rd b/man/mars.Rd index 1a9e3675..eb1c6004 100644 --- a/man/mars.Rd +++ b/man/mars.Rd @@ -9,7 +9,8 @@ mars( parallel, model_type = "single", date_rm_regex, - fiscal_year_start + fiscal_year_start, + pca ) } \arguments{ diff --git a/man/nnetar_xregs.Rd b/man/nnetar_xregs.Rd index 36e4ebf9..03238447 100644 --- a/man/nnetar_xregs.Rd +++ b/man/nnetar_xregs.Rd @@ -12,7 +12,8 @@ nnetar_xregs( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/prophet_boost.Rd b/man/prophet_boost.Rd index ecba2e53..4effdf16 100644 --- a/man/prophet_boost.Rd +++ b/man/prophet_boost.Rd @@ -11,7 +11,8 @@ prophet_boost( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/prophet_xregs.Rd b/man/prophet_xregs.Rd index 1a2b69e1..793b7720 100644 --- a/man/prophet_xregs.Rd +++ b/man/prophet_xregs.Rd @@ -11,7 +11,8 @@ prophet_xregs( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/svm_poly.Rd b/man/svm_poly.Rd index 0b05a219..b7fff6ce 100644 --- a/man/svm_poly.Rd +++ b/man/svm_poly.Rd @@ -12,7 +12,8 @@ svm_poly( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/svm_rbf.Rd b/man/svm_rbf.Rd index 88ab426c..28f628dc 100644 --- a/man/svm_rbf.Rd +++ b/man/svm_rbf.Rd @@ -12,7 +12,8 @@ svm_rbf( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{ diff --git a/man/tabnet.Rd b/man/tabnet.Rd index 137f7b08..588c4bf8 100644 --- a/man/tabnet.Rd +++ b/man/tabnet.Rd @@ -4,7 +4,7 @@ \alias{tabnet} \title{TabNet} \usage{ -tabnet(train_data, parallel, fiscal_year_start, date_rm_regex) +tabnet(train_data, parallel, fiscal_year_start, date_rm_regex, pca) } \arguments{ \item{train_data}{Training Data} diff --git a/man/xgboost.Rd b/man/xgboost.Rd index 3eaf449e..870db268 100644 --- a/man/xgboost.Rd +++ b/man/xgboost.Rd @@ -12,7 +12,8 @@ xgboost( tscv_initial, date_rm_regex, fiscal_year_start, - back_test_spacing + back_test_spacing, + pca ) } \arguments{