From 0c7a314a68349d380b76523dc8bbd13398043b02 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 16 Nov 2021 14:31:42 -0800 Subject: [PATCH 1/9] updates to default values of back testing and parallel arguments --- .github/workflows/R-CMD-check.yaml | 3 +-- DESCRIPTION | 3 +++ NAMESPACE | 4 ++++ R/configure_forecast_run.R | 6 +++--- R/forecast_models.R | 6 +++--- R/forecast_time_series.R | 34 +++++++++++++++--------------- R/models.R | 8 +++---- R/multivariate_data_prep.R | 8 +++---- R/prepare_hierarchy_data.R | 4 ++-- R/prepare_input_data.R | 4 ++-- R/utility.R | 10 +++++++++ R/validate_forecasting_inputs.R | 26 +++++++++++------------ man/forecast_time_series.Rd | 18 ++++++++-------- man/reexports.Rd | 16 ++++++++++++++ vignettes/feature-engineering.Rmd | 6 +++--- 15 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 man/reexports.Rd diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f2551675..658bb84e 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -25,8 +25,7 @@ jobs: - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - # Use 3.6 to trigger usage of RTools35 - - {os: windows-latest, r: '3.6'} + # Use older ubuntu to maximise backward compatibility - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} diff --git a/DESCRIPTION b/DESCRIPTION index 52b0274c..51a2f60c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,9 @@ Imports: hts, kernlab, lightgbm, + magrittr, matrixcalc, + methods, modeltime.ensemble, modeltime.gluonts, modeltime.resample, @@ -68,6 +70,7 @@ Remotes: Suggests: rmarkdown, knitr, + reactable, testthat (>= 3.0.0) Config/testthat/edition: 3 Depends: diff --git a/NAMESPACE b/NAMESPACE index 0285e430..d2007865 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(arima) export(arima_boost) export(croston) @@ -25,3 +26,6 @@ export(tabnet) export(tbats) export(theta) export(xgboost) +importFrom(magrittr,"%>%") +importFrom(methods,formalArgs) +importFrom(stats,sd) diff --git a/R/configure_forecast_run.R b/R/configure_forecast_run.R index ced63f87..9fe66871 100644 --- a/R/configure_forecast_run.R +++ b/R/configure_forecast_run.R @@ -164,7 +164,7 @@ get_date_regex <- function(date_type){ #' Gets the back testing spacing #' -#' Checks if back_test_spacing is set to auto and gets the right one +#' Checks if back_test_spacing is set to NULL and gets the right one #' #' #' @param back_test_spacing back_test_spacing override @@ -175,7 +175,7 @@ get_date_regex <- function(date_type){ get_back_test_spacing <- function(back_test_spacing, date_type){ - if(back_test_spacing != "auto") { + if(!is.null(back_test_spacing)) { return(back_test_spacing) } @@ -218,7 +218,7 @@ get_back_test_scenario_hist_periods<- function(full_data_tbl, hist_periods_80 <- floor(historical_periods*0.7) #used with time series CV in multivariate models - if(back_test_scenarios == "auto") { + if(is.null(back_test_scenarios)) { historical_periods_20 <- floor(historical_periods*0.2) diff --git a/R/forecast_models.R b/R/forecast_models.R index 667c6f7f..465a1b1d 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -259,7 +259,7 @@ construct_forecast_models <- function(full_data_tbl, cli::cli_h2("Running Combo: {combo_value}") # Copy functions into global environment within azure batch - if(parallel_processing == "azure_batch") { + if(sum(parallel_processing == "azure_batch") == 1) { global_env <- .GlobalEnv export_env <- global_env$azbatchenv$exportenv @@ -332,7 +332,7 @@ construct_forecast_models <- function(full_data_tbl, combined_models_recipe_2 <- modeltime::modeltime_table() # parallel processing - if(run_model_parallel==TRUE & parallel_processing!="local_machine") { + if(run_model_parallel==TRUE & sum(parallel_processing!="local_machine") == 1) { parallel_args <- init_parallel_within(parallel_processing, num_cores) } @@ -746,7 +746,7 @@ construct_forecast_models <- function(full_data_tbl, } #stop parallel processing - if(run_model_parallel==TRUE & parallel_processing!="local_machine"){ + if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0){ exit_parallel_within(parallel_args) } diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index f7b90476..fe5667bd 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -24,16 +24,16 @@ #' values. #' @param clean_outliers Should outliers be cleaned and inputted with values more in line with historical data? #' @param back_test_scenarios Number of specific back test folds to run when determining the best model. -#' Default of 'auto' will automatically choose the number of back tests to run based on historical data size, +#' Default of NULL will automatically choose the number of back tests to run based on historical data size, #' which tries to always use a minimum of 80% of the data when training a model. -#' @param back_test_spacing Number of periods to move back for each back test scenario. Default of 'auto' moves back 1 +#' @param back_test_spacing Number of periods to move back for each back test scenario. Default of NULL moves back 1 #' period at a time for year, quarter, and month data. Moves back 4 for week and 7 for day data. #' @param modeling_approach How Finn should approach your data. Current default and only option is 'accuracy'. In the #' future this could evolve to other areas like optimizing for interpretability over accuracy. #' @param forecast_approach How the forecast is created. The default of 'bottoms_up' trains models for each individual #' time series. 'grouped_hierarchy' creates a grouped time series to forecast at while 'standard_hierarchy' creates #' a more traditional hierarchical time series to forecast, both based on the hts package. -#' @param parallel_processing Default of 'none' runs no parallel processing and forecasts each individual time series +#' @param parallel_processing Default of NULL runs no parallel processing and forecasts each individual time series #' one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' #' runs time series in parallel on a remote compute cluster in Azure Batch. #' @param run_model_parallel Run model training in parallel, only works when parallel_processing is set to @@ -95,18 +95,18 @@ forecast_time_series <- function(input_data, date_type, forecast_horizon, external_regressors = NULL, - run_name = "time_series_forecast", + run_name = "finnts_forecast", hist_start_date = NULL, hist_end_date = NULL, combo_cleanup_date = NULL, fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, - back_test_scenarios = "auto", - back_test_spacing = "auto", + back_test_scenarios = NULL, + back_test_spacing = NULL, modeling_approach = "accuracy", forecast_approach = "bottoms_up", - parallel_processing = 'none', + parallel_processing = NULL, run_model_parallel = TRUE, num_cores = NULL, azure_batch_credentials = NULL, @@ -299,14 +299,14 @@ forecast_time_series <- function(input_data, } # no parallel processing - if(parallel_processing == "none") { + if(is.null(parallel_processing)) { fcst <- lapply(combo_list, forecast_models_fn) fcst <- do.call(rbind, fcst) } # parallel run on local machine - if(parallel_processing=="local_machine") { + if(sum(parallel_processing=="local_machine") == 1) { fcst <- get_fcast_parallel(combo_list, forecast_models_fn, @@ -315,7 +315,7 @@ forecast_time_series <- function(input_data, } # parallel run within azure batch - if(parallel_processing=="azure_batch") { + if(sum(parallel_processing=="azure_batch") == 1) { fcst <- get_fcast_parallel_azure(combo_list, forecast_models_fn, @@ -356,14 +356,14 @@ forecast_time_series <- function(input_data, #parallel processing - if(run_model_parallel==TRUE & parallel_processing!="local_machine") { + if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0) { cores <- get_cores(num_cores) cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch - if(parallel_processing=="azure_batch") { + if(sum(parallel_processing=="azure_batch") == 1) { clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) } @@ -385,7 +385,7 @@ forecast_time_series <- function(input_data, } #stop parallel processing - if(run_model_parallel==TRUE & parallel_processing!="local_machine") {parallel::stopCluster(cl)} + if(run_model_parallel==TRUE & sum(parallel_processing=="local_machine") == 0) {parallel::stopCluster(cl)} } else { @@ -408,14 +408,14 @@ forecast_time_series <- function(input_data, } # no parallel processing - if(parallel_processing == "none") { + if(is.null(parallel_processing)) { combinations_tbl_final <- lapply(2:min(max_model_average, length(model_list)), create_model_averages) combinations_tbl_final <- do.call(rbind, combinations_tbl_final) } # parallel run on local machine - if(parallel_processing=="local_machine") { + if(sum(parallel_processing=="local_machine") == 1) { cores <- get_cores(num_cores) @@ -431,7 +431,7 @@ forecast_time_series <- function(input_data, } # parallel run within azure batch - if(parallel_processing=="azure_batch") { + if(sum(parallel_processing=="azure_batch") == 1) { combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', @@ -444,7 +444,7 @@ forecast_time_series <- function(input_data, } - if(parallel_processing == 'azure_batch' & azure_batch_cluster_delete == TRUE) { + if(sum(parallel_processing == 'azure_batch') == 1 & azure_batch_cluster_delete == TRUE) { stopCluster(cluster) } diff --git a/R/models.R b/R/models.R index be137df4..57ac8805 100644 --- a/R/models.R +++ b/R/models.R @@ -61,11 +61,11 @@ get_recipie_configurable <- function(train_data, switch(rm_date, "plain" = df %>% - recipes::step_rm(matches(date_rm_regex_final), Date), + recipes::step_rm(tidyselect::matches(date_rm_regex_final), Date), "with_adj" = df %>% - recipes::step_rm(matches(date_rm_regex_final), Date, Date_Adj), + recipes::step_rm(tidyselect::matches(date_rm_regex_final), Date, Date_Adj), "with_adj_index" = df %>% - recipes::step_rm(matches(date_rm_regex_final), Date, Date_Adj, Date_Adj_index.num), + recipes::step_rm(tidyselect::matches(date_rm_regex_final), Date, Date_Adj, Date_Adj_index.num), df) } @@ -272,7 +272,7 @@ get_tune_grid <- function(train_data, params <- dials::parameters(wkflw) if(isBoost){ - params %>% update(learn_rate = dials::learn_rate(range = c(0.15, 0.5), + params %>% stats::update(learn_rate = dials::learn_rate(range = c(0.15, 0.5), trans = NULL)) }else { diff --git a/R/multivariate_data_prep.R b/R/multivariate_data_prep.R index 7d073955..1e97b97d 100644 --- a/R/multivariate_data_prep.R +++ b/R/multivariate_data_prep.R @@ -58,7 +58,7 @@ multivariate_prep_recipe_1 <- function(data, external_regressors, xregs_future_v df_lag_final <- df #apply lags - for(column in colnames(df %>% dplyr::select(contains(c("Target", external_regressors))))) { + for(column in colnames(df %>% dplyr::select(tidyselect::contains(c("Target", external_regressors))))) { df_lag <- df %>% timetk::tk_augment_lags(column, .lags = lag_periods) %>% @@ -115,7 +115,7 @@ multivariate_prep_recipe_1 <- function(data, external_regressors, xregs_future_v }) %>% dplyr::bind_rows() %>% timetk::tk_augment_fourier(Date, .periods = fourier_periods, .K = 2) %>% #add fourier series - tidyr::fill(contains("_roll"), .direction = "down") + tidyr::fill(tidyselect::contains("_roll"), .direction = "down") #drop xregs that do not contain future values data_period <- data_period %>% @@ -200,7 +200,7 @@ multivariate_prep_recipe_2 <- function(data, external_regressors, xregs_future_v #apply lags df_lag_final <- df - for(column in colnames(df %>% dplyr::select(contains(c("Target", external_regressors))))) { + for(column in colnames(df %>% dplyr::select(tidyselect::contains(c("Target", external_regressors))))) { df_lag <- df %>% timetk::tk_augment_lags(column, .lags = unique(c(lag_periods_r2, lag_periods))+(period-1)) %>% @@ -250,7 +250,7 @@ multivariate_prep_recipe_2 <- function(data, external_regressors, xregs_future_v is.na(df_roll) <- sapply(df_roll, is.nan) df_roll <- df_roll %>% - tidyr::fill(contains("_roll"), .direction = "down") + tidyr::fill(tidyselect::contains("_roll"), .direction = "down") df_window_final <- cbind(df_window_final, df_roll %>% dplyr::select(c(stringr::str_c(column, "_roll", rolling_window_periods, "_Avg"), stringr::str_c(column, "_roll", rolling_window_periods, "_Sum"), stringr::str_c(column, "_roll", rolling_window_periods, "_StdDev")))) } diff --git a/R/prepare_hierarchy_data.R b/R/prepare_hierarchy_data.R index 3e1f24df..e958ad78 100644 --- a/R/prepare_hierarchy_data.R +++ b/R/prepare_hierarchy_data.R @@ -116,7 +116,7 @@ get_data_tbl_final <- function(data_tbl, df %>% dplyr::select(-Date) %>% - ts(frequency = frequency_number)%>% + stats::ts(frequency = frequency_number)%>% get_hts(hts_list) %>% hts::allts() %>% data.frame() %>% @@ -129,7 +129,7 @@ get_data_tbl_final <- function(data_tbl, } else if(ret_obj == "hts_gts") { data_ts <- df %>% dplyr::select(-Date) %>% - ts(frequency = frequency_number) + stats::ts(frequency = frequency_number) hts_gts <- data_ts %>% get_hts(hts_list) diff --git a/R/prepare_input_data.R b/R/prepare_input_data.R index 5f89a484..de389cbe 100644 --- a/R/prepare_input_data.R +++ b/R/prepare_input_data.R @@ -91,8 +91,8 @@ get_modelling_ready_tbl<-function(data_tbl, data_tbl %>% dplyr::select(c("Combo", - all_of(combo_variables), - all_of(external_regressors), + tidyselect::all_of(combo_variables), + tidyselect::all_of(external_regressors), "Date", "Target")) %>% dplyr::filter(Date <= hist_end_date) %>% dplyr::arrange(Combo, Date) %>% diff --git a/R/utility.R b/R/utility.R index 6ba24df4..6a57b066 100644 --- a/R/utility.R +++ b/R/utility.R @@ -1,3 +1,13 @@ +#' @importFrom magrittr %>% +#' @export +magrittr::`%>%` + +#' @importFrom methods formalArgs +NULL + +#' @importFrom stats sd +NULL + # * cbind.fill custom function ---- #create function to cbind dataframes that contain different amounts of rows #https://github.com/cvarrichio/rowr/blob/master/R/rowr.R diff --git a/R/validate_forecasting_inputs.R b/R/validate_forecasting_inputs.R index 8173bf89..393ebc9a 100644 --- a/R/validate_forecasting_inputs.R +++ b/R/validate_forecasting_inputs.R @@ -16,11 +16,11 @@ #' @param fiscal_year_start Month number of start of fiscal year #' @param clean_missing_values Cleaning missing values #' @param clean_outliers Cleaning outliers in data -#' @param back_test_scenarios Auto or 1,2,3, etc. -#' @param back_test_spacing Auto or 1,2,3, etc. +#' @param back_test_scenarios NULL or 1,2,3, etc. +#' @param back_test_spacing NULL or 1,2,3, etc. #' @param modeling_approach Currently only accuracy is supported -#' @param forecast_approach Bottoms_up, grouped_hierarchy, standard_hierarchy -#' @param parallel_processing azure_batch, local_machine, none +#' @param forecast_approach bottoms_up, grouped_hierarchy, standard_hierarchy +#' @param parallel_processing azure_batch, local_machine, NULL #' @param num_cores number of cores for parallel processing #' @param run_model_parallel run hyperparameter search and model in parallel #' @param azure_batch_credentials Azure Batch Credentials @@ -137,13 +137,13 @@ validate_forecasting_inputs<-function(input_data, } #back test scenarios formatting - if((!is.numeric(back_test_scenarios) & back_test_scenarios != "auto") | back_test_scenarios < 1) { - stop("back test scenarios input value must be either a number greater than 0 or set to 'auto'") + if((!is.numeric(back_test_scenarios) & !is.null(back_test_scenarios)) | sum(back_test_scenarios < 1) == 1) { + stop("back test scenarios input value must be either a number greater than 0 or set to NULL") } #back test spacing - if((!is.numeric(back_test_spacing) & back_test_spacing != "auto") | back_test_spacing < 1) { - stop("back test spacing input value must be either a number greater than 0 or set to 'auto'") + if((!is.numeric(back_test_spacing) & !is.null(back_test_spacing)) | sum(back_test_spacing < 1) == 1) { + stop("back test spacing input value must be either a number greater than 0 or set to NULL") } #modeling approach formatting @@ -182,13 +182,13 @@ validate_forecasting_inputs<-function(input_data, } #parallel processing within data combos and each model - if(parallel_processing == "local_machine" & run_model_parallel) { + if(sum(parallel_processing == "local_machine") == 1 & run_model_parallel) { stop("cannot run parallel process (run model parallel input) within another parallel process (parallel processing input)") } #parallel processing formatting - if(!(parallel_processing %in% c("none", "local_machine", "azure_batch"))) { - stop("parallel processing input must be one of these values: 'none', 'local_machine', 'azure_batch'") + if(!is.null(parallel_processing) & sum(parallel_processing %in% c("local_machine", "azure_batch")) == 0) { + stop("parallel processing input must be one of these values: NULL, 'local_machine', 'azure_batch'") } #number of cores formatting @@ -197,12 +197,12 @@ validate_forecasting_inputs<-function(input_data, } #check if azure credentials are given in case of parallel_processing = azure_batch - if(parallel_processing == "azure_batch" & is.null(azure_batch_credentials)){ + if(sum(parallel_processing == "azure_batch") == 1 & is.null(azure_batch_credentials)){ stop("cannot run parallel_processing on azure_batch without batch credentials") } #check if azure batch cluster info is given in case of parallel_processing = azure_batch - if(parallel_processing == "azure_batch" & is.null(azure_batch_cluster_config)){ + if(sum(parallel_processing == "azure_batch") == 1 & is.null(azure_batch_cluster_config)){ stop("cannot run parallel_processing on azure_batch without cluster config") } diff --git a/man/forecast_time_series.Rd b/man/forecast_time_series.Rd index 4a9a95bc..08ae1498 100644 --- a/man/forecast_time_series.Rd +++ b/man/forecast_time_series.Rd @@ -11,18 +11,18 @@ forecast_time_series( date_type, forecast_horizon, external_regressors = NULL, - run_name = "time_series_forecast", + run_name = "finnts_forecast", hist_start_date = NULL, hist_end_date = NULL, combo_cleanup_date = NULL, fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, - back_test_scenarios = "auto", - back_test_spacing = "auto", + back_test_scenarios = NULL, + back_test_spacing = NULL, modeling_approach = "accuracy", forecast_approach = "bottoms_up", - parallel_processing = "none", + parallel_processing = NULL, run_model_parallel = TRUE, num_cores = NULL, azure_batch_credentials = NULL, @@ -83,10 +83,10 @@ values.} \item{clean_outliers}{Should outliers be cleaned and inputted with values more in line with historical data?} \item{back_test_scenarios}{Number of specific back test folds to run when determining the best model. -Default of 'auto' will automatically choose the number of back tests to run based on historical data size, +Default of NULL will automatically choose the number of back tests to run based on historical data size, which tries to always use a minimum of 80\% of the data when training a model.} -\item{back_test_spacing}{Number of periods to move back for each back test scenario. Default of 'auto' moves back 1 +\item{back_test_spacing}{Number of periods to move back for each back test scenario. Default of NULL moves back 1 period at a time for year, quarter, and month data. Moves back 4 for week and 7 for day data.} \item{modeling_approach}{How Finn should approach your data. Current default and only option is 'accuracy'. In the @@ -96,7 +96,7 @@ future this could evolve to other areas like optimizing for interpretability ove time series. 'grouped_hierarchy' creates a grouped time series to forecast at while 'standard_hierarchy' creates a more traditional hierarchical time series to forecast, both based on the hts package.} -\item{parallel_processing}{Default of 'none' runs no parallel processing and forecasts each individual time series +\item{parallel_processing}{Default of NULL runs no parallel processing and forecasts each individual time series one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' runs time series in parallel on a remote compute cluster in Azure Batch.} @@ -124,7 +124,7 @@ these values based on the date_type.} based on date_type.} \item{rolling_window_periods}{List of values to use in creating rolling window features. Default of NULL automatically -chooses these values based on date_type.} +chooses these values based on date type.} \item{recipes_to_run}{List of recipes to run on multivariate models that can run different recipes. A value of NULL runs all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless @@ -132,7 +132,7 @@ of date type. A list like c("R1") or c("R2") would only run models with the R1 o \item{pca}{Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs PCA on day and week date types across all local multivariate models, and also for global models across all date types.} - + \item{reticulate_environment}{File path to python environment to use when training gluonts deep learning models. Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image that has python environment already installed.} diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 00000000..788fc1ee --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utility.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{\%>\%} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} +}} + diff --git a/vignettes/feature-engineering.Rmd b/vignettes/feature-engineering.Rmd index 88a02995..188c1bd8 100644 --- a/vignettes/feature-engineering.Rmd +++ b/vignettes/feature-engineering.Rmd @@ -75,9 +75,9 @@ tibble( ) %>% dplyr::mutate(Date = as.Date(Date)) %>% timetk::tk_augment_lags(.value = "Target", .lags = c(3,6), .names = "auto") %>% - tidyr::fill(contains("lag"), .direction = "up") %>% - tk_augment_slidify(contains("lag"), .f = sum, .period = c(3, 6), .align = "right") %>% - tidyr::fill(contains("lag"), .direction = "up") %>% + tidyr::fill(tidyselect::contains("lag"), .direction = "up") %>% + tk_augment_slidify(tidyselect::contains("lag"), .f = sum, .period = c(3, 6), .align = "right") %>% + tidyr::fill(tidyselect::contains("lag"), .direction = "up") %>% dplyr::rename( Target_lag3_roll3_sum = Target_lag3_roll_3, Target_lag6_roll3_sum = Target_lag6_roll_3, From 9fb7c2ee2de6b509bf8c47d4d7523503da04a12f Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Wed, 17 Nov 2021 19:37:16 -0800 Subject: [PATCH 2/9] final push before PR --- .Rbuildignore | 5 ++ DESCRIPTION | 12 ++--- NAMESPACE | 5 +- NEWS.md | 4 ++ R/azure_batch_parallel.R | 2 +- R/forecast_models.R | 10 ++-- R/forecast_time_series.R | 57 +++++++++++++--------- R/general_parallel.R | 4 +- R/utility.R | 21 +++++++- _pkgdown.yml | 4 ++ cran-comments.md | 19 ++++++++ man/forecast_time_series.Rd | 29 +++++------ man/reexports.Rd | 16 ------ tests/testthat/test-forecast_time_series.R | 2 +- 14 files changed, 118 insertions(+), 72 deletions(-) create mode 100644 NEWS.md create mode 100644 cran-comments.md delete mode 100644 man/reexports.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 1aa19d3a..1d244201 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,8 @@ ^docs$ ^pkgdown$ ^\.github$ +CODE_OF_CONDUCT.md +SECURITY.md +SUPPORT.md +cran-comments.md +NEWS.md \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 51a2f60c..077c6bb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.0.0.9000 +Version: 0.1.0 Authors@R: c(person(given = "Mike", family = "Tokic", role = c("aut", "cre"), - email = "mitokic@microsoft.com", + email = "mftokic@gmail.com", comment = c(ORCID = "0000-0002-7630-7055")), person(given = "Aadharsh", family = "Kannan", @@ -34,15 +34,17 @@ Imports: doParallel, dplyr, earth, + foreach, generics, glmnet, gtools, hts, kernlab, lightgbm, + lubridate, magrittr, matrixcalc, - methods, + methods, modeltime.ensemble, modeltime.gluonts, modeltime.resample, @@ -74,8 +76,6 @@ Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 Depends: - R (>= 3.6.0), - lubridate, - foreach, + R (>= 3.6.0), modeltime VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index d2007865..46b1f09a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export("%>%") export(arima) export(arima_boost) export(croston) @@ -26,6 +25,10 @@ export(tabnet) export(tbats) export(theta) export(xgboost) +import(modeltime) +importFrom(foreach,"%do%") +importFrom(foreach,"%dopar%") +importFrom(lubridate,"%m+%") importFrom(magrittr,"%>%") importFrom(methods,formalArgs) importFrom(stats,sd) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..8b530d86 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,4 @@ + +# finnts 0.1.0 + +* Initial CRAN Release \ No newline at end of file diff --git a/R/azure_batch_parallel.R b/R/azure_batch_parallel.R index 43a55704..b6315c85 100644 --- a/R/azure_batch_parallel.R +++ b/R/azure_batch_parallel.R @@ -22,7 +22,7 @@ get_fcast_parallel_azure <- function(combo_list, cli::cli_h2("Submitting Tasks to Azure Batch") - fcst <- foreach(i = combo_list, .combine = 'rbind', + fcst <- foreach::foreach(i = combo_list, .combine = 'rbind', .packages = get_export_packages(), .export = get_transfer_functions(), .options.azure = list(maxTaskRetryCount = 0, diff --git a/R/forecast_models.R b/R/forecast_models.R index 465a1b1d..84cf35d4 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -332,7 +332,7 @@ construct_forecast_models <- function(full_data_tbl, combined_models_recipe_2 <- modeltime::modeltime_table() # parallel processing - if(run_model_parallel==TRUE & sum(parallel_processing!="local_machine") == 1) { + if(run_model_parallel == TRUE & sum(parallel_processing == "local_machine") == 0) { parallel_args <- init_parallel_within(parallel_processing, num_cores) } @@ -390,7 +390,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, mdl_called, location = "top") %>% - update_model_description(1, model_name), + modeltime::update_model_description(1, model_name), silent = TRUE) }else{ @@ -422,7 +422,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_1 <- modeltime::add_modeltime_model(combined_models_recipe_1, mdl_called, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } @@ -446,7 +446,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_models_recipe_2 <- modeltime::add_modeltime_model(combined_models_recipe_2, mdl_called, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } @@ -651,7 +651,7 @@ construct_forecast_models <- function(full_data_tbl, try(combined_ensemble_models <- modeltime::add_modeltime_model(combined_ensemble_models, mdl_ensemble, location = "top") %>% - update_model_description(1, add_name), + modeltime::update_model_description(1, add_name), silent = TRUE) } diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index fe5667bd..7b0f6361 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -19,10 +19,10 @@ #' that specified date. Default of NULL is to not remove any time series and attempt to forecast all of them. #' @param fiscal_year_start Month number of start of fiscal year of input data, aids in building out date features. #' Formatted as a numeric value. Default of 1 assumes fiscal year starts in January. -#' @param clean_missing_values Should missing values be inputted? Only inputes values for missing data within an +#' @param clean_missing_values If TRUE, cleans missing values. Only inputes values for missing data within an #' existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said #' values. -#' @param clean_outliers Should outliers be cleaned and inputted with values more in line with historical data? +#' @param clean_outliers If TRUE, outliers are cleaned and inputted with values more in line with historical data #' @param back_test_scenarios Number of specific back test folds to run when determining the best model. #' Default of NULL will automatically choose the number of back tests to run based on historical data size, #' which tries to always use a minimum of 80% of the data when training a model. @@ -36,16 +36,16 @@ #' @param parallel_processing Default of NULL runs no parallel processing and forecasts each individual time series #' one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' #' runs time series in parallel on a remote compute cluster in Azure Batch. -#' @param run_model_parallel Run model training in parallel, only works when parallel_processing is set to +#' @param run_model_parallel If TRUE, runs model training in parallel, only works when parallel_processing is set to #' 'local_machine' or 'azure_batch'. #' @param num_cores Number of cores to run when parallel processing is set up. Used when running parallel computations #' on local machine or within Azure. Default of NULL uses total amount of cores on machine minus one. Can't be greater #' than number of cores on machine minus 1. #' @param azure_batch_credentials Credentials to run parallel_processing in Azure Batch. #' @param azure_batch_cluster_config Compute cluster specification to run parallel_processing in Azure Batch. -#' @param azure_batch_cluster_delete Delete the Azure Batch compute cluster after Finn finished running. -#' @param target_log_transformation Log transform target variable before training models. -#' @param negative_fcst Allow forecasts to dip below zero. +#' @param azure_batch_cluster_delete If TRUE, deletes the Azure Batch compute cluster after Finn finished running. +#' @param target_log_transformation If TRUE, log transform target variable before training models. +#' @param negative_fcst If TRUE, allow forecasts to dip below zero. #' @param fourier_periods List of values to use in creating fourier series as features. Default of NULL automatically chooses #' these values based on the date_type. #' @param lag_periods List of values to use in creating lag features. Default of NULL automatically chooses these values @@ -55,7 +55,7 @@ #' @param recipes_to_run List of recipes to run on multivariate models that can run different recipes. A value of NULL runs #' all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless #' of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe. -#' @param pca Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +#' @param pca If TRUE, run principle component analysis on any lagged features to speed up model run time. Default of NULL runs #' PCA on day and week date types across all local multivariate models, and also for global models across all date types. #' @param reticulate_environment File path to python environment to use when training gluonts deep learning models. #' Only important when parallel_processing is not set to 'azure_batch'. Azure Batch should use its own docker image @@ -63,16 +63,17 @@ #' @param models_to_run List of models to run. Default of NULL runs all models. #' @param models_not_to_run List of models not to run, overrides values in models_to_run. Default of NULL doesn't turn off #' any model. -#' @param run_deep_learning Run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and +#' @param run_deep_learning If TRUE, run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and #' models_not_to_run. -#' @param run_global_models Run multivariate models on the entire data set (across all time series) as a global model. +#' @param run_global_models If TRUE, run multivariate models on the entire data set (across all time series) as a global model. #' Can be override by models_not_to_run. Default of NULL runs global models for all date types except week and day. -#' @param run_local_models Run models by individual time series as local models. -#' @param run_ensemble_models Run ensemble models -#' @param average_models Create simple averages of individual models. +#' @param run_local_models If TRUE, run models by individual time series as local models. +#' @param run_ensemble_models If TRUE, run ensemble models. Default of NULL runs ensemble models only for quarter and month +#' date types. +#' @param average_models If TRUE, create simple averages of individual models. #' @param max_model_average Max number of models to average together. Will create model averages for 2 models up until input value #' or max number of models ran. -#' @param weekly_to_daily Convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating +#' @param weekly_to_daily If TRUE, convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating #' up to higher temporal levels like month or quarter. #' #' @return A list of three separate data sets: the future forecast, the back test results, and the best model per time series. @@ -125,7 +126,7 @@ forecast_time_series <- function(input_data, run_deep_learning = FALSE, run_global_models = NULL, run_local_models = TRUE, - run_ensemble_models = TRUE, + run_ensemble_models = NULL, average_models = TRUE, max_model_average = 3, weekly_to_daily = TRUE @@ -195,12 +196,20 @@ forecast_time_series <- function(input_data, back_test_spacing <- get_back_test_spacing(back_test_spacing, date_type) - # * Yearly Forecast Adjustment ---- - if(date_type =="year") { + # * Ensemble Models Adjustment ---- + if(is.null(run_ensemble_models) & date_type %in% c("quarter", "month")) { + run_ensemble_models <- TRUE + } else if(is.null(run_ensemble_models) & date_type %in% c("week", "day")) { + run_ensemble_models <- FALSE + } else if(sum(run_ensemble_models == TRUE) == 1 & date_type %in% c("quarter", "month", "week", "day")) { + run_ensemble_models <- TRUE + } else if(sum(run_ensemble_models == TRUE) == 1 & date_type =="year") { run_ensemble_models = FALSE warning("ensemble models have been turned off for yearly forecasts") + } else { + run_ensemble_models = FALSE } - + # 4. Prep Data ---- cli::cli_h1("Prepping Data") @@ -354,17 +363,16 @@ forecast_time_series <- function(input_data, model_combinations$All <- model_combinations %>% tidyr::unite(All, colnames(model_combinations)) model_combinations <- model_combinations$All - #parallel processing if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0) { - + cores <- get_cores(num_cores) cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch if(sum(parallel_processing=="azure_batch") == 1) { - clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) } combinations_tbl <- foreach::foreach(i = model_combinations[[1]], .combine = 'rbind', @@ -422,7 +430,7 @@ forecast_time_series <- function(input_data, cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) - combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', + combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', .packages = get_export_packages(), .export = c("fcst_prep", "get_cores")) %dopar% {create_model_averages(i)} @@ -434,7 +442,7 @@ forecast_time_series <- function(input_data, if(sum(parallel_processing=="azure_batch") == 1) { - combinations_tbl_final <- foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', + combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', .packages = get_export_packages(), .export = c("fcst_prep", "get_cores"), .options.azure = list(maxTaskRetryCount = 0, autoDeleteJob = TRUE, @@ -445,7 +453,7 @@ forecast_time_series <- function(input_data, } if(sum(parallel_processing == 'azure_batch') == 1 & azure_batch_cluster_delete == TRUE) { - stopCluster(cluster) + parallel::stopCluster(cluster) } # combine with individual model data @@ -653,7 +661,8 @@ forecast_time_series <- function(input_data, dplyr::group_by(Combo, .id, Model) %>% dplyr::mutate(Horizon = dplyr::row_number()) %>% dplyr::ungroup() %>% - dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE) %>% + dplyr::mutate(Best_Model = ifelse(Model == "Best-Model", "Yes", "No")) %>% + dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE, Best_Model) %>% tidyr::separate(Combo, into = combo_variables, sep = '--', remove = FALSE) %>% dplyr::rename(Back_Test_Scenario = .id) diff --git a/R/general_parallel.R b/R/general_parallel.R index 65174b94..ea27573a 100644 --- a/R/general_parallel.R +++ b/R/general_parallel.R @@ -28,7 +28,7 @@ init_parallel_within <-function(type, num_cores){ doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch - if(type == "azure_batch") { + if(sum(type == "azure_batch") == 1) { parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) } @@ -72,7 +72,7 @@ get_fcast_parallel<- function(combo_list, cli::cli_alert_info("Running across {cores} cores") - fcst <- foreach(i = combo_list, + fcst <- foreach::foreach(i = combo_list, .combine = 'rbind', .packages = get_export_packages(), .export = get_transfer_functions() diff --git a/R/utility.R b/R/utility.R index 6a57b066..c61a019d 100644 --- a/R/utility.R +++ b/R/utility.R @@ -1,6 +1,14 @@ +# define global variables to prevent notes in R CMD Check +utils::globalVariables(c(".id", ".key", ".model_desc", ".pred", ".resample_id", "All", "Best_Model", "Combo", + "Combo_Test_Date", "Combo_Total", "Count", "Date", "Date_Adj", "Date_Adj_half", + "Date_Adj_index.num", "Date_Adj_quarter", "Date_Adj_year", "Date_Day", "FCST", + "Horizon", "MAPE", "Model", "Number", "Number_Char", "Origin", "Residual", + "Residual_Std_Dev", "Rolling_MAPE", "Slice", "Sum", "Target", "Type", "Variable", + "cluster", "frequency", "gluon_ts_frequency", "hi.80", "hi.95", "i", "lo.80", "lo.95", + "weighted_MAPE", "where", "as2")) + #' @importFrom magrittr %>% -#' @export -magrittr::`%>%` +NULL #' @importFrom methods formalArgs NULL @@ -8,6 +16,15 @@ NULL #' @importFrom stats sd NULL +#' @importFrom foreach %do% %dopar% +NULL + +#' @importFrom lubridate %m+% +NULL + +#' @import modeltime +NULL + # * cbind.fill custom function ---- #create function to cbind dataframes that contain different amounts of rows #https://github.com/cvarrichio/rowr/blob/master/R/rowr.R diff --git a/_pkgdown.yml b/_pkgdown.yml index 42657634..59e0a626 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,3 +8,7 @@ authors: href: https://aadharshkannan.com/ Mike Tokic: href: https://www.linkedin.com/in/michaeltokic/ + +template: + params: + ganalytics: G-6X0DS5856B diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 00000000..4dac61b2 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,19 @@ +## R CMD check results +There were no ERRORs or WARNINGs. + +There was 1 NOTE: + +* checking dependencies in R code ... NOTE + Imports includes 38 non-default packages. + Importing from so many packages makes the package vulnerable to any of + them becoming unavailable. Move as many as possible to Suggests and + use conditionally. + + This package does leverage many outside packages. The main feature of this package is + that it consolidates a lot of different models into one package to run them automatically. + So having many required packages is important to the package. + + Also this is my first cran submission. + +## Downstream dependencies +There are currently no downstream dependencies for this package \ No newline at end of file diff --git a/man/forecast_time_series.Rd b/man/forecast_time_series.Rd index 08ae1498..a31bf6eb 100644 --- a/man/forecast_time_series.Rd +++ b/man/forecast_time_series.Rd @@ -41,7 +41,7 @@ forecast_time_series( run_deep_learning = FALSE, run_global_models = NULL, run_local_models = TRUE, - run_ensemble_models = TRUE, + run_ensemble_models = NULL, average_models = TRUE, max_model_average = 3, weekly_to_daily = TRUE @@ -76,11 +76,11 @@ that specified date. Default of NULL is to not remove any time series and attemp \item{fiscal_year_start}{Month number of start of fiscal year of input data, aids in building out date features. Formatted as a numeric value. Default of 1 assumes fiscal year starts in January.} -\item{clean_missing_values}{Should missing values be inputted? Only inputes values for missing data within an +\item{clean_missing_values}{If TRUE, cleans missing values. Only inputes values for missing data within an existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said values.} -\item{clean_outliers}{Should outliers be cleaned and inputted with values more in line with historical data?} +\item{clean_outliers}{If TRUE, outliers are cleaned and inputted with values more in line with historical data} \item{back_test_scenarios}{Number of specific back test folds to run when determining the best model. Default of NULL will automatically choose the number of back tests to run based on historical data size, @@ -100,7 +100,7 @@ a more traditional hierarchical time series to forecast, both based on the hts p one after another. 'local_machine' leverages all cores on current machine Finn is running on. 'azure_batch' runs time series in parallel on a remote compute cluster in Azure Batch.} -\item{run_model_parallel}{Run model training in parallel, only works when parallel_processing is set to +\item{run_model_parallel}{If TRUE, runs model training in parallel, only works when parallel_processing is set to 'local_machine' or 'azure_batch'.} \item{num_cores}{Number of cores to run when parallel processing is set up. Used when running parallel computations @@ -111,11 +111,11 @@ than number of cores on machine minus 1.} \item{azure_batch_cluster_config}{Compute cluster specification to run parallel_processing in Azure Batch.} -\item{azure_batch_cluster_delete}{Delete the Azure Batch compute cluster after Finn finished running.} +\item{azure_batch_cluster_delete}{If TRUE, deletes the Azure Batch compute cluster after Finn finished running.} -\item{target_log_transformation}{Log transform target variable before training models.} +\item{target_log_transformation}{If TRUE, log transform target variable before training models.} -\item{negative_fcst}{Allow forecasts to dip below zero.} +\item{negative_fcst}{If TRUE, allow forecasts to dip below zero.} \item{fourier_periods}{List of values to use in creating fourier series as features. Default of NULL automatically chooses these values based on the date_type.} @@ -130,7 +130,7 @@ chooses these values based on date type.} all recipes, but only runs the R1 recipe for weekly and daily date types. A value of "all" runs all recipes, regardless of date type. A list like c("R1") or c("R2") would only run models with the R1 or R2 recipe.} -\item{pca}{Run principle component analysis on any lagged features to speed up model run time. Default of NULL runs +\item{pca}{If TRUE, run principle component analysis on any lagged features to speed up model run time. Default of NULL runs PCA on day and week date types across all local multivariate models, and also for global models across all date types.} \item{reticulate_environment}{File path to python environment to use when training gluonts deep learning models. @@ -142,22 +142,23 @@ that has python environment already installed.} \item{models_not_to_run}{List of models not to run, overrides values in models_to_run. Default of NULL doesn't turn off any model.} -\item{run_deep_learning}{Run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and +\item{run_deep_learning}{If TRUE, run deep learning models from gluonts (deepar and nbeats). Overrides models_to_run and models_not_to_run.} -\item{run_global_models}{Run multivariate models on the entire data set (across all time series) as a global model. +\item{run_global_models}{If TRUE, run multivariate models on the entire data set (across all time series) as a global model. Can be override by models_not_to_run. Default of NULL runs global models for all date types except week and day.} -\item{run_local_models}{Run models by individual time series as local models.} +\item{run_local_models}{If TRUE, run models by individual time series as local models.} -\item{run_ensemble_models}{Run ensemble models} +\item{run_ensemble_models}{If TRUE, run ensemble models. Default of NULL runs ensemble models only for quarter and month +date types.} -\item{average_models}{Create simple averages of individual models.} +\item{average_models}{If TRUE, create simple averages of individual models.} \item{max_model_average}{Max number of models to average together. Will create model averages for 2 models up until input value or max number of models ran.} -\item{weekly_to_daily}{Convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating +\item{weekly_to_daily}{If TRUE, convert a week forecast down to day by evenly splitting across each day of week. Helps when aggregating up to higher temporal levels like month or quarter.} } \value{ diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 788fc1ee..00000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utility.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{\%>\%} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} -}} - diff --git a/tests/testthat/test-forecast_time_series.R b/tests/testthat/test-forecast_time_series.R index d6e1b94e..272ce970 100644 --- a/tests/testthat/test-forecast_time_series.R +++ b/tests/testthat/test-forecast_time_series.R @@ -14,7 +14,7 @@ forecast_horizon <- 3 target_variable <- "value" combo_variables <- c("id") models_to_run <- c("arima", "ets") -inp_data <- m750 %>% dplyr::rename(Date = date) %>% dplyr::mutate(id = as.character(id)) +inp_data <- modeltime::m750 %>% dplyr::rename(Date = date) %>% dplyr::mutate(id = as.character(id)) dt_type <- "month" finn_forecast <- forecast_time_series( From e9e30965d4ea316d0f1faa9ff71201c985f8e082 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 18 Nov 2021 09:46:07 -0800 Subject: [PATCH 3/9] pad_by_time workaround --- R/prepare_hierarchy_data.R | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/R/prepare_hierarchy_data.R b/R/prepare_hierarchy_data.R index e958ad78..38d8d271 100644 --- a/R/prepare_hierarchy_data.R +++ b/R/prepare_hierarchy_data.R @@ -313,16 +313,20 @@ get_full_data_tbl <- function(data_tbl, Target, external_regressors) %>% dplyr::group_by(Combo) %>% - timetk::pad_by_time(Date, - .by = date_type, - .pad_value = pad_value, - .end_date = hist_end_date) %>% #fill in missing values in between existing data points - timetk::pad_by_time(Date, - .by = date_type, - .pad_value = 0, - .start_date = hist_start_date, - .end_date = hist_end_date) %>% #fill in missing values at beginning of time series with zero - dplyr::ungroup()%>% + dplyr::group_split() %>% + purrr::map(.f = function(df) { # latest update to timetk as of 11.18.21 doesn't allow for non NA pad value within dplyr groups. Filed bug and will update once fixed + df %>% + timetk::pad_by_time(Date, + .by = date_type, + .pad_value = pad_value, + .end_date = hist_end_date) %>% #fill in missing values in between existing data points + timetk::pad_by_time(Date, + .by = date_type, + .pad_value = 0, + .start_date = hist_start_date, + .end_date = hist_end_date) #fill in missing values at beginning of time series with zero + }) %>% + dplyr::bind_rows() %>% get_log_transformation(target_log_transformation) %>% dplyr::group_by(Combo) %>% timetk::future_frame(Date, From f064f304e9df3f33409bc4cc78aa04871427541f Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Fri, 19 Nov 2021 09:34:44 -0800 Subject: [PATCH 4/9] update fix to pad_by_time to remove bugs --- R/forecast_models.R | 2 +- R/prepare_hierarchy_data.R | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/forecast_models.R b/R/forecast_models.R index 84cf35d4..70cc6d7b 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -274,7 +274,7 @@ construct_forecast_models <- function(full_data_tbl, run_data_full_tbl <- full_data_tbl %>% combo_specific_filter(combo_value, combo_variables) - + cli::cli_h3("Initial Feature Engineering") # recipe 1: standard feature engineering diff --git a/R/prepare_hierarchy_data.R b/R/prepare_hierarchy_data.R index 38d8d271..9191e951 100644 --- a/R/prepare_hierarchy_data.R +++ b/R/prepare_hierarchy_data.R @@ -315,7 +315,11 @@ get_full_data_tbl <- function(data_tbl, dplyr::group_by(Combo) %>% dplyr::group_split() %>% purrr::map(.f = function(df) { # latest update to timetk as of 11.18.21 doesn't allow for non NA pad value within dplyr groups. Filed bug and will update once fixed - df %>% + + combo <- unique(df$Combo) + + pad_data <- df %>% + dplyr::select(-Combo) %>% timetk::pad_by_time(Date, .by = date_type, .pad_value = pad_value, @@ -324,7 +328,11 @@ get_full_data_tbl <- function(data_tbl, .by = date_type, .pad_value = 0, .start_date = hist_start_date, - .end_date = hist_end_date) #fill in missing values at beginning of time series with zero + .end_date = hist_end_date) %>% #fill in missing values at beginning of time series with zero + dplyr::mutate(Combo = combo) %>% + dplyr::select(Combo, Date, Target, external_regressors) + + return(pad_data) }) %>% dplyr::bind_rows() %>% get_log_transformation(target_log_transformation) %>% From 2f9a119ac367a0e4b94f2f5b908fb962c2973207 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Fri, 19 Nov 2021 10:01:17 -0800 Subject: [PATCH 5/9] update back test mape to remove any inf or NA MAPE values. --- R/forecast_time_series.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index 7b0f6361..15bd6a1d 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -657,6 +657,7 @@ forecast_time_series <- function(input_data, dplyr::select(Combo, Date, Target)) %>% dplyr::mutate(FCST = ifelse(is.na(FCST) | is.nan(FCST), 0, FCST), Target = ifelse(is.na(Target) | is.nan(Target), 0, Target)) %>% + dplyr::mutate(Target = ifelse(Target == 0, 0.1, Target)) %>% dplyr::mutate(MAPE = abs((Target-FCST)/Target)) %>% dplyr::group_by(Combo, .id, Model) %>% dplyr::mutate(Horizon = dplyr::row_number()) %>% @@ -705,6 +706,7 @@ forecast_time_series <- function(input_data, dplyr::mutate(FCST = ifelse(is.na(FCST) | is.nan(FCST), 0, FCST)) %>% dplyr::left_join(accuracy_final) %>% dplyr::mutate(Best_Model = ifelse(is.na(Best_Model), "No", "Yes"), + Target = ifelse(Target == 0, 0.1, Target), MAPE = abs((Target-FCST)/Target)) %>% dplyr::select(Combo, .id, Date, Model, Horizon, FCST, Target, MAPE, Best_Model) %>% tidyr::separate(Combo, into = combo_variables, sep = '--', remove = FALSE) %>% From a28ae121f3b68c384e4e8fce733bcdf51c3ada42 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 22 Nov 2021 08:46:03 -0800 Subject: [PATCH 6/9] updated negative_fcst argument to negative_forecast to match similar argument names in forecast_time_series function --- R/forecast_time_series.R | 10 +++++----- R/prepare_output_data.R | 6 +++--- man/forecast_time_series.Rd | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index 15bd6a1d..7f8447fd 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -45,7 +45,7 @@ #' @param azure_batch_cluster_config Compute cluster specification to run parallel_processing in Azure Batch. #' @param azure_batch_cluster_delete If TRUE, deletes the Azure Batch compute cluster after Finn finished running. #' @param target_log_transformation If TRUE, log transform target variable before training models. -#' @param negative_fcst If TRUE, allow forecasts to dip below zero. +#' @param negative_forecast If TRUE, allow forecasts to dip below zero. #' @param fourier_periods List of values to use in creating fourier series as features. Default of NULL automatically chooses #' these values based on the date_type. #' @param lag_periods List of values to use in creating lag features. Default of NULL automatically chooses these values @@ -114,7 +114,7 @@ forecast_time_series <- function(input_data, azure_batch_cluster_config = NULL, azure_batch_cluster_delete = FALSE, target_log_transformation = FALSE, - negative_fcst = FALSE, + negative_forecast = FALSE, fourier_periods = NULL, lag_periods = NULL, rolling_window_periods = NULL, @@ -336,7 +336,7 @@ forecast_time_series <- function(input_data, # Adjust for NaNs and Negative Forecasts fcst <- fcst %>% - get_forecast_negative_adjusted(negative_fcst) + get_forecast_negative_adjusted(negative_forecast) # * Create Average Ensembles ---- @@ -628,11 +628,11 @@ forecast_time_series <- function(input_data, if(forecast_approach == "standard_hierarchy") { ts_combined <- data.frame(hts::combinef(ts, nodes = hts::get_nodes(hts_gts_list$hts_gts), weights = (1/colMeans(temp_residuals^2, na.rm = TRUE)), - keep ="bottom", nonnegative = !negative_fcst)) + keep ="bottom", nonnegative = !negative_forecast)) colnames(ts_combined) <- colnames(hts_gts_list$data_ts) } else if(forecast_approach == "grouped_hierarchy") { ts_combined <- data.frame(hts::combinef(ts, groups = hts::get_groups(hts_gts_list$hts_gts), weights = (1/colMeans(temp_residuals^2, na.rm = TRUE)), - keep ="bottom", nonnegative = !negative_fcst)) + keep ="bottom", nonnegative = !negative_forecast)) colnames(ts_combined) <- colnames(hts_gts_list$data_ts) } diff --git a/R/prepare_output_data.R b/R/prepare_output_data.R index 36d0cab4..60c3a295 100644 --- a/R/prepare_output_data.R +++ b/R/prepare_output_data.R @@ -3,10 +3,10 @@ #' Function to rectify forecast non-negativity #' #' @param fcst Input Data Frame -#' @param negative_fcst is negative forecast allowed +#' @param negative_forecast is negative forecast allowed #' @noRd get_forecast_negative_adjusted <- function(fcst, - negative_fcst){ + negative_forecast){ #TODO: Should re-write this as dplyr @@ -16,7 +16,7 @@ get_forecast_negative_adjusted <- function(fcst, fcst[is.na(fcst)] = 0 # convert negative forecasts to zero - if(negative_fcst == FALSE) {fcst$FCST <- replace(fcst$FCST, + if(negative_forecast == FALSE) {fcst$FCST <- replace(fcst$FCST, which(fcst$FCST < 0), 0)} return (fcst) diff --git a/man/forecast_time_series.Rd b/man/forecast_time_series.Rd index a31bf6eb..4b3db184 100644 --- a/man/forecast_time_series.Rd +++ b/man/forecast_time_series.Rd @@ -29,7 +29,7 @@ forecast_time_series( azure_batch_cluster_config = NULL, azure_batch_cluster_delete = FALSE, target_log_transformation = FALSE, - negative_fcst = FALSE, + negative_forecast = FALSE, fourier_periods = NULL, lag_periods = NULL, rolling_window_periods = NULL, @@ -115,7 +115,7 @@ than number of cores on machine minus 1.} \item{target_log_transformation}{If TRUE, log transform target variable before training models.} -\item{negative_fcst}{If TRUE, allow forecasts to dip below zero.} +\item{negative_forecast}{If TRUE, allow forecasts to dip below zero.} \item{fourier_periods}{List of values to use in creating fourier series as features. Default of NULL automatically chooses these values based on the date_type.} From 54b46a5d531e8b126482d8444def5f504f6efde9 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 22 Nov 2021 17:02:30 -0800 Subject: [PATCH 7/9] updated logic to better deal with function arguments that can either have NULL, boolean, or character values. --- R/forecast_models.R | 46 +++++++--- R/forecast_time_series.R | 141 ++++++++++++++++-------------- R/general_parallel.R | 7 +- R/validate_forecasting_inputs.R | 34 +++---- vignettes/parallel-processing.Rmd | 5 +- 5 files changed, 132 insertions(+), 101 deletions(-) diff --git a/R/forecast_models.R b/R/forecast_models.R index 70cc6d7b..0d247001 100644 --- a/R/forecast_models.R +++ b/R/forecast_models.R @@ -259,12 +259,16 @@ construct_forecast_models <- function(full_data_tbl, cli::cli_h2("Running Combo: {combo_value}") # Copy functions into global environment within azure batch - if(sum(parallel_processing == "azure_batch") == 1) { - global_env <- .GlobalEnv - export_env <- global_env$azbatchenv$exportenv - - for(n in ls(export_env , all.names=TRUE)) { - assign(n, get(n, export_env), global_env) + if(!is.null(parallel_processing)) { + if(parallel_processing == "azure_batch") { + + global_env <- .GlobalEnv + export_env <- global_env$azbatchenv$exportenv + + for(n in ls(export_env , all.names=TRUE)) { + assign(n, get(n, export_env), global_env) + } + } } @@ -277,10 +281,19 @@ construct_forecast_models <- function(full_data_tbl, cli::cli_h3("Initial Feature Engineering") + # Run all recipes + if(is.null(recipes_to_run)) { + run_all_recipes_override <- FALSE + } else if(recipes_to_run == "all") { + run_all_recipes_override <- TRUE + } else { + run_all_recipes_override <- FALSE + } + # recipe 1: standard feature engineering run_data_full_recipe_1 <- NULL - if(is.null(recipes_to_run) | "R1" %in% recipes_to_run | sum(recipes_to_run == "all") == 1) { + if(is.null(recipes_to_run) | "R1" %in% recipes_to_run | run_all_recipes_override) { run_data_full_recipe_1 <- run_data_full_tbl %>% multivariate_prep_recipe_1(external_regressors = external_regressors, @@ -301,7 +314,7 @@ construct_forecast_models <- function(full_data_tbl, # recipe 2: custom horizon specific feature engineering run_data_full_recipe_2 <- NULL - if((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | sum(recipes_to_run == "all") == 1) { + if((is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")) | "R2" %in% recipes_to_run | run_all_recipes_override) { run_data_full_recipe_2 <- run_data_full_tbl %>% multivariate_prep_recipe_2(external_regressors = external_regressors, @@ -332,7 +345,7 @@ construct_forecast_models <- function(full_data_tbl, combined_models_recipe_2 <- modeltime::modeltime_table() # parallel processing - if(run_model_parallel == TRUE & sum(parallel_processing == "local_machine") == 0) { + if(run_model_parallel == TRUE) { parallel_args <- init_parallel_within(parallel_processing, num_cores) } @@ -353,12 +366,17 @@ construct_forecast_models <- function(full_data_tbl, models_to_go_over <- names(model_list) # PCA - if(sum(pca == TRUE) == 1 | (combo_value == "All-Data" & is.null(pca)) | (is.null(pca) & date_type %in% c("day", "week"))) { + if((combo_value == "All-Data" & is.null(pca)) | (is.null(pca) & date_type %in% c("day", "week"))) { + run_pca <- TRUE + } else if(is.null(pca)) { + run_pca <- FALSE + } else if(pca == TRUE) { run_pca <- TRUE } else { run_pca <- FALSE } - + + # train each model for(model_name in models_to_go_over){ model_fn <- as.character(model_list[model_name]) @@ -398,7 +416,7 @@ construct_forecast_models <- function(full_data_tbl, freq_val <- frequency - if(((model_name %in% r1_models) | (model_name %in% r2_models)) & (is.null(recipes_to_run) | sum(recipes_to_run == "all") == 1 | "R1" %in% recipes_to_run)){ + if(((model_name %in% r1_models) | (model_name %in% r2_models)) & (is.null(recipes_to_run) | run_all_recipes_override | "R1" %in% recipes_to_run)){ add_name <- paste0(model_name,"-R1",model_name_suffix) if(model_name %in% deep_nn_models){ @@ -427,7 +445,7 @@ construct_forecast_models <- function(full_data_tbl, } - if(model_name %in% r2_models & ("R2" %in% recipes_to_run | sum(recipes_to_run == "all") == 1 | (is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")))){ + if(model_name %in% r2_models & ("R2" %in% recipes_to_run | run_all_recipes_override | (is.null(recipes_to_run) & date_type %in% c("month", "quarter", "year")))){ add_name <- paste0(model_name,"-R2",model_name_suffix) try(mdl_called <- invoke_forecast_function(fn_to_invoke = model_fn, @@ -746,7 +764,7 @@ construct_forecast_models <- function(full_data_tbl, } #stop parallel processing - if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0){ + if(run_model_parallel==TRUE){ exit_parallel_within(parallel_args) } diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index 7f8447fd..2a5f7ac5 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -196,20 +196,31 @@ forecast_time_series <- function(input_data, back_test_spacing <- get_back_test_spacing(back_test_spacing, date_type) - # * Ensemble Models Adjustment ---- + # * NULL Argument Adjustment ---- + + # run ensemble models if(is.null(run_ensemble_models) & date_type %in% c("quarter", "month")) { run_ensemble_models <- TRUE - } else if(is.null(run_ensemble_models) & date_type %in% c("week", "day")) { + } else if(is.null(run_ensemble_models) & date_type %in% c("year", "week", "day")) { run_ensemble_models <- FALSE - } else if(sum(run_ensemble_models == TRUE) == 1 & date_type %in% c("quarter", "month", "week", "day")) { + } else if(run_ensemble_models == TRUE & date_type %in% c("quarter", "month", "week", "day")) { run_ensemble_models <- TRUE - } else if(sum(run_ensemble_models == TRUE) == 1 & date_type =="year") { - run_ensemble_models = FALSE + } else if(run_ensemble_models == TRUE & date_type == "year") { + run_ensemble_models <- FALSE warning("ensemble models have been turned off for yearly forecasts") } else { - run_ensemble_models = FALSE + run_ensemble_models <- FALSE } - + + # run global models + if(is.null(run_global_models) & date_type %in% c("month", "quarter", "year")) { + run_global_models <- TRUE + } else if(is.null(run_global_models) & date_type %in% c("day", "week")) { + run_global_models <- FALSE + } else { + # keep existing value of run_global_models + } + # 4. Prep Data ---- cli::cli_h1("Prepping Data") @@ -294,11 +305,11 @@ forecast_time_series <- function(input_data, pca) # * Run Forecast ---- - if(forecast_approach == "bottoms_up" & length(unique(full_data_tbl$Combo)) > 1 & (sum(run_global_models == TRUE) == 1 | (is.null(run_global_models) & date_type %in% c("month", "quarter", "year"))) & run_local_models) { + if(forecast_approach == "bottoms_up" & length(unique(full_data_tbl$Combo)) > 1 & run_global_models & run_local_models) { combo_list <- c('All-Data', unique(full_data_tbl$Combo)) - } else if(forecast_approach == "bottoms_up" & length(unique(full_data_tbl$Combo)) > 1 & (sum(run_global_models == TRUE) == 1 | (is.null(run_global_models) & date_type %in% c("month", "quarter", "year"))) & run_local_models == FALSE) { + } else if(forecast_approach == "bottoms_up" & length(unique(full_data_tbl$Combo)) > 1 & run_global_models & run_local_models == FALSE) { combo_list <- c('All-Data') @@ -307,31 +318,29 @@ forecast_time_series <- function(input_data, combo_list <- unique(full_data_tbl$Combo) } - # no parallel processing - if(is.null(parallel_processing)) { + # call run function + if(is.null(parallel_processing)) { # no parallel processing fcst <- lapply(combo_list, forecast_models_fn) fcst <- do.call(rbind, fcst) - } - - # parallel run on local machine - if(sum(parallel_processing=="local_machine") == 1) { - fcst <- get_fcast_parallel(combo_list, - forecast_models_fn, - num_cores) + } else if(parallel_processing=="local_machine") { # parallel run on local machine + + fcst <- get_fcast_parallel(combo_list, + forecast_models_fn, + num_cores) + + } else if(parallel_processing=="azure_batch") { # parallel run within azure batch - } - - # parallel run within azure batch - if(sum(parallel_processing=="azure_batch") == 1) { - fcst <- get_fcast_parallel_azure(combo_list, forecast_models_fn, azure_batch_credentials, azure_batch_cluster_config, run_name) + } else { + stop("error during forecast run function call") + } # Adjust for NaNs and Negative Forecasts @@ -364,66 +373,66 @@ forecast_time_series <- function(input_data, model_combinations <- model_combinations$All #parallel processing - if(run_model_parallel==TRUE & sum(parallel_processing == "local_machine") == 0) { + if(run_model_parallel==TRUE) { cores <- get_cores(num_cores) cl <- parallel::makeCluster(cores) doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch - if(sum(parallel_processing=="azure_batch") == 1) { - parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + if(!is.null(parallel_processing)) { + if(parallel_processing == "azure_batch") { + parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + } } - - combinations_tbl <- foreach::foreach(i = model_combinations[[1]], .combine = 'rbind', - .packages = c('rlist', 'tidyverse', 'lubridate', - "doParallel", "parallel", "gtools"), - .export = c("fcst_prep", "get_cores")) %dopar% { - + + combinations_tbl <- foreach::foreach(i = model_combinations[[1]], .combine = 'rbind', + .packages = c('rlist', 'tidyverse', 'lubridate', + "doParallel", "parallel", "gtools"), + .export = c("fcst_prep")) %dopar% { + fcst_combination_temp <- fcst_prep %>% dplyr::filter(Model %in% strsplit(i, split = "_")[[1]]) %>% dplyr::group_by(.id, Combo, Date, Horizon) %>% - dplyr::summarise(FCST = mean(FCST, na.rm=TRUE), + dplyr::summarise(FCST = mean(FCST, na.rm=TRUE), Target = mean(Target, nam.rm=FALSE)) %>% dplyr::ungroup() %>% dplyr::mutate(Model = i) - + return(fcst_combination_temp) - + } - + #stop parallel processing - if(run_model_parallel==TRUE & sum(parallel_processing=="local_machine") == 0) {parallel::stopCluster(cl)} - + parallel::stopCluster(cl) + } else { - + combinations_tbl <- foreach::foreach(i = model_combinations[[1]], .combine = 'rbind') %do% { - + fcst_combination_temp <- fcst_prep %>% dplyr::filter(Model %in% strsplit(i, split = "_")[[1]]) %>% dplyr::group_by(.id, Combo, Date, Horizon) %>% - dplyr::summarise(FCST = mean(FCST, na.rm=TRUE), + dplyr::summarise(FCST = mean(FCST, na.rm=TRUE), Target = mean(Target, nam.rm=FALSE)) %>% dplyr::ungroup() %>% dplyr::mutate(Model = i) - + return(fcst_combination_temp) - + } } return(combinations_tbl) } - # no parallel processing - if(is.null(parallel_processing)) { + # kick off model average run + if(is.null(parallel_processing)) { # no parallel processing combinations_tbl_final <- lapply(2:min(max_model_average, length(model_list)), create_model_averages) combinations_tbl_final <- do.call(rbind, combinations_tbl_final) - } - - # parallel run on local machine - if(sum(parallel_processing=="local_machine") == 1) { + + } else if(parallel_processing == "local_machine") { # run on local machine cores <- get_cores(num_cores) @@ -431,35 +440,33 @@ forecast_time_series <- function(input_data, doParallel::registerDoParallel(cl) combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', - .packages = get_export_packages(), - .export = c("fcst_prep", "get_cores")) %dopar% {create_model_averages(i)} + .packages = get_export_packages(), + .export = c("fcst_prep", "get_cores")) %dopar% {create_model_averages(i)} parallel::stopCluster(cl) - } - - # parallel run within azure batch - if(sum(parallel_processing=="azure_batch") == 1) { - + } else if(parallel_processing == "azure_batch") { # run on azure batch combinations_tbl_final <- foreach::foreach(i = 2:min(max_model_average, length(model_list)), .combine = 'rbind', - .packages = get_export_packages(), - .export = c("fcst_prep", "get_cores"), - .options.azure = list(maxTaskRetryCount = 0, autoDeleteJob = TRUE, - job = substr(paste0('finn-model-avg-combo-', strftime(Sys.time(), format="%H%M%S"), '-', - tolower(gsub(" ", "-", trimws(gsub("\\s+", " ", gsub("[[:punct:]]", '', run_name)))))), 1, 63)), - .errorhandling = "remove") %dopar% {create_model_averages(i)} - - } - - if(sum(parallel_processing == 'azure_batch') == 1 & azure_batch_cluster_delete == TRUE) { - parallel::stopCluster(cluster) + .packages = get_export_packages(), + .export = c("fcst_prep", "get_cores"), + .options.azure = list(maxTaskRetryCount = 0, autoDeleteJob = TRUE, + job = substr(paste0('finn-model-avg-combo-', strftime(Sys.time(), format="%H%M%S"), '-', + tolower(gsub(" ", "-", trimws(gsub("\\s+", " ", gsub("[[:punct:]]", '', run_name)))))), 1, 63)), + .errorhandling = "remove") %dopar% {create_model_averages(i)} } # combine with individual model data fcst_combination <- rbind(fcst_combination, combinations_tbl_final) } + # delete azure batch cluster + if(!is.null(parallel_processing)) { + if(parallel_processing == "azure_btach" & azure_batch_cluster_delete == TRUE) { + parallel::stopCluster(cluster) + } + } + # 6. Final Finn Outputs ---- cli::cli_h1("Final Finn Outputs") diff --git a/R/general_parallel.R b/R/general_parallel.R index ea27573a..df5f4e28 100644 --- a/R/general_parallel.R +++ b/R/general_parallel.R @@ -1,6 +1,7 @@ #' Get number of cores to use when registering parallel back end #' #' @param num_cores number of cores for parallel processing +#' #' @noRd get_cores <-function(num_cores){ @@ -28,8 +29,10 @@ init_parallel_within <-function(type, num_cores){ doParallel::registerDoParallel(cl) #point to the correct libraries within Azure Batch - if(sum(type == "azure_batch") == 1) { - parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + if(!is.null(type)) { + if(type == "azure_batch") { + parallel::clusterEvalQ(cl, .libPaths("/mnt/batch/tasks/shared/R/packages")) + } } cli::cli_alert_info("Running across {cores} cores") diff --git a/R/validate_forecasting_inputs.R b/R/validate_forecasting_inputs.R index 393ebc9a..d86e9f3c 100644 --- a/R/validate_forecasting_inputs.R +++ b/R/validate_forecasting_inputs.R @@ -181,14 +181,27 @@ validate_forecasting_inputs<-function(input_data, } - #parallel processing within data combos and each model - if(sum(parallel_processing == "local_machine") == 1 & run_model_parallel) { - stop("cannot run parallel process (run model parallel input) within another parallel process (parallel processing input)") - } + # parallel processing formatting + if(is.null(parallel_processing)) { + + # no further checks needed - #parallel processing formatting - if(!is.null(parallel_processing) & sum(parallel_processing %in% c("local_machine", "azure_batch")) == 0) { + } else if(parallel_processing %in% c("local_machine", "azure_batch") == FALSE) { + stop("parallel processing input must be one of these values: NULL, 'local_machine', 'azure_batch'") + + } else if(parallel_processing == "local_machine" & run_model_parallel) { + + stop("cannot run parallel process (run model parallel input) within another parallel process (parallel processing input)") + + } else if(parallel_processing == "azure_batch" & (is.null(azure_batch_credentials) | is.null(azure_batch_cluster_config))) { + + stop("cannot run parallel_processing on azure_batch without batch credentials and cluster configuration") + + } else { + + # no further checks needed + } #number of cores formatting @@ -196,15 +209,6 @@ validate_forecasting_inputs<-function(input_data, stop("num_cores should be NULL or a numeric value") } - #check if azure credentials are given in case of parallel_processing = azure_batch - if(sum(parallel_processing == "azure_batch") == 1 & is.null(azure_batch_credentials)){ - stop("cannot run parallel_processing on azure_batch without batch credentials") - } - - #check if azure batch cluster info is given in case of parallel_processing = azure_batch - if(sum(parallel_processing == "azure_batch") == 1 & is.null(azure_batch_cluster_config)){ - stop("cannot run parallel_processing on azure_batch without cluster config") - } #max model average formatting if(!is.numeric(max_model_average)) { diff --git a/vignettes/parallel-processing.Rmd b/vignettes/parallel-processing.Rmd index fc7d458a..e822eef9 100644 --- a/vignettes/parallel-processing.Rmd +++ b/vignettes/parallel-processing.Rmd @@ -18,13 +18,12 @@ knitr::opts_chunk$set( When the "parallel_processing" input within "forecast_time_series" is set to "local_machine", each time series (including training models on the entire data set) is ran in parallel on the users local machine. Each time series will run on a separate core of the machine. Hyperparameter tuning, model refitting, and model averaging will be ran sequentially, which cannot be done in parallel since a parallel process is already running on the machine for each time series. This works well for data that contains many time series where you might only want to run a few simpler models, and in scenarios where cloud computing is not available. -If "parallel_processing" is set to "none" and "run_model_parallel" is set to "TRUE" within the "forecast_time_series" function, then each time series is ran sequentially but the hyperparameter tuning, model refitting, and model averaging is ran in parallel. This works great for data that has a limited number of time series where you want to run a lot of back testing and build dozens of models within Finn. +If "parallel_processing" is set to NULL and "run_model_parallel" is set to TRUE within the "forecast_time_series" function, then each time series is ran sequentially but the hyperparameter tuning, model refitting, and model averaging is ran in parallel. This works great for data that has a limited number of time series where you want to run a lot of back testing and build dozens of models within Finn. -Lastly, if "parallel_processing" is set to "none" and "run_model_parallel" is set to "FALSE", then everything will run sequentially. This is definitely the slowest way to build forecasts with Finn, but could be the best choice if no cloud options are available and your local machine may not have many cores or large RAM. ## Within Azure using Azure Batch -To leverage the full power of Finn, running within Azure is the best choice in building production ready forecasts that can easily scale. The most efficient way to run Finn is to set "parallel_processing" to "azure_batch" and set "run_model_parallel" to "TRUE" within the "forecast_time_series" function. This will run each time series in separate virtual machines (VM) in Azure. Within each VM, hyperparameter tuning, modeling refitting, and model averaging are all done in parallel across the cores available on the machine. +To leverage the full power of Finn, running within Azure is the best choice in building production ready forecasts that can easily scale. The most efficient way to run Finn is to set "parallel_processing" to "azure_batch" and set "run_model_parallel" to TRUE within the "forecast_time_series" function. This will run each time series in separate virtual machines (VM) in Azure. Within each VM, hyperparameter tuning, modeling refitting, and model averaging are all done in parallel across the cores available on the machine. [Azure Batch](https://azure.microsoft.com/en-us/services/batch/#overview) is a powerful resource from Microsoft Azure, that allows for easily salable parallel compute. Finn leverages the [doAzureParallel](https://github.com/Azure/doAzureParallel) and rAzureBatch packages built by Microsoft to connect to Azure batch. Refer to their [github site](https://github.com/Azure/doAzureParallel) for more information about how it works under the hood and how to set up your own Azure Batch resource to use with Finn. From 4c98476361513bd7338fe085f9e6ff72e174bb3a Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 22 Nov 2021 17:13:53 -0800 Subject: [PATCH 8/9] last logic fix --- R/validate_forecasting_inputs.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/validate_forecasting_inputs.R b/R/validate_forecasting_inputs.R index d86e9f3c..2713d1fe 100644 --- a/R/validate_forecasting_inputs.R +++ b/R/validate_forecasting_inputs.R @@ -137,7 +137,9 @@ validate_forecasting_inputs<-function(input_data, } #back test scenarios formatting - if((!is.numeric(back_test_scenarios) & !is.null(back_test_scenarios)) | sum(back_test_scenarios < 1) == 1) { + if(!is.numeric(back_test_scenarios) & !is.null(back_test_scenarios)) { + stop("back test scenarios input value must be either a number greater than 0 or set to NULL") + } else if(back_test_scenarios < 1) { stop("back test scenarios input value must be either a number greater than 0 or set to NULL") } From 6ef2f73db7c84bcc936edcd25eb73c68a669fbb7 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 22 Nov 2021 17:19:17 -0800 Subject: [PATCH 9/9] bug fix from prev commit --- R/validate_forecasting_inputs.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/validate_forecasting_inputs.R b/R/validate_forecasting_inputs.R index 2713d1fe..fe0e923b 100644 --- a/R/validate_forecasting_inputs.R +++ b/R/validate_forecasting_inputs.R @@ -139,6 +139,8 @@ validate_forecasting_inputs<-function(input_data, #back test scenarios formatting if(!is.numeric(back_test_scenarios) & !is.null(back_test_scenarios)) { stop("back test scenarios input value must be either a number greater than 0 or set to NULL") + } else if(is.null(back_test_scenarios)) { + # do nothing } else if(back_test_scenarios < 1) { stop("back test scenarios input value must be either a number greater than 0 or set to NULL") }