From dd5725af2029bdfdacb4ff05dd0001421d877124 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 30 May 2024 10:41:49 -0700 Subject: [PATCH 1/2] global model bug fix for multistep horizon forecasting --- DESCRIPTION | 2 +- NEWS.md | 3 ++- R/multistep_cubist.R | 30 +++++++++++++++++++++++++----- R/multistep_glmnet.R | 26 +++++++++++++++++++++++--- R/multistep_mars.R | 26 +++++++++++++++++++++++--- R/multistep_svm_poly.R | 26 +++++++++++++++++++++++--- R/multistep_svm_rbf.R | 26 +++++++++++++++++++++++--- R/multistep_xgboost.R | 36 +++++++++++++++++++++++++++++------- R/train_models.R | 4 +--- R/utility.R | 2 +- 10 files changed, 151 insertions(+), 30 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 99ff60e9..0f210549 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0.9003 +Version: 0.4.0.9004 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index c5a6999a..444a0ad6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,9 @@ -# finnts 0.4.0.9003 (DEVELOPMENT VERSION) +# finnts 0.4.0.9004 (DEVELOPMENT VERSION) ## Improvements - Added support for hierarchical forecasting with external regressors +- Allow global models for hierarchical forecasts - Multistep horizon forecasts for R1 recipe, listed as `multistep_horizon` within `prep_data()` ## Bug Fixes diff --git a/R/multistep_cubist.R b/R/multistep_cubist.R index efd7f99b..8a005a4c 100644 --- a/R/multistep_cubist.R +++ b/R/multistep_cubist.R @@ -439,6 +439,13 @@ predict.cubist_multistep_fit_impl <- function(object, new_data, ...) { #' @export cubist_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -447,14 +454,16 @@ cubist_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS - final_prediction <- c() + final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -463,7 +472,10 @@ cubist_multistep_predict_impl <- function(object, new_data, ...) { cubist_model <- object$models[[model_name]] xreg_tbl_final <- xreg_tbl %>% - dplyr::slice(start_val:lag_number) + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) if (!is.null(xreg_tbl)) { preds_cubist <- predict(cubist_model, xreg_tbl_final) @@ -471,9 +483,17 @@ cubist_multistep_predict_impl <- function(object, new_data, ...) { preds_cubist <- rep(0, h_horizon) } + preds_cubist <- tibble::tibble(.pred = preds_cubist) %>% + dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) + start_val <- as.numeric(lag_number) + 1 - final_prediction <- c(final_prediction, preds_cubist) + final_prediction <- rbind(final_prediction, preds_cubist) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::pull(.pred) + return(final_prediction) } diff --git a/R/multistep_glmnet.R b/R/multistep_glmnet.R index 0ab6f155..f177aaa0 100644 --- a/R/multistep_glmnet.R +++ b/R/multistep_glmnet.R @@ -427,6 +427,13 @@ predict.glmnet_multistep_fit_impl <- function(object, new_data, ...) { #' @export glmnet_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -435,14 +442,16 @@ glmnet_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -451,7 +460,10 @@ glmnet_multistep_predict_impl <- function(object, new_data, ...) { glmnet_model <- object$models[[model_name]] xreg_tbl_final <- xreg_tbl %>% - dplyr::slice(start_val:lag_number) + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) if (!is.null(xreg_tbl)) { preds_glmnet <- predict(glmnet_model, xreg_tbl_final) @@ -459,9 +471,17 @@ glmnet_multistep_predict_impl <- function(object, new_data, ...) { preds_glmnet <- rep(0, h_horizon) } + preds_glmnet <- preds_glmnet %>% + dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_glmnet) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::select(.pred) + return(final_prediction) } diff --git a/R/multistep_mars.R b/R/multistep_mars.R index b4f04f06..02e84e5b 100644 --- a/R/multistep_mars.R +++ b/R/multistep_mars.R @@ -450,6 +450,13 @@ predict.mars_multistep_fit_impl <- function(object, new_data, ...) { #' @export mars_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -458,14 +465,16 @@ mars_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -474,7 +483,10 @@ mars_multistep_predict_impl <- function(object, new_data, ...) { mars_model <- object$models[[model_name]] xreg_tbl_final <- xreg_tbl %>% - dplyr::slice(start_val:lag_number) + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) if (!is.null(xreg_tbl)) { preds_mars <- predict(mars_model, xreg_tbl_final) @@ -482,9 +494,17 @@ mars_multistep_predict_impl <- function(object, new_data, ...) { preds_mars <- rep(0, h_horizon) } + preds_mars <- preds_mars %>% + dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_mars) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::select(.pred) + return(final_prediction) } diff --git a/R/multistep_svm_poly.R b/R/multistep_svm_poly.R index ac082a72..14ad5e21 100644 --- a/R/multistep_svm_poly.R +++ b/R/multistep_svm_poly.R @@ -476,6 +476,13 @@ predict.svm_poly_multistep_fit_impl <- function(object, new_data, ...) { #' @export svm_poly_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -484,14 +491,16 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -500,7 +509,10 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { svm_poly_model <- object$models[[model_name]] xreg_tbl_final <- xreg_tbl %>% - dplyr::slice(start_val:lag_number) + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) if (!is.null(xreg_tbl)) { preds_svm_poly <- predict(svm_poly_model, xreg_tbl_final) @@ -508,9 +520,17 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { preds_svm_poly <- rep(0, h_horizon) } + preds_svm_poly <- preds_svm_poly %>% + dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_svm_poly) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::select(.pred) + return(final_prediction) } diff --git a/R/multistep_svm_rbf.R b/R/multistep_svm_rbf.R index 17a4c621..6a7f779f 100644 --- a/R/multistep_svm_rbf.R +++ b/R/multistep_svm_rbf.R @@ -456,6 +456,13 @@ predict.svm_rbf_multistep_fit_impl <- function(object, new_data, ...) { #' @export svm_rbf_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -464,14 +471,16 @@ svm_rbf_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -480,7 +489,10 @@ svm_rbf_multistep_predict_impl <- function(object, new_data, ...) { svm_rbf_model <- object$models[[model_name]] xreg_tbl_final <- xreg_tbl %>% - dplyr::slice(start_val:lag_number) + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) if (!is.null(xreg_tbl)) { preds_svm_rbf <- predict(svm_rbf_model, xreg_tbl_final) @@ -488,9 +500,17 @@ svm_rbf_multistep_predict_impl <- function(object, new_data, ...) { preds_svm_rbf <- rep(0, h_horizon) } + preds_svm_rbf <- preds_svm_rbf %>% + dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_svm_rbf) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::select(.pred) + return(final_prediction) } diff --git a/R/multistep_xgboost.R b/R/multistep_xgboost.R index a933635d..03b327f1 100644 --- a/R/multistep_xgboost.R +++ b/R/multistep_xgboost.R @@ -538,6 +538,13 @@ predict.xgboost_multistep_fit_impl <- function(object, new_data, ...) { #' @export xgboost_multistep_predict_impl <- function(object, new_data, ...) { + # Date Mapping Table + date_tbl <- new_data %>% + dplyr::select(Date, Date_index.num) %>% + dplyr::distinct() %>% + dplyr::arrange(Date) %>% + dplyr::mutate(Run_Number = dplyr::row_number()) + # PREPARE INPUTS xreg_recipe <- object$extras$xreg_recipe h_horizon <- nrow(new_data) @@ -546,14 +553,16 @@ xgboost_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl <- modeltime::bake_xreg_recipe(xreg_recipe, new_data, format = "tbl" - ) + ) %>% + dplyr::left_join(date_tbl, by = "Date_index.num") %>% + dplyr::mutate(Row_Num = dplyr::row_number()) # PREDICTIONS - final_prediction <- c() + final_prediction <- tibble::tibble() start_val <- 1 for (model_name in names(object$models)) { - if (start_val > nrow(xreg_tbl)) { + if (start_val > nrow(date_tbl)) { break } @@ -561,9 +570,14 @@ xgboost_multistep_predict_impl <- function(object, new_data, ...) { xgboost_model <- object$models[[model_name]] - xreg_tbl_final <- xreg_tbl %>% - dplyr::select(tidyselect::any_of(xgboost_model$feature_names)) %>% - dplyr::slice(start_val:lag_number) + xreg_tbl_temp <- xreg_tbl %>% + dplyr::filter( + Run_Number >= start_val, + Run_Number <= lag_number + ) + + xreg_tbl_final <- xreg_tbl_temp %>% + dplyr::select(tidyselect::any_of(xgboost_model$feature_names)) if (!is.null(xreg_tbl)) { preds_xgboost <- modeltime::xgboost_predict(xgboost_model, @@ -574,9 +588,17 @@ xgboost_multistep_predict_impl <- function(object, new_data, ...) { preds_xgboost <- rep(0, h_horizon) } + preds_xgboost <- tibble::tibble(.pred = preds_xgboost) %>% + dplyr::mutate(Row_Num = xreg_tbl_temp$Row_Num) + start_val <- as.numeric(lag_number) + 1 - final_prediction <- c(final_prediction, preds_xgboost) + final_prediction <- rbind(final_prediction, preds_xgboost) } + # Ensure it's sorted correctly for global models + final_prediction <- final_prediction %>% + dplyr::arrange(Row_Num) %>% + dplyr::pull(.pred) + return(final_prediction) } diff --git a/R/train_models.R b/R/train_models.R index 54521aa5..511c9383 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -98,8 +98,6 @@ train_models <- function(run_info, if (is.null(run_global_models) & date_type %in% c("day", "week")) { run_global_models <- FALSE - } else if (forecast_approach != "bottoms_up") { - run_global_models <- FALSE } else if (is.null(run_global_models)) { run_global_models <- TRUE } else { @@ -652,7 +650,7 @@ train_models <- function(run_info, tidyr::unnest(Forecast_Tbl) %>% dplyr::arrange(Train_Test_ID) %>% tidyr::unite(col = "Model_ID", c("Model_Name", "Model_Type", "Recipe_ID"), sep = "--", remove = FALSE) %>% - dplyr::group_by(Combo_ID, Model_ID, Train_Test_ID) %>% + dplyr::group_by(Combo, Model_ID, Train_Test_ID) %>% dplyr::mutate(Horizon = dplyr::row_number()) %>% dplyr::ungroup() diff --git a/R/utility.R b/R/utility.R index 1da63edf..cb5a57d5 100644 --- a/R/utility.R +++ b/R/utility.R @@ -15,7 +15,7 @@ utils::globalVariables(c( "Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc", "term", "Column", "Box_Cox_Lambda", "get_recipie_configurable", "Agg", "Unique", "Var", "Var_Combo", "regressor", "regressor_tbl", "value_level_iter", ".actual", ".fitted", - "forecast_horizon", "lag", "new_data", "object", "fit" + "forecast_horizon", "lag", "new_data", "object", "fit", "Row_Num", "Run_Number" )) #' @importFrom magrittr %>% From 5b5bc0b06dfa92a5f7f3ebea977a7a60cbca97c5 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 30 May 2024 11:01:51 -0700 Subject: [PATCH 2/2] update r checks --- .github/workflows/R-CMD-check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 61589d1d..9e9894e7 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -51,7 +51,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::rcmdcheck, vip=?ignore-before-r=4.1.0, Boruta=?ignore-before-r=4.1.0 + extra-packages: any::rcmdcheck, vip=?ignore-before-r=4.1.0, Boruta=?ignore-before-r=4.1.0, corrr=?ignore-before-r=4.1.0 needs: check - uses: r-lib/actions/check-r-package@v2