Skip to content

Commit

Permalink
Merge pull request #163 from microsoft/mitokic/07172024/best-model-sc…
Browse files Browse the repository at this point in the history
…aling

Mitokic/07172024/best model scaling
  • Loading branch information
mitokic authored Jul 29, 2024
2 parents 03eabf3 + bc88b7b commit ab1c2a1
Show file tree
Hide file tree
Showing 36 changed files with 583 additions and 443 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: finnts
Title: Microsoft Finance Time Series Forecasting Framework
Version: 0.4.0.9004
Version: 0.4.0.9005
Authors@R:
c(person(given = "Mike",
family = "Tokic",
Expand All @@ -24,7 +24,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.3.1
Imports:
cli,
Cubist,
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# finnts 0.4.0.9004 (DEVELOPMENT VERSION)
# finnts 0.4.0.9005 (DEVELOPMENT VERSION)

## Improvements

- Added support for hierarchical forecasting with external regressors
- Allow global models for hierarchical forecasts
- Multistep horizon forecasts for R1 recipe, listed as `multistep_horizon` within `prep_data()`
- Always save the most accurate model average, regardless if selected as best model. This allows for improved scaling with large data sets.
- Automatically condense large forecasts (+10k time series) into smaller amount of files to make it easier to read forecast outputs
- Improved weighted MAPE calculation across all time series

## Bug Fixes

Expand Down
31 changes: 16 additions & 15 deletions R/ensemble_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,27 +181,29 @@ ensemble_models <- function(run_info,
# model forecasts
single_model_tbl <- NULL
if (run_local_models) {
suppressWarnings(try(single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
suppressWarnings(try(
single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}

global_model_tbl <- NULL
if (run_global_models) {
suppressWarnings(try(global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
suppressWarnings(try(
global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}

Expand Down Expand Up @@ -336,7 +338,6 @@ ensemble_models <- function(run_info,
.multicombine = TRUE,
.noexport = NULL
) %do% {

# get initial run info
model <- model_run %>%
dplyr::pull(Model_Name)
Expand Down
4 changes: 0 additions & 4 deletions R/feature_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ run_feature_selection <- function(input_data,
forecast_horizon,
external_regressors,
multistep_horizon = FALSE) {

# check for more than one unique target value
if (input_data %>% tidyr::drop_na(Target) %>% dplyr::pull(Target) %>% unique() %>% length() < 2) {

# just return the date features
fs_list <- input_data %>%
dplyr::select(tidyselect::contains("Date"))
Expand Down Expand Up @@ -83,7 +81,6 @@ run_feature_selection <- function(input_data,

# run feature selection
if (date_type %in% c("day", "week")) {

# number of votes needed for feature to be selected
votes_needed <- 3

Expand Down Expand Up @@ -410,7 +407,6 @@ lofo_fn <- function(run_info,
parallel_processing,
pca = FALSE,
seed = 123) {

# parallel run info
par_info <- par_start(
run_info = run_info,
Expand Down
133 changes: 96 additions & 37 deletions R/final_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#' Select Best Models and Prep Final Outputs
#'
#' @param run_info run info using the [set_run_info()] function.
#' @param average_models If TRUE, create simple averages of individual models.
#' @param average_models If TRUE, create simple averages of individual models
#' and save the most accurate one.
#' @param max_model_average Max number of models to average together. Will
#' create model averages for 2 models up until input value or max number of
#' models ran.
Expand Down Expand Up @@ -124,7 +125,8 @@ final_models <- function(run_info,
current_combo_list_final <- setdiff(
current_combo_list,
prev_combo_list
)
) %>%
sample()

prev_log_df <- read_file(run_info,
path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
Expand All @@ -138,8 +140,7 @@ final_models <- function(run_info,
run_local_models <- prev_log_df$run_local_models
run_ensemble_models <- prev_log_df$run_ensemble_models

if ((length(current_combo_list_final) == 0 & length(prev_combo_list) > 0) | sum(colnames(prev_log_df) %in% "weighted_mape")) {

if (sum(colnames(prev_log_df) %in% "weighted_mape")) {
# check if input values have changed
current_log_df <- tibble::tibble(
average_models = average_models,
Expand Down Expand Up @@ -175,7 +176,7 @@ final_models <- function(run_info,

# submit tasks
best_model_tbl <- foreach::foreach(
x = current_combo_list,
x = current_combo_list_final,
.combine = "rbind",
.packages = packages,
.errorhandling = "stop",
Expand Down Expand Up @@ -262,31 +263,7 @@ final_models <- function(run_info,

# check if model averaging already happened
if ("Best_Model" %in% colnames(local_model_tbl %>% rbind(global_model_tbl))) {
# see if average models file exists and add to model tbl
average_model_tbl <- tryCatch(
{
read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-average_models.", run_info$data_output
),
return_type = "df"
)
},
warning = function(w) {
# do nothing
},
error = function(e) {
NULL
}
)

local_model_tbl <- local_model_tbl %>%
rbind(average_model_tbl)

best_model_check <- TRUE
} else {
best_model_check <- FALSE
return(data.frame(Combo_Hash = combo))
}

# combine all forecasts
Expand Down Expand Up @@ -315,8 +292,7 @@ final_models <- function(run_info,
final_model_list <- c(local_model_list, global_model_list)

# simple model averaging
if (average_models & length(final_model_list) > 1 & !best_model_check) {

if (average_models & length(final_model_list) > 1) {
# create model combinations list
model_combinations <- tibble::tibble()

Expand Down Expand Up @@ -360,7 +336,6 @@ final_models <- function(run_info,
.noexport = NULL
) %op%
{

# get list of models to average
model_list <- strsplit(x, "_")[[1]]

Expand All @@ -385,7 +360,34 @@ final_models <- function(run_info,
averages_tbl <- NULL
}

# choose best model
# choose best average model
if (!is.null(averages_tbl)) {
avg_back_test_mape <- averages_tbl %>%
dplyr::mutate(
Train_Test_ID = as.numeric(Train_Test_ID),
Target = ifelse(Target == 0, 0.1, Target)
) %>%
dplyr::filter(Train_Test_ID != 1) %>%
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4))

avg_best_model_mape <- avg_back_test_mape %>%
dplyr::group_by(Model_ID, Combo) %>%
dplyr::mutate(
Combo_Total = sum(abs(Target), na.rm = TRUE),
weighted_MAPE = (abs(Target) / Combo_Total) * MAPE
) %>%
dplyr::summarise(Rolling_MAPE = sum(weighted_MAPE, na.rm = TRUE)) %>%
dplyr::arrange(Rolling_MAPE) %>%
dplyr::ungroup() %>%
dplyr::group_by(Combo) %>%
dplyr::slice(1) %>%
dplyr::ungroup()

avg_best_model_tbl <- avg_best_model_mape %>%
dplyr::select(Combo, Model_ID)
}

# choose best overall model
final_predictions_tbl <- predictions_tbl %>%
dplyr::select(Combo, Model_ID, Train_Test_ID, Date, Forecast, Target) %>%
rbind(averages_tbl)
Expand Down Expand Up @@ -513,7 +515,6 @@ final_models <- function(run_info,
)
}
} else { # choose the most accurate individual model and write outputs

final_model_tbl <- tibble::tibble(Model_ID = final_model_list) %>%
dplyr::left_join(
best_model_final_tbl %>%
Expand All @@ -522,6 +523,35 @@ final_models <- function(run_info,
) %>%
dplyr::mutate(Best_Model = ifelse(!is.na(Best_Model), "Yes", "No"))

if (!is.null(averages_tbl)) {
avg_model_final_tbl <- averages_tbl %>%
dplyr::right_join(avg_best_model_tbl,
by = c("Combo", "Model_ID")
) %>%
dplyr::mutate(
Combo_ID = Combo,
Model_Name = "NA",
Model_Type = "local",
Recipe_ID = "simple_average",
Hyperparameter_ID = "NA",
Best_Model = "No"
) %>%
dplyr::group_by(Combo_ID, Model_ID, Train_Test_ID) %>%
dplyr::mutate(Horizon = dplyr::row_number()) %>%
dplyr::ungroup() %>%
create_prediction_intervals(model_train_test_tbl) %>%
convert_weekly_to_daily(date_type, weekly_to_daily)

write_data(
x = avg_model_final_tbl,
combo = unique(avg_model_final_tbl$Combo),
run_info = run_info,
output_type = "data",
folder = "forecasts",
suffix = "-average_models"
)
}

if (!is.null(single_model_tbl)) {
single_model_final_tbl <- single_model_tbl %>%
remove_best_model() %>%
Expand Down Expand Up @@ -580,13 +610,24 @@ final_models <- function(run_info,
}
}

return(best_model_mape)
return(data.frame(Combo_Hash = combo))
} %>%
base::suppressPackageStartupMessages()

# clean up any parallel run process
par_end(cl)

# condense outputs into less files for larger runs
if (length(combo_list) > 10000) {
cli::cli_progress_step("Condensing Forecasts")

condense_data(
run_info,
parallel_processing,
num_cores
)
}

# reconcile hierarchical forecasts
if (forecast_approach != "bottoms_up") {
cli::cli_progress_step("Reconciling Hierarchical Forecasts")
Expand All @@ -600,6 +641,24 @@ final_models <- function(run_info,
)
}

# calculate weighted mape
weighted_mape <- get_forecast_data(run_info = run_info) %>%
dplyr::filter(
Run_Type == "Back_Test",
Best_Model == "Yes"
) %>%
dplyr::mutate(
Target = ifelse(Target == 0, 0.1, Target)
) %>%
dplyr::mutate(
MAPE = round(abs((Forecast - Target) / Target), digits = 4),
Total = sum(Target, na.rm = TRUE),
Weight = (MAPE * Target) / Total
) %>%
dplyr::pull(Weight) %>%
sum() %>%
round(digits = 4)

# update logging file
log_df <- read_file(run_info,
path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
Expand All @@ -608,7 +667,7 @@ final_models <- function(run_info,
dplyr::mutate(
average_models = average_models,
max_model_average = max_model_average,
weighted_mape = base::mean(best_model_tbl$Rolling_MAPE, na.rm = TRUE)
weighted_mape = round(weighted_mape, digits = 4)
)

write_data(
Expand Down
1 change: 0 additions & 1 deletion R/forecast_time_series.R
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,6 @@ forecast_backwards_compatibility <- function(run_info,
dplyr::select(Combo, Model, Best_Model) %>%
dplyr::distinct()
} else {

# read in unreconciled results
best_model_tbl <- read_file(run_info,
path = paste0(
Expand Down
Loading

0 comments on commit ab1c2a1

Please sign in to comment.