From 700c7f7d3cc8ead6172fe7e7a3ceb676b0d9a930 Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 15:29:08 -0400 Subject: [PATCH 01/16] moved imputation code to a function --- R/impute.R | 16 ++++++++++++++++ R/preprocess.R | 23 +++++++---------------- 2 files changed, 23 insertions(+), 16 deletions(-) create mode 100644 R/impute.R diff --git a/R/impute.R b/R/impute.R new file mode 100644 index 00000000..b84336eb --- /dev/null +++ b/R/impute.R @@ -0,0 +1,16 @@ +impute <- function(transformed_cont, n_missing) { + transformed_cont <- sapply_fn(transformed_cont, function(x) { + if (class(x) %in% c("integer", "numeric")) { + m <- is.na(x) + x[m] <- stats::median(x, na.rm = TRUE) + } + return(x) + }) %>% dplyr::as_tibble() + message( + paste0( + n_missing, + " missing continuous value(s) were imputed using the median value of the feature." + ) + ) + return (transformed_cont) +} \ No newline at end of file diff --git a/R/preprocess.R b/R/preprocess.R index 4418e416..119ad8b3 100644 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -60,7 +60,7 @@ preprocess_data <- function(dataset, outcome_colname, method = c("center", "scale"), remove_var = "nzv", collapse_corr_feats = TRUE, to_numeric = TRUE, group_neg_corr = TRUE, - prefilter_threshold = 1) { + prefilter_threshold = 1, impute_in_preprocessing = TRUE) { progbar <- NULL if (isTRUE(check_packages_installed("progressr"))) { progbar <- progressr::progressor(steps = 20, message = "preprocessing") @@ -88,7 +88,7 @@ preprocess_data <- function(dataset, outcome_colname, pbtick(progbar) split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) pbtick(progbar) # combine all processed features @@ -364,7 +364,7 @@ process_cat_feats <- function(features, progbar = NULL) { #' #' @examples #' process_cont_feats(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale")) -process_cont_feats <- function(features, method) { +process_cont_feats <- function(features, method, impute_in_preprocessing) { transformed_cont <- NULL removed_cont <- NULL @@ -388,19 +388,10 @@ process_cont_feats <- function(features, method) { n_missing <- sum(missing) if (n_missing > 0) { # impute missing data using the median value - transformed_cont <- sapply_fn(transformed_cont, function(x) { - if (class(x) %in% c("integer", "numeric")) { - m <- is.na(x) - x[m] <- stats::median(x, na.rm = TRUE) - } - return(x) - }) %>% dplyr::as_tibble() - message( - paste0( - n_missing, - " missing continuous value(s) were imputed using the median value of the feature." - ) - ) + if (impute_in_preprocessing) { + source("impute.R") + transformed_cont <- impute(transformed_cont, n_missing) + } } } } From cd57a1d1027b1d511332eb00d41a1d3b1696377b Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 15:32:43 -0400 Subject: [PATCH 02/16] add impute param in func call --- R/run_ml.R | 106 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/R/run_ml.R b/R/run_ml.R index 7abe8b82..50ebf1b4 100644 --- a/R/run_ml.R +++ b/R/run_ml.R @@ -1,3 +1,5 @@ +# TODO: add call to the function, add the option +# TODO: figure out if there's a way to only specify option in one place #' Run the machine learning pipeline #' #' This function splits the data set into a train & test set, @@ -144,6 +146,7 @@ run_ml <- group_partitions = NULL, corr_thresh = 1, seed = NA, + impute_in_training = FALSE, ...) { check_all( dataset, @@ -162,7 +165,7 @@ run_ml <- if (!is.na(seed)) { set.seed(seed) } - + # `future.apply` is required for `find_feature_importance()`. # check it here to adhere to the fail fast principle. if (find_feature_importance) { @@ -173,20 +176,20 @@ run_ml <- if (find_feature_importance) { check_cat_feats(dataset %>% dplyr::select(-outcome_colname)) } - + dataset <- dataset %>% randomize_feature_order(outcome_colname) %>% # convert tibble to dataframe to silence warning from caret::train(): # "Warning: Setting row names on a tibble is deprecated.." as.data.frame() - + outcomes_vctr <- dataset %>% dplyr::pull(outcome_colname) - + if (length(training_frac) == 1) { training_inds <- get_partition_indices(outcomes_vctr, - training_frac = training_frac, - groups = groups, - group_partitions = group_partitions + training_frac = training_frac, + groups = groups, + group_partitions = group_partitions ) } else { training_inds <- training_frac @@ -201,30 +204,95 @@ run_ml <- } check_training_frac(training_frac) check_training_indices(training_inds, dataset) - + train_data <- dataset[training_inds, ] test_data <- dataset[-training_inds, ] + if (impute_in_training == TRUE) { + source("preprocess.R") + train_data[[outcome_colname]] <- replace_spaces(train_data[[outcome_colname]]) + train_data <- rm_missing_outcome(train_data, outcome_colname) + split_dat <- split_outcome_features(train_data, outcome_colname) + + features <- split_dat$features + removed_feats <- character(0) + if (to_numeric) { + feats <- change_to_num(features) %>% + remove_singleton_columns(threshold = prefilter_threshold) + removed_feats <- feats$removed_feats + features <- feats$dat + } + pbtick(progbar) + + nv_feats <- process_novar_feats(features, progbar = progbar) + pbtick(progbar) + split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) + pbtick(progbar) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) + pbtick(progbar) + # repeat with test data + test_data[[outcome_colname]] <- replace_spaces(test_data[[outcome_colname]]) + test_data <- rm_missing_outcome(test_data, outcome_colname) + split_dat <- split_outcome_features(test_data, outcome_colname) + + features <- split_dat$features + removed_feats <- character(0) + if (to_numeric) { + feats <- change_to_num(features) %>% + remove_singleton_columns(threshold = prefilter_threshold) + removed_feats <- feats$removed_feats + features <- feats$dat + } + pbtick(progbar) + + nv_feats <- process_novar_feats(features, progbar = progbar) + pbtick(progbar) + split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) + pbtick(progbar) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) + pbtick(progbar) + + + + + #source("impute.R") + #sapply_fn <- select_apply("sapply") + #cl_train <- sapply_fn(train_data, function(x) { + # class(x) + #}) + #cl_test <- sapply_fn(test_data, function(x) { + # class(x) + #}) + #missing_train <- + # is.na(train_data[, cl_train %in% c("integer", "numeric")]) + #missing_test <- + # is.na(test_data[, cl_test %in% c("integer", "numeric")]) + #n_missing_train <- sum(missing_train) + #n_missing_test <- sum(missing_test) + # do imputation + #impute_missing_vals(train_data, n_missing_train) + #impute_missing_vals(test_data, n_missing_test) + } # train_groups & test_groups will be NULL if groups is NULL train_groups <- groups[training_inds] test_groups <- groups[-training_inds] - + if (is.null(hyperparameters)) { hyperparameters <- get_hyperparams_list(dataset, method) } tune_grid <- get_tuning_grid(hyperparameters, method) - - + + outcome_type <- get_outcome_type(outcomes_vctr) class_probs <- outcome_type != "continuous" - + if (is.null(perf_metric_function)) { perf_metric_function <- get_perf_metric_fn(outcome_type) } - + if (is.null(perf_metric_name)) { perf_metric_name <- get_perf_metric_name(outcome_type) } - + if (is.null(cross_val)) { cross_val <- define_cv( train_data, @@ -238,8 +306,8 @@ run_ml <- group_partitions = group_partitions ) } - - + + message("Training the model...") trained_model_caret <- train_model( train_data = train_data, @@ -254,7 +322,7 @@ run_ml <- if (!is.na(seed)) { set.seed(seed) } - + if (calculate_performance) { performance_tbl <- get_performance_tbl( trained_model_caret, @@ -269,7 +337,7 @@ run_ml <- } else { performance_tbl <- "Skipped calculating performance" } - + if (find_feature_importance) { message("Finding feature importance...") feature_importance_tbl <- get_feature_importance( @@ -287,7 +355,7 @@ run_ml <- } else { feature_importance_tbl <- "Skipped feature importance" } - + return( list( trained_model = trained_model_caret, From 58afddb1f926cf73ec75e93685e6034ed7996b9c Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 15:33:45 -0400 Subject: [PATCH 03/16] add todo about fixing here stuff --- R/preprocess.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/preprocess.R b/R/preprocess.R index 119ad8b3..5dfa01f3 100644 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -1,3 +1,6 @@ +# TODO: set this for a generic path (probably using here::here) +library(here) +here("R", "impute.R") #' Preprocess data prior to running machine learning #' #' Function to preprocess your data for input into [run_ml()]. From 105c22f028a6aa5ed9b7c56793d5aba941961710 Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 15:42:08 -0400 Subject: [PATCH 04/16] add tests for setting impute param to false in preprocess data --- tests/testthat/test-preprocess.R | 281 +++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) diff --git a/tests/testthat/test-preprocess.R b/tests/testthat/test-preprocess.R index ef1555bb..539f79e6 100644 --- a/tests/testthat/test-preprocess.R +++ b/tests/testthat/test-preprocess.R @@ -671,3 +671,284 @@ test_that("preprocess_data replaces spaces in outcome column values (class label dat_proc ) %>% suppressMessages() }) + + + +test_that("setting impute param to false doesn't impute data", { + expect_message( + expect_equal( + preprocess_data(test_df, "outcome", + prefilter_threshold = -1, impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + -1, + 0, 1 + ), grp4 = c(0, 0, 1), var8 = c( + -0.707106781186547, 0.707106781186547, + NA + )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c( + "var10_0", "var2_b", "var3_yes", + "var4_1", "var9_x" + ), grp2 = c("var10_1", "var2_a"), grp3 = c( + "var1", + "var12" + ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"), + removed_feats = c("var5", "var6", "var11") + ) + ), + "Removed " + ) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df, "outcome", + prefilter_threshold = -1, + group_neg_corr = FALSE, + impute_in_preprocessing = FALSE + ), + list(dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + -1, + 0, 1 + ), grp4 = c(0, 0, 1), var7_1 = c(1, 1, 0), var8 = c( + -0.707106781186547, + 0.707106781186547, NA + )), row.names = c(NA, -3L), class = c( + "tbl_df", + "tbl", "data.frame" + )), grp_feats = list( + grp1 = c( + "var10_0", "var2_b", + "var3_yes", "var4_1", "var9_x" + ), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var9_y"), var7_1 = "var7_1", + var8 = "var8" + ), removed_feats = c("var5", "var6", "var11")) + )) %>% suppressMessages() + expect_equal( + preprocess_data(test_df[1:3, c("outcome", "var1")], "outcome"), + list( + dat_transformed = dplyr::tibble( + outcome = c("normal", "normal", "cancer"), + var1 = c(-1, 0, 1) + ), + grp_feats = NULL, + removed_feats = character(0) + ) + ) %>% suppressMessages() + expect_equal( + preprocess_data(test_df[1:3, c("outcome", "var2")], "outcome", impute_in_preprocessing = FALSE), + list( + dat_transformed = dplyr::tibble( + outcome = c("normal", "normal", "cancer"), + var2_a = c(1, 0, 0), + var2_b = c(0, 1, 0), + var2_c = c(0, 0, 1), + ), + grp_feats = NULL, + removed_feats = character(0) + ) + ) %>% suppressMessages() + expect_equal( + preprocess_data(test_df[1:3, c("outcome", "var3")], "outcome", impute_in_preprocessing = FALSE), + list( + dat_transformed = dplyr::tibble( + outcome = c("normal", "normal", "cancer"), + var3_yes = c(0, 1, 0), + ), + grp_feats = NULL, + removed_feats = character(0) + ) + ) %>% suppressMessages() + expect_equal( + preprocess_data(test_df[1:3, c("outcome", "var4")], "outcome", + prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = dplyr::tibble( + outcome = c("normal", "normal", "cancer"), + var4_1 = c(0, 1, 0), + ), + grp_feats = NULL, + removed_feats = character(0) + ) + ) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df[1:3, ], "outcome", + method = NULL, + prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + 1, + 2, 3 + ), grp4 = c(0, 0, 1), var8 = c(5, 6, NA)), row.names = c( + NA, + -3L + ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list( + grp1 = c("var10_0", "var2_b", "var3_yes", "var4_1", "var9_x"), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8" + ), + removed_feats = c("var5", "var6", "var11") + ) + )) %>% suppressMessages() + expect_error( + preprocess_data(test_df[1:3, c("outcome", "var5")], "outcome", impute_in_preprocessing = FALSE), + "All features have zero variance" + ) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df[1:3, ], + "outcome", + method = c("range"), + prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + 0, + 0.5, 1 + ), grp4 = c(0, 0, 1), var8 = c(0, 1, NA)), row.names = c( + NA, + -3L + ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list( + grp1 = c("var10_0", "var2_b", "var3_yes", "var4_1", "var9_x"), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8" + ), + removed_feats = c("var5", "var6", "var11") + ) + )) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df[1:3, ], + "outcome", + remove_var = "zv", + prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + -1, + 0, 1 + ), grp4 = c(0, 0, 1), var8 = c( + -0.707106781186547, 0.707106781186547, + NA + )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c( + "var10_0", "var2_b", "var3_yes", + "var4_1", "var9_x" + ), grp2 = c("var10_1", "var2_a"), grp3 = c( + "var1", + "var12" + ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"), + removed_feats = c("var5", "var6", "var11") + ) + )) %>% suppressMessages() + expect_message( + expect_equal( + preprocess_data(test_df[1:3, ], "outcome", + remove_var = NULL, prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c( + -1, + 0, 1 + ), grp4 = c(0, 0, 1), var8 = c( + -0.707106781186547, 0.707106781186547, + NA + )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c( + "var10_0", "var2_b", "var3_yes", + "var4_1", "var9_x" + ), grp2 = c("var10_1", "var2_a"), grp3 = c( + "var1", + "var12" + ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"), + removed_feats = c("var5", "var6", "var11") + ) + ), + "Removing" + ) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df[1:3, ], + "outcome", + remove_var = NULL, + collapse_corr_feats = FALSE, + prefilter_threshold = -1, + impute_in_preprocessing = FALSE + ), + list( + dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), var1 = c(-1, 0, 1), var8 = c( + -0.707106781186547, 0.707106781186547, + NA + ), var12 = c(-1, 0, 1), var3_yes = c(0, 1, 0), var4_1 = c( + 0, + 1, 0 + ), var7_1 = c(1, 1, 0), var2_a = c(1, 0, 0), var2_b = c( + 0, + 1, 0 + ), var2_c = c(0, 0, 1), var9_x = c(0, 1, 0), var9_y = c( + 0, + 0, 1 + ), var10_0 = c(0, 1, 0), var10_1 = c(1, 0, 0), var5 = c( + 0, + 0, 0 + ), var6 = c(0, 0, 0), var11 = c(1, 1, 1)), row.names = c( + NA, + -3L + ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = NULL, + removed_feats = character(0) + ) + )) %>% suppressMessages() + expect_error(preprocess_data(test_df[1:3, ], + "outcome", + method = c("asdf") + )) %>% suppressMessages() + expect_message(expect_equal( + preprocess_data(test_df, + "outcome", + to_numeric = FALSE, + impute_in_preprocessing = FALSE + ), + list(dat_transformed = structure(list(outcome = c( + "normal", "normal", + "cancer" + ), var1 = c(-1, 0, 1), grp1 = c(0, 1, 0), grp2 = c( + 1, + 0, 0 + ), grp3 = c(0, 0, 1), var8 = c( + -0.707106781186547, 0.707106781186547, + NA + )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(var1 = "var1", grp1 = c( + "var10_0", "var12_2", + "var2_b", "var3_yes", "var4_1", "var9_x" + ), grp2 = c( + "var10_1", + "var12_1", "var2_a" + ), grp3 = c( + "var12_3", "var2_c", "var7_1", + "var9_y" + ), var8 = "var8"), removed_feats = c( + "var5", "var6", + "var11" + )) + )) %>% suppressMessages() +}) + + + From 596200629a5382dcbd88abf319287f1c053ca8d7 Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 15:44:51 -0400 Subject: [PATCH 05/16] adding todos for future --- R/run_ml.R | 5 +++-- tests/testthat/test-run_ml.R | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/run_ml.R b/R/run_ml.R index 50ebf1b4..0e618e0a 100644 --- a/R/run_ml.R +++ b/R/run_ml.R @@ -1,5 +1,6 @@ -# TODO: add call to the function, add the option -# TODO: figure out if there's a way to only specify option in one place +# TODO: test if calling these functions works +# TODO: combine calls into its own function and call it w train and test set (more readable) +# TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess) #' Run the machine learning pipeline #' #' This function splits the data set into a train & test set, diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R index e91a0611..c3fcba57 100644 --- a/tests/testthat/test-run_ml.R +++ b/tests/testthat/test-run_ml.R @@ -1,3 +1,4 @@ +# TODO: add tests for if impution param is set to true library(dplyr) options( warnPartialMatchArgs = FALSE, From 807903e43b494e19761e2010b94208d2121658c7 Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Fri, 7 Apr 2023 16:11:06 -0400 Subject: [PATCH 06/16] added impution in run_ml by calling impute_during_training function --- R/impute_during_training.R | 24 +++++++++++++ R/run_ml.R | 69 +++----------------------------------- 2 files changed, 29 insertions(+), 64 deletions(-) create mode 100644 R/impute_during_training.R diff --git a/R/impute_during_training.R b/R/impute_during_training.R new file mode 100644 index 00000000..8a4ba434 --- /dev/null +++ b/R/impute_during_training.R @@ -0,0 +1,24 @@ +impute_during_training <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) { + source("preprocess.R") + dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) + dataset <- rm_missing_outcome(dataset, outcome_colname) + split_dat <- split_outcome_features(dataset, outcome_colname) + + features <- split_dat$features + removed_feats <- character(0) + if (to_numeric) { + feats <- change_to_num(features) %>% + remove_singleton_columns(threshold = prefilter_threshold) + removed_feats <- feats$removed_feats + features <- feats$dat + } + pbtick(progbar) + + nv_feats <- process_novar_feats(features, progbar = progbar) + pbtick(progbar) + split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) + pbtick(progbar) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) + pbtick(progbar) + return(cont_feats) +} \ No newline at end of file diff --git a/R/run_ml.R b/R/run_ml.R index 0e618e0a..64b70ff7 100644 --- a/R/run_ml.R +++ b/R/run_ml.R @@ -1,5 +1,4 @@ # TODO: test if calling these functions works -# TODO: combine calls into its own function and call it w train and test set (more readable) # TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess) #' Run the machine learning pipeline #' @@ -209,69 +208,11 @@ run_ml <- train_data <- dataset[training_inds, ] test_data <- dataset[-training_inds, ] if (impute_in_training == TRUE) { - source("preprocess.R") - train_data[[outcome_colname]] <- replace_spaces(train_data[[outcome_colname]]) - train_data <- rm_missing_outcome(train_data, outcome_colname) - split_dat <- split_outcome_features(train_data, outcome_colname) - - features <- split_dat$features - removed_feats <- character(0) - if (to_numeric) { - feats <- change_to_num(features) %>% - remove_singleton_columns(threshold = prefilter_threshold) - removed_feats <- feats$removed_feats - features <- feats$dat - } - pbtick(progbar) - - nv_feats <- process_novar_feats(features, progbar = progbar) - pbtick(progbar) - split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) - pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) - pbtick(progbar) - # repeat with test data - test_data[[outcome_colname]] <- replace_spaces(test_data[[outcome_colname]]) - test_data <- rm_missing_outcome(test_data, outcome_colname) - split_dat <- split_outcome_features(test_data, outcome_colname) - - features <- split_dat$features - removed_feats <- character(0) - if (to_numeric) { - feats <- change_to_num(features) %>% - remove_singleton_columns(threshold = prefilter_threshold) - removed_feats <- feats$removed_feats - features <- feats$dat - } - pbtick(progbar) - - nv_feats <- process_novar_feats(features, progbar = progbar) - pbtick(progbar) - split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) - pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) - pbtick(progbar) - - - - - #source("impute.R") - #sapply_fn <- select_apply("sapply") - #cl_train <- sapply_fn(train_data, function(x) { - # class(x) - #}) - #cl_test <- sapply_fn(test_data, function(x) { - # class(x) - #}) - #missing_train <- - # is.na(train_data[, cl_train %in% c("integer", "numeric")]) - #missing_test <- - # is.na(test_data[, cl_test %in% c("integer", "numeric")]) - #n_missing_train <- sum(missing_train) - #n_missing_test <- sum(missing_test) - # do imputation - #impute_missing_vals(train_data, n_missing_train) - #impute_missing_vals(test_data, n_missing_test) + source("impute_during_training.R") + train_data <- impute_during_training(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) + train_data <- train_data$transformed_cont + test_data <- impute_during_training(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) + test_data <- test_data$transformed_cont } # train_groups & test_groups will be NULL if groups is NULL train_groups <- groups[training_inds] From 724cf60306edc6a687ca11139fd4230a98ddfa0e Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Wed, 12 Apr 2023 16:55:17 -0400 Subject: [PATCH 07/16] refactor code and add tests --- R/impute.R | 33 +++++++++++++++ R/impute_during_training.R | 24 ----------- R/preprocess.R | 35 ++++------------ R/run_ml.R | 16 ++++--- tests/testthat/test-preprocess.R | 17 +++++++- tests/testthat/test-run_ml.R | 71 ++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 58 deletions(-) delete mode 100644 R/impute_during_training.R diff --git a/R/impute.R b/R/impute.R index b84336eb..c7548fc4 100644 --- a/R/impute.R +++ b/R/impute.R @@ -13,4 +13,37 @@ impute <- function(transformed_cont, n_missing) { ) ) return (transformed_cont) +} + +prep_data <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) { + dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) + dataset <- rm_missing_outcome(dataset, outcome_colname) + split_dat <- split_outcome_features(dataset, outcome_colname) + + features <- split_dat$features + removed_feats <- character(0) + if (to_numeric) { + feats <- change_to_num(features) %>% + remove_singleton_columns(threshold = prefilter_threshold) + removed_feats <- feats$removed_feats + features <- feats$dat + } + pbtick(progbar) + + nv_feats <- process_novar_feats(features, progbar = progbar) + pbtick(progbar) + split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) + pbtick(progbar) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) + pbtick(progbar) + # combine all processed features + processed_feats <- dplyr::bind_cols( + cont_feats$transformed_cont, + split_feats$cat_feats, + nv_feats$novar_feats + ) + pbtick(progbar) + + processed_data <- list(cont_feats = cont_feats, removed_feats = removed_feats, split_dat = split_dat, processed_feats = processed_feats) + return(processed_data) } \ No newline at end of file diff --git a/R/impute_during_training.R b/R/impute_during_training.R deleted file mode 100644 index 8a4ba434..00000000 --- a/R/impute_during_training.R +++ /dev/null @@ -1,24 +0,0 @@ -impute_during_training <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) { - source("preprocess.R") - dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) - dataset <- rm_missing_outcome(dataset, outcome_colname) - split_dat <- split_outcome_features(dataset, outcome_colname) - - features <- split_dat$features - removed_feats <- character(0) - if (to_numeric) { - feats <- change_to_num(features) %>% - remove_singleton_columns(threshold = prefilter_threshold) - removed_feats <- feats$removed_feats - features <- feats$dat - } - pbtick(progbar) - - nv_feats <- process_novar_feats(features, progbar = progbar) - pbtick(progbar) - split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) - pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) - pbtick(progbar) - return(cont_feats) -} \ No newline at end of file diff --git a/R/preprocess.R b/R/preprocess.R index 5dfa01f3..ac5671d4 100644 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -73,34 +73,15 @@ preprocess_data <- function(dataset, outcome_colname, check_outcome_column(dataset, outcome_colname, check_values = FALSE) check_remove_var(remove_var) pbtick(progbar) - dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) - dataset <- rm_missing_outcome(dataset, outcome_colname) - split_dat <- split_outcome_features(dataset, outcome_colname) - - features <- split_dat$features - removed_feats <- character(0) - if (to_numeric) { - feats <- change_to_num(features) %>% - remove_singleton_columns(threshold = prefilter_threshold) - removed_feats <- feats$removed_feats - features <- feats$dat - } - pbtick(progbar) - - nv_feats <- process_novar_feats(features, progbar = progbar) - pbtick(progbar) - split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) - pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) - pbtick(progbar) - + + processed_data <- prep_data(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) + removed_feats <- processed_data$removed_feats + processed_feats <- processed_data$processed_feats + split_dat <- processed_data$split_dat + cont_feats <- processed_data$cont_feats + # combine all processed features - processed_feats <- dplyr::bind_cols( - cont_feats$transformed_cont, - split_feats$cat_feats, - nv_feats$novar_feats - ) - pbtick(progbar) + # remove features with (near-)zero variance feats <- get_caret_processed_df(processed_feats, remove_var) diff --git a/R/run_ml.R b/R/run_ml.R index 64b70ff7..bfd20345 100644 --- a/R/run_ml.R +++ b/R/run_ml.R @@ -208,11 +208,17 @@ run_ml <- train_data <- dataset[training_inds, ] test_data <- dataset[-training_inds, ] if (impute_in_training == TRUE) { - source("impute_during_training.R") - train_data <- impute_during_training(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) - train_data <- train_data$transformed_cont - test_data <- impute_during_training(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) - test_data <- test_data$transformed_cont + + train_processed_data <- prep_data(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) + train_processed_feats <- train_processed_data$processed_feats + split_dat <- train_processed_data$split_dat + train_data <- dplyr::bind_cols(split_dat$outcome, train_processed_feats) %>% + dplyr::as_tibble() + test_processed_data <- prep_data(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) + test_processed_feats <- test_processed_data$processed_feats + split_dat <- test_processed_data$split_dat + test_data <- dplyr::bind_cols(split_dat$outcome, test_processed_feats) %>% + dplyr::as_tibble() } # train_groups & test_groups will be NULL if groups is NULL train_groups <- groups[training_inds] diff --git a/tests/testthat/test-preprocess.R b/tests/testthat/test-preprocess.R index 539f79e6..2a95e3e3 100644 --- a/tests/testthat/test-preprocess.R +++ b/tests/testthat/test-preprocess.R @@ -442,14 +442,14 @@ test_that("process_cat_feats works", { test_that("process_cont_feats works", { expect_equal( - process_cont_feats(dplyr::as_tibble(test_df[1:3, 2]), method = c("center", "scale")), + process_cont_feats(dplyr::as_tibble(test_df[1:3, 2]), method = c("center", "scale"), impute_in_preprocessing = TRUE), list(transformed_cont = structure(list(value = c(-1, 0, 1)), row.names = c( NA, -3L ), class = c("tbl_df", "tbl", "data.frame")), removed_cont = character(0)) ) %>% suppressMessages() expect_message(expect_equal( - process_cont_feats(test_df[1:3, c(2, 9)], method = c("center", "scale")), + process_cont_feats(test_df[1:3, c(2, 9)], method = c("center", "scale"), impute_in_preprocessing = TRUE), list(transformed_cont = structure(list(var1 = c(-1, 0, 1), var8 = c( -0.707106781186547, 0.707106781186547, 0 @@ -951,4 +951,17 @@ test_that("setting impute param to false doesn't impute data", { }) +test_that("default parameter for impute_in_preprocessing is TRUE", { + expect_message( + expect_equal( + preprocess_data(test_df, "outcome", + prefilter_threshold = -1 + ), + preprocess_data(test_df, "outcome", + prefilter_threshold = -1, impute_in_preprocessing = TRUE)) + , + "Removed " + ) %>% suppressMessages() +}) + diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R index c3fcba57..b196260f 100644 --- a/tests/testthat/test-run_ml.R +++ b/tests/testthat/test-run_ml.R @@ -310,3 +310,74 @@ test_that("models use case weights when provided", { expect_true("weights" %in% colnames(results_custom_train$trained_model$pred)) expect_false("weights" %in% colnames(otu_mini_bin_results_glmnet$trained_model$pred)) }) + + + +# TODO: fix errors +test_that("make sure impute function on train data set works", { + train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), + var1 = 1:4, + var2 = c("a", "b", "c", "d"), + var3 = c("no", "yes", "no", "no"), + var4 = c(0, 1, 0, 0), + var5 = c(0, 0, 0, 0), + var6 = c("no", "no", "no", "no"), + var7 = c(1, 1, 0, 0), + var8 = c(5, 6, NA, 7), + var9 = c(NA, "x", "y", "z"), + var10 = c(1, 0, NA, NA)) + test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), + var12 = c("1", "2", "3", "4")) + train_data_output <- data.frame( + outcome = c("normal", "normal", "cancer"), + var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0)) + results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) + final_results <- results_output$processed_feats + split_dat <- results_output$split_dat + train_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>% + dplyr::as_tibble() + expect_equal(train_df, train_data_output) +}) +test_that("make sure impute function on test data set works", { + train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), + var1 = 1:4, + var2 = c("a", "b", "c", "d"), + var3 = c("no", "yes", "no", "no"), + var4 = c(0, 1, 0, 0), + var5 = c(0, 0, 0, 0), + var6 = c("no", "no", "no", "no"), + var7 = c(1, 1, 0, 0), + var8 = c(5, 6, NA, 7), + var9 = c(NA, "x", "y", "z"), + var10 = c(1, 0, NA, NA)) + test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), + var12 = c("1", "2", "3", "4")) + test_data_output <- data.frame( + outcome = c("normal", "normal", "cancer"), + var12 = c(-1, 0, 1), var11 = c(1, 1, 1)) + results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) + final_results <- results_output$processed_feats + split_dat <- results_output$split_dat + test_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>% + dplyr::as_tibble() + expect_equal(test_df, test_data_output) +}) + +temp_df <- otu_mini_bin +Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, + 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, + 5, 5) +bin2 <- otu_mini_bin %>% mutate(Otu00011) +# TODO: how to test? DOES OUR FUNC GET CALLED IF FLAG SET TO TRUE, NOT CALLED WHEN SET TO FALSE + +#expect_equal(run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA), run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA, FALSE)) + +#run_ml(bin2, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) +#run_ml(otu_mini_bin, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) From a2b15edeb57e2912e2b214e16b85f9410acbc07b Mon Sep 17 00:00:00 2001 From: Megan Coden Date: Wed, 12 Apr 2023 17:54:38 -0400 Subject: [PATCH 08/16] include tests to validate that the flag for impute during training works --- tests/testthat/test-run_ml.R | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R index b196260f..a188b8e1 100644 --- a/tests/testthat/test-run_ml.R +++ b/tests/testthat/test-run_ml.R @@ -311,9 +311,6 @@ test_that("models use case weights when provided", { expect_false("weights" %in% colnames(otu_mini_bin_results_glmnet$trained_model$pred)) }) - - -# TODO: fix errors test_that("make sure impute function on train data set works", { train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var1 = 1:4, @@ -328,7 +325,7 @@ test_that("make sure impute function on train data set works", { var10 = c(1, 0, NA, NA)) test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), var12 = c("1", "2", "3", "4")) - train_data_output <- data.frame( + train_data_output <- dplyr::tibble( outcome = c("normal", "normal", "cancer"), var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0)) results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) @@ -338,6 +335,7 @@ test_that("make sure impute function on train data set works", { dplyr::as_tibble() expect_equal(train_df, train_data_output) }) + test_that("make sure impute function on test data set works", { train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var1 = 1:4, @@ -352,7 +350,7 @@ test_that("make sure impute function on test data set works", { var10 = c(1, 0, NA, NA)) test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), var12 = c("1", "2", "3", "4")) - test_data_output <- data.frame( + test_data_output <- dplyr::tibble( outcome = c("normal", "normal", "cancer"), var12 = c(-1, 0, 1), var11 = c(1, 1, 1)) results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) @@ -374,10 +372,22 @@ Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5) -bin2 <- otu_mini_bin %>% mutate(Otu00011) -# TODO: how to test? DOES OUR FUNC GET CALLED IF FLAG SET TO TRUE, NOT CALLED WHEN SET TO FALSE +mini_bin_with_nas <- otu_mini_bin %>% mutate(Otu00011) -#expect_equal(run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA), run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA, FALSE)) +test_that("data gets imputed when impute_in_training is set to TRUE", { + results <- run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) + temp <- colSums(is.na(results$test_data)) + num_nas <- sum(temp) + expect_equal(0, num_nas) %>% suppressMessages() + temp <- colSums(is.na(results$trained_model$trainingData)) + num_nas <- sum(temp) + expect_equal(0, num_nas) %>% suppressMessages() +}) + +test_that("data is not imputed when impute_in_training is set to FALSE", { + expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, FALSE),NULL) %>% suppressMessages() +}) -#run_ml(bin2, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) -#run_ml(otu_mini_bin, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) +test_that("data is not imputed when impute_in_training is not set", { + expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019),NULL) %>% suppressMessages() +}) \ No newline at end of file From 998a158dab4d9b3322834df0d94b5ea6ec2358f6 Mon Sep 17 00:00:00 2001 From: Megan Coden <55669799+megancoden@users.noreply.github.com> Date: Wed, 12 Apr 2023 18:02:46 -0400 Subject: [PATCH 09/16] update with imputation changes --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index c73251cd..b4c5a8bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,9 @@ - New column `lower` and `upper` to report the bounds of the empirical 95% confidence interval from the permutation test. See `vignette('parallel')` for an example of plotting feature importance with confidence intervals. - Minor documentation improvements (#323, @kelly-sovacool). +- Added option to impute missing data during training rather than preprocessing (#301, @megancoden and @shah-priyal). + - Added impute_in_training option to `run_ml()`, which defaults to FALSE. + - Added impute_in_preprocessing option to `preprocess()`, which defaults to TRUE. # mikropml 1.5.0 From 43cdd4b89dd983dd1b20934341ec7da4955b73b9 Mon Sep 17 00:00:00 2001 From: shah-priyal Date: Tue, 18 Apr 2023 15:16:39 -0400 Subject: [PATCH 10/16] changed file so it only does imputation step --- R/impute.R | 70 +++++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 46 deletions(-) diff --git a/R/impute.R b/R/impute.R index c7548fc4..9cd97886 100644 --- a/R/impute.R +++ b/R/impute.R @@ -1,49 +1,27 @@ -impute <- function(transformed_cont, n_missing) { - transformed_cont <- sapply_fn(transformed_cont, function(x) { - if (class(x) %in% c("integer", "numeric")) { - m <- is.na(x) - x[m] <- stats::median(x, na.rm = TRUE) - } - return(x) - }) %>% dplyr::as_tibble() - message( - paste0( - n_missing, - " missing continuous value(s) were imputed using the median value of the feature." +impute <- function(transformed_cont) { + sapply_fn <- select_apply("sapply") + cl <- sapply_fn(transformed_cont, function(x) { + class(x) + }) + missing <- + is.na(transformed_cont[, cl %in% c("integer", "numeric")]) + n_missing <- sum(missing) + if (n_missing > 0) { + transformed_cont <- sapply_fn(transformed_cont, function(x) { + if (class(x) %in% c("integer", "numeric")) { + m <- is.na(x) + x[m] <- stats::median(x, na.rm = TRUE) + } + message(typeof(x)) + message(class(x)) + return(x) + }) %>% dplyr::as_tibble() + message( + paste0( + n_missing, + " missing continuous value(s) were imputed using the median value of the feature." + ) ) - ) - return (transformed_cont) -} - -prep_data <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) { - dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) - dataset <- rm_missing_outcome(dataset, outcome_colname) - split_dat <- split_outcome_features(dataset, outcome_colname) - - features <- split_dat$features - removed_feats <- character(0) - if (to_numeric) { - feats <- change_to_num(features) %>% - remove_singleton_columns(threshold = prefilter_threshold) - removed_feats <- feats$removed_feats - features <- feats$dat } - pbtick(progbar) - - nv_feats <- process_novar_feats(features, progbar = progbar) - pbtick(progbar) - split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) - pbtick(progbar) - cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) - pbtick(progbar) - # combine all processed features - processed_feats <- dplyr::bind_cols( - cont_feats$transformed_cont, - split_feats$cat_feats, - nv_feats$novar_feats - ) - pbtick(progbar) - - processed_data <- list(cont_feats = cont_feats, removed_feats = removed_feats, split_dat = split_dat, processed_feats = processed_feats) - return(processed_data) + return (transformed_cont)git } \ No newline at end of file From cf5bb4e3bdf69e40697bb9893b9bcf30321c0b1d Mon Sep 17 00:00:00 2001 From: shah-priyal Date: Tue, 18 Apr 2023 15:17:22 -0400 Subject: [PATCH 11/16] removed extra lines and prep_data steps back to file --- R/preprocess.R | 75 +++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/R/preprocess.R b/R/preprocess.R index 52e80d20..20495aaf 100644 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -1,6 +1,3 @@ -# TODO: set this for a generic path (probably using here::here) -library(here) -here("R", "impute.R") #' Preprocess data prior to running machine learning #' #' Function to preprocess your data for input into [run_ml()]. @@ -74,15 +71,35 @@ preprocess_data <- function(dataset, outcome_colname, check_remove_var(remove_var) pbtick(progbar) - processed_data <- prep_data(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) - removed_feats <- processed_data$removed_feats - processed_feats <- processed_data$processed_feats - split_dat <- processed_data$split_dat - cont_feats <- processed_data$cont_feats + dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]]) + dataset <- rm_missing_outcome(dataset, outcome_colname) + split_dat <- split_outcome_features(dataset, outcome_colname) + + features <- split_dat$features + removed_feats <- character(0) + if (to_numeric) { + feats <- change_to_num(features) %>% + remove_singleton_columns(threshold = prefilter_threshold) + removed_feats <- feats$removed_feats + features <- feats$dat + } + pbtick(progbar) + + nv_feats <- process_novar_feats(features, progbar = progbar) + pbtick(progbar) + split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar) + pbtick(progbar) + cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing) + pbtick(progbar) # combine all processed features + processed_feats <- dplyr::bind_cols( + cont_feats$transformed_cont, + split_feats$cat_feats, + nv_feats$novar_feats + ) + pbtick(progbar) - # remove features with (near-)zero variance feats <- get_caret_processed_df(processed_feats, remove_var) processed_feats <- feats$processed @@ -124,17 +141,15 @@ preprocess_data <- function(dataset, outcome_colname, #' @inheritParams run_ml #' #' @return dataset with no missing outcomes -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' rm_missing_outcome(mikropml::otu_mini_bin, "dx") #' #' test_df <- mikropml::otu_mini_bin #' test_df[1:100, "dx"] <- NA #' rm_missing_outcome(test_df, "dx") -#' } rm_missing_outcome <- function(dataset, outcome_colname) { n_outcome_na <- sum(is.na(dataset %>% dplyr::pull(outcome_colname))) total_outcomes <- nrow(dataset) @@ -152,13 +167,11 @@ rm_missing_outcome <- function(dataset, outcome_colname) { #' @param features dataframe of features for machine learning #' #' @return dataframe with numeric columns where possible -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' class(change_to_num(data.frame(val = c("1", "2", "3")))[[1]]) -#' } change_to_num <- function(features) { lapply_fn <- select_apply(fun = "lapply") check_features(features, check_missing = FALSE) @@ -212,13 +225,11 @@ remove_singleton_columns <- function(dat, threshold = 1) { #' @param progbar optional progress bar (default: `NULL`) #' #' @return list of two dataframes: features with variability (unprocessed) and without (processed) -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' process_novar_feats(mikropml::otu_small[, 2:ncol(otu_small)]) -#' } process_novar_feats <- function(features, progbar = NULL) { novar_feats <- NULL var_feats <- NULL @@ -281,13 +292,11 @@ process_novar_feats <- function(features, progbar = NULL) { #' @inheritParams process_novar_feats #' #' @return list of two dataframes: categorical (processed) and continuous features (unprocessed) -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' process_cat_feats(mikropml::otu_small[, 2:ncol(otu_small)]) -#' } process_cat_feats <- function(features, progbar = NULL) { feature_design_cat_mat <- NULL cont_feats <- NULL @@ -351,13 +360,11 @@ process_cat_feats <- function(features, progbar = NULL) { #' @inheritParams get_caret_processed_df #' #' @return dataframe of preprocessed features -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' process_cont_feats(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale")) -#' } process_cont_feats <- function(features, method, impute_in_preprocessing) { transformed_cont <- NULL removed_cont <- NULL @@ -373,22 +380,12 @@ process_cont_feats <- function(features, method, impute_in_preprocessing) { transformed_cont <- feats$processed removed_cont <- feats$removed } - sapply_fn <- select_apply("sapply") - cl <- sapply_fn(transformed_cont, function(x) { - class(x) - }) - missing <- - is.na(transformed_cont[, cl %in% c("integer", "numeric")]) - n_missing <- sum(missing) - if (n_missing > 0) { # impute missing data using the median value if (impute_in_preprocessing) { - source("impute.R") - transformed_cont <- impute(transformed_cont, n_missing) + transformed_cont <- impute(transformed_cont) } } } - } return(list(transformed_cont = transformed_cont, removed_cont = removed_cont)) } @@ -425,11 +422,10 @@ get_caret_processed_df <- function(features, method) { #' @inheritParams process_novar_feats #' @param full_rank whether matrix should be full rank or not (see `[caret::dummyVars]) #' @return design matrix -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' df <- data.frame( #' outcome = c("normal", "normal", "cancer"), #' var1 = 1:3, @@ -438,7 +434,6 @@ get_caret_processed_df <- function(features, method) { #' var4 = c(0, 1, 0) #' ) #' get_caret_dummyvars_df(df, TRUE) -#' } get_caret_dummyvars_df <- function(features, full_rank = FALSE, progbar = NULL) { check_features(features, check_missing = FALSE) if (!is.null(process_novar_feats(features, progbar = progbar)$novar_feats)) { @@ -456,13 +451,11 @@ get_caret_dummyvars_df <- function(features, full_rank = FALSE, progbar = NULL) #' @inheritParams group_correlated_features #' #' @return features where perfectly correlated ones are collapsed -#' @keywords internal +#' @noRd #' @author Zena Lapp, \email{zenalapp@@umich.edu} #' #' @examples -#' \dontrun{ #' collapse_correlated_features(mikropml::otu_small[, 2:ncol(otu_small)]) -#' } collapse_correlated_features <- function(features, group_neg_corr = TRUE, progbar = NULL) { feats_nocorr <- features grp_feats <- NULL From eb81a6181d8d2a18288768945c55b35458a5cec8 Mon Sep 17 00:00:00 2001 From: shah-priyal Date: Tue, 18 Apr 2023 15:18:43 -0400 Subject: [PATCH 12/16] removed extra todos and update file to only include imputation step --- R/run_ml.R | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/R/run_ml.R b/R/run_ml.R index bfd20345..4526b199 100644 --- a/R/run_ml.R +++ b/R/run_ml.R @@ -1,5 +1,3 @@ -# TODO: test if calling these functions works -# TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess) #' Run the machine learning pipeline #' #' This function splits the data set into a train & test set, @@ -146,7 +144,7 @@ run_ml <- group_partitions = NULL, corr_thresh = 1, seed = NA, - impute_in_training = FALSE, + impute_after_split = FALSE, ...) { check_all( dataset, @@ -207,18 +205,9 @@ run_ml <- train_data <- dataset[training_inds, ] test_data <- dataset[-training_inds, ] - if (impute_in_training == TRUE) { - - train_processed_data <- prep_data(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) - train_processed_feats <- train_processed_data$processed_feats - split_dat <- train_processed_data$split_dat - train_data <- dplyr::bind_cols(split_dat$outcome, train_processed_feats) %>% - dplyr::as_tibble() - test_processed_data <- prep_data(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE) - test_processed_feats <- test_processed_data$processed_feats - split_dat <- test_processed_data$split_dat - test_data <- dplyr::bind_cols(split_dat$outcome, test_processed_feats) %>% - dplyr::as_tibble() + if (impute_after_split == TRUE) { + train_data <- impute(train_data) + test_data <- impute(test_data) } # train_groups & test_groups will be NULL if groups is NULL train_groups <- groups[training_inds] From e6726969f16a0d80fa936419b0f67f9ca777afa6 Mon Sep 17 00:00:00 2001 From: shah-priyal Date: Tue, 18 Apr 2023 15:19:53 -0400 Subject: [PATCH 13/16] changed tests to adjust expected output of imputed data --- tests/testthat/test-run_ml.R | 79 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R index a188b8e1..ee140858 100644 --- a/tests/testthat/test-run_ml.R +++ b/tests/testthat/test-run_ml.R @@ -1,4 +1,3 @@ -# TODO: add tests for if impution param is set to true library(dplyr) options( warnPartialMatchArgs = FALSE, @@ -312,7 +311,7 @@ test_that("models use case weights when provided", { }) test_that("make sure impute function on train data set works", { - train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), + train_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), var1 = 1:4, var2 = c("a", "b", "c", "d"), var3 = c("no", "yes", "no", "no"), @@ -321,45 +320,47 @@ test_that("make sure impute function on train data set works", { var6 = c("no", "no", "no", "no"), var7 = c(1, 1, 0, 0), var8 = c(5, 6, NA, 7), - var9 = c(NA, "x", "y", "z"), + var9 = c(NA, 1, 1, 0), var10 = c(1, 0, NA, NA)) - test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), - var12 = c("1", "2", "3", "4")) + test_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), var11 = c(1, 1, NA, NA), + var12 = c(1, 2, NA, 4)) train_data_output <- dplyr::tibble( - outcome = c("normal", "normal", "cancer"), - var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0)) - results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) - final_results <- results_output$processed_feats - split_dat <- results_output$split_dat - train_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>% - dplyr::as_tibble() - expect_equal(train_df, train_data_output) + outcome = c("normal", "normal", "cancer", "cancer"), + var1 = c('1', '2', '3', '4'), + var2 = c("a", "b", "c", "d"), + var3 = c("no", "yes", "no", "no"), + var4 = c('0', '1', '0', '0'), + var5 = c('0', '0', '0', '0'), + var6 = c("no", "no", "no", "no"), + var7 = c('1', '1', '0', '0'), + var8 = c('5', '6', '6', '7'), + var9 = c('1', '1', '1', '0'), + var10 = c('1', '0', '0.5', '0.5')) + results_output <- impute(train_data) + expect_equal(train_data_output, results_output) }) test_that("make sure impute function on test data set works", { - train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), - var1 = 1:4, - var2 = c("a", "b", "c", "d"), - var3 = c("no", "yes", "no", "no"), - var4 = c(0, 1, 0, 0), - var5 = c(0, 0, 0, 0), - var6 = c("no", "no", "no", "no"), - var7 = c(1, 1, 0, 0), - var8 = c(5, 6, NA, 7), - var9 = c(NA, "x", "y", "z"), - var10 = c(1, 0, NA, NA)) - test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA), - var12 = c("1", "2", "3", "4")) - test_data_output <- dplyr::tibble( - outcome = c("normal", "normal", "cancer"), - var12 = c(-1, 0, 1), var11 = c(1, 1, 1)) - results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE) - final_results <- results_output$processed_feats - split_dat <- results_output$split_dat - test_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>% - dplyr::as_tibble() - expect_equal(test_df, test_data_output) -}) + train_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), + var1 = 1:4, + var2 = c("a", "b", "c", "d"), + var3 = c("no", "yes", "no", "no"), + var4 = c(0, 1, 0, 0), + var5 = c(0, 0, 0, 0), + var6 = c("no", "no", "no", "no"), + var7 = c(1, 1, 0, 0), + var8 = c(5, 6, NA, 7), + var9 = c(NA, 1, 1, 0), + var10 = c(1, 0, NA, NA)) + test_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), var11 = c(1, 1, NA, NA), + var12 = c(1, 2, NA, 4)) + test_data_output <- dplyr::tibble( + outcome = c("normal", "normal", "cancer", "cancer"), var11 = c('1', '1', '1', '1'), + var12 = c('1', '2', '2', '4')) + results_output <- impute(test_data) + expect_equal(test_data_output, results_output) + }) + temp_df <- otu_mini_bin Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, @@ -374,7 +375,7 @@ Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6 5, 5) mini_bin_with_nas <- otu_mini_bin %>% mutate(Otu00011) -test_that("data gets imputed when impute_in_training is set to TRUE", { +test_that("data gets imputed when impute_after_split is set to TRUE", { results <- run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE) temp <- colSums(is.na(results$test_data)) num_nas <- sum(temp) @@ -384,10 +385,10 @@ test_that("data gets imputed when impute_in_training is set to TRUE", { expect_equal(0, num_nas) %>% suppressMessages() }) -test_that("data is not imputed when impute_in_training is set to FALSE", { +test_that("data is not imputed when impute_after_split is set to FALSE", { expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, FALSE),NULL) %>% suppressMessages() }) -test_that("data is not imputed when impute_in_training is not set", { +test_that("data is not imputed when impute_after_split is not set", { expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019),NULL) %>% suppressMessages() }) \ No newline at end of file From 58d7f26b8ce141625c946bf8899d1feb1f521079 Mon Sep 17 00:00:00 2001 From: Megan Coden <55669799+megancoden@users.noreply.github.com> Date: Tue, 18 Apr 2023 15:26:12 -0400 Subject: [PATCH 14/16] include info about setting impute_in_preprocessing to FALSE --- vignettes/preprocess.Rmd | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vignettes/preprocess.Rmd b/vignettes/preprocess.Rmd index fe8ab009..0437bfde 100644 --- a/vignettes/preprocess.Rmd +++ b/vignettes/preprocess.Rmd @@ -370,6 +370,13 @@ going on (i.e. the median value is used): preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL) ``` +To delay this step until after the train/test split in run_ml.R, set the impute_in_preprocessing option to FALSE as shown here: + +```{r} +# preprocess raw dataset with missing value in continuous feature +preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL, impute_in_preprocessing=FALSE) +``` + ## Putting it all together Here's some more complicated example raw data that puts everything we discussed together: From 8a18c1d341533695bc6118eb5fb0a64c07cfbb84 Mon Sep 17 00:00:00 2001 From: shah-priyal Date: Tue, 18 Apr 2023 15:35:03 -0400 Subject: [PATCH 15/16] added roxygen docstring for impute function --- R/impute.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/R/impute.R b/R/impute.R index 9cd97886..149e5316 100644 --- a/R/impute.R +++ b/R/impute.R @@ -1,3 +1,13 @@ +#' Replace NA values with the median value of the column for continious variables in the dataset +#' +#' @param transformed_cont Data frame that may include NA values in one or more columns +#' +#' @return Data frame that has no NA values in continious numeric columns +#' +#' @examples +#' transformed_cont <- impute(transformed_cont) +#' train_data <- impute(train_data) +#' test_data <- impute(test_data) impute <- function(transformed_cont) { sapply_fn <- select_apply("sapply") cl <- sapply_fn(transformed_cont, function(x) { @@ -23,5 +33,5 @@ impute <- function(transformed_cont) { ) ) } - return (transformed_cont)git + return (transformed_cont) } \ No newline at end of file From 8c81b35d10ed35f84abbacead68d8e3de3068583 Mon Sep 17 00:00:00 2001 From: shah-priyal <55864447+shah-priyal@users.noreply.github.com> Date: Tue, 18 Apr 2023 15:47:09 -0400 Subject: [PATCH 16/16] Update preprocess.Rmd --- vignettes/preprocess.Rmd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vignettes/preprocess.Rmd b/vignettes/preprocess.Rmd index 0437bfde..31e7ab1c 100644 --- a/vignettes/preprocess.Rmd +++ b/vignettes/preprocess.Rmd @@ -369,13 +369,21 @@ going on (i.e. the median value is used): # preprocess raw dataset with missing value in continuous feature preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL) ``` - +#### Impute after the train/test split in run_ml.R To delay this step until after the train/test split in run_ml.R, set the impute_in_preprocessing option to FALSE as shown here: ```{r} # preprocess raw dataset with missing value in continuous feature preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL, impute_in_preprocessing=FALSE) ``` +To impute the data after the train/test split in run_ml.R, set the impute_after_split option to TRUE as shown here: +```{r, eval = FALSE} +results <- run_ml(otu_mini_bin, + "glmnet", + outcome_colname = "dx", + seed = 2019, impute_after_split = TRUE +) +``` ## Putting it all together