From 700c7f7d3cc8ead6172fe7e7a3ceb676b0d9a930 Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 15:29:08 -0400
Subject: [PATCH 01/16] moved imputation code to a function

---
 R/impute.R     | 16 ++++++++++++++++
 R/preprocess.R | 23 +++++++----------------
 2 files changed, 23 insertions(+), 16 deletions(-)
 create mode 100644 R/impute.R

diff --git a/R/impute.R b/R/impute.R
new file mode 100644
index 00000000..b84336eb
--- /dev/null
+++ b/R/impute.R
@@ -0,0 +1,16 @@
+impute <- function(transformed_cont, n_missing) {
+  transformed_cont <- sapply_fn(transformed_cont, function(x) {
+    if (class(x) %in% c("integer", "numeric")) {
+      m <- is.na(x)
+      x[m] <- stats::median(x, na.rm = TRUE)
+    }
+    return(x)
+  }) %>% dplyr::as_tibble()
+  message(
+    paste0(
+      n_missing,
+      " missing continuous value(s) were imputed using the median value of the feature."
+    )
+  )
+  return (transformed_cont)
+}
\ No newline at end of file
diff --git a/R/preprocess.R b/R/preprocess.R
index 4418e416..119ad8b3 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -60,7 +60,7 @@ preprocess_data <- function(dataset, outcome_colname,
                             method = c("center", "scale"),
                             remove_var = "nzv", collapse_corr_feats = TRUE,
                             to_numeric = TRUE, group_neg_corr = TRUE,
-                            prefilter_threshold = 1) {
+                            prefilter_threshold = 1, impute_in_preprocessing = TRUE) {
   progbar <- NULL
   if (isTRUE(check_packages_installed("progressr"))) {
     progbar <- progressr::progressor(steps = 20, message = "preprocessing")
@@ -88,7 +88,7 @@ preprocess_data <- function(dataset, outcome_colname,
   pbtick(progbar)
   split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
   pbtick(progbar)
-  cont_feats <- process_cont_feats(split_feats$cont_feats, method)
+  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
   pbtick(progbar)
 
   # combine all processed features
@@ -364,7 +364,7 @@ process_cat_feats <- function(features, progbar = NULL) {
 #'
 #' @examples
 #' process_cont_feats(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
-process_cont_feats <- function(features, method) {
+process_cont_feats <- function(features, method, impute_in_preprocessing) {
   transformed_cont <- NULL
   removed_cont <- NULL
 
@@ -388,19 +388,10 @@ process_cont_feats <- function(features, method) {
       n_missing <- sum(missing)
       if (n_missing > 0) {
         # impute missing data using the median value
-        transformed_cont <- sapply_fn(transformed_cont, function(x) {
-          if (class(x) %in% c("integer", "numeric")) {
-            m <- is.na(x)
-            x[m] <- stats::median(x, na.rm = TRUE)
-          }
-          return(x)
-        }) %>% dplyr::as_tibble()
-        message(
-          paste0(
-            n_missing,
-            " missing continuous value(s) were imputed using the median value of the feature."
-          )
-        )
+        if (impute_in_preprocessing) {
+          source("impute.R")
+          transformed_cont <- impute(transformed_cont, n_missing)
+        }
       }
     }
   }

From cd57a1d1027b1d511332eb00d41a1d3b1696377b Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 15:32:43 -0400
Subject: [PATCH 02/16] add impute param in func call

---
 R/run_ml.R | 106 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 19 deletions(-)

diff --git a/R/run_ml.R b/R/run_ml.R
index 7abe8b82..50ebf1b4 100644
--- a/R/run_ml.R
+++ b/R/run_ml.R
@@ -1,3 +1,5 @@
+# TODO: add call to the function, add the option
+# TODO: figure out if there's a way to only specify option in one place
 #' Run the machine learning pipeline
 #'
 #' This function splits the data set into a train & test set,
@@ -144,6 +146,7 @@ run_ml <-
            group_partitions = NULL,
            corr_thresh = 1,
            seed = NA,
+           impute_in_training = FALSE,
            ...) {
     check_all(
       dataset,
@@ -162,7 +165,7 @@ run_ml <-
     if (!is.na(seed)) {
       set.seed(seed)
     }
-
+    
     # `future.apply` is required for `find_feature_importance()`.
     # check it here to adhere to the fail fast principle.
     if (find_feature_importance) {
@@ -173,20 +176,20 @@ run_ml <-
     if (find_feature_importance) {
       check_cat_feats(dataset %>% dplyr::select(-outcome_colname))
     }
-
+    
     dataset <- dataset %>%
       randomize_feature_order(outcome_colname) %>%
       # convert tibble to dataframe to silence warning from caret::train():
       # "Warning: Setting row names on a tibble is deprecated.."
       as.data.frame()
-
+    
     outcomes_vctr <- dataset %>% dplyr::pull(outcome_colname)
-
+    
     if (length(training_frac) == 1) {
       training_inds <- get_partition_indices(outcomes_vctr,
-        training_frac = training_frac,
-        groups = groups,
-        group_partitions = group_partitions
+                                             training_frac = training_frac,
+                                             groups = groups,
+                                             group_partitions = group_partitions
       )
     } else {
       training_inds <- training_frac
@@ -201,30 +204,95 @@ run_ml <-
     }
     check_training_frac(training_frac)
     check_training_indices(training_inds, dataset)
-
+    
     train_data <- dataset[training_inds, ]
     test_data <- dataset[-training_inds, ]
+    if (impute_in_training == TRUE) {
+      source("preprocess.R")
+      train_data[[outcome_colname]] <- replace_spaces(train_data[[outcome_colname]])
+      train_data <- rm_missing_outcome(train_data, outcome_colname)
+      split_dat <- split_outcome_features(train_data, outcome_colname)
+      
+      features <- split_dat$features
+      removed_feats <- character(0)
+      if (to_numeric) {
+        feats <- change_to_num(features) %>%
+          remove_singleton_columns(threshold = prefilter_threshold)
+        removed_feats <- feats$removed_feats
+        features <- feats$dat
+      }
+      pbtick(progbar)
+      
+      nv_feats <- process_novar_feats(features, progbar = progbar)
+      pbtick(progbar)
+      split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+      pbtick(progbar)
+      cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+      pbtick(progbar)
+      # repeat with test data
+      test_data[[outcome_colname]] <- replace_spaces(test_data[[outcome_colname]])
+      test_data <- rm_missing_outcome(test_data, outcome_colname)
+      split_dat <- split_outcome_features(test_data, outcome_colname)
+      
+      features <- split_dat$features
+      removed_feats <- character(0)
+      if (to_numeric) {
+        feats <- change_to_num(features) %>%
+          remove_singleton_columns(threshold = prefilter_threshold)
+        removed_feats <- feats$removed_feats
+        features <- feats$dat
+      }
+      pbtick(progbar)
+      
+      nv_feats <- process_novar_feats(features, progbar = progbar)
+      pbtick(progbar)
+      split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+      pbtick(progbar)
+      cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+      pbtick(progbar)
+      
+      
+      
+      
+      #source("impute.R")
+      #sapply_fn <- select_apply("sapply")
+      #cl_train <- sapply_fn(train_data, function(x) {
+      #  class(x)
+      #})
+      #cl_test <- sapply_fn(test_data, function(x) {
+      #  class(x)
+      #})
+      #missing_train <-
+      #  is.na(train_data[, cl_train %in% c("integer", "numeric")])
+      #missing_test <-
+      #  is.na(test_data[, cl_test %in% c("integer", "numeric")])
+      #n_missing_train <- sum(missing_train)
+      #n_missing_test <- sum(missing_test)
+      # do imputation
+      #impute_missing_vals(train_data, n_missing_train)
+      #impute_missing_vals(test_data, n_missing_test)
+    }
     # train_groups & test_groups will be NULL if groups is NULL
     train_groups <- groups[training_inds]
     test_groups <- groups[-training_inds]
-
+    
     if (is.null(hyperparameters)) {
       hyperparameters <- get_hyperparams_list(dataset, method)
     }
     tune_grid <- get_tuning_grid(hyperparameters, method)
-
-
+    
+    
     outcome_type <- get_outcome_type(outcomes_vctr)
     class_probs <- outcome_type != "continuous"
-
+    
     if (is.null(perf_metric_function)) {
       perf_metric_function <- get_perf_metric_fn(outcome_type)
     }
-
+    
     if (is.null(perf_metric_name)) {
       perf_metric_name <- get_perf_metric_name(outcome_type)
     }
-
+    
     if (is.null(cross_val)) {
       cross_val <- define_cv(
         train_data,
@@ -238,8 +306,8 @@ run_ml <-
         group_partitions = group_partitions
       )
     }
-
-
+    
+    
     message("Training the model...")
     trained_model_caret <- train_model(
       train_data = train_data,
@@ -254,7 +322,7 @@ run_ml <-
     if (!is.na(seed)) {
       set.seed(seed)
     }
-
+    
     if (calculate_performance) {
       performance_tbl <- get_performance_tbl(
         trained_model_caret,
@@ -269,7 +337,7 @@ run_ml <-
     } else {
       performance_tbl <- "Skipped calculating performance"
     }
-
+    
     if (find_feature_importance) {
       message("Finding feature importance...")
       feature_importance_tbl <- get_feature_importance(
@@ -287,7 +355,7 @@ run_ml <-
     } else {
       feature_importance_tbl <- "Skipped feature importance"
     }
-
+    
     return(
       list(
         trained_model = trained_model_caret,

From 58afddb1f926cf73ec75e93685e6034ed7996b9c Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 15:33:45 -0400
Subject: [PATCH 03/16] add todo about fixing here stuff

---
 R/preprocess.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/preprocess.R b/R/preprocess.R
index 119ad8b3..5dfa01f3 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -1,3 +1,6 @@
+# TODO: set this for a generic path (probably using here::here)
+library(here)
+here("R", "impute.R")
 #' Preprocess data prior to running machine learning
 #'
 #' Function to preprocess your data for input into [run_ml()].

From 105c22f028a6aa5ed9b7c56793d5aba941961710 Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 15:42:08 -0400
Subject: [PATCH 04/16] add tests for setting impute param to false in
 preprocess data

---
 tests/testthat/test-preprocess.R | 281 +++++++++++++++++++++++++++++++
 1 file changed, 281 insertions(+)

diff --git a/tests/testthat/test-preprocess.R b/tests/testthat/test-preprocess.R
index ef1555bb..539f79e6 100644
--- a/tests/testthat/test-preprocess.R
+++ b/tests/testthat/test-preprocess.R
@@ -671,3 +671,284 @@ test_that("preprocess_data replaces spaces in outcome column values (class label
     dat_proc
   ) %>% suppressMessages()
 })
+
+
+
+test_that("setting impute param to false doesn't impute data", {
+  expect_message(
+    expect_equal(
+      preprocess_data(test_df, "outcome",
+                      prefilter_threshold = -1, impute_in_preprocessing = FALSE
+      ),
+      list(
+        dat_transformed = structure(list(outcome = c(
+          "normal", "normal",
+          "cancer"
+        ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+          -1,
+          0, 1
+        ), grp4 = c(0, 0, 1), var8 = c(
+          -0.707106781186547, 0.707106781186547,
+          NA
+        )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c(
+          "var10_0", "var2_b", "var3_yes",
+          "var4_1", "var9_x"
+        ), grp2 = c("var10_1", "var2_a"), grp3 = c(
+          "var1",
+          "var12"
+        ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"),
+        removed_feats = c("var5", "var6", "var11")
+      )
+    ),
+    "Removed "
+  ) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df, "outcome",
+                    prefilter_threshold = -1,
+                    group_neg_corr = FALSE,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(dat_transformed = structure(list(outcome = c(
+      "normal", "normal",
+      "cancer"
+    ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+      -1,
+      0, 1
+    ), grp4 = c(0, 0, 1), var7_1 = c(1, 1, 0), var8 = c(
+      -0.707106781186547,
+      0.707106781186547, NA
+    )), row.names = c(NA, -3L), class = c(
+      "tbl_df",
+      "tbl", "data.frame"
+    )), grp_feats = list(
+      grp1 = c(
+        "var10_0", "var2_b",
+        "var3_yes", "var4_1", "var9_x"
+      ), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var9_y"), var7_1 = "var7_1",
+      var8 = "var8"
+    ), removed_feats = c("var5", "var6", "var11"))
+  )) %>% suppressMessages()
+  expect_equal(
+    preprocess_data(test_df[1:3, c("outcome", "var1")], "outcome"),
+    list(
+      dat_transformed = dplyr::tibble(
+        outcome = c("normal", "normal", "cancer"),
+        var1 = c(-1, 0, 1)
+      ),
+      grp_feats = NULL,
+      removed_feats = character(0)
+    )
+  ) %>% suppressMessages()
+  expect_equal(
+    preprocess_data(test_df[1:3, c("outcome", "var2")], "outcome", impute_in_preprocessing = FALSE),
+    list(
+      dat_transformed = dplyr::tibble(
+        outcome = c("normal", "normal", "cancer"),
+        var2_a = c(1, 0, 0),
+        var2_b = c(0, 1, 0),
+        var2_c = c(0, 0, 1),
+      ),
+      grp_feats = NULL,
+      removed_feats = character(0)
+    )
+  ) %>% suppressMessages()
+  expect_equal(
+    preprocess_data(test_df[1:3, c("outcome", "var3")], "outcome", impute_in_preprocessing = FALSE),
+    list(
+      dat_transformed = dplyr::tibble(
+        outcome = c("normal", "normal", "cancer"),
+        var3_yes = c(0, 1, 0),
+      ),
+      grp_feats = NULL,
+      removed_feats = character(0)
+    )
+  ) %>% suppressMessages()
+  expect_equal(
+    preprocess_data(test_df[1:3, c("outcome", "var4")], "outcome",
+                    prefilter_threshold = -1,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(
+      dat_transformed = dplyr::tibble(
+        outcome = c("normal", "normal", "cancer"),
+        var4_1 = c(0, 1, 0),
+      ),
+      grp_feats = NULL,
+      removed_feats = character(0)
+    )
+  ) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df[1:3, ], "outcome",
+                    method = NULL,
+                    prefilter_threshold = -1,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(
+      dat_transformed = structure(list(outcome = c(
+        "normal", "normal",
+        "cancer"
+      ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+        1,
+        2, 3
+      ), grp4 = c(0, 0, 1), var8 = c(5, 6, NA)), row.names = c(
+        NA,
+        -3L
+      ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(
+        grp1 = c("var10_0", "var2_b", "var3_yes", "var4_1", "var9_x"), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"
+      ),
+      removed_feats = c("var5", "var6", "var11")
+    )
+  )) %>% suppressMessages()
+  expect_error(
+    preprocess_data(test_df[1:3, c("outcome", "var5")], "outcome", impute_in_preprocessing = FALSE),
+    "All features have zero variance"
+  ) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df[1:3, ],
+                    "outcome",
+                    method = c("range"),
+                    prefilter_threshold = -1,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(
+      dat_transformed = structure(list(outcome = c(
+        "normal", "normal",
+        "cancer"
+      ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+        0,
+        0.5, 1
+      ), grp4 = c(0, 0, 1), var8 = c(0, 1, NA)), row.names = c(
+        NA,
+        -3L
+      ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(
+        grp1 = c("var10_0", "var2_b", "var3_yes", "var4_1", "var9_x"), grp2 = c("var10_1", "var2_a"), grp3 = c("var1", "var12"), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"
+      ),
+      removed_feats = c("var5", "var6", "var11")
+    )
+  )) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df[1:3, ],
+                    "outcome",
+                    remove_var = "zv",
+                    prefilter_threshold = -1,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(
+      dat_transformed = structure(list(outcome = c(
+        "normal", "normal",
+        "cancer"
+      ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+        -1,
+        0, 1
+      ), grp4 = c(0, 0, 1), var8 = c(
+        -0.707106781186547, 0.707106781186547,
+        NA
+      )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c(
+        "var10_0", "var2_b", "var3_yes",
+        "var4_1", "var9_x"
+      ), grp2 = c("var10_1", "var2_a"), grp3 = c(
+        "var1",
+        "var12"
+      ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"),
+      removed_feats = c("var5", "var6", "var11")
+    )
+  )) %>% suppressMessages()
+  expect_message(
+    expect_equal(
+      preprocess_data(test_df[1:3, ], "outcome",
+                      remove_var = NULL, prefilter_threshold = -1,
+                      impute_in_preprocessing = FALSE
+      ),
+      list(
+        dat_transformed = structure(list(outcome = c(
+          "normal", "normal",
+          "cancer"
+        ), grp1 = c(0, 1, 0), grp2 = c(1, 0, 0), grp3 = c(
+          -1,
+          0, 1
+        ), grp4 = c(0, 0, 1), var8 = c(
+          -0.707106781186547, 0.707106781186547,
+          NA
+        )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(grp1 = c(
+          "var10_0", "var2_b", "var3_yes",
+          "var4_1", "var9_x"
+        ), grp2 = c("var10_1", "var2_a"), grp3 = c(
+          "var1",
+          "var12"
+        ), grp4 = c("var2_c", "var7_1", "var9_y"), var8 = "var8"),
+        removed_feats = c("var5", "var6", "var11")
+      )
+    ),
+    "Removing"
+  ) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df[1:3, ],
+                    "outcome",
+                    remove_var = NULL,
+                    collapse_corr_feats = FALSE,
+                    prefilter_threshold = -1,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(
+      dat_transformed = structure(list(outcome = c(
+        "normal", "normal",
+        "cancer"
+      ), var1 = c(-1, 0, 1), var8 = c(
+        -0.707106781186547, 0.707106781186547,
+        NA
+      ), var12 = c(-1, 0, 1), var3_yes = c(0, 1, 0), var4_1 = c(
+        0,
+        1, 0
+      ), var7_1 = c(1, 1, 0), var2_a = c(1, 0, 0), var2_b = c(
+        0,
+        1, 0
+      ), var2_c = c(0, 0, 1), var9_x = c(0, 1, 0), var9_y = c(
+        0,
+        0, 1
+      ), var10_0 = c(0, 1, 0), var10_1 = c(1, 0, 0), var5 = c(
+        0,
+        0, 0
+      ), var6 = c(0, 0, 0), var11 = c(1, 1, 1)), row.names = c(
+        NA,
+        -3L
+      ), class = c("tbl_df", "tbl", "data.frame")), grp_feats = NULL,
+      removed_feats = character(0)
+    )
+  )) %>% suppressMessages()
+  expect_error(preprocess_data(test_df[1:3, ],
+                               "outcome",
+                               method = c("asdf")
+  )) %>% suppressMessages()
+  expect_message(expect_equal(
+    preprocess_data(test_df,
+                    "outcome",
+                    to_numeric = FALSE,
+                    impute_in_preprocessing = FALSE
+    ),
+    list(dat_transformed = structure(list(outcome = c(
+      "normal", "normal",
+      "cancer"
+    ), var1 = c(-1, 0, 1), grp1 = c(0, 1, 0), grp2 = c(
+      1,
+      0, 0
+    ), grp3 = c(0, 0, 1), var8 = c(
+      -0.707106781186547, 0.707106781186547,
+      NA
+    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame")), grp_feats = list(var1 = "var1", grp1 = c(
+      "var10_0", "var12_2",
+      "var2_b", "var3_yes", "var4_1", "var9_x"
+    ), grp2 = c(
+      "var10_1",
+      "var12_1", "var2_a"
+    ), grp3 = c(
+      "var12_3", "var2_c", "var7_1",
+      "var9_y"
+    ), var8 = "var8"), removed_feats = c(
+      "var5", "var6",
+      "var11"
+    ))
+  )) %>% suppressMessages()
+})
+
+
+

From 596200629a5382dcbd88abf319287f1c053ca8d7 Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 15:44:51 -0400
Subject: [PATCH 05/16] adding todos for future

---
 R/run_ml.R                   | 5 +++--
 tests/testthat/test-run_ml.R | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/run_ml.R b/R/run_ml.R
index 50ebf1b4..0e618e0a 100644
--- a/R/run_ml.R
+++ b/R/run_ml.R
@@ -1,5 +1,6 @@
-# TODO: add call to the function, add the option
-# TODO: figure out if there's a way to only specify option in one place
+# TODO: test if calling these functions works
+# TODO: combine calls into its own function and call it w train and test set (more readable)
+# TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess)
 #' Run the machine learning pipeline
 #'
 #' This function splits the data set into a train & test set,
diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R
index e91a0611..c3fcba57 100644
--- a/tests/testthat/test-run_ml.R
+++ b/tests/testthat/test-run_ml.R
@@ -1,3 +1,4 @@
+# TODO: add tests for if impution param is set to true
 library(dplyr)
 options(
   warnPartialMatchArgs = FALSE,

From 807903e43b494e19761e2010b94208d2121658c7 Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Fri, 7 Apr 2023 16:11:06 -0400
Subject: [PATCH 06/16] added impution in run_ml by calling
 impute_during_training function

---
 R/impute_during_training.R | 24 +++++++++++++
 R/run_ml.R                 | 69 +++-----------------------------------
 2 files changed, 29 insertions(+), 64 deletions(-)
 create mode 100644 R/impute_during_training.R

diff --git a/R/impute_during_training.R b/R/impute_during_training.R
new file mode 100644
index 00000000..8a4ba434
--- /dev/null
+++ b/R/impute_during_training.R
@@ -0,0 +1,24 @@
+impute_during_training <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) {
+  source("preprocess.R")
+  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
+  dataset <- rm_missing_outcome(dataset, outcome_colname)
+  split_dat <- split_outcome_features(dataset, outcome_colname)
+  
+  features <- split_dat$features
+  removed_feats <- character(0)
+  if (to_numeric) {
+    feats <- change_to_num(features) %>%
+      remove_singleton_columns(threshold = prefilter_threshold)
+    removed_feats <- feats$removed_feats
+    features <- feats$dat
+  }
+  pbtick(progbar)
+  
+  nv_feats <- process_novar_feats(features, progbar = progbar)
+  pbtick(progbar)
+  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+  pbtick(progbar)
+  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+  pbtick(progbar)
+  return(cont_feats)
+}
\ No newline at end of file
diff --git a/R/run_ml.R b/R/run_ml.R
index 0e618e0a..64b70ff7 100644
--- a/R/run_ml.R
+++ b/R/run_ml.R
@@ -1,5 +1,4 @@
 # TODO: test if calling these functions works
-# TODO: combine calls into its own function and call it w train and test set (more readable)
 # TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess)
 #' Run the machine learning pipeline
 #'
@@ -209,69 +208,11 @@ run_ml <-
     train_data <- dataset[training_inds, ]
     test_data <- dataset[-training_inds, ]
     if (impute_in_training == TRUE) {
-      source("preprocess.R")
-      train_data[[outcome_colname]] <- replace_spaces(train_data[[outcome_colname]])
-      train_data <- rm_missing_outcome(train_data, outcome_colname)
-      split_dat <- split_outcome_features(train_data, outcome_colname)
-      
-      features <- split_dat$features
-      removed_feats <- character(0)
-      if (to_numeric) {
-        feats <- change_to_num(features) %>%
-          remove_singleton_columns(threshold = prefilter_threshold)
-        removed_feats <- feats$removed_feats
-        features <- feats$dat
-      }
-      pbtick(progbar)
-      
-      nv_feats <- process_novar_feats(features, progbar = progbar)
-      pbtick(progbar)
-      split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-      pbtick(progbar)
-      cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
-      pbtick(progbar)
-      # repeat with test data
-      test_data[[outcome_colname]] <- replace_spaces(test_data[[outcome_colname]])
-      test_data <- rm_missing_outcome(test_data, outcome_colname)
-      split_dat <- split_outcome_features(test_data, outcome_colname)
-      
-      features <- split_dat$features
-      removed_feats <- character(0)
-      if (to_numeric) {
-        feats <- change_to_num(features) %>%
-          remove_singleton_columns(threshold = prefilter_threshold)
-        removed_feats <- feats$removed_feats
-        features <- feats$dat
-      }
-      pbtick(progbar)
-      
-      nv_feats <- process_novar_feats(features, progbar = progbar)
-      pbtick(progbar)
-      split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-      pbtick(progbar)
-      cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
-      pbtick(progbar)
-      
-      
-      
-      
-      #source("impute.R")
-      #sapply_fn <- select_apply("sapply")
-      #cl_train <- sapply_fn(train_data, function(x) {
-      #  class(x)
-      #})
-      #cl_test <- sapply_fn(test_data, function(x) {
-      #  class(x)
-      #})
-      #missing_train <-
-      #  is.na(train_data[, cl_train %in% c("integer", "numeric")])
-      #missing_test <-
-      #  is.na(test_data[, cl_test %in% c("integer", "numeric")])
-      #n_missing_train <- sum(missing_train)
-      #n_missing_test <- sum(missing_test)
-      # do imputation
-      #impute_missing_vals(train_data, n_missing_train)
-      #impute_missing_vals(test_data, n_missing_test)
+      source("impute_during_training.R")
+      train_data <- impute_during_training(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      train_data <- train_data$transformed_cont
+      test_data <- impute_during_training(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      test_data <- test_data$transformed_cont
     }
     # train_groups & test_groups will be NULL if groups is NULL
     train_groups <- groups[training_inds]

From 724cf60306edc6a687ca11139fd4230a98ddfa0e Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Wed, 12 Apr 2023 16:55:17 -0400
Subject: [PATCH 07/16] refactor code and add tests

---
 R/impute.R                       | 33 +++++++++++++++
 R/impute_during_training.R       | 24 -----------
 R/preprocess.R                   | 35 ++++------------
 R/run_ml.R                       | 16 ++++---
 tests/testthat/test-preprocess.R | 17 +++++++-
 tests/testthat/test-run_ml.R     | 71 ++++++++++++++++++++++++++++++++
 6 files changed, 138 insertions(+), 58 deletions(-)
 delete mode 100644 R/impute_during_training.R

diff --git a/R/impute.R b/R/impute.R
index b84336eb..c7548fc4 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -13,4 +13,37 @@ impute <- function(transformed_cont, n_missing) {
     )
   )
   return (transformed_cont)
+}
+
+prep_data <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) {
+  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
+  dataset <- rm_missing_outcome(dataset, outcome_colname)
+  split_dat <- split_outcome_features(dataset, outcome_colname)
+  
+  features <- split_dat$features
+  removed_feats <- character(0)
+  if (to_numeric) {
+    feats <- change_to_num(features) %>%
+      remove_singleton_columns(threshold = prefilter_threshold)
+    removed_feats <- feats$removed_feats
+    features <- feats$dat
+  }
+  pbtick(progbar)
+  
+  nv_feats <- process_novar_feats(features, progbar = progbar)
+  pbtick(progbar)
+  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+  pbtick(progbar)
+  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+  pbtick(progbar)
+  # combine all processed features
+  processed_feats <- dplyr::bind_cols(
+    cont_feats$transformed_cont,
+    split_feats$cat_feats,
+    nv_feats$novar_feats
+  )
+  pbtick(progbar)
+  
+  processed_data <- list(cont_feats = cont_feats, removed_feats = removed_feats, split_dat = split_dat, processed_feats = processed_feats)
+  return(processed_data)
 }
\ No newline at end of file
diff --git a/R/impute_during_training.R b/R/impute_during_training.R
deleted file mode 100644
index 8a4ba434..00000000
--- a/R/impute_during_training.R
+++ /dev/null
@@ -1,24 +0,0 @@
-impute_during_training <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) {
-  source("preprocess.R")
-  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
-  dataset <- rm_missing_outcome(dataset, outcome_colname)
-  split_dat <- split_outcome_features(dataset, outcome_colname)
-  
-  features <- split_dat$features
-  removed_feats <- character(0)
-  if (to_numeric) {
-    feats <- change_to_num(features) %>%
-      remove_singleton_columns(threshold = prefilter_threshold)
-    removed_feats <- feats$removed_feats
-    features <- feats$dat
-  }
-  pbtick(progbar)
-  
-  nv_feats <- process_novar_feats(features, progbar = progbar)
-  pbtick(progbar)
-  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-  pbtick(progbar)
-  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
-  pbtick(progbar)
-  return(cont_feats)
-}
\ No newline at end of file
diff --git a/R/preprocess.R b/R/preprocess.R
index 5dfa01f3..ac5671d4 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -73,34 +73,15 @@ preprocess_data <- function(dataset, outcome_colname,
   check_outcome_column(dataset, outcome_colname, check_values = FALSE)
   check_remove_var(remove_var)
   pbtick(progbar)
-  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
-  dataset <- rm_missing_outcome(dataset, outcome_colname)
-  split_dat <- split_outcome_features(dataset, outcome_colname)
-
-  features <- split_dat$features
-  removed_feats <- character(0)
-  if (to_numeric) {
-    feats <- change_to_num(features) %>%
-      remove_singleton_columns(threshold = prefilter_threshold)
-    removed_feats <- feats$removed_feats
-    features <- feats$dat
-  }
-  pbtick(progbar)
-
-  nv_feats <- process_novar_feats(features, progbar = progbar)
-  pbtick(progbar)
-  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-  pbtick(progbar)
-  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
-  pbtick(progbar)
-
+  
+  processed_data <- prep_data(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric)
+  removed_feats <- processed_data$removed_feats
+  processed_feats <- processed_data$processed_feats
+  split_dat <- processed_data$split_dat
+  cont_feats <- processed_data$cont_feats
+  
   # combine all processed features
-  processed_feats <- dplyr::bind_cols(
-    cont_feats$transformed_cont,
-    split_feats$cat_feats,
-    nv_feats$novar_feats
-  )
-  pbtick(progbar)
+  
 
   # remove features with (near-)zero variance
   feats <- get_caret_processed_df(processed_feats, remove_var)
diff --git a/R/run_ml.R b/R/run_ml.R
index 64b70ff7..bfd20345 100644
--- a/R/run_ml.R
+++ b/R/run_ml.R
@@ -208,11 +208,17 @@ run_ml <-
     train_data <- dataset[training_inds, ]
     test_data <- dataset[-training_inds, ]
     if (impute_in_training == TRUE) {
-      source("impute_during_training.R")
-      train_data <- impute_during_training(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
-      train_data <- train_data$transformed_cont
-      test_data <- impute_during_training(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
-      test_data <- test_data$transformed_cont
+      
+      train_processed_data <- prep_data(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      train_processed_feats <- train_processed_data$processed_feats
+      split_dat <- train_processed_data$split_dat
+      train_data <- dplyr::bind_cols(split_dat$outcome, train_processed_feats) %>%
+        dplyr::as_tibble()
+      test_processed_data <- prep_data(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
+      test_processed_feats <- test_processed_data$processed_feats
+      split_dat <- test_processed_data$split_dat
+      test_data <- dplyr::bind_cols(split_dat$outcome, test_processed_feats) %>%
+        dplyr::as_tibble()
     }
     # train_groups & test_groups will be NULL if groups is NULL
     train_groups <- groups[training_inds]
diff --git a/tests/testthat/test-preprocess.R b/tests/testthat/test-preprocess.R
index 539f79e6..2a95e3e3 100644
--- a/tests/testthat/test-preprocess.R
+++ b/tests/testthat/test-preprocess.R
@@ -442,14 +442,14 @@ test_that("process_cat_feats works", {
 
 test_that("process_cont_feats works", {
   expect_equal(
-    process_cont_feats(dplyr::as_tibble(test_df[1:3, 2]), method = c("center", "scale")),
+    process_cont_feats(dplyr::as_tibble(test_df[1:3, 2]), method = c("center", "scale"), impute_in_preprocessing = TRUE),
     list(transformed_cont = structure(list(value = c(-1, 0, 1)), row.names = c(
       NA,
       -3L
     ), class = c("tbl_df", "tbl", "data.frame")), removed_cont = character(0))
   ) %>% suppressMessages()
   expect_message(expect_equal(
-    process_cont_feats(test_df[1:3, c(2, 9)], method = c("center", "scale")),
+    process_cont_feats(test_df[1:3, c(2, 9)], method = c("center", "scale"), impute_in_preprocessing = TRUE),
     list(transformed_cont = structure(list(var1 = c(-1, 0, 1), var8 = c(
       -0.707106781186547,
       0.707106781186547, 0
@@ -951,4 +951,17 @@ test_that("setting impute param to false doesn't impute data", {
 })
 
 
+test_that("default parameter for impute_in_preprocessing is TRUE", {
+  expect_message(
+    expect_equal(
+      preprocess_data(test_df, "outcome",
+                      prefilter_threshold = -1
+      ),
+      preprocess_data(test_df, "outcome",
+                      prefilter_threshold = -1, impute_in_preprocessing = TRUE))
+    ,
+    "Removed "
+  ) %>% suppressMessages()
+})
+
 
diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R
index c3fcba57..b196260f 100644
--- a/tests/testthat/test-run_ml.R
+++ b/tests/testthat/test-run_ml.R
@@ -310,3 +310,74 @@ test_that("models use case weights when provided", {
   expect_true("weights" %in% colnames(results_custom_train$trained_model$pred))
   expect_false("weights" %in% colnames(otu_mini_bin_results_glmnet$trained_model$pred))
 })
+
+
+
+# TODO: fix errors
+test_that("make sure impute function on train data set works", {
+  train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
+                           var1 = 1:4,
+                           var2 = c("a", "b", "c", "d"),
+                           var3 = c("no", "yes", "no", "no"),
+                           var4 = c(0, 1, 0, 0),
+                           var5 = c(0, 0, 0, 0),
+                           var6 = c("no", "no", "no", "no"),
+                           var7 = c(1, 1, 0, 0),
+                           var8 = c(5, 6, NA, 7),
+                           var9 = c(NA, "x", "y", "z"),
+                           var10 = c(1, 0, NA, NA))
+  test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
+                          var12 = c("1", "2", "3", "4"))
+  train_data_output <- data.frame(
+      outcome = c("normal", "normal", "cancer"),
+      var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0))
+  results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
+  final_results <- results_output$processed_feats
+  split_dat <- results_output$split_dat
+  train_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>%
+    dplyr::as_tibble()
+  expect_equal(train_df, train_data_output)
+})
+test_that("make sure impute function on test data set works", {
+  train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
+                           var1 = 1:4,
+                           var2 = c("a", "b", "c", "d"),
+                           var3 = c("no", "yes", "no", "no"),
+                           var4 = c(0, 1, 0, 0),
+                           var5 = c(0, 0, 0, 0),
+                           var6 = c("no", "no", "no", "no"),
+                           var7 = c(1, 1, 0, 0),
+                           var8 = c(5, 6, NA, 7),
+                           var9 = c(NA, "x", "y", "z"),
+                           var10 = c(1, 0, NA, NA))
+  test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
+                          var12 = c("1", "2", "3", "4"))
+  test_data_output <- data.frame(
+    outcome = c("normal", "normal", "cancer"),
+    var12 = c(-1, 0, 1), var11 = c(1, 1, 1))
+  results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
+  final_results <- results_output$processed_feats
+  split_dat <- results_output$split_dat
+  test_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>%
+    dplyr::as_tibble()
+  expect_equal(test_df, test_data_output)
+})
+
+temp_df <- otu_mini_bin
+Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 
+              6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
+              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 
+              5, 5)
+bin2 <- otu_mini_bin %>% mutate(Otu00011)
+# TODO: how to test? DOES OUR FUNC GET CALLED IF FLAG SET TO TRUE, NOT CALLED WHEN SET TO FALSE
+
+#expect_equal(run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA), run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA, FALSE))
+
+#run_ml(bin2, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)
+#run_ml(otu_mini_bin, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)

From a2b15edeb57e2912e2b214e16b85f9410acbc07b Mon Sep 17 00:00:00 2001
From: Megan Coden <mlcoden@umich.edu>
Date: Wed, 12 Apr 2023 17:54:38 -0400
Subject: [PATCH 08/16] include tests to validate that the flag for impute
 during training works

---
 tests/testthat/test-run_ml.R | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R
index b196260f..a188b8e1 100644
--- a/tests/testthat/test-run_ml.R
+++ b/tests/testthat/test-run_ml.R
@@ -311,9 +311,6 @@ test_that("models use case weights when provided", {
   expect_false("weights" %in% colnames(otu_mini_bin_results_glmnet$trained_model$pred))
 })
 
-
-
-# TODO: fix errors
 test_that("make sure impute function on train data set works", {
   train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
                            var1 = 1:4,
@@ -328,7 +325,7 @@ test_that("make sure impute function on train data set works", {
                            var10 = c(1, 0, NA, NA))
   test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
                           var12 = c("1", "2", "3", "4"))
-  train_data_output <- data.frame(
+  train_data_output <- dplyr::tibble(
       outcome = c("normal", "normal", "cancer"),
       var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0))
   results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
@@ -338,6 +335,7 @@ test_that("make sure impute function on train data set works", {
     dplyr::as_tibble()
   expect_equal(train_df, train_data_output)
 })
+
 test_that("make sure impute function on test data set works", {
   train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
                            var1 = 1:4,
@@ -352,7 +350,7 @@ test_that("make sure impute function on test data set works", {
                            var10 = c(1, 0, NA, NA))
   test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
                           var12 = c("1", "2", "3", "4"))
-  test_data_output <- data.frame(
+  test_data_output <- dplyr::tibble(
     outcome = c("normal", "normal", "cancer"),
     var12 = c(-1, 0, 1), var11 = c(1, 1, 1))
   results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
@@ -374,10 +372,22 @@ Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6
               6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 
               6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 
               5, 5)
-bin2 <- otu_mini_bin %>% mutate(Otu00011)
-# TODO: how to test? DOES OUR FUNC GET CALLED IF FLAG SET TO TRUE, NOT CALLED WHEN SET TO FALSE
+mini_bin_with_nas <- otu_mini_bin %>% mutate(Otu00011)
 
-#expect_equal(run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA), run_ml(test_df, "glmnet", 'outcome', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, NA, FALSE))
+test_that("data gets imputed when impute_in_training is set to TRUE", {
+  results <- run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)
+  temp <- colSums(is.na(results$test_data)) 
+  num_nas <- sum(temp)
+  expect_equal(0, num_nas) %>% suppressMessages() 
+  temp <- colSums(is.na(results$trained_model$trainingData)) 
+  num_nas <- sum(temp)
+  expect_equal(0, num_nas) %>% suppressMessages()
+})
+
+test_that("data is not imputed when impute_in_training is set to FALSE", {
+  expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, FALSE),NULL) %>% suppressMessages()
+})
 
-#run_ml(bin2, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)
-#run_ml(otu_mini_bin, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)
+test_that("data is not imputed when impute_in_training is not set", {
+  expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019),NULL) %>% suppressMessages()
+})
\ No newline at end of file

From 998a158dab4d9b3322834df0d94b5ea6ec2358f6 Mon Sep 17 00:00:00 2001
From: Megan Coden <55669799+megancoden@users.noreply.github.com>
Date: Wed, 12 Apr 2023 18:02:46 -0400
Subject: [PATCH 09/16] update with imputation changes

---
 NEWS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index c73251cd..b4c5a8bf 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,9 @@
     - New column `lower` and `upper` to report the bounds of the empirical 95% confidence interval from the permutation test.
       See `vignette('parallel')` for an example of plotting feature importance with confidence intervals.
 - Minor documentation improvements (#323, @kelly-sovacool).
+- Added option to impute missing data during training rather than preprocessing (#301, @megancoden and @shah-priyal).
+  - Added impute_in_training option to `run_ml()`, which defaults to FALSE.
+  - Added impute_in_preprocessing option to `preprocess()`, which defaults to TRUE.
 
 # mikropml 1.5.0
 

From 43cdd4b89dd983dd1b20934341ec7da4955b73b9 Mon Sep 17 00:00:00 2001
From: shah-priyal <shahpri@umich.edu>
Date: Tue, 18 Apr 2023 15:16:39 -0400
Subject: [PATCH 10/16] changed file so it only does imputation step

---
 R/impute.R | 70 +++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 46 deletions(-)

diff --git a/R/impute.R b/R/impute.R
index c7548fc4..9cd97886 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -1,49 +1,27 @@
-impute <- function(transformed_cont, n_missing) {
-  transformed_cont <- sapply_fn(transformed_cont, function(x) {
-    if (class(x) %in% c("integer", "numeric")) {
-      m <- is.na(x)
-      x[m] <- stats::median(x, na.rm = TRUE)
-    }
-    return(x)
-  }) %>% dplyr::as_tibble()
-  message(
-    paste0(
-      n_missing,
-      " missing continuous value(s) were imputed using the median value of the feature."
+impute <- function(transformed_cont) {
+  sapply_fn <- select_apply("sapply")
+  cl <- sapply_fn(transformed_cont, function(x) {
+    class(x)
+  })
+  missing <-
+    is.na(transformed_cont[, cl %in% c("integer", "numeric")])
+  n_missing <- sum(missing)
+  if (n_missing > 0) {
+    transformed_cont <- sapply_fn(transformed_cont, function(x) {
+      if (class(x) %in% c("integer", "numeric")) {
+        m <- is.na(x)
+        x[m] <- stats::median(x, na.rm = TRUE)
+      }
+      message(typeof(x))
+      message(class(x))
+      return(x)
+    }) %>% dplyr::as_tibble()
+    message(
+      paste0(
+        n_missing,
+        " missing continuous value(s) were imputed using the median value of the feature."
+      )
     )
-  )
-  return (transformed_cont)
-}
-
-prep_data <- function(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric) {
-  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
-  dataset <- rm_missing_outcome(dataset, outcome_colname)
-  split_dat <- split_outcome_features(dataset, outcome_colname)
-  
-  features <- split_dat$features
-  removed_feats <- character(0)
-  if (to_numeric) {
-    feats <- change_to_num(features) %>%
-      remove_singleton_columns(threshold = prefilter_threshold)
-    removed_feats <- feats$removed_feats
-    features <- feats$dat
   }
-  pbtick(progbar)
-  
-  nv_feats <- process_novar_feats(features, progbar = progbar)
-  pbtick(progbar)
-  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
-  pbtick(progbar)
-  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
-  pbtick(progbar)
-  # combine all processed features
-  processed_feats <- dplyr::bind_cols(
-    cont_feats$transformed_cont,
-    split_feats$cat_feats,
-    nv_feats$novar_feats
-  )
-  pbtick(progbar)
-  
-  processed_data <- list(cont_feats = cont_feats, removed_feats = removed_feats, split_dat = split_dat, processed_feats = processed_feats)
-  return(processed_data)
+  return (transformed_cont)git 
 }
\ No newline at end of file

From cf5bb4e3bdf69e40697bb9893b9bcf30321c0b1d Mon Sep 17 00:00:00 2001
From: shah-priyal <shahpri@umich.edu>
Date: Tue, 18 Apr 2023 15:17:22 -0400
Subject: [PATCH 11/16] removed extra lines and prep_data steps back to file

---
 R/preprocess.R | 75 +++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/R/preprocess.R b/R/preprocess.R
index 52e80d20..20495aaf 100644
--- a/R/preprocess.R
+++ b/R/preprocess.R
@@ -1,6 +1,3 @@
-# TODO: set this for a generic path (probably using here::here)
-library(here)
-here("R", "impute.R")
 #' Preprocess data prior to running machine learning
 #'
 #' Function to preprocess your data for input into [run_ml()].
@@ -74,15 +71,35 @@ preprocess_data <- function(dataset, outcome_colname,
   check_remove_var(remove_var)
   pbtick(progbar)
   
-  processed_data <- prep_data(dataset, outcome_colname, prefilter_threshold, method, impute_in_preprocessing, to_numeric)
-  removed_feats <- processed_data$removed_feats
-  processed_feats <- processed_data$processed_feats
-  split_dat <- processed_data$split_dat
-  cont_feats <- processed_data$cont_feats
+  dataset[[outcome_colname]] <- replace_spaces(dataset[[outcome_colname]])
+  dataset <- rm_missing_outcome(dataset, outcome_colname)
+  split_dat <- split_outcome_features(dataset, outcome_colname)
+  
+  features <- split_dat$features
+  removed_feats <- character(0)
+  if (to_numeric) {
+    feats <- change_to_num(features) %>%
+      remove_singleton_columns(threshold = prefilter_threshold)
+    removed_feats <- feats$removed_feats
+    features <- feats$dat
+  }
+  pbtick(progbar)
+  
+  nv_feats <- process_novar_feats(features, progbar = progbar)
+  pbtick(progbar)
+  split_feats <- process_cat_feats(nv_feats$var_feats, progbar = progbar)
+  pbtick(progbar)
+  cont_feats <- process_cont_feats(split_feats$cont_feats, method, impute_in_preprocessing)
+  pbtick(progbar)
   
   # combine all processed features
+  processed_feats <- dplyr::bind_cols(
+    cont_feats$transformed_cont,
+    split_feats$cat_feats,
+    nv_feats$novar_feats
+  )
+  pbtick(progbar)
   
-
   # remove features with (near-)zero variance
   feats <- get_caret_processed_df(processed_feats, remove_var)
   processed_feats <- feats$processed
@@ -124,17 +141,15 @@ preprocess_data <- function(dataset, outcome_colname,
 #' @inheritParams run_ml
 #'
 #' @return dataset with no missing outcomes
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' rm_missing_outcome(mikropml::otu_mini_bin, "dx")
 #'
 #' test_df <- mikropml::otu_mini_bin
 #' test_df[1:100, "dx"] <- NA
 #' rm_missing_outcome(test_df, "dx")
-#' }
 rm_missing_outcome <- function(dataset, outcome_colname) {
   n_outcome_na <- sum(is.na(dataset %>% dplyr::pull(outcome_colname)))
   total_outcomes <- nrow(dataset)
@@ -152,13 +167,11 @@ rm_missing_outcome <- function(dataset, outcome_colname) {
 #' @param features dataframe of features for machine learning
 #'
 #' @return dataframe with numeric columns where possible
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' class(change_to_num(data.frame(val = c("1", "2", "3")))[[1]])
-#' }
 change_to_num <- function(features) {
   lapply_fn <- select_apply(fun = "lapply")
   check_features(features, check_missing = FALSE)
@@ -212,13 +225,11 @@ remove_singleton_columns <- function(dat, threshold = 1) {
 #' @param progbar optional progress bar (default: `NULL`)
 #'
 #' @return list of two dataframes: features with variability (unprocessed) and without (processed)
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' process_novar_feats(mikropml::otu_small[, 2:ncol(otu_small)])
-#' }
 process_novar_feats <- function(features, progbar = NULL) {
   novar_feats <- NULL
   var_feats <- NULL
@@ -281,13 +292,11 @@ process_novar_feats <- function(features, progbar = NULL) {
 #' @inheritParams process_novar_feats
 #'
 #' @return list of two dataframes: categorical (processed) and continuous features (unprocessed)
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' process_cat_feats(mikropml::otu_small[, 2:ncol(otu_small)])
-#' }
 process_cat_feats <- function(features, progbar = NULL) {
   feature_design_cat_mat <- NULL
   cont_feats <- NULL
@@ -351,13 +360,11 @@ process_cat_feats <- function(features, progbar = NULL) {
 #' @inheritParams get_caret_processed_df
 #'
 #' @return dataframe of preprocessed features
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' process_cont_feats(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
-#' }
 process_cont_feats <- function(features, method, impute_in_preprocessing) {
   transformed_cont <- NULL
   removed_cont <- NULL
@@ -373,22 +380,12 @@ process_cont_feats <- function(features, method, impute_in_preprocessing) {
         transformed_cont <- feats$processed
         removed_cont <- feats$removed
       }
-      sapply_fn <- select_apply("sapply")
-      cl <- sapply_fn(transformed_cont, function(x) {
-        class(x)
-      })
-      missing <-
-        is.na(transformed_cont[, cl %in% c("integer", "numeric")])
-      n_missing <- sum(missing)
-      if (n_missing > 0) {
         # impute missing data using the median value
         if (impute_in_preprocessing) {
-          source("impute.R")
-          transformed_cont <- impute(transformed_cont, n_missing)
+          transformed_cont <- impute(transformed_cont)
         }
       }
     }
-  }
   return(list(transformed_cont = transformed_cont, removed_cont = removed_cont))
 }
 
@@ -425,11 +422,10 @@ get_caret_processed_df <- function(features, method) {
 #' @inheritParams process_novar_feats
 #' @param full_rank whether matrix should be full rank or not (see `[caret::dummyVars])
 #' @return design matrix
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' df <- data.frame(
 #'   outcome = c("normal", "normal", "cancer"),
 #'   var1 = 1:3,
@@ -438,7 +434,6 @@ get_caret_processed_df <- function(features, method) {
 #'   var4 = c(0, 1, 0)
 #' )
 #' get_caret_dummyvars_df(df, TRUE)
-#' }
 get_caret_dummyvars_df <- function(features, full_rank = FALSE, progbar = NULL) {
   check_features(features, check_missing = FALSE)
   if (!is.null(process_novar_feats(features, progbar = progbar)$novar_feats)) {
@@ -456,13 +451,11 @@ get_caret_dummyvars_df <- function(features, full_rank = FALSE, progbar = NULL)
 #' @inheritParams group_correlated_features
 #'
 #' @return features where perfectly correlated ones are collapsed
-#' @keywords internal
+#' @noRd
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
 #'
 #' @examples
-#' \dontrun{
 #' collapse_correlated_features(mikropml::otu_small[, 2:ncol(otu_small)])
-#' }
 collapse_correlated_features <- function(features, group_neg_corr = TRUE, progbar = NULL) {
   feats_nocorr <- features
   grp_feats <- NULL

From eb81a6181d8d2a18288768945c55b35458a5cec8 Mon Sep 17 00:00:00 2001
From: shah-priyal <shahpri@umich.edu>
Date: Tue, 18 Apr 2023 15:18:43 -0400
Subject: [PATCH 12/16] removed extra todos and update file to only include
 imputation step

---
 R/run_ml.R | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/R/run_ml.R b/R/run_ml.R
index bfd20345..4526b199 100644
--- a/R/run_ml.R
+++ b/R/run_ml.R
@@ -1,5 +1,3 @@
-# TODO: test if calling these functions works
-# TODO: figure out if there's a way to only specify option in one place (for runml and for preprocess)
 #' Run the machine learning pipeline
 #'
 #' This function splits the data set into a train & test set,
@@ -146,7 +144,7 @@ run_ml <-
            group_partitions = NULL,
            corr_thresh = 1,
            seed = NA,
-           impute_in_training = FALSE,
+           impute_after_split = FALSE,
            ...) {
     check_all(
       dataset,
@@ -207,18 +205,9 @@ run_ml <-
     
     train_data <- dataset[training_inds, ]
     test_data <- dataset[-training_inds, ]
-    if (impute_in_training == TRUE) {
-      
-      train_processed_data <- prep_data(train_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
-      train_processed_feats <- train_processed_data$processed_feats
-      split_dat <- train_processed_data$split_dat
-      train_data <- dplyr::bind_cols(split_dat$outcome, train_processed_feats) %>%
-        dplyr::as_tibble()
-      test_processed_data <- prep_data(test_data, outcome_colname, prefilter_threshold=1, method=c("center", "scale"), impute_in_preprocessing=TRUE, to_numeric=TRUE)
-      test_processed_feats <- test_processed_data$processed_feats
-      split_dat <- test_processed_data$split_dat
-      test_data <- dplyr::bind_cols(split_dat$outcome, test_processed_feats) %>%
-        dplyr::as_tibble()
+    if (impute_after_split == TRUE) {
+      train_data <- impute(train_data)
+      test_data <- impute(test_data)
     }
     # train_groups & test_groups will be NULL if groups is NULL
     train_groups <- groups[training_inds]

From e6726969f16a0d80fa936419b0f67f9ca777afa6 Mon Sep 17 00:00:00 2001
From: shah-priyal <shahpri@umich.edu>
Date: Tue, 18 Apr 2023 15:19:53 -0400
Subject: [PATCH 13/16] changed tests to adjust expected output of imputed data

---
 tests/testthat/test-run_ml.R | 79 ++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/tests/testthat/test-run_ml.R b/tests/testthat/test-run_ml.R
index a188b8e1..ee140858 100644
--- a/tests/testthat/test-run_ml.R
+++ b/tests/testthat/test-run_ml.R
@@ -1,4 +1,3 @@
-# TODO: add tests for if impution param is set to true
 library(dplyr)
 options(
   warnPartialMatchArgs = FALSE,
@@ -312,7 +311,7 @@ test_that("models use case weights when provided", {
 })
 
 test_that("make sure impute function on train data set works", {
-  train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
+  train_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"),
                            var1 = 1:4,
                            var2 = c("a", "b", "c", "d"),
                            var3 = c("no", "yes", "no", "no"),
@@ -321,45 +320,47 @@ test_that("make sure impute function on train data set works", {
                            var6 = c("no", "no", "no", "no"),
                            var7 = c(1, 1, 0, 0),
                            var8 = c(5, 6, NA, 7),
-                           var9 = c(NA, "x", "y", "z"),
+                           var9 = c(NA, 1, 1, 0),
                            var10 = c(1, 0, NA, NA))
-  test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
-                          var12 = c("1", "2", "3", "4"))
+  test_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), var11 = c(1, 1, NA, NA),
+                          var12 = c(1, 2, NA, 4))
   train_data_output <- dplyr::tibble(
-      outcome = c("normal", "normal", "cancer"),
-      var1 = c(-1, 0, 1), var8 = c(-0.70710678, 0.70710678, 0), var3_yes = c(0, 1, 0), var7_1=c(1, 1, 0), var2_a = c(1, 0, 0), var2_b=c(0, 1, 0), var2_c=c(0, 0, 1), var9_x=c(0, 1, 0), var9_y = c(0, 0, 1), var6=c(0, 0, 0))
-  results_output <- prep_data(train_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
-  final_results <- results_output$processed_feats
-  split_dat <- results_output$split_dat
-  train_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>%
-    dplyr::as_tibble()
-  expect_equal(train_df, train_data_output)
+    outcome = c("normal", "normal", "cancer", "cancer"),
+    var1 = c('1', '2', '3', '4'),
+    var2 = c("a", "b", "c", "d"),
+    var3 = c("no", "yes", "no", "no"),
+    var4 = c('0', '1', '0', '0'),
+    var5 = c('0', '0', '0', '0'),
+    var6 = c("no", "no", "no", "no"),
+    var7 = c('1', '1', '0', '0'),
+    var8 = c('5', '6', '6', '7'),
+    var9 = c('1', '1', '1', '0'),
+    var10 = c('1', '0', '0.5', '0.5'))
+  results_output <- impute(train_data)
+  expect_equal(train_data_output, results_output)
 })
 
 test_that("make sure impute function on test data set works", {
-  train_data <- data.frame(outcome = c("normal", "normal", "cancer", NA),
-                           var1 = 1:4,
-                           var2 = c("a", "b", "c", "d"),
-                           var3 = c("no", "yes", "no", "no"),
-                           var4 = c(0, 1, 0, 0),
-                           var5 = c(0, 0, 0, 0),
-                           var6 = c("no", "no", "no", "no"),
-                           var7 = c(1, 1, 0, 0),
-                           var8 = c(5, 6, NA, 7),
-                           var9 = c(NA, "x", "y", "z"),
-                           var10 = c(1, 0, NA, NA))
-  test_data <- data.frame(outcome = c("normal", "normal", "cancer", NA), var11 = c(1, 1, NA, NA),
-                          var12 = c("1", "2", "3", "4"))
-  test_data_output <- dplyr::tibble(
-    outcome = c("normal", "normal", "cancer"),
-    var12 = c(-1, 0, 1), var11 = c(1, 1, 1))
-  results_output <- prep_data(test_data, 'outcome', 1, c("center", "scale"), TRUE, TRUE)
-  final_results <- results_output$processed_feats
-  split_dat <- results_output$split_dat
-  test_df <- dplyr::bind_cols(split_dat$outcome, final_results) %>%
-    dplyr::as_tibble()
-  expect_equal(test_df, test_data_output)
-})
+    train_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"),
+                             var1 = 1:4,
+                             var2 = c("a", "b", "c", "d"),
+                             var3 = c("no", "yes", "no", "no"),
+                             var4 = c(0, 1, 0, 0),
+                             var5 = c(0, 0, 0, 0),
+                             var6 = c("no", "no", "no", "no"),
+                             var7 = c(1, 1, 0, 0),
+                             var8 = c(5, 6, NA, 7),
+                             var9 = c(NA, 1, 1, 0),
+                             var10 = c(1, 0, NA, NA))
+    test_data <- data.frame(outcome = c("normal", "normal", "cancer", "cancer"), var11 = c(1, 1, NA, NA),
+                            var12 = c(1, 2, NA, 4))
+    test_data_output <- dplyr::tibble(
+      outcome = c("normal", "normal", "cancer", "cancer"), var11 = c('1', '1', '1', '1'),
+      var12 = c('1', '2', '2', '4'))
+    results_output <- impute(test_data)
+    expect_equal(test_data_output, results_output)
+  })
+  
 
 temp_df <- otu_mini_bin
 Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 
@@ -374,7 +375,7 @@ Otu00011 <- c(6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, NA, 6, 6, 6, 6, 6, 6, 6
               5, 5)
 mini_bin_with_nas <- otu_mini_bin %>% mutate(Otu00011)
 
-test_that("data gets imputed when impute_in_training is set to TRUE", {
+test_that("data gets imputed when impute_after_split is set to TRUE", {
   results <- run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, TRUE)
   temp <- colSums(is.na(results$test_data)) 
   num_nas <- sum(temp)
@@ -384,10 +385,10 @@ test_that("data gets imputed when impute_in_training is set to TRUE", {
   expect_equal(0, num_nas) %>% suppressMessages()
 })
 
-test_that("data is not imputed when impute_in_training is set to FALSE", {
+test_that("data is not imputed when impute_after_split is set to FALSE", {
   expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019, FALSE),NULL) %>% suppressMessages()
 })
 
-test_that("data is not imputed when impute_in_training is not set", {
+test_that("data is not imputed when impute_after_split is not set", {
   expect_error(run_ml(mini_bin_with_nas, "glmnet", 'dx', NULL, FALSE, TRUE, 5, 100, NULL, 0.5, NULL, NULL, NULL, NULL, 1, 2019),NULL) %>% suppressMessages()
 })
\ No newline at end of file

From 58d7f26b8ce141625c946bf8899d1feb1f521079 Mon Sep 17 00:00:00 2001
From: Megan Coden <55669799+megancoden@users.noreply.github.com>
Date: Tue, 18 Apr 2023 15:26:12 -0400
Subject: [PATCH 14/16] include info about setting impute_in_preprocessing to
 FALSE

---
 vignettes/preprocess.Rmd | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vignettes/preprocess.Rmd b/vignettes/preprocess.Rmd
index fe8ab009..0437bfde 100644
--- a/vignettes/preprocess.Rmd
+++ b/vignettes/preprocess.Rmd
@@ -370,6 +370,13 @@ going on (i.e. the median value is used):
 preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL)
 ```
 
+To delay this step until after the train/test split in run_ml.R, set the impute_in_preprocessing option to FALSE as shown here:
+
+```{r}
+# preprocess raw dataset with missing value in continuous feature
+preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL, impute_in_preprocessing=FALSE)
+```
+
 ## Putting it all together
 
 Here's some more complicated example raw data that puts everything we discussed together:

From 8a18c1d341533695bc6118eb5fb0a64c07cfbb84 Mon Sep 17 00:00:00 2001
From: shah-priyal <shahpri@umich.edu>
Date: Tue, 18 Apr 2023 15:35:03 -0400
Subject: [PATCH 15/16] added roxygen docstring for impute function

---
 R/impute.R | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/R/impute.R b/R/impute.R
index 9cd97886..149e5316 100644
--- a/R/impute.R
+++ b/R/impute.R
@@ -1,3 +1,13 @@
+#' Replace NA values with the median value of the column for continious variables in the dataset
+#'
+#' @param transformed_cont Data frame that may include NA values in one or more columns
+#'
+#' @return Data frame that has no NA values in continious numeric columns
+#'
+#' @examples
+#' transformed_cont <- impute(transformed_cont)
+#' train_data <- impute(train_data)
+#' test_data <- impute(test_data)
 impute <- function(transformed_cont) {
   sapply_fn <- select_apply("sapply")
   cl <- sapply_fn(transformed_cont, function(x) {
@@ -23,5 +33,5 @@ impute <- function(transformed_cont) {
       )
     )
   }
-  return (transformed_cont)git 
+  return (transformed_cont) 
 }
\ No newline at end of file

From 8c81b35d10ed35f84abbacead68d8e3de3068583 Mon Sep 17 00:00:00 2001
From: shah-priyal <55864447+shah-priyal@users.noreply.github.com>
Date: Tue, 18 Apr 2023 15:47:09 -0400
Subject: [PATCH 16/16] Update preprocess.Rmd

---
 vignettes/preprocess.Rmd | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vignettes/preprocess.Rmd b/vignettes/preprocess.Rmd
index 0437bfde..31e7ab1c 100644
--- a/vignettes/preprocess.Rmd
+++ b/vignettes/preprocess.Rmd
@@ -369,13 +369,21 @@ going on (i.e. the median value is used):
 # preprocess raw dataset with missing value in continuous feature
 preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL)
 ```
-
+#### Impute after the train/test split in run_ml.R
 To delay this step until after the train/test split in run_ml.R, set the impute_in_preprocessing option to FALSE as shown here:
 
 ```{r}
 # preprocess raw dataset with missing value in continuous feature
 preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL, impute_in_preprocessing=FALSE)
 ```
+To impute the data after the train/test split in run_ml.R, set the impute_after_split option to TRUE as shown here:
+```{r, eval = FALSE}
+results <- run_ml(otu_mini_bin,
+  "glmnet",
+  outcome_colname = "dx",
+  seed = 2019, impute_after_split = TRUE
+)
+```
 
 ## Putting it all together