From 4af1eca0ab8060cf323a00869809e807578cde22 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 22 Feb 2022 10:32:06 -0500 Subject: [PATCH 1/3] Improve feature importance examples Strongly recommend using multiple cores. --- R/feature_importance.R | 61 ++++++++++++++++++++++++----------- man/get_feature_importance.Rd | 61 ++++++++++++++++++++++++----------- 2 files changed, 84 insertions(+), 38 deletions(-) diff --git a/R/feature_importance.R b/R/feature_importance.R index 8e18caa9..b1a434cd 100644 --- a/R/feature_importance.R +++ b/R/feature_importance.R @@ -34,23 +34,41 @@ #' the event that the null hypothesis is true, where the null hypothesis is that #' the feature is not important for model performance. #' +#' We strongly recommend providing multiple cores to speed up computation time. +#' See [our vignette on parallel processing](http://www.schlosslab.org/mikropml/articles/parallel.html) +#' for more details. +#' #' @examples #' \dontrun{ +#' # If you called `run_ml()` with `feature_importance = FALSE` (the default), +#' # you can use `get_feature_importance()` later as long as you have the +#' # trained model and test data. #' results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) #' names(results$trained_model$trainingData)[1] <- "dx" -#' get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' feat_imp <- get_feature_importance(results$trained_model, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet" +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet" #' ) #' -#' # optionally, you can group features together with a custom grouping -#' get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' # We strongly recommend providing multiple cores to speed up computation time. +#' # Do this before calling `get_feature_importance()`. +#' doFuture::registerDoFuture() +#' future::plan(future::multicore, workers = 2) +#' +#' # Optionally, you can group features together with a custom grouping +#' feat_imp <- get_feature_importance(results$trained_model, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet", +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet", #' groups = c( #' "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", #' "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", @@ -66,9 +84,8 @@ #' ) #' ) #' -#' # the function can show a progress bar if you have the progressr package installed -#' ## optionally, specify the progress bar format -#' +#' # the function can show a progress bar if you have the `progressr` package installed. +#' ## optionally, specify the progress bar format: #' progressr::handlers(progressr::handler_progress( #' format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", #' clear = FALSE, @@ -78,18 +95,24 @@ #' progressr::handlers(global = TRUE) #' ## run the function and watch the live progress udpates #' feat_imp <- get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet" +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet" #' ) #' -#' # you can specify any correlation method supported by `stats::cor`: +#' # You can specify any correlation method supported by `stats::cor`: #' feat_imp <- get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet", +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet", #' corr_method = "pearson" #' ) #' } diff --git a/man/get_feature_importance.Rd b/man/get_feature_importance.Rd index e6b48868..6861d461 100644 --- a/man/get_feature_importance.Rd +++ b/man/get_feature_importance.Rd @@ -96,24 +96,42 @@ precision of estimating the null distribution, but also increases runtime. The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance. + +We strongly recommend providing multiple cores to speed up computation time. +See \href{http://www.schlosslab.org/mikropml/articles/parallel.html}{our vignette on parallel processing} +for more details. } \examples{ \dontrun{ +# If you called `run_ml()` with `feature_importance = FALSE` (the default), +# you can use `get_feature_importance()` later as long as you have the +# trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" -get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, +feat_imp <- get_feature_importance(results$trained_model, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# optionally, you can group features together with a custom grouping -get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, +# We strongly recommend providing multiple cores to speed up computation time. +# Do this before calling `get_feature_importance()`. +doFuture::registerDoFuture() +future::plan(future::multicore, workers = 2) + +# Optionally, you can group features together with a custom grouping +feat_imp <- get_feature_importance(results$trained_model, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", @@ -129,9 +147,8 @@ get_feature_importance(results$trained_model, ) ) -# the function can show a progress bar if you have the progressr package installed -## optionally, specify the progress bar format - +# the function can show a progress bar if you have the `progressr` package installed. +## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, @@ -141,18 +158,24 @@ progressr::handlers(progressr::handler_progress( progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# you can specify any correlation method supported by `stats::cor`: +# You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", corr_method = "pearson" ) } From 0e03eae8451f27d7a5db011c416e0cfee006e995 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Tue, 22 Feb 2022 10:36:11 -0500 Subject: [PATCH 2/3] Update NEWS for PR #293 --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 79494ebc..452c4b8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # development version - mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292). -- Fix bug where `cv_times` had no effect on repeats for cross-validation (#291, @kelly-sovacool). +- Fix bug where `cv_times` had no effect on reported repeats for cross-validation (#291, @kelly-sovacool). +- Made minor documentation improvements (#293, @kelly-sovacool) # mikropml 1.2.2 From 181fa885a53354ad33d7ce875f345fe8f2ae381f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 24 Feb 2022 20:10:07 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=91=20Build=20docs=20site?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/pkgdown.yml | 2 +- docs/reference/get_feature_importance.html | 60 ++++++++++++++------- docs/reference/get_perf_metric_fn.html | 6 +-- docs/reference/randomize_feature_order.html | 8 +-- 4 files changed, 49 insertions(+), 27 deletions(-) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index d6adef55..84b803e2 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -7,7 +7,7 @@ articles: parallel: parallel.html preprocess: preprocess.html tuning: tuning.html -last_built: 2022-02-16T16:38Z +last_built: 2022-02-24T19:44Z urls: reference: http://www.schlosslab.org/mikropml/reference article: http://www.schlosslab.org/mikropml/articles diff --git a/docs/reference/get_feature_importance.html b/docs/reference/get_feature_importance.html index 17fdb7b2..fc3ec17d 100644 --- a/docs/reference/get_feature_importance.html +++ b/docs/reference/get_feature_importance.html @@ -171,6 +171,9 @@

Details

The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance.

+

We strongly recommend providing multiple cores to speed up computation time. +See our vignette on parallel processing +for more details.

Author

@@ -182,21 +185,35 @@

Author

Examples

if (FALSE) {
+# If you called `run_ml()` with `feature_importance = FALSE` (the default),
+# you can use `get_feature_importance()` later as long as you have the
+# trained model and test data.
 results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2)
 names(results$trained_model$trainingData)[1] <- "dx"
-get_feature_importance(results$trained_model,
-  results$trained_model$trainingData, results$test_data,
+feat_imp <- get_feature_importance(results$trained_model,
+  results$trained_model$trainingData,
+  results$test_data,
   "dx",
-  multiClassSummary, "AUC",
-  class_probs = TRUE, method = "glmnet"
+  multiClassSummary,
+  "AUC",
+  class_probs = TRUE,
+  method = "glmnet"
 )
 
-# optionally, you can group features together with a custom grouping
-get_feature_importance(results$trained_model,
-  results$trained_model$trainingData, results$test_data,
+# We strongly recommend providing multiple cores to speed up computation time.
+# Do this before calling `get_feature_importance()`.
+doFuture::registerDoFuture()
+future::plan(future::multicore, workers = 2)
+
+# Optionally, you can group features together with a custom grouping
+feat_imp <- get_feature_importance(results$trained_model,
+  results$trained_model$trainingData,
+  results$test_data,
   "dx",
-  multiClassSummary, "AUC",
-  class_probs = TRUE, method = "glmnet",
+  multiClassSummary,
+  "AUC",
+  class_probs = TRUE,
+  method = "glmnet",
   groups = c(
     "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012",
     "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022",
@@ -212,9 +229,8 @@ 

Examples

) ) -# the function can show a progress bar if you have the progressr package installed -## optionally, specify the progress bar format - +# the function can show a progress bar if you have the `progressr` package installed. +## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, @@ -224,18 +240,24 @@

Examples

progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# you can specify any correlation method supported by `stats::cor`: +# You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", corr_method = "pearson" ) } diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html index fbda5846..1216b07c 100644 --- a/docs/reference/get_perf_metric_fn.html +++ b/docs/reference/get_perf_metric_fn.html @@ -105,7 +105,7 @@

Examples

#> data$obs <- factor(data$obs, levels = lev) #> postResample(data[, "pred"], data[, "obs"]) #> } -#> <bytecode: 0x7fa6d7021a18> +#> <bytecode: 0x7f80bb595430> #> <environment: namespace:caret> get_perf_metric_fn("binary") #> function (data, lev = NULL, model = NULL) @@ -163,7 +163,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa6f3348cf8> +#> <bytecode: 0x7f80d7840cc0> #> <environment: namespace:caret> get_perf_metric_fn("multiclass") #> function (data, lev = NULL, model = NULL) @@ -221,7 +221,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa6f3348cf8> +#> <bytecode: 0x7f80d7840cc0> #> <environment: namespace:caret>
diff --git a/docs/reference/randomize_feature_order.html b/docs/reference/randomize_feature_order.html index c2cbc7b9..0e396324 100644 --- a/docs/reference/randomize_feature_order.html +++ b/docs/reference/randomize_feature_order.html @@ -107,10 +107,10 @@

Examples

a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome") -#> outcome c b d a -#> 1 1 10 7 13 4 -#> 2 2 11 8 14 5 -#> 3 3 12 9 15 6 +#> outcome c d b a +#> 1 1 10 13 7 4 +#> 2 2 11 14 8 5 +#> 3 3 12 15 9 6