From 4af1eca0ab8060cf323a00869809e807578cde22 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool
Date: Tue, 22 Feb 2022 10:32:06 -0500
Subject: [PATCH 1/3] Improve feature importance examples
Strongly recommend using multiple cores.
---
R/feature_importance.R | 61 ++++++++++++++++++++++++-----------
man/get_feature_importance.Rd | 61 ++++++++++++++++++++++++-----------
2 files changed, 84 insertions(+), 38 deletions(-)
diff --git a/R/feature_importance.R b/R/feature_importance.R
index 8e18caa9..b1a434cd 100644
--- a/R/feature_importance.R
+++ b/R/feature_importance.R
@@ -34,23 +34,41 @@
#' the event that the null hypothesis is true, where the null hypothesis is that
#' the feature is not important for model performance.
#'
+#' We strongly recommend providing multiple cores to speed up computation time.
+#' See [our vignette on parallel processing](http://www.schlosslab.org/mikropml/articles/parallel.html)
+#' for more details.
+#'
#' @examples
#' \dontrun{
+#' # If you called `run_ml()` with `feature_importance = FALSE` (the default),
+#' # you can use `get_feature_importance()` later as long as you have the
+#' # trained model and test data.
#' results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2)
#' names(results$trained_model$trainingData)[1] <- "dx"
-#' get_feature_importance(results$trained_model,
-#' results$trained_model$trainingData, results$test_data,
+#' feat_imp <- get_feature_importance(results$trained_model,
+#' results$trained_model$trainingData,
+#' results$test_data,
#' "dx",
-#' multiClassSummary, "AUC",
-#' class_probs = TRUE, method = "glmnet"
+#' multiClassSummary,
+#' "AUC",
+#' class_probs = TRUE,
+#' method = "glmnet"
#' )
#'
-#' # optionally, you can group features together with a custom grouping
-#' get_feature_importance(results$trained_model,
-#' results$trained_model$trainingData, results$test_data,
+#' # We strongly recommend providing multiple cores to speed up computation time.
+#' # Do this before calling `get_feature_importance()`.
+#' doFuture::registerDoFuture()
+#' future::plan(future::multicore, workers = 2)
+#'
+#' # Optionally, you can group features together with a custom grouping
+#' feat_imp <- get_feature_importance(results$trained_model,
+#' results$trained_model$trainingData,
+#' results$test_data,
#' "dx",
-#' multiClassSummary, "AUC",
-#' class_probs = TRUE, method = "glmnet",
+#' multiClassSummary,
+#' "AUC",
+#' class_probs = TRUE,
+#' method = "glmnet",
#' groups = c(
#' "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012",
#' "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022",
@@ -66,9 +84,8 @@
#' )
#' )
#'
-#' # the function can show a progress bar if you have the progressr package installed
-#' ## optionally, specify the progress bar format
-#'
+#' # the function can show a progress bar if you have the `progressr` package installed.
+#' ## optionally, specify the progress bar format:
#' progressr::handlers(progressr::handler_progress(
#' format = ":message :bar :percent | elapsed: :elapsed | eta: :eta",
#' clear = FALSE,
@@ -78,18 +95,24 @@
#' progressr::handlers(global = TRUE)
#' ## run the function and watch the live progress udpates
#' feat_imp <- get_feature_importance(results$trained_model,
-#' results$trained_model$trainingData, results$test_data,
+#' results$trained_model$trainingData,
+#' results$test_data,
#' "dx",
-#' multiClassSummary, "AUC",
-#' class_probs = TRUE, method = "glmnet"
+#' multiClassSummary,
+#' "AUC",
+#' class_probs = TRUE,
+#' method = "glmnet"
#' )
#'
-#' # you can specify any correlation method supported by `stats::cor`:
+#' # You can specify any correlation method supported by `stats::cor`:
#' feat_imp <- get_feature_importance(results$trained_model,
-#' results$trained_model$trainingData, results$test_data,
+#' results$trained_model$trainingData,
+#' results$test_data,
#' "dx",
-#' multiClassSummary, "AUC",
-#' class_probs = TRUE, method = "glmnet",
+#' multiClassSummary,
+#' "AUC",
+#' class_probs = TRUE,
+#' method = "glmnet",
#' corr_method = "pearson"
#' )
#' }
diff --git a/man/get_feature_importance.Rd b/man/get_feature_importance.Rd
index e6b48868..6861d461 100644
--- a/man/get_feature_importance.Rd
+++ b/man/get_feature_importance.Rd
@@ -96,24 +96,42 @@ precision of estimating the null distribution, but also increases runtime.
The p-value represents the probability of obtaining the actual performance in
the event that the null hypothesis is true, where the null hypothesis is that
the feature is not important for model performance.
+
+We strongly recommend providing multiple cores to speed up computation time.
+See \href{http://www.schlosslab.org/mikropml/articles/parallel.html}{our vignette on parallel processing}
+for more details.
}
\examples{
\dontrun{
+# If you called `run_ml()` with `feature_importance = FALSE` (the default),
+# you can use `get_feature_importance()` later as long as you have the
+# trained model and test data.
results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2)
names(results$trained_model$trainingData)[1] <- "dx"
-get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+feat_imp <- get_feature_importance(results$trained_model,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet"
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet"
)
-# optionally, you can group features together with a custom grouping
-get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+# We strongly recommend providing multiple cores to speed up computation time.
+# Do this before calling `get_feature_importance()`.
+doFuture::registerDoFuture()
+future::plan(future::multicore, workers = 2)
+
+# Optionally, you can group features together with a custom grouping
+feat_imp <- get_feature_importance(results$trained_model,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet",
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet",
groups = c(
"Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012",
"Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022",
@@ -129,9 +147,8 @@ get_feature_importance(results$trained_model,
)
)
-# the function can show a progress bar if you have the progressr package installed
-## optionally, specify the progress bar format
-
+# the function can show a progress bar if you have the `progressr` package installed.
+## optionally, specify the progress bar format:
progressr::handlers(progressr::handler_progress(
format = ":message :bar :percent | elapsed: :elapsed | eta: :eta",
clear = FALSE,
@@ -141,18 +158,24 @@ progressr::handlers(progressr::handler_progress(
progressr::handlers(global = TRUE)
## run the function and watch the live progress udpates
feat_imp <- get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet"
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet"
)
-# you can specify any correlation method supported by `stats::cor`:
+# You can specify any correlation method supported by `stats::cor`:
feat_imp <- get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet",
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet",
corr_method = "pearson"
)
}
From 0e03eae8451f27d7a5db011c416e0cfee006e995 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool
Date: Tue, 22 Feb 2022 10:36:11 -0500
Subject: [PATCH 2/3] Update NEWS for PR #293
---
NEWS.md | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/NEWS.md b/NEWS.md
index 79494ebc..452c4b8e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,8 @@
# development version
- mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292).
-- Fix bug where `cv_times` had no effect on repeats for cross-validation (#291, @kelly-sovacool).
+- Fix bug where `cv_times` had no effect on reported repeats for cross-validation (#291, @kelly-sovacool).
+- Made minor documentation improvements (#293, @kelly-sovacool)
# mikropml 1.2.2
From 181fa885a53354ad33d7ce875f345fe8f2ae381f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 24 Feb 2022 20:10:07 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=91=20Build=20docs=20site?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
docs/pkgdown.yml | 2 +-
docs/reference/get_feature_importance.html | 60 ++++++++++++++-------
docs/reference/get_perf_metric_fn.html | 6 +--
docs/reference/randomize_feature_order.html | 8 +--
4 files changed, 49 insertions(+), 27 deletions(-)
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index d6adef55..84b803e2 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -7,7 +7,7 @@ articles:
parallel: parallel.html
preprocess: preprocess.html
tuning: tuning.html
-last_built: 2022-02-16T16:38Z
+last_built: 2022-02-24T19:44Z
urls:
reference: http://www.schlosslab.org/mikropml/reference
article: http://www.schlosslab.org/mikropml/articles
diff --git a/docs/reference/get_feature_importance.html b/docs/reference/get_feature_importance.html
index 17fdb7b2..fc3ec17d 100644
--- a/docs/reference/get_feature_importance.html
+++ b/docs/reference/get_feature_importance.html
@@ -171,6 +171,9 @@ Details
The p-value represents the probability of obtaining the actual performance in
the event that the null hypothesis is true, where the null hypothesis is that
the feature is not important for model performance.
+We strongly recommend providing multiple cores to speed up computation time.
+See our vignette on parallel processing
+for more details.
Author
@@ -182,21 +185,35 @@
Author
Examples
if (FALSE) {
+# If you called `run_ml()` with `feature_importance = FALSE` (the default),
+# you can use `get_feature_importance()` later as long as you have the
+# trained model and test data.
results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2)
names(results$trained_model$trainingData)[1] <- "dx"
-get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+feat_imp <- get_feature_importance(results$trained_model,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet"
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet"
)
-# optionally, you can group features together with a custom grouping
-get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+# We strongly recommend providing multiple cores to speed up computation time.
+# Do this before calling `get_feature_importance()`.
+doFuture::registerDoFuture()
+future::plan(future::multicore, workers = 2)
+
+# Optionally, you can group features together with a custom grouping
+feat_imp <- get_feature_importance(results$trained_model,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet",
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet",
groups = c(
"Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012",
"Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022",
@@ -212,9 +229,8 @@ Examples
)
)
-# the function can show a progress bar if you have the progressr package installed
-## optionally, specify the progress bar format
-
+# the function can show a progress bar if you have the `progressr` package installed.
+## optionally, specify the progress bar format:
progressr::handlers(progressr::handler_progress(
format = ":message :bar :percent | elapsed: :elapsed | eta: :eta",
clear = FALSE,
@@ -224,18 +240,24 @@ Examples
progressr::handlers(global = TRUE)
## run the function and watch the live progress udpates
feat_imp <- get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet"
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet"
)
-# you can specify any correlation method supported by `stats::cor`:
+# You can specify any correlation method supported by `stats::cor`:
feat_imp <- get_feature_importance(results$trained_model,
- results$trained_model$trainingData, results$test_data,
+ results$trained_model$trainingData,
+ results$test_data,
"dx",
- multiClassSummary, "AUC",
- class_probs = TRUE, method = "glmnet",
+ multiClassSummary,
+ "AUC",
+ class_probs = TRUE,
+ method = "glmnet",
corr_method = "pearson"
)
}
diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html
index fbda5846..1216b07c 100644
--- a/docs/reference/get_perf_metric_fn.html
+++ b/docs/reference/get_perf_metric_fn.html
@@ -105,7 +105,7 @@ Examples
#> data$obs <- factor(data$obs, levels = lev)
#> postResample(data[, "pred"], data[, "obs"])
#> }
-#> <bytecode: 0x7fa6d7021a18>
+#> <bytecode: 0x7f80bb595430>
#> <environment: namespace:caret>
get_perf_metric_fn("binary")
#> function (data, lev = NULL, model = NULL)
@@ -163,7 +163,7 @@ Examples
#> stats <- stats[c(stat_list)]
#> return(stats)
#> }
-#> <bytecode: 0x7fa6f3348cf8>
+#> <bytecode: 0x7f80d7840cc0>
#> <environment: namespace:caret>
get_perf_metric_fn("multiclass")
#> function (data, lev = NULL, model = NULL)
@@ -221,7 +221,7 @@ Examples
#> stats <- stats[c(stat_list)]
#> return(stats)
#> }
-#> <bytecode: 0x7fa6f3348cf8>
+#> <bytecode: 0x7f80d7840cc0>
#> <environment: namespace:caret>
diff --git a/docs/reference/randomize_feature_order.html b/docs/reference/randomize_feature_order.html
index c2cbc7b9..0e396324 100644
--- a/docs/reference/randomize_feature_order.html
+++ b/docs/reference/randomize_feature_order.html
@@ -107,10 +107,10 @@
Examples
a = 4:6, b = 7:9, c = 10:12, d = 13:15
)
randomize_feature_order(dat, "outcome")
-
#> outcome c b d a
-
#> 1 1 10 7 13 4
-
#> 2 2 11 8 14 5
-
#> 3 3 12 9 15 6
+
#> outcome c d b a
+
#> 1 1 10 13 7 4
+
#> 2 2 11 14 8 5
+
#> 3 3 12 15 9 6