diff --git a/NEWS.md b/NEWS.md index 79494ebc..452c4b8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # development version - mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292). -- Fix bug where `cv_times` had no effect on repeats for cross-validation (#291, @kelly-sovacool). +- Fix bug where `cv_times` had no effect on reported repeats for cross-validation (#291, @kelly-sovacool). +- Made minor documentation improvements (#293, @kelly-sovacool) # mikropml 1.2.2 diff --git a/R/feature_importance.R b/R/feature_importance.R index 8e18caa9..b1a434cd 100644 --- a/R/feature_importance.R +++ b/R/feature_importance.R @@ -34,23 +34,41 @@ #' the event that the null hypothesis is true, where the null hypothesis is that #' the feature is not important for model performance. #' +#' We strongly recommend providing multiple cores to speed up computation time. +#' See [our vignette on parallel processing](http://www.schlosslab.org/mikropml/articles/parallel.html) +#' for more details. +#' #' @examples #' \dontrun{ +#' # If you called `run_ml()` with `feature_importance = FALSE` (the default), +#' # you can use `get_feature_importance()` later as long as you have the +#' # trained model and test data. #' results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) #' names(results$trained_model$trainingData)[1] <- "dx" -#' get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' feat_imp <- get_feature_importance(results$trained_model, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet" +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet" #' ) #' -#' # optionally, you can group features together with a custom grouping -#' get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' # We strongly recommend providing multiple cores to speed up computation time. +#' # Do this before calling `get_feature_importance()`. +#' doFuture::registerDoFuture() +#' future::plan(future::multicore, workers = 2) +#' +#' # Optionally, you can group features together with a custom grouping +#' feat_imp <- get_feature_importance(results$trained_model, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet", +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet", #' groups = c( #' "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", #' "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", @@ -66,9 +84,8 @@ #' ) #' ) #' -#' # the function can show a progress bar if you have the progressr package installed -#' ## optionally, specify the progress bar format -#' +#' # the function can show a progress bar if you have the `progressr` package installed. +#' ## optionally, specify the progress bar format: #' progressr::handlers(progressr::handler_progress( #' format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", #' clear = FALSE, @@ -78,18 +95,24 @@ #' progressr::handlers(global = TRUE) #' ## run the function and watch the live progress udpates #' feat_imp <- get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet" +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet" #' ) #' -#' # you can specify any correlation method supported by `stats::cor`: +#' # You can specify any correlation method supported by `stats::cor`: #' feat_imp <- get_feature_importance(results$trained_model, -#' results$trained_model$trainingData, results$test_data, +#' results$trained_model$trainingData, +#' results$test_data, #' "dx", -#' multiClassSummary, "AUC", -#' class_probs = TRUE, method = "glmnet", +#' multiClassSummary, +#' "AUC", +#' class_probs = TRUE, +#' method = "glmnet", #' corr_method = "pearson" #' ) #' } diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index d6adef55..84b803e2 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -7,7 +7,7 @@ articles: parallel: parallel.html preprocess: preprocess.html tuning: tuning.html -last_built: 2022-02-16T16:38Z +last_built: 2022-02-24T19:44Z urls: reference: http://www.schlosslab.org/mikropml/reference article: http://www.schlosslab.org/mikropml/articles diff --git a/docs/reference/get_feature_importance.html b/docs/reference/get_feature_importance.html index 17fdb7b2..fc3ec17d 100644 --- a/docs/reference/get_feature_importance.html +++ b/docs/reference/get_feature_importance.html @@ -171,6 +171,9 @@

Details

The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance.

+

We strongly recommend providing multiple cores to speed up computation time. +See our vignette on parallel processing +for more details.

Author

@@ -182,21 +185,35 @@

Author

Examples

if (FALSE) {
+# If you called `run_ml()` with `feature_importance = FALSE` (the default),
+# you can use `get_feature_importance()` later as long as you have the
+# trained model and test data.
 results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2)
 names(results$trained_model$trainingData)[1] <- "dx"
-get_feature_importance(results$trained_model,
-  results$trained_model$trainingData, results$test_data,
+feat_imp <- get_feature_importance(results$trained_model,
+  results$trained_model$trainingData,
+  results$test_data,
   "dx",
-  multiClassSummary, "AUC",
-  class_probs = TRUE, method = "glmnet"
+  multiClassSummary,
+  "AUC",
+  class_probs = TRUE,
+  method = "glmnet"
 )
 
-# optionally, you can group features together with a custom grouping
-get_feature_importance(results$trained_model,
-  results$trained_model$trainingData, results$test_data,
+# We strongly recommend providing multiple cores to speed up computation time.
+# Do this before calling `get_feature_importance()`.
+doFuture::registerDoFuture()
+future::plan(future::multicore, workers = 2)
+
+# Optionally, you can group features together with a custom grouping
+feat_imp <- get_feature_importance(results$trained_model,
+  results$trained_model$trainingData,
+  results$test_data,
   "dx",
-  multiClassSummary, "AUC",
-  class_probs = TRUE, method = "glmnet",
+  multiClassSummary,
+  "AUC",
+  class_probs = TRUE,
+  method = "glmnet",
   groups = c(
     "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012",
     "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022",
@@ -212,9 +229,8 @@ 

Examples

) ) -# the function can show a progress bar if you have the progressr package installed -## optionally, specify the progress bar format - +# the function can show a progress bar if you have the `progressr` package installed. +## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, @@ -224,18 +240,24 @@

Examples

progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# you can specify any correlation method supported by `stats::cor`: +# You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", corr_method = "pearson" ) } diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html index fbda5846..1216b07c 100644 --- a/docs/reference/get_perf_metric_fn.html +++ b/docs/reference/get_perf_metric_fn.html @@ -105,7 +105,7 @@

Examples

#> data$obs <- factor(data$obs, levels = lev) #> postResample(data[, "pred"], data[, "obs"]) #> } -#> <bytecode: 0x7fa6d7021a18> +#> <bytecode: 0x7f80bb595430> #> <environment: namespace:caret> get_perf_metric_fn("binary") #> function (data, lev = NULL, model = NULL) @@ -163,7 +163,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa6f3348cf8> +#> <bytecode: 0x7f80d7840cc0> #> <environment: namespace:caret> get_perf_metric_fn("multiclass") #> function (data, lev = NULL, model = NULL) @@ -221,7 +221,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa6f3348cf8> +#> <bytecode: 0x7f80d7840cc0> #> <environment: namespace:caret>
diff --git a/docs/reference/randomize_feature_order.html b/docs/reference/randomize_feature_order.html index c2cbc7b9..0e396324 100644 --- a/docs/reference/randomize_feature_order.html +++ b/docs/reference/randomize_feature_order.html @@ -107,10 +107,10 @@

Examples

a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome") -#> outcome c b d a -#> 1 1 10 7 13 4 -#> 2 2 11 8 14 5 -#> 3 3 12 9 15 6 +#> outcome c d b a +#> 1 1 10 13 7 4 +#> 2 2 11 14 8 5 +#> 3 3 12 15 9 6
diff --git a/man/get_feature_importance.Rd b/man/get_feature_importance.Rd index e6b48868..6861d461 100644 --- a/man/get_feature_importance.Rd +++ b/man/get_feature_importance.Rd @@ -96,24 +96,42 @@ precision of estimating the null distribution, but also increases runtime. The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance. + +We strongly recommend providing multiple cores to speed up computation time. +See \href{http://www.schlosslab.org/mikropml/articles/parallel.html}{our vignette on parallel processing} +for more details. } \examples{ \dontrun{ +# If you called `run_ml()` with `feature_importance = FALSE` (the default), +# you can use `get_feature_importance()` later as long as you have the +# trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" -get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, +feat_imp <- get_feature_importance(results$trained_model, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# optionally, you can group features together with a custom grouping -get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, +# We strongly recommend providing multiple cores to speed up computation time. +# Do this before calling `get_feature_importance()`. +doFuture::registerDoFuture() +future::plan(future::multicore, workers = 2) + +# Optionally, you can group features together with a custom grouping +feat_imp <- get_feature_importance(results$trained_model, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", @@ -129,9 +147,8 @@ get_feature_importance(results$trained_model, ) ) -# the function can show a progress bar if you have the progressr package installed -## optionally, specify the progress bar format - +# the function can show a progress bar if you have the `progressr` package installed. +## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, @@ -141,18 +158,24 @@ progressr::handlers(progressr::handler_progress( progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet" + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet" ) -# you can specify any correlation method supported by `stats::cor`: +# You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, - results$trained_model$trainingData, results$test_data, + results$trained_model$trainingData, + results$test_data, "dx", - multiClassSummary, "AUC", - class_probs = TRUE, method = "glmnet", + multiClassSummary, + "AUC", + class_probs = TRUE, + method = "glmnet", corr_method = "pearson" ) }