Merge pull request #288 from SchlossLab/iss-287

Report p-values in the feature importance dataframe
SchlossLab · Nov 30, 2021 · 9c4c317 · 9c4c317
2 parents 6f83e61 + a8173d8
commit 9c4c317
Show file tree

Hide file tree

Showing 23 changed files with 189 additions and 82 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -10,6 +10,8 @@
 (~Strikethrough~ any points that are not applicable.)
 
 - [ ] Write unit tests for any new functionality or bug fixes.
-- [ ] Update roxygen comments & vignettes if there are any API changes.
+- [ ] Update docs if there are any API changes:
+  - [ ] roxygen comments
+  - [ ] vignettes
 - [ ] Update `NEWS.md` if this includes any user-facing changes. 
 - [ ] The check workflow succeeds on your most recent commit. **This is always required before the PR can be merged.**
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,9 @@
-Meta/
-doc/
 .DS_Store
 *.Rproj
 *.Rproj.user
 .Rproj.user
 .Rhistory
 inst/doc
 .snakemake
-doc
-Meta
+/doc/
+/Meta/
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,8 @@
 # development version 1.3.0
 
-- Allow `kfold >= length(groups)`. (#285, @kelly-sovacool)
+- Allow `kfold >= length(groups)` (#285, @kelly-sovacool).
     - When using the groups parameter, groups are kept together in cross-validation partitions when `kfold` <= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions. 
+- Report p-values for permutation feature importance (#288, @kelly-sovacool)
 
 # mikropml 1.2.0
 

diff --git a/R/feature_importance.R b/R/feature_importance.R
@@ -13,12 +13,26 @@
 #'   character (`|`). If this is `NULL` (default), correlated features will be
 #'   grouped together based on `corr_thresh`.
 #'
-#' @return Dataframe with performance metrics for when each feature (or group of
-#'   correlated features; `names`) is permuted (`perf_metric`), and differences
-#'   between test performance metric and permuted performance metric
-#'   (`perf_metric_diff`; test minus permuted performance). Features with a
-#'   larger `perf_metric_diff` are more important. The performance metric name
-#'   (`perf_metric_name`) and seed (`seed`) are also returned.
+#' @return Data frame with performance metrics for when each feature (or group
+#'   of correlated features; `names`) is permuted (`perf_metric`), differences
+#'   between the actual test performance metric on and the permuted performance
+#'   metric (`perf_metric_diff`; test minus permuted performance), and the
+#'   p-value (`pvalue`: the probability of obtaining the actual performance
+#'   value under the null hypothesis). Features with a larger `perf_metric_diff`
+#'   are more important. The performance metric name (`perf_metric_name`) and
+#'   seed (`seed`) are also returned.
+#'
+#' @details
+#' For permutation tests, the p-value is the number of permutation statistics
+#' that are greater than the test statistic, divided by the number of
+#' permutations. In our case, the permutation statistic is the model performance
+#' (e.g. AUROC) after randomizing the order of observations for one feature, and
+#' the test statistic is the actual performance on the test data. By default we
+#' perform 100 permutations per feature; increasing this will increase the
+#' precision of estimating the null distribution, but also increases runtime.
+#' The p-value represents the probability of obtaining the actual performance in
+#' the event that the null hypothesis is true, where the null hypothesis is that
+#' the feature is not important for model performance.
 #'
 #' @examples
 #' \dontrun{
@@ -83,6 +97,7 @@
 #' @export
 #' @author Begüm Topçuoğlu, \email{topcuoglu.begum@@gmail.com}
 #' @author Zena Lapp, \email{zenalapp@@umich.edu}
+#' @author Kelly Sovacool, \email{sovacool@@umich.edu}
 get_feature_importance <- function(trained_model, train_data, test_data,
                                    outcome_colname, perf_metric_function,
                                    perf_metric_name, class_probs, method,
@@ -195,9 +210,10 @@ find_permuted_perf_metric <- function(test_data, trained_model, outcome_colname,
       )[[perf_metric_name]]
     )
   })
-  mean_perm_perf <- sum(perm_perfs) / nperms
+  mean_perm_perf <- mean(perm_perfs)
   return(c(
     perf_metric = mean_perm_perf,
-    perf_metric_diff = test_perf_value - mean_perm_perf
+    perf_metric_diff = test_perf_value - mean_perm_perf,
+    pvalue = calc_pvalue(perm_perfs, test_perf_value)
   ))
 }
diff --git a/R/utils.R b/R/utils.R
@@ -231,3 +231,17 @@ radix_sort <- function(...) {
 is_whole_number <- function(x, tol = .Machine$double.eps^0.5) {
   abs(x - round(x)) < tol
 }
+
+#' Calculate the p-value for a permutation test
+#'
+#' @param vctr vector of statistics
+#' @param test_stat the test statistic
+#'
+#' @return the number of observations in `vctr` that are greater than
+#'   `test_stat` divided by the number of observations in `vctr`
+#'
+#' @noRd
+#' @author Kelly Sovacool \email{sovacool@@umich.edu}
+calc_pvalue <- function(vctr, test_stat) {
+  return(sum(vctr > test_stat) / length(vctr))
+}
diff --git a/data-raw/otu_mini_bin.R b/data-raw/otu_mini_bin.R
@@ -74,7 +74,7 @@ otu_mini_bin_results_rf <- mikropml::run_ml(otu_mini_bin,
   find_feature_importance = TRUE,
   seed = 2019,
   cv_times = 2,
-  group = otu_mini_group
+  groups = otu_mini_group
 )
 usethis::use_data(otu_mini_bin_results_rf, overwrite = TRUE)
 

diff --git a/data-raw/otu_mini_multi.R b/data-raw/otu_mini_multi.R
@@ -12,6 +12,6 @@ otu_mini_multi_results_glmnet <- mikropml::run_ml(otu_mini_multi, # use built-in
   find_feature_importance = TRUE,
   seed = 2019,
   cv_times = 2,
-  group = otu_mini_multi_group
+  groups = otu_mini_multi_group
 )
 usethis::use_data(otu_mini_multi_results_glmnet, overwrite = TRUE)
diff --git a/data/otu_mini_bin.rda b/data/otu_mini_bin.rda
diff --git a/data/otu_mini_bin_results_rf.rda b/data/otu_mini_bin_results_rf.rda
diff --git a/data/otu_mini_cont_results_glmnet.rda b/data/otu_mini_cont_results_glmnet.rda
diff --git a/data/otu_mini_multi.rda b/data/otu_mini_multi.rda
diff --git a/data/otu_mini_multi_group.rda b/data/otu_mini_multi_group.rda
diff --git a/data/otu_mini_multi_results_glmnet.rda b/data/otu_mini_multi_results_glmnet.rda
diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html
diff --git a/docs/articles/introduction_files/header-attrs-2.11/header-attrs.js b/docs/articles/introduction_files/header-attrs-2.11/header-attrs.js
@@ -0,0 +1,12 @@
+// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
+// be compatible with the behavior of Pandoc < 2.8).
+document.addEventListener('DOMContentLoaded', function(e) {
+  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
+  var i, h, a;
+  for (i = 0; i < hs.length; i++) {
+    h = hs[i];
+    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
+    a = h.attributes;
+    while (a.length > 0) h.removeAttribute(a[0].name);
+  }
+});
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -7,7 +7,7 @@ articles:
   parallel: parallel.html
   preprocess: preprocess.html
   tuning: tuning.html
-last_built: 2021-11-22T15:57Z
+last_built: 2021-11-30T18:45Z
 urls:
   reference: http://www.schlosslab.org/mikropml/reference
   article: http://www.schlosslab.org/mikropml/articles

diff --git a/docs/pull_request_template.html b/docs/pull_request_template.html
diff --git a/docs/reference/get_feature_importance.html b/docs/reference/get_feature_importance.html
diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html
diff --git a/man/get_feature_importance.Rd b/man/get_feature_importance.Rd