diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f1c204fb..745f3805 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,6 +10,8 @@ (~Strikethrough~ any points that are not applicable.) - [ ] Write unit tests for any new functionality or bug fixes. -- [ ] Update roxygen comments & vignettes if there are any API changes. +- [ ] Update docs if there are any API changes: + - [ ] roxygen comments + - [ ] vignettes - [ ] Update `NEWS.md` if this includes any user-facing changes. - [ ] The check workflow succeeds on your most recent commit. **This is always required before the PR can be merged.** diff --git a/.gitignore b/.gitignore index c030667b..c58a0f37 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ -Meta/ -doc/ .DS_Store *.Rproj *.Rproj.user @@ -7,5 +5,5 @@ doc/ .Rhistory inst/doc .snakemake -doc -Meta +/doc/ +/Meta/ diff --git a/NEWS.md b/NEWS.md index 360d9264..6f23ac00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # development version 1.3.0 -- Allow `kfold >= length(groups)`. (#285, @kelly-sovacool) +- Allow `kfold >= length(groups)` (#285, @kelly-sovacool). - When using the groups parameter, groups are kept together in cross-validation partitions when `kfold` <= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions. +- Report p-values for permutation feature importance (#288, @kelly-sovacool) # mikropml 1.2.0 diff --git a/R/feature_importance.R b/R/feature_importance.R index 0692ac08..8e18caa9 100644 --- a/R/feature_importance.R +++ b/R/feature_importance.R @@ -13,12 +13,26 @@ #' character (`|`). If this is `NULL` (default), correlated features will be #' grouped together based on `corr_thresh`. #' -#' @return Dataframe with performance metrics for when each feature (or group of -#' correlated features; `names`) is permuted (`perf_metric`), and differences -#' between test performance metric and permuted performance metric -#' (`perf_metric_diff`; test minus permuted performance). Features with a -#' larger `perf_metric_diff` are more important. The performance metric name -#' (`perf_metric_name`) and seed (`seed`) are also returned. +#' @return Data frame with performance metrics for when each feature (or group +#' of correlated features; `names`) is permuted (`perf_metric`), differences +#' between the actual test performance metric on and the permuted performance +#' metric (`perf_metric_diff`; test minus permuted performance), and the +#' p-value (`pvalue`: the probability of obtaining the actual performance +#' value under the null hypothesis). Features with a larger `perf_metric_diff` +#' are more important. The performance metric name (`perf_metric_name`) and +#' seed (`seed`) are also returned. +#' +#' @details +#' For permutation tests, the p-value is the number of permutation statistics +#' that are greater than the test statistic, divided by the number of +#' permutations. In our case, the permutation statistic is the model performance +#' (e.g. AUROC) after randomizing the order of observations for one feature, and +#' the test statistic is the actual performance on the test data. By default we +#' perform 100 permutations per feature; increasing this will increase the +#' precision of estimating the null distribution, but also increases runtime. +#' The p-value represents the probability of obtaining the actual performance in +#' the event that the null hypothesis is true, where the null hypothesis is that +#' the feature is not important for model performance. #' #' @examples #' \dontrun{ @@ -83,6 +97,7 @@ #' @export #' @author Begüm Topçuoğlu, \email{topcuoglu.begum@@gmail.com} #' @author Zena Lapp, \email{zenalapp@@umich.edu} +#' @author Kelly Sovacool, \email{sovacool@@umich.edu} get_feature_importance <- function(trained_model, train_data, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, @@ -195,9 +210,10 @@ find_permuted_perf_metric <- function(test_data, trained_model, outcome_colname, )[[perf_metric_name]] ) }) - mean_perm_perf <- sum(perm_perfs) / nperms + mean_perm_perf <- mean(perm_perfs) return(c( perf_metric = mean_perm_perf, - perf_metric_diff = test_perf_value - mean_perm_perf + perf_metric_diff = test_perf_value - mean_perm_perf, + pvalue = calc_pvalue(perm_perfs, test_perf_value) )) } diff --git a/R/utils.R b/R/utils.R index c1fb9956..1b6bcb98 100644 --- a/R/utils.R +++ b/R/utils.R @@ -231,3 +231,17 @@ radix_sort <- function(...) { is_whole_number <- function(x, tol = .Machine$double.eps^0.5) { abs(x - round(x)) < tol } + +#' Calculate the p-value for a permutation test +#' +#' @param vctr vector of statistics +#' @param test_stat the test statistic +#' +#' @return the number of observations in `vctr` that are greater than +#' `test_stat` divided by the number of observations in `vctr` +#' +#' @noRd +#' @author Kelly Sovacool \email{sovacool@@umich.edu} +calc_pvalue <- function(vctr, test_stat) { + return(sum(vctr > test_stat) / length(vctr)) +} diff --git a/data-raw/otu_mini_bin.R b/data-raw/otu_mini_bin.R index e733cfd9..b48fd017 100644 --- a/data-raw/otu_mini_bin.R +++ b/data-raw/otu_mini_bin.R @@ -74,7 +74,7 @@ otu_mini_bin_results_rf <- mikropml::run_ml(otu_mini_bin, find_feature_importance = TRUE, seed = 2019, cv_times = 2, - group = otu_mini_group + groups = otu_mini_group ) usethis::use_data(otu_mini_bin_results_rf, overwrite = TRUE) diff --git a/data-raw/otu_mini_multi.R b/data-raw/otu_mini_multi.R index 37b43bab..1d13f23d 100644 --- a/data-raw/otu_mini_multi.R +++ b/data-raw/otu_mini_multi.R @@ -12,6 +12,6 @@ otu_mini_multi_results_glmnet <- mikropml::run_ml(otu_mini_multi, # use built-in find_feature_importance = TRUE, seed = 2019, cv_times = 2, - group = otu_mini_multi_group + groups = otu_mini_multi_group ) usethis::use_data(otu_mini_multi_results_glmnet, overwrite = TRUE) diff --git a/data/otu_mini_bin.rda b/data/otu_mini_bin.rda index 84ba2e2a..c8e69ca7 100644 Binary files a/data/otu_mini_bin.rda and b/data/otu_mini_bin.rda differ diff --git a/data/otu_mini_bin_results_rf.rda b/data/otu_mini_bin_results_rf.rda index d0f72c1e..27c0b828 100644 Binary files a/data/otu_mini_bin_results_rf.rda and b/data/otu_mini_bin_results_rf.rda differ diff --git a/data/otu_mini_cont_results_glmnet.rda b/data/otu_mini_cont_results_glmnet.rda index 65c0b6b7..386b81fd 100644 Binary files a/data/otu_mini_cont_results_glmnet.rda and b/data/otu_mini_cont_results_glmnet.rda differ diff --git a/data/otu_mini_multi.rda b/data/otu_mini_multi.rda index b28d66c1..9db7d23e 100644 Binary files a/data/otu_mini_multi.rda and b/data/otu_mini_multi.rda differ diff --git a/data/otu_mini_multi_group.rda b/data/otu_mini_multi_group.rda index 5db10426..747efbcc 100644 Binary files a/data/otu_mini_multi_group.rda and b/data/otu_mini_multi_group.rda differ diff --git a/data/otu_mini_multi_results_glmnet.rda b/data/otu_mini_multi_results_glmnet.rda index a0280f12..4109291f 100644 Binary files a/data/otu_mini_multi_results_glmnet.rda and b/data/otu_mini_multi_results_glmnet.rda differ diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html index ab988cf6..769488ae 100644 --- a/docs/articles/introduction.html +++ b/docs/articles/introduction.html @@ -438,23 +438,25 @@
Now, we can check out the feature importances:
results_imp$feature_importance
-#> perf_metric perf_metric_diff names method perf_metric_name seed
-#> 1 0.5542375 0.0082625 Otu00001 rf AUC 2019
-#> 2 0.5731750 -0.0106750 Otu00002 rf AUC 2019
-#> 3 0.5548750 0.0076250 Otu00003 rf AUC 2019
-#> 4 0.6414750 -0.0789750 Otu00004 rf AUC 2019
-#> 5 0.5049625 0.0575375 Otu00005 rf AUC 2019
-#> 6 0.5444500 0.0180500 Otu00006 rf AUC 2019
-#> 7 0.5417125 0.0207875 Otu00007 rf AUC 2019
-#> 8 0.5257750 0.0367250 Otu00008 rf AUC 2019
-#> 9 0.5395750 0.0229250 Otu00009 rf AUC 2019
-#> 10 0.4977625 0.0647375 Otu00010 rf AUC 2019
There are several columns:
perf_metric
: The performance value of the permuted feature.perf_metric_diff
: The difference between the performance for the true and permuted data (i.e. test performance minus permuted performance). Features with a larger perf_metric_diff
are more important.perf_metric_diff
: The difference between the performance for the actual and permuted data (i.e. test performance minus permuted performance). Features with a larger perf_metric_diff
are more important.
+pvalue
: the probability of obtaining the actual performance value under the null hypothesis.names
: The feature that was permuted.NEWS.md
if this includes any user-facing changes.Dataframe with performance metrics for when each feature (or group of
-correlated features; names
) is permuted (perf_metric
), and differences
-between test performance metric and permuted performance metric
-(perf_metric_diff
; test minus permuted performance). Features with a
-larger perf_metric_diff
are more important. The performance metric name
-(perf_metric_name
) and seed (seed
) are also returned.
Data frame with performance metrics for when each feature (or group
+of correlated features; names
) is permuted (perf_metric
), differences
+between the actual test performance metric on and the permuted performance
+metric (perf_metric_diff
; test minus permuted performance), and the
+p-value (pvalue
: the probability of obtaining the actual performance
+value under the null hypothesis). Features with a larger perf_metric_diff
+are more important. The performance metric name (perf_metric_name
) and
+seed (seed
) are also returned.
For permutation tests, the p-value is the number of permutation statistics +that are greater than the test statistic, divided by the number of +permutations. In our case, the permutation statistic is the model performance +(e.g. AUROC) after randomizing the order of observations for one feature, and +the test statistic is the actual performance on the test data. By default we +perform 100 permutations per feature; increasing this will increase the +precision of estimating the null distribution, but also increases runtime. +The p-value represents the probability of obtaining the actual performance in +the event that the null hypothesis is true, where the null hypothesis is that +the feature is not important for model performance.
Begüm Topçuoğlu, topcuoglu.begum@gmail.com
Zena Lapp, zenalapp@umich.edu
+Kelly Sovacool, sovacool@umich.edu
if (FALSE) { diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html index e285da89..82c5d2d0 100644 --- a/docs/reference/get_perf_metric_fn.html +++ b/docs/reference/get_perf_metric_fn.html @@ -182,7 +182,7 @@diff --git a/man/get_feature_importance.Rd b/man/get_feature_importance.Rd index fda505bc..e6b48868 100644 --- a/man/get_feature_importance.Rd +++ b/man/get_feature_importance.Rd @@ -72,17 +72,31 @@ grouped together based on \code{corr_thresh}.} by \code{stats::cor}: spearman, pearson, kendall. (default: spearman)} } \value{ -Dataframe with performance metrics for when each feature (or group of -correlated features; \code{names}) is permuted (\code{perf_metric}), and differences -between test performance metric and permuted performance metric -(\code{perf_metric_diff}; test minus permuted performance). Features with a -larger \code{perf_metric_diff} are more important. The performance metric name -(\code{perf_metric_name}) and seed (\code{seed}) are also returned. +Data frame with performance metrics for when each feature (or group +of correlated features; \code{names}) is permuted (\code{perf_metric}), differences +between the actual test performance metric on and the permuted performance +metric (\code{perf_metric_diff}; test minus permuted performance), and the +p-value (\code{pvalue}: the probability of obtaining the actual performance +value under the null hypothesis). Features with a larger \code{perf_metric_diff} +are more important. The performance metric name (\code{perf_metric_name}) and +seed (\code{seed}) are also returned. } \description{ Calculates feature importance using a trained model and test data. Requires the \code{future.apply} package. } +\details{ +For permutation tests, the p-value is the number of permutation statistics +that are greater than the test statistic, divided by the number of +permutations. In our case, the permutation statistic is the model performance +(e.g. AUROC) after randomizing the order of observations for one feature, and +the test statistic is the actual performance on the test data. By default we +perform 100 permutations per feature; increasing this will increase the +precision of estimating the null distribution, but also increases runtime. +The p-value represents the probability of obtaining the actual performance in +the event that the null hypothesis is true, where the null hypothesis is that +the feature is not important for model performance. +} \examples{ \dontrun{ results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) @@ -148,4 +162,6 @@ feat_imp <- get_feature_importance(results$trained_model, Begüm Topçuoğlu, \email{topcuoglu.begum@gmail.com} Zena Lapp, \email{zenalapp@umich.edu} + +Kelly Sovacool, \email{sovacool@umich.edu} } diff --git a/tests/testthat/test-feature_importance.R b/tests/testthat/test-feature_importance.R index 748af1f0..7db5805f 100644 --- a/tests/testthat/test-feature_importance.R +++ b/tests/testthat/test-feature_importance.R @@ -23,7 +23,8 @@ test_that("find_permuted_perf_metric works", { ), c( perf_metric = 0.647368421052632, - perf_metric_diff = -2.1052632526164e-08 + perf_metric_diff = -2.1052632526164e-08, + pvalue = 1 ), tol = 10e-5 ) @@ -38,7 +39,8 @@ test_that("find_permuted_perf_metric works", { ), c( perf_metric = 0.647368421052632, - perf_metric_diff = -2.1052632526164e-08 + perf_metric_diff = -2.1052632526164e-08, + pvalue = 1 ), tol = 10e-5 ) @@ -54,39 +56,50 @@ test_that("find_permuted_perf_metric works", { ), c( perf_metric = 0.639315789473684, - perf_metric_diff = 0.00805261052631601 + perf_metric_diff = 0.00805261052631601, + pvalue = 0.18 ), tol = 10e-5 ) }) -feat_imp <- structure(list(perf_metric = c( - 0.629157894736842, 0.605473684210526, - 0.638789473684211, 0.636763157894737, 0.629447368421053, 0.637868421052632, - 0.642552631578947, 0.592157894736842, 0.639684210526316, 0.637526315789474 -), perf_metric_diff = c( - 0.0182105263157896, 0.0418947368421053, - 0.00857894736842102, 0.0106052631578948, 0.0179210526315789, - 0.00950000000000006, 0.00481578947368433, 0.0552105263157895, - 0.00768421052631585, 0.00984210526315799 -), names = structure(1:10, .Label = c( - "Otu00001", - "Otu00002", "Otu00003", "Otu00004", "Otu00005", "Otu00006", "Otu00007", - "Otu00008", "Otu00009", "Otu00010" -), class = "factor"), method = c( - "glmnet", - "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", - "glmnet", "glmnet" -), perf_metric_name = c( - "AUC", "AUC", "AUC", - "AUC", "AUC", "AUC", "AUC", "AUC", "AUC", "AUC" -), seed = c( - 2019, - 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019 -)), row.names = c( - NA, - -10L -), class = "data.frame") +feat_imp <- structure(list( + perf_metric = c( + 0.629157894736842, 0.605473684210526, + 0.63878947368421, 0.636763157894737, 0.629447368421053, 0.637868421052632, + 0.642552631578947, 0.592157894736842, 0.639684210526316, 0.637526315789474 + ), + perf_metric_diff = c( + 0.0182105263157895, 0.0418947368421053, + 0.00857894736842113, 0.0106052631578948, 0.0179210526315789, + 0.00950000000000006, 0.00481578947368422, 0.0552105263157895, + 0.00768421052631585, 0.00984210526315787 + ), + pvalue = c( + 0.17, 0.06, + 0.09, 0.28, 0.37, 0.28, 0.24, 0.09, 0.12, 0.41 + ), + names = structure(1:10, .Label = c( + "Otu00001", + "Otu00002", "Otu00003", "Otu00004", "Otu00005", "Otu00006", "Otu00007", + "Otu00008", "Otu00009", "Otu00010" + ), class = "factor"), + method = c( + "glmnet", + "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", "glmnet", + "glmnet", "glmnet" + ), + perf_metric_name = c( + "AUC", "AUC", "AUC", + "AUC", "AUC", "AUC", "AUC", "AUC", "AUC", "AUC" + ), + seed = c( + 2019, + 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019 + ) +), +row.names = c(NA, -10L), class = "data.frame" +) test_that("feature importances are correct", { set.seed(2019) @@ -158,10 +171,13 @@ test_that("custom grouped features works", { 0.633605263157895, 0.639105263157895, 0.642421052631579, 0.596842105263158, 0.640289473684211, 0.629868421052632 ), perf_metric_diff = c( - 0.0182105263157896, + 0.0182105263157895, 0.0504473684210527, 0.0137631578947369, 0.00826315789473686, - 0.00494736842105259, 0.0505263157894738, 0.00707894736842096, + 0.0049473684210527, 0.0505263157894738, 0.00707894736842107, 0.0175000000000001 + ), pvalue = c( + 0.17, 0.24, 0.27, 0.3, 0.26, + 0.08, 0.11, 0.3 ), names = structure(1:8, .Label = c( "Otu00001", "Otu00002|Otu00003|Otu00005", "Otu00004", "Otu00006", "Otu00007", diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index c547d401..338b792e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -104,3 +104,9 @@ test_that("is_whole_number() checks for integer numbers regardless of class", { expect_true(all(is_whole_number(c(1.0, 2.0, 3.0)))) expect_false(is_whole_number(1.2)) }) + +test_that("calc_pvalue() works", { + expect_equal(calc_pvalue(1:20, 10), 0.5) + expect_equal(calc_pvalue(1, 1), 0) + expect_equal(calc_pvalue(1:3, 0), 1) +}) diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 17e0c5a1..9cb1d8a9 100644 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -422,7 +422,8 @@ results_imp$feature_importance There are several columns: 1. `perf_metric`: The performance value of the permuted feature. -1. `perf_metric_diff`: The difference between the performance for the true and permuted data (i.e. test performance minus permuted performance). Features with a larger `perf_metric_diff` are more important. +1. `perf_metric_diff`: The difference between the performance for the actual and permuted data (i.e. test performance minus permuted performance). Features with a larger `perf_metric_diff` are more important. +1. `pvalue`: the probability of obtaining the actual performance value under the null hypothesis. 1. `names`: The feature that was permuted. 1. `method`: The ML method used. 1. `perf_metric_name`: The peformance metric used.Examp #> data$obs <- factor(data$obs, levels = lev) #> postResample(data[, "pred"], data[, "obs"]) #> } -#> <bytecode: 0x7f7fa0eaf3c0> +#> <bytecode: 0x7fc428237898> #> <environment: namespace:caret> get_perf_metric_fn("binary") #> function (data, lev = NULL, model = NULL) @@ -240,7 +240,7 @@
Examp #> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7f7f9f130158> +#> <bytecode: 0x7fc445fd1c50> #> <environment: namespace:caret> get_perf_metric_fn("multiclass") #> function (data, lev = NULL, model = NULL) @@ -298,7 +298,7 @@
Examp #> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7f7f9f130158> +#> <bytecode: 0x7fc445fd1c50> #> <environment: namespace:caret>