diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 26f60654..e12e34f9 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -19,6 +19,7 @@ jobs: fail-fast: false matrix: config: + - {os: macOS-latest, r: 'devel'} - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} diff --git a/NEWS.md b/NEWS.md index 452c4b8e..af236574 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # development version - mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292). -- Fix bug where `cv_times` had no effect on reported repeats for cross-validation (#291, @kelly-sovacool). +- New function `compare_models()` compares the performance of two models with a permutation test (#295, @courtneyarmour). +- Fixed a bug where `cv_times` did not affect the reported repeats for cross-validation (#291, @kelly-sovacool). - Made minor documentation improvements (#293, @kelly-sovacool) # mikropml 1.2.2 diff --git a/R/checks.R b/R/checks.R index 0ce412af..08a2a67f 100644 --- a/R/checks.R +++ b/R/checks.R @@ -274,7 +274,7 @@ check_outcome_value <- function(dataset, outcome_colname) { stop( paste0( "A binary or multi-class outcome variable is required, but this dataset has ", - num_outcomes, " outcome(s): ", paste(outcomes, collapse = ", ") + num_outcomes, " outcome(s): ", paste(outcomes, collapse = ", ") ) ) } diff --git a/R/compare_models.R b/R/compare_models.R new file mode 100644 index 00000000..13ba1a19 --- /dev/null +++ b/R/compare_models.R @@ -0,0 +1,180 @@ +#' Average metric difference +#' +#' Calculate the difference in the mean of the metric for two groups +#' +#' @param sub_data subset of the merged performance data frame for two groups +#' @param group_name name of column with group variable +#' @param metric metric to compare +#' +#' @return numeric difference in the average metric between the two groups +#' +#' @export +#' @author Courtney Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' df <- dplyr::tibble( +#' condition = c("a", "a", "b", "b"), +#' AUC = c(.2, 0.3, 0.8, 0.9) +#' ) +#' get_difference(df, "condition", "AUC") +#' +get_difference <- function(sub_data, group_name, metric) { + if (!is.numeric(sub_data %>% dplyr::pull(metric))) { + stop(paste0( + "The metric `", metric, + "` is not numeric, please check that you specified the right column." + )) + } + means <- sub_data %>% + dplyr::group_by(.data[[group_name]]) %>% + dplyr::summarise(meanVal = mean(.data[[metric]]), .groups = "drop") %>% + dplyr::pull(meanVal) + abs(diff(means)) +} + +#' Shuffle the rows in a column +#' +#' @param dat a data frame containing `col_name` +#' @param col_name column name to shuffle +#' +#' @return `dat` with the rows of `col_name` shuffled +#' @export +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' set.seed(123) +#' df <- dplyr::tibble( +#' condition = c("a", "a", "b", "b"), +#' AUC = c(.2, 0.3, 0.8, 0.9) +#' ) +#' shuffle_group(df, "condition") +shuffle_group <- function(dat, col_name) { + if (!(col_name %in% colnames(dat))) { + stop(paste0("The col_name `", col_name, "` does not exist in the data frame.")) + } + group_vals <- dat %>% + dplyr::pull({{ col_name }}) + group_vals_shuffled <- base::sample(group_vals) + + data_shuffled <- dat %>% + dplyr::mutate(!!col_name := group_vals_shuffled) + + return(data_shuffled) +} + + +#' Calculated a permuted p-value comparing two models +#' +#' @inheritParams compare_models +#' @param group_1 name of one group to compare +#' @param group_2 name of other group to compare +#' +#' @return numeric p-value comparing two models +#' @export +#' @author Begüm Topçuoğlu, \email{topcuoglu.begum@@gmail.com} +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' df <- dplyr::tibble( +#' model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) +#' set.seed(123) +#' permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100) +permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, nperm = 10000) { + # check that the metric and group exist in data + if (!(metric %in% colnames(merged_data))) { + stop(paste0("The metric `", metric, "` does not exist in the data.")) + } + if (!(group_name %in% colnames(merged_data))) { + stop(paste0("The group_name `", group_name, "` does not exist in the data.")) + } + # check that group_1 and group_2 exist in the data + if (!(group_1 %in% (merged_data %>% dplyr::pull(group_name)))) { + stop(paste0("group_1 `", group_1, "` does not exist in the data.")) + } + if (!(group_2 %in% (merged_data %>% dplyr::pull(group_name)))) { + stop(paste0("group_2 `", group_2, "` does not exist in the data.")) + } + + # subset results to select metric and group columns and + # filter to only the two groups of interest + sub_data <- merged_data %>% + dplyr::select({{ metric }}, {{ group_name }}) %>% + dplyr::filter(.data[[group_name]] == {{ group_1 }} | .data[[group_name]] == {{ group_2 }}) + + # observed difference: quantify the absolute value of the difference + # in metric between the two groups + metric_obs <- get_difference(sub_data, {{ group_name }}, {{ metric }}) + + # shuffled difference: quantify the absolute value of the difference + # in metric between the two groups after shuffling group labels + rep_fn <- select_apply("replicate") + metric_null <- rep_fn( + nperm, + get_difference( + shuffle_group(sub_data, group_name), + group_name, + metric + ) + ) + + p_value <- calc_pvalue(metric_null, metric_obs) + return(p_value) +} + + +#' Compute all pairs of comparisons +#' calculate permuted p-value across all pairs of group variable. +#' wrapper for `permute_p_value` +#' +#' @param merged_data the concatenated performance data from `run_ml` +#' @param metric metric to compare, must be numeric +#' @param group_name column with group variables to compare +#' @param nperm number of permutations, default=10000 +#' +#' @return a table of p-values for all pairs of group varible +#' @export +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' df <- dplyr::tibble( +#' model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) +#' set.seed(123) +#' compare_models(df, "AUC", "model", nperm = 10) +compare_models <- function(merged_data, metric, group_name, nperm = 10000) { + # check that the metric and group exist in data + if (!(metric %in% colnames(merged_data))) { + stop("The metric does not exist in the data.") + } + if (!(group_name %in% colnames(merged_data))) { + stop("The group_name does not exist in the data.") + } + + # identify all unique groups in group variable + groups <- merged_data %>% + dplyr::pull({{ group_name }}) %>% + unique() + + # create a table with all possible comparisons of groups + # without repeating pairings + p_table <- tidyr::expand_grid( + x = 1:length(groups), + y = 1:length(groups) + ) %>% + dplyr::filter(x < y) %>% + dplyr::mutate( + group1 = groups[x], + group2 = groups[y] + ) %>% + dplyr::select(-x, -y) %>% + dplyr::group_by(group1, group2) %>% + dplyr::summarize( + p_value = permute_p_value(merged_data, metric, group_name, group1, group2, nperm), + .groups = "drop" + ) + + return(as.data.frame(p_table)) +} diff --git a/R/utils.R b/R/utils.R index 1b6bcb98..74c98234 100644 --- a/R/utils.R +++ b/R/utils.R @@ -234,6 +234,10 @@ is_whole_number <- function(x, tol = .Machine$double.eps^0.5) { #' Calculate the p-value for a permutation test #' +#' compute Monte Carlo p-value with correction +#' based on formula from Page 158 of 'Bootstrap methods and their application' +#' By Davison & Hinkley 1997 +#' #' @param vctr vector of statistics #' @param test_stat the test statistic #' @@ -243,5 +247,5 @@ is_whole_number <- function(x, tol = .Machine$double.eps^0.5) { #' @noRd #' @author Kelly Sovacool \email{sovacool@@umich.edu} calc_pvalue <- function(vctr, test_stat) { - return(sum(vctr > test_stat) / length(vctr)) + return((sum(vctr >= test_stat) + 1) / (length(vctr) + 1)) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 186d6a96..b126508f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -29,10 +29,11 @@ reference: - mikropml - preprocess_data - run_ml -- title: Plotting helpers +- title: Plotting & evalutation helpers desc: > - Visualize performance to help you tune hyperparameters and choose model methods. + Visualize & evalutate performance to help you tune hyperparameters and choose model methods. contents: + - compare_models - starts_with('plot') - tidy_perf_data - get_hp_performance diff --git a/data/otu_mini_bin.rda b/data/otu_mini_bin.rda index c3ef3843..ccb5cf7a 100644 Binary files a/data/otu_mini_bin.rda and b/data/otu_mini_bin.rda differ diff --git a/data/otu_mini_bin_results_glmnet.rda b/data/otu_mini_bin_results_glmnet.rda index 7a0f0b6d..2c85c788 100644 Binary files a/data/otu_mini_bin_results_glmnet.rda and b/data/otu_mini_bin_results_glmnet.rda differ diff --git a/data/otu_mini_bin_results_rf.rda b/data/otu_mini_bin_results_rf.rda index 6ba8ead6..ac6e8a1e 100644 Binary files a/data/otu_mini_bin_results_rf.rda and b/data/otu_mini_bin_results_rf.rda differ diff --git a/data/otu_mini_bin_results_rpart2.rda b/data/otu_mini_bin_results_rpart2.rda index 753c04aa..0e652ae6 100644 Binary files a/data/otu_mini_bin_results_rpart2.rda and b/data/otu_mini_bin_results_rpart2.rda differ diff --git a/data/otu_mini_bin_results_svmRadial.rda b/data/otu_mini_bin_results_svmRadial.rda index 46b3d262..8a3d16bf 100644 Binary files a/data/otu_mini_bin_results_svmRadial.rda and b/data/otu_mini_bin_results_svmRadial.rda differ diff --git a/data/otu_mini_bin_results_xgbTree.rda b/data/otu_mini_bin_results_xgbTree.rda index 1d923fe7..8b21b0bb 100644 Binary files a/data/otu_mini_bin_results_xgbTree.rda and b/data/otu_mini_bin_results_xgbTree.rda differ diff --git a/data/otu_mini_cont_results_glmnet.rda b/data/otu_mini_cont_results_glmnet.rda index ab97fcc3..44eaa16e 100644 Binary files a/data/otu_mini_cont_results_glmnet.rda and b/data/otu_mini_cont_results_glmnet.rda differ diff --git a/data/otu_mini_cont_results_nocv.rda b/data/otu_mini_cont_results_nocv.rda index 01c9ffdd..b613687c 100644 Binary files a/data/otu_mini_cont_results_nocv.rda and b/data/otu_mini_cont_results_nocv.rda differ diff --git a/data/otu_mini_cv.rda b/data/otu_mini_cv.rda index 9cdbd31b..b8e54749 100644 Binary files a/data/otu_mini_cv.rda and b/data/otu_mini_cv.rda differ diff --git a/data/otu_mini_multi.rda b/data/otu_mini_multi.rda index fb6f1af1..2c773fac 100644 Binary files a/data/otu_mini_multi.rda and b/data/otu_mini_multi.rda differ diff --git a/data/otu_mini_multi_group.rda b/data/otu_mini_multi_group.rda index 4b0a1521..07f7eff5 100644 Binary files a/data/otu_mini_multi_group.rda and b/data/otu_mini_multi_group.rda differ diff --git a/data/otu_mini_multi_results_glmnet.rda b/data/otu_mini_multi_results_glmnet.rda index 79d8dca5..9d506a8b 100644 Binary files a/data/otu_mini_multi_results_glmnet.rda and b/data/otu_mini_multi_results_glmnet.rda differ diff --git a/docs/404.html b/docs/404.html index 0f7568a8..986332d0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -49,7 +49,7 @@ Reference
This document was adapted from the Tidyverse Code of Conduct.
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
This document was adapted from the Tidyverse Contributing guide.
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index fd319ad2..c35487fd 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -26,7 +26,7 @@ ReferenceCopyright (c) 2019-2021 Begüm D. Topçuoğlu, Zena Lapp, Kelly L. Sovacool, Evan Snitkin, Jenna Wiens, and Patrick D. Schloss
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
@@ -95,7 +94,7 @@Thanks for using mikropml! Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible.
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
results_imp$feature_importance
-#> perf_metric perf_metric_diff pvalue names method perf_metric_name seed
-#> 1 0.5542375 0.0082625 0.37 Otu00001 rf AUC 2019
-#> 2 0.5731750 -0.0106750 0.57 Otu00002 rf AUC 2019
-#> 3 0.5548750 0.0076250 0.38 Otu00003 rf AUC 2019
-#> 4 0.6414750 -0.0789750 0.99 Otu00004 rf AUC 2019
-#> 5 0.5049625 0.0575375 0.05 Otu00005 rf AUC 2019
-#> 6 0.5444500 0.0180500 0.18 Otu00006 rf AUC 2019
-#> 7 0.5417125 0.0207875 0.21 Otu00007 rf AUC 2019
-#> 8 0.5257750 0.0367250 0.05 Otu00008 rf AUC 2019
-#> 9 0.5395750 0.0229250 0.02 Otu00009 rf AUC 2019
-#> 10 0.4977625 0.0647375 0.05 Otu00010 rf AUC 2019
There are several columns:
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
diff --git a/docs/articles/paper.html b/docs/articles/paper.html index 1b1d5df3..4f4af106 100644 --- a/docs/articles/paper.html +++ b/docs/articles/paper.html @@ -50,7 +50,7 @@ ReferenceSite built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
diff --git a/docs/articles/parallel.html b/docs/articles/parallel.html index 4d9c6fe8..a20fdfa3 100644 --- a/docs/articles/parallel.html +++ b/docs/articles/parallel.html @@ -50,7 +50,7 @@ ReferenceBy default, preprocess_data()
and run_ml()
use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach
, future
, future.apply
, and doFuture
. Then, register a future plan prior to calling preprocess_data()
and run_ml()
:
By default, preprocess_data()
, run_ml()
, and compare_models()
use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach
, future
, future.apply
, and doFuture
. Then, register a future plan prior to calling these functions:
-doFuture::registerDoFuture()
+doFuture::registerDoFuture()
future::plan(future::multicore, workers = 2)
Above, we used the multicore
plan to split the work across 2 cores. See the future
documentation for more about picking the best plan for your use case. Notably, multicore
does not work inside RStudio or on Windows; you will need to use multisession
instead in those cases.
After registering a future plan, you can call preprocess_data()
and run_ml()
as usual, and they will run certain tasks in parallel.
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
-+meek-ROPE em el
-
meek-ROPE em el
User-Friendly R Package for Supervised Machine Learning Pipelines
-An interface to build machine learning models for classification and regression problems. mikropml
implements the ML pipeline described by Topçuoğlu et al. (2020) with reasonable default options for data preprocessing, hyperparameter tuning, cross-validation, testing, model evaluation, and interpretation steps. See the website for more information, documentation, and examples.
kfold >= length(groups)
(#285, @kelly-sovacool).
-kfold
<= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions.kfold >= length(groups)
(#285, @kelly-sovacool).kfold
<= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions.cross_val
added to run_ml()
allows users to define their own custom cross-validation scheme (#278, @kelly-sovacool).
-calculate_performance
, which controls whether performance metrics are calculated (default: TRUE
). Users may wish to skip performance calculations when training models with no cross-validation.cross_val
added to run_ml()
allows users to define their own custom cross-validation scheme (#278, @kelly-sovacool).calculate_performance
, which controls whether performance metrics are calculated (default: TRUE
). Users may wish to skip performance calculations when training models with no cross-validation.group_partitions
added to run_ml()
allows users to control which groups should go to which partition of the train/test split (#281, @kelly-sovacool).training_frac
parameter in run_ml()
(#281, @kelly-sovacool).
-training_frac
is a fraction between 0 and 1 that specifies how much of the dataset should be used in the training fraction of the train/test split.training_frac
parameter in run_ml()
(#281, @kelly-sovacool).training_frac
is a fraction between 0 and 1 that specifies how much of the dataset should be used in the training fraction of the train/test split.training_frac
a vector of indices that correspond to which rows of the dataset should go in the training fraction of the train/test split. This gives users direct control over exactly which observations are in the training fraction if desired.group_correlated_features()
is now a user-facing function.group_correlated_features()
is now a user-facing function.stats::cor
with the corr_method
parameter: get_feature_importance(corr_method = "pearson")
-stats::cor
with the corr_method
parameter: get_feature_importance(corr_method = "pearson")
+preprocess_data()
converted the outcome column to a character vector (#273, @kelly-sovacool, @ecmaggioncalda).preprocess_data()
: prefilter_threshold
(#240, @kelly-sovacool, @courtneyarmour).
-prefilter_threshold
or fewer rows in the data.preprocess_data()
: prefilter_threshold
(#240, @kelly-sovacool, @courtneyarmour).prefilter_threshold
or fewer rows in the data.remove_singleton_columns()
called by preprocess_data()
to carry this out.get_feature_importance()
: groups
(#246, @kelly-sovacool).
-get_feature_importance()
: groups
(#246, @kelly-sovacool).groups
is NULL
by default; in this case, correlated features above corr_thresh
are grouped together.This is the first release version of mikropml! 🎉
NEWS.md
file to track changes to the package.run_ml()
run_ml()
:
-run_ml()
:glmnet
: logistic and linear regressionrf
: random forestxgbTree
: gradient-boosted trees(Strikethrough any points that are not applicable.)
NEWS.md
if this includes any user-facing changes.NEWS.md
if this includes any user-facing changes.
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
diff --git a/docs/reference/calc_perf_metrics.html b/docs/reference/calc_perf_metrics.html index 446a67f1..cb3f3f8e 100644 --- a/docs/reference/calc_perf_metrics.html +++ b/docs/reference/calc_perf_metrics.html @@ -26,7 +26,7 @@ ReferenceRun the machine learning pipeline
Visualize performance to help you tune hyperparameters and choose model methods.
+Visualize & evalutate performance to help you tune hyperparameters and choose model methods.
Site built with pkgdown 2.0.2.
+Site built with pkgdown 2.0.3.
diff --git a/docs/reference/mikropml.html b/docs/reference/mikropml.html index 4c02d933..033b0f39 100644 --- a/docs/reference/mikropml.html +++ b/docs/reference/mikropml.html @@ -29,7 +29,7 @@ ReferenceThis function runs machine learning (ML), evaluates the best model, and optionally calculates feature importance using the framework -outlined in Topçuoğlu et al. 2020 (doi: 10.1128/mBio.00434-20 +outlined in Topçuoğlu et al. 2020 (doi:10.1128/mBio.00434-20 ). Required inputs are a dataframe with an outcome variable and other columns as features, as well as the ML method. @@ -251,7 +251,7 @@