From 9b47faeaa1136ead696cec2308e29792e609c7e6 Mon Sep 17 00:00:00 2001 From: Courtney Armour Date: Thu, 28 Apr 2022 11:19:57 -0400 Subject: [PATCH 01/26] initial compare_models script --- R/compare_models.R | 111 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 R/compare_models.R diff --git a/R/compare_models.R b/R/compare_models.R new file mode 100644 index 00000000..203ca853 --- /dev/null +++ b/R/compare_models.R @@ -0,0 +1,111 @@ +#' Get Difference + +# calculate the difference in the mean of the metric for the two groups +# sub_data: subset of the performance data for just two groups +# group_name: name of column with group variable +# metric: metric to compare +get_difference <- function(sub_data,group_name,metric){ + # get the mean metric value for each group + means <- sub_data %>% + dplyr::group_by(.data[[group_name]]) %>% + dplyr::summarise(meanVal = mean(.data[[metric]]),.groups="drop") %>% + dplyr::pull(meanVal) + # find the difference in the mean between the two groups + abs(diff(means)) +} + +#' Shuffle Group + +# shuffle the groups across the data +# sub_data: subset of the performance data for just two groups +# group_name: column name to shuffle +shuffle_group <- function(sub_data,group_name){ + # get teh group labels + group_vals <- sub_data %>% + dplyr::pull( {{ group_name }} ) + # shuffle the group labels + group_vals_shuffled <- base::sample(group_vals) + + #assign shuffled groups to group column + data_shuffled <- sub_data %>% + dplyr::mutate( !!group_name := group_vals_shuffled) + + return(data_shuffled) +} + +#' Permute P-Value + +# data: the concatenated performance (from what output?) +# metric: metric to compare, either AUC or cv_metric_AUC +# group_name: column with group variables to compare +# group_1: name of one group to compare +# group_2: name of other group to compare +permute_p_value <- function(data, metric, group_name, group_1, group_2, nperm){ + # check that the metric and group exist in data + assertthat::has_name(data,metric) + assertthat::has_name(data,group_name) + # check that group_1 and group_2 exist in the data + #assertthat::has_name(data %>% distinct({{group_name}}),{{ group_1 }}) + + # subset results to select metric and group columns and + # filter to only the two groups of interest + sub_data <- data %>% + dplyr::select({{ metric }},{{ group_name }}) %>% + dplyr::filter( .data[[group_name]] == {{group_1}} | .data[[group_name]] == {{group_2}}) + + # observed difference: quantify the absolute value of the difference + # in metric between the two groups + metric_obs <- get_difference(sub_data,{{group_name}},{{metric}}) + + # shuffled difference: quantify the absolute value of the difference + # in metric between the two groups after shuffling group labels + metric_null <- replicate(nperm,get_difference(shuffle_group(sub_data,group_name),group_name,metric)) + + # n: number of shuffled calculations + n <- length(metric_null) + # r: replications at least as extreme as observed effect + r <- sum(abs(metric_null) >= metric_obs) + + # compute Monte Carlo p-value with correction (Davison & Hinkley, 1997) + p_value=(r+1)/(n+1) + return(p_value) +} + +# Wrapper to do all comparisons +# data: the concatenated performance (from what output?) +# metric: metric to compare, either AUC or cv_metric_AUC +# group_name: column with group variables to compare +compare_models <- function(merged_data,metric,group_name,nperm=10000){ + + # identify all unique groups in group variable + groups <- merged_data %>% + dplyr::pull( {{group_name}} ) %>% + unique() + + # create a table with all possible comparisons of groups + # without repeating pairings + p_table <- tidyr::expand_grid(x=1:length(groups), + y=1:length(groups)) %>% + dplyr::filter(x < y) %>% + dplyr::mutate(group1 = groups[x], + group2 = groups[y]) %>% + select(-x, -y) %>% + group_by(group1,group2) %>% + summarize(p_value = permute_p_value(merged_data,metric,group_name,group1,group2,nperm), + .groups = "drop") + + return(p_table) + +} + +df <- tibble(samp=c("a","a","b","b"),val=c(.2,0.3,0.8,0.9)) +get_difference(df,"samp","val") +shuffle_group(df,"samp") +permute_p_value(df,"val","samp","a","b",nperm=10) +compare_models(df,"val","samp",nperm=10) + +#### TESTS #### + +testthat::expect_equal(get_difference(tibble(AUC=c(0.5,0.8), + type=c("a","b")), + "type","AUC"),0.3) From 2634dc710a3acdf2ea1c1bcd994aae0cba6da25e Mon Sep 17 00:00:00 2001 From: Courtney Armour Date: Thu, 28 Apr 2022 13:19:32 -0400 Subject: [PATCH 02/26] typo --- R/compare_models.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/compare_models.R b/R/compare_models.R index 203ca853..c9d9df4e 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -1,4 +1,4 @@ -#' Get Difference +# Get Difference # calculate the difference in the mean of the metric for the two groups # sub_data: subset of the performance data for just two groups @@ -14,8 +14,7 @@ get_difference <- function(sub_data,group_name,metric){ abs(diff(means)) } -#' Shuffle Group - +# Shuffle Group # shuffle the groups across the data # sub_data: subset of the performance data for just two groups # group_name: column name to shuffle @@ -33,8 +32,7 @@ shuffle_group <- function(sub_data,group_name){ return(data_shuffled) } -#' Permute P-Value - +# Permute P-Value # data: the concatenated performance (from what output?) # metric: metric to compare, either AUC or cv_metric_AUC # group_name: column with group variables to compare @@ -98,14 +96,16 @@ compare_models <- function(merged_data,metric,group_name,nperm=10000){ } +#### TESTS #### df <- tibble(samp=c("a","a","b","b"),val=c(.2,0.3,0.8,0.9)) get_difference(df,"samp","val") shuffle_group(df,"samp") permute_p_value(df,"val","samp","a","b",nperm=10) compare_models(df,"val","samp",nperm=10) -#### TESTS #### - +#### testthat::expect_equal(get_difference(tibble(AUC=c(0.5,0.8), type=c("a","b")), "type","AUC"),0.3) + + From d580d4d9d6f27aa8490568883c644dcd368cb1dc Mon Sep 17 00:00:00 2001 From: Courtney Armour Date: Fri, 29 Apr 2022 19:01:28 -0400 Subject: [PATCH 03/26] add compare_model tests --- R/compare_models.R | 150 ++++++++++++++++++++------- tests/testthat/test-compare_models.R | 50 +++++++++ 2 files changed, 162 insertions(+), 38 deletions(-) create mode 100644 tests/testthat/test-compare_models.R diff --git a/R/compare_models.R b/R/compare_models.R index c9d9df4e..77d6218d 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -1,10 +1,27 @@ -# Get Difference -# calculate the difference in the mean of the metric for the two groups -# sub_data: subset of the performance data for just two groups -# group_name: name of column with group variable -# metric: metric to compare +#' Average metric difference +#' calculate the difference in the mean of the metric for the two groups +#' +#' @param sub_data subset of the merged performance data for just two groups +#' @param group_name name of column with group variable +#' @param metric metric to compare +#' +#' @return numeric difference in the average metric between the two groups +#' +#' @export +#' @author Courtney Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' \dontrun{ +#' df <- tibble(condition=c("a","a","b","b","c","c"), +#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' sub_df <- df %>% filter(condition %in% c("a","b")) +#' get_difference(sub_df,"condition","AUC") +#' } get_difference <- function(sub_data,group_name,metric){ + if(!is.numeric(sub_data %>% dplyr::pull(metric))){ + stop("The specified metric is not numeric, please check that you specified the right column.") + } # get the mean metric value for each group means <- sub_data %>% dplyr::group_by(.data[[group_name]]) %>% @@ -14,12 +31,28 @@ get_difference <- function(sub_data,group_name,metric){ abs(diff(means)) } -# Shuffle Group -# shuffle the groups across the data -# sub_data: subset of the performance data for just two groups -# group_name: column name to shuffle +#' Shuffle the values in the groups column +#' +#' @param sub_data subset of the performance data for just two groups +#' @param group_name column name to shuffle +#' +#' @return `sub_data` with the `group_name` column values shuffled +#' @export +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' \dontrun{ +#' set.seed(123) +#' df <- tibble(condition=c("a","a","b","b","c","c"), +#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' sub_df <- df %>% filter(condition %in% c("a","b")) +#' shuffle_group(sub_df,"condition") +#' } shuffle_group <- function(sub_data,group_name){ - # get teh group labels + if(!(group_name %in% colnames(sub_data))){ + stop("The group_name does not exist in the data.") + } + # get the group labels group_vals <- sub_data %>% dplyr::pull( {{ group_name }} ) # shuffle the group labels @@ -32,22 +65,44 @@ shuffle_group <- function(sub_data,group_name){ return(data_shuffled) } -# Permute P-Value -# data: the concatenated performance (from what output?) -# metric: metric to compare, either AUC or cv_metric_AUC -# group_name: column with group variables to compare -# group_1: name of one group to compare -# group_2: name of other group to compare -permute_p_value <- function(data, metric, group_name, group_1, group_2, nperm){ + +#' Calculated a permuted p-value comparing two models +#' +#' @inheritParams compare_models +#' @param group_1 name of one group to compare +#' @param group_2 name of other group to compare +#' +#' @return numeric p-value comparing two models +#' @export +#' @author Begüm Topçuoğlu, \email{topcuoglu.begum@@gmail.com} +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' \dontrun{ +#' df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), +#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' set.seed(123) +#' permute_p_value(df,"AUC","model","rf","glmnet",nperm=100) +#' } +permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, nperm=10000){ # check that the metric and group exist in data - assertthat::has_name(data,metric) - assertthat::has_name(data,group_name) + if(!(metric %in% colnames(merged_data))){ + stop("The metric does not exist in the data.") + } + if(!(group_name %in% colnames(merged_data))){ + stop("The group_name does not exist in the data.") + } # check that group_1 and group_2 exist in the data - #assertthat::has_name(data %>% distinct({{group_name}}),{{ group_1 }}) - + if(!(group_1 %in% (merged_data %>% dplyr::pull(group_name)))){ + stop("group_1 does not exist in the data.") + } + if(!(group_2 %in% (merged_data %>% dplyr::pull(group_name)))){ + stop("group_2 does not exist in the data.") + } + # subset results to select metric and group columns and # filter to only the two groups of interest - sub_data <- data %>% + sub_data <- merged_data %>% dplyr::select({{ metric }},{{ group_name }}) %>% dplyr::filter( .data[[group_name]] == {{group_1}} | .data[[group_name]] == {{group_2}}) @@ -57,7 +112,8 @@ permute_p_value <- function(data, metric, group_name, group_1, group_2, nperm){ # shuffled difference: quantify the absolute value of the difference # in metric between the two groups after shuffling group labels - metric_null <- replicate(nperm,get_difference(shuffle_group(sub_data,group_name),group_name,metric)) + rep_fn <- select_apply("replicate") + metric_null <- rep_fn(nperm,get_difference(shuffle_group(sub_data,group_name),group_name,metric)) # n: number of shuffled calculations n <- length(metric_null) @@ -69,11 +125,35 @@ permute_p_value <- function(data, metric, group_name, group_1, group_2, nperm){ return(p_value) } -# Wrapper to do all comparisons -# data: the concatenated performance (from what output?) -# metric: metric to compare, either AUC or cv_metric_AUC -# group_name: column with group variables to compare + +#' Compute all pairs of comparisons +#' calculate permuted p-value across all pairs of group variable. +#' wrapper for `permute_p_value` +#' +#' @param merged_data the concatenated performance data from `run_ml` +#' @param metric metric to compare, must be numeric +#' @param group_name column with group variables to compare +#' @param nperm number of permutations, default=10000 +#' +#' @return a table of p-values for all pairs of group varible +#' @export +#' @author Courtney R Armour, \email{armourc@@umich.edu} +#' +#' @examples +#' \dontrun{ +#' df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), +#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' set.seed(123) +#' compare_models(df,"AUC","model",nperm=100) +#' } compare_models <- function(merged_data,metric,group_name,nperm=10000){ + # check that the metric and group exist in data + if(!(metric %in% colnames(merged_data))){ + stop("The metric does not exist in the data.") + } + if(!(group_name %in% colnames(merged_data))){ + stop("The group_name does not exist in the data.") + } # identify all unique groups in group variable groups <- merged_data %>% @@ -97,15 +177,9 @@ compare_models <- function(merged_data,metric,group_name,nperm=10000){ } #### TESTS #### -df <- tibble(samp=c("a","a","b","b"),val=c(.2,0.3,0.8,0.9)) -get_difference(df,"samp","val") -shuffle_group(df,"samp") -permute_p_value(df,"val","samp","a","b",nperm=10) -compare_models(df,"val","samp",nperm=10) - -#### -testthat::expect_equal(get_difference(tibble(AUC=c(0.5,0.8), - type=c("a","b")), - "type","AUC"),0.3) - +# df <- tibble(samp=c("a","a","b","b"),val=c(.2,0.3,0.8,0.9)) +# get_difference(df,"samp","val") +# shuffle_group(df,"samp") +# permute_p_value(df,"val","samp","a","b",nperm=10) +# compare_models(df,"val","samp",nperm=10) diff --git a/tests/testthat/test-compare_models.R b/tests/testthat/test-compare_models.R new file mode 100644 index 00000000..933e8dbe --- /dev/null +++ b/tests/testthat/test-compare_models.R @@ -0,0 +1,50 @@ + +# get_difference +test_that("get_difference works",{ + expect_equal(get_difference(data.frame(AUC=c(0.5,0.8),type=c("a","b")), + "type","AUC"), + 0.3) + expect_error(get_difference(data.frame(AUC=c(0.5,0.8),type=c("a","b")), + "type","type"), + "The specified metric is not numeric, please check that you specified the right column.") +}) + +# shuffle_group +test_that("shuffle_group works",{ + set.seed(2022) + df <- tibble(condition=c("a","a","b","b"),AUC=c(0.2,0.3,0.8,0.9)) + expect_equal(shuffle_group(df,"condition"), + tibble(condition=c("b","b","a","a"),AUC=c(0.2,0.3,0.8,0.9))) + expect_error(shuffle_group(df,"group"), + "The group_name does not exist in the data.") +}) + +# permute_p_value +test_that("permute_p_value works", { + set.seed(2022) + df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"),AUC=c(0.2,0.2,0.2,0.2,0.2,0.2)) + expect_equal(permute_p_value(df,"AUC","model","rf","glmnet",nperm=10), + 1) + expect_error(permute_p_value(df,"auc","model","rf","glmnet",nperm=10), + "The metric does not exist in the data.") + expect_error(permute_p_value(df,"AUC","group","rf","glmnet",nperm=10), + "The group_name does not exist in the data.") + expect_error(permute_p_value(df,"AUC","model","RF","glmnet",nperm=10), + "group_1 does not exist in the data.") + expect_error(permute_p_value(df,"AUC","model","rf","logreg",nperm=10), + "group_2 does not exist in the data.") +}) + +# compare_models +test_that("compare_models works",{ + set.seed(2022) + df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"),AUC=c(0.2,0.2,0.2,0.2,0.2,0.2)) + expect_equal(compare_models(df,"AUC","model",10), + tibble(group1=c("glmnet","rf","rf"), + group2=c("svmRadial","glmnet","svmRadial"), + p_value=c(1,1,1))) + expect_error(compare_models(df,"auc","model",100), + "The metric does not exist in the data.") + expect_error(compare_models(df,"AUC","group",100), + "The group_name does not exist in the data.") +}) From 7ad628f94d7949b593a1f407bba22721d20420fc Mon Sep 17 00:00:00 2001 From: Courtney Armour Date: Mon, 2 May 2022 12:05:56 -0400 Subject: [PATCH 04/26] fix failed tests --- R/compare_models.R | 8 +++---- tests/testthat/test-compare_models.R | 31 +++++++++++++++++++++------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/R/compare_models.R b/R/compare_models.R index 77d6218d..44effd80 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -167,12 +167,12 @@ compare_models <- function(merged_data,metric,group_name,nperm=10000){ dplyr::filter(x < y) %>% dplyr::mutate(group1 = groups[x], group2 = groups[y]) %>% - select(-x, -y) %>% - group_by(group1,group2) %>% - summarize(p_value = permute_p_value(merged_data,metric,group_name,group1,group2,nperm), + dplyr::select(-x, -y) %>% + dplyr::group_by(group1,group2) %>% + dplyr::summarize(p_value = permute_p_value(merged_data,metric,group_name,group1,group2,nperm), .groups = "drop") - return(p_table) + return(as.data.frame(p_table)) } diff --git a/tests/testthat/test-compare_models.R b/tests/testthat/test-compare_models.R index 933e8dbe..e5c0a867 100644 --- a/tests/testthat/test-compare_models.R +++ b/tests/testthat/test-compare_models.R @@ -12,9 +12,13 @@ test_that("get_difference works",{ # shuffle_group test_that("shuffle_group works",{ set.seed(2022) - df <- tibble(condition=c("a","a","b","b"),AUC=c(0.2,0.3,0.8,0.9)) + df <- structure(list(condition=c("a","a","b","b"), + AUC=c(0.2,0.3,0.8,0.9)), + .Names = c("condition","AUC"), + row.names = c(NA,-4L), + class = "data.frame") expect_equal(shuffle_group(df,"condition"), - tibble(condition=c("b","b","a","a"),AUC=c(0.2,0.3,0.8,0.9))) + data.frame(condition=c("b","b","a","a"),AUC=c(0.2,0.3,0.8,0.9))) expect_error(shuffle_group(df,"group"), "The group_name does not exist in the data.") }) @@ -22,7 +26,11 @@ test_that("shuffle_group works",{ # permute_p_value test_that("permute_p_value works", { set.seed(2022) - df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"),AUC=c(0.2,0.2,0.2,0.2,0.2,0.2)) + df <- structure(list(model = c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), + AUC = c(0.2,0.2,0.2,0.2,0.2,0.2)), + .Names = c("model","AUC"), + row.names = c(NA,-6L), + class = "data.frame") expect_equal(permute_p_value(df,"AUC","model","rf","glmnet",nperm=10), 1) expect_error(permute_p_value(df,"auc","model","rf","glmnet",nperm=10), @@ -38,13 +46,20 @@ test_that("permute_p_value works", { # compare_models test_that("compare_models works",{ set.seed(2022) - df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"),AUC=c(0.2,0.2,0.2,0.2,0.2,0.2)) + df <- structure(list(model = c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), + AUC = c(0.2,0.2,0.2,0.2,0.2,0.2)), + .Names = c("model","AUC"), + row.names = c(NA,-6L), + class = "data.frame") expect_equal(compare_models(df,"AUC","model",10), - tibble(group1=c("glmnet","rf","rf"), - group2=c("svmRadial","glmnet","svmRadial"), - p_value=c(1,1,1))) + structure(list(group1=c("glmnet","rf","rf"), + group2=c("svmRadial","glmnet","svmRadial"), + p_value=c(1,1,1)), + .Names = c("group1","group2","p_value"), + row.names = c(NA,-3L), + class = "data.frame")) expect_error(compare_models(df,"auc","model",100), "The metric does not exist in the data.") expect_error(compare_models(df,"AUC","group",100), "The group_name does not exist in the data.") -}) +}) \ No newline at end of file From 37a822b0eab1de6d17ec211baea72768aeb8ee72 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 2 May 2022 16:18:51 +0000 Subject: [PATCH 05/26] =?UTF-8?q?=F0=9F=8E=A8=20Style=20R=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/checks.R | 2 +- R/compare_models.R | 142 +++++++++++++++------------ tests/testthat/test-compare_models.R | 140 ++++++++++++++++---------- 3 files changed, 168 insertions(+), 116 deletions(-) diff --git a/R/checks.R b/R/checks.R index 0ce412af..08a2a67f 100644 --- a/R/checks.R +++ b/R/checks.R @@ -274,7 +274,7 @@ check_outcome_value <- function(dataset, outcome_colname) { stop( paste0( "A binary or multi-class outcome variable is required, but this dataset has ", - num_outcomes, " outcome(s): ", paste(outcomes, collapse = ", ") + num_outcomes, " outcome(s): ", paste(outcomes, collapse = ", ") ) ) } diff --git a/R/compare_models.R b/R/compare_models.R index 44effd80..c949a598 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -7,25 +7,27 @@ #' @param metric metric to compare #' #' @return numeric difference in the average metric between the two groups -#' +#' #' @export #' @author Courtney Armour, \email{armourc@@umich.edu} -#' +#' #' @examples #' \dontrun{ -#' df <- tibble(condition=c("a","a","b","b","c","c"), -#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) -#' sub_df <- df %>% filter(condition %in% c("a","b")) -#' get_difference(sub_df,"condition","AUC") +#' df <- tibble( +#' condition = c("a", "a", "b", "b", "c", "c"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) +#' sub_df <- df %>% filter(condition %in% c("a", "b")) +#' get_difference(sub_df, "condition", "AUC") #' } -get_difference <- function(sub_data,group_name,metric){ - if(!is.numeric(sub_data %>% dplyr::pull(metric))){ +get_difference <- function(sub_data, group_name, metric) { + if (!is.numeric(sub_data %>% dplyr::pull(metric))) { stop("The specified metric is not numeric, please check that you specified the right column.") } # get the mean metric value for each group means <- sub_data %>% dplyr::group_by(.data[[group_name]]) %>% - dplyr::summarise(meanVal = mean(.data[[metric]]),.groups="drop") %>% + dplyr::summarise(meanVal = mean(.data[[metric]]), .groups = "drop") %>% dplyr::pull(meanVal) # find the difference in the mean between the two groups abs(diff(means)) @@ -43,25 +45,27 @@ get_difference <- function(sub_data,group_name,metric){ #' @examples #' \dontrun{ #' set.seed(123) -#' df <- tibble(condition=c("a","a","b","b","c","c"), -#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) -#' sub_df <- df %>% filter(condition %in% c("a","b")) -#' shuffle_group(sub_df,"condition") +#' df <- tibble( +#' condition = c("a", "a", "b", "b", "c", "c"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) +#' sub_df <- df %>% filter(condition %in% c("a", "b")) +#' shuffle_group(sub_df, "condition") #' } -shuffle_group <- function(sub_data,group_name){ - if(!(group_name %in% colnames(sub_data))){ +shuffle_group <- function(sub_data, group_name) { + if (!(group_name %in% colnames(sub_data))) { stop("The group_name does not exist in the data.") } # get the group labels - group_vals <- sub_data %>% - dplyr::pull( {{ group_name }} ) + group_vals <- sub_data %>% + dplyr::pull({{ group_name }}) # shuffle the group labels group_vals_shuffled <- base::sample(group_vals) - - #assign shuffled groups to group column - data_shuffled <- sub_data %>% - dplyr::mutate( !!group_name := group_vals_shuffled) - + + # assign shuffled groups to group column + data_shuffled <- sub_data %>% + dplyr::mutate(!!group_name := group_vals_shuffled) + return(data_shuffled) } @@ -79,55 +83,57 @@ shuffle_group <- function(sub_data,group_name){ #' #' @examples #' \dontrun{ -#' df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), -#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' df <- tibble( +#' model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) #' set.seed(123) -#' permute_p_value(df,"AUC","model","rf","glmnet",nperm=100) +#' permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100) #' } -permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, nperm=10000){ +permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, nperm = 10000) { # check that the metric and group exist in data - if(!(metric %in% colnames(merged_data))){ + if (!(metric %in% colnames(merged_data))) { stop("The metric does not exist in the data.") } - if(!(group_name %in% colnames(merged_data))){ + if (!(group_name %in% colnames(merged_data))) { stop("The group_name does not exist in the data.") } # check that group_1 and group_2 exist in the data - if(!(group_1 %in% (merged_data %>% dplyr::pull(group_name)))){ + if (!(group_1 %in% (merged_data %>% dplyr::pull(group_name)))) { stop("group_1 does not exist in the data.") } - if(!(group_2 %in% (merged_data %>% dplyr::pull(group_name)))){ + if (!(group_2 %in% (merged_data %>% dplyr::pull(group_name)))) { stop("group_2 does not exist in the data.") } # subset results to select metric and group columns and # filter to only the two groups of interest sub_data <- merged_data %>% - dplyr::select({{ metric }},{{ group_name }}) %>% - dplyr::filter( .data[[group_name]] == {{group_1}} | .data[[group_name]] == {{group_2}}) - - # observed difference: quantify the absolute value of the difference - # in metric between the two groups - metric_obs <- get_difference(sub_data,{{group_name}},{{metric}}) - - # shuffled difference: quantify the absolute value of the difference + dplyr::select({{ metric }}, {{ group_name }}) %>% + dplyr::filter(.data[[group_name]] == {{ group_1 }} | .data[[group_name]] == {{ group_2 }}) + + # observed difference: quantify the absolute value of the difference + # in metric between the two groups + metric_obs <- get_difference(sub_data, {{ group_name }}, {{ metric }}) + + # shuffled difference: quantify the absolute value of the difference # in metric between the two groups after shuffling group labels rep_fn <- select_apply("replicate") - metric_null <- rep_fn(nperm,get_difference(shuffle_group(sub_data,group_name),group_name,metric)) - + metric_null <- rep_fn(nperm, get_difference(shuffle_group(sub_data, group_name), group_name, metric)) + # n: number of shuffled calculations n <- length(metric_null) # r: replications at least as extreme as observed effect r <- sum(abs(metric_null) >= metric_obs) - + # compute Monte Carlo p-value with correction (Davison & Hinkley, 1997) - p_value=(r+1)/(n+1) + p_value <- (r + 1) / (n + 1) return(p_value) } #' Compute all pairs of comparisons -#' calculate permuted p-value across all pairs of group variable. +#' calculate permuted p-value across all pairs of group variable. #' wrapper for `permute_p_value` #' #' @param merged_data the concatenated performance data from `run_ml` @@ -141,39 +147,46 @@ permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, n #' #' @examples #' \dontrun{ -#' df <- tibble(model=c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), -#' AUC=c(.2,0.3,0.8,0.9,0.85,0.95)) +#' df <- tibble( +#' model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), +#' AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +#' ) #' set.seed(123) -#' compare_models(df,"AUC","model",nperm=100) +#' compare_models(df, "AUC", "model", nperm = 100) #' } -compare_models <- function(merged_data,metric,group_name,nperm=10000){ +compare_models <- function(merged_data, metric, group_name, nperm = 10000) { # check that the metric and group exist in data - if(!(metric %in% colnames(merged_data))){ + if (!(metric %in% colnames(merged_data))) { stop("The metric does not exist in the data.") } - if(!(group_name %in% colnames(merged_data))){ + if (!(group_name %in% colnames(merged_data))) { stop("The group_name does not exist in the data.") } - + # identify all unique groups in group variable - groups <- merged_data %>% - dplyr::pull( {{group_name}} ) %>% + groups <- merged_data %>% + dplyr::pull({{ group_name }}) %>% unique() - + # create a table with all possible comparisons of groups # without repeating pairings - p_table <- tidyr::expand_grid(x=1:length(groups), - y=1:length(groups)) %>% - dplyr::filter(x < y) %>% - dplyr::mutate(group1 = groups[x], - group2 = groups[y]) %>% - dplyr::select(-x, -y) %>% - dplyr::group_by(group1,group2) %>% - dplyr::summarize(p_value = permute_p_value(merged_data,metric,group_name,group1,group2,nperm), - .groups = "drop") - - return(as.data.frame(p_table)) + p_table <- tidyr::expand_grid( + x = 1:length(groups), + y = 1:length(groups) + ) %>% + dplyr::filter(x < y) %>% + dplyr::mutate( + group1 = groups[x], + group2 = groups[y] + ) %>% + dplyr::select(-x, -y) %>% + dplyr::group_by(group1, group2) %>% + dplyr::summarize( + p_value = permute_p_value(merged_data, metric, group_name, group1, group2, nperm), + .groups = "drop" + ) + return(as.data.frame(p_table)) } #### TESTS #### @@ -182,4 +195,3 @@ compare_models <- function(merged_data,metric,group_name,nperm=10000){ # shuffle_group(df,"samp") # permute_p_value(df,"val","samp","a","b",nperm=10) # compare_models(df,"val","samp",nperm=10) - diff --git a/tests/testthat/test-compare_models.R b/tests/testthat/test-compare_models.R index e5c0a867..e5c30e7f 100644 --- a/tests/testthat/test-compare_models.R +++ b/tests/testthat/test-compare_models.R @@ -1,65 +1,105 @@ # get_difference -test_that("get_difference works",{ - expect_equal(get_difference(data.frame(AUC=c(0.5,0.8),type=c("a","b")), - "type","AUC"), - 0.3) - expect_error(get_difference(data.frame(AUC=c(0.5,0.8),type=c("a","b")), - "type","type"), - "The specified metric is not numeric, please check that you specified the right column.") +test_that("get_difference works", { + expect_equal( + get_difference( + data.frame(AUC = c(0.5, 0.8), type = c("a", "b")), + "type", "AUC" + ), + 0.3 + ) + expect_error( + get_difference( + data.frame(AUC = c(0.5, 0.8), type = c("a", "b")), + "type", "type" + ), + "The specified metric is not numeric, please check that you specified the right column." + ) }) # shuffle_group -test_that("shuffle_group works",{ +test_that("shuffle_group works", { set.seed(2022) - df <- structure(list(condition=c("a","a","b","b"), - AUC=c(0.2,0.3,0.8,0.9)), - .Names = c("condition","AUC"), - row.names = c(NA,-4L), - class = "data.frame") - expect_equal(shuffle_group(df,"condition"), - data.frame(condition=c("b","b","a","a"),AUC=c(0.2,0.3,0.8,0.9))) - expect_error(shuffle_group(df,"group"), - "The group_name does not exist in the data.") + df <- structure(list( + condition = c("a", "a", "b", "b"), + AUC = c(0.2, 0.3, 0.8, 0.9) + ), + .Names = c("condition", "AUC"), + row.names = c(NA, -4L), + class = "data.frame" + ) + expect_equal( + shuffle_group(df, "condition"), + data.frame(condition = c("b", "b", "a", "a"), AUC = c(0.2, 0.3, 0.8, 0.9)) + ) + expect_error( + shuffle_group(df, "group"), + "The group_name does not exist in the data." + ) }) # permute_p_value test_that("permute_p_value works", { set.seed(2022) - df <- structure(list(model = c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), - AUC = c(0.2,0.2,0.2,0.2,0.2,0.2)), - .Names = c("model","AUC"), - row.names = c(NA,-6L), - class = "data.frame") - expect_equal(permute_p_value(df,"AUC","model","rf","glmnet",nperm=10), - 1) - expect_error(permute_p_value(df,"auc","model","rf","glmnet",nperm=10), - "The metric does not exist in the data.") - expect_error(permute_p_value(df,"AUC","group","rf","glmnet",nperm=10), - "The group_name does not exist in the data.") - expect_error(permute_p_value(df,"AUC","model","RF","glmnet",nperm=10), - "group_1 does not exist in the data.") - expect_error(permute_p_value(df,"AUC","model","rf","logreg",nperm=10), - "group_2 does not exist in the data.") + df <- structure(list( + model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), + AUC = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2) + ), + .Names = c("model", "AUC"), + row.names = c(NA, -6L), + class = "data.frame" + ) + expect_equal( + permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 10), + 1 + ) + expect_error( + permute_p_value(df, "auc", "model", "rf", "glmnet", nperm = 10), + "The metric does not exist in the data." + ) + expect_error( + permute_p_value(df, "AUC", "group", "rf", "glmnet", nperm = 10), + "The group_name does not exist in the data." + ) + expect_error( + permute_p_value(df, "AUC", "model", "RF", "glmnet", nperm = 10), + "group_1 does not exist in the data." + ) + expect_error( + permute_p_value(df, "AUC", "model", "rf", "logreg", nperm = 10), + "group_2 does not exist in the data." + ) }) # compare_models -test_that("compare_models works",{ +test_that("compare_models works", { set.seed(2022) - df <- structure(list(model = c("rf","rf","glmnet","glmnet","svmRadial","svmRadial"), - AUC = c(0.2,0.2,0.2,0.2,0.2,0.2)), - .Names = c("model","AUC"), - row.names = c(NA,-6L), - class = "data.frame") - expect_equal(compare_models(df,"AUC","model",10), - structure(list(group1=c("glmnet","rf","rf"), - group2=c("svmRadial","glmnet","svmRadial"), - p_value=c(1,1,1)), - .Names = c("group1","group2","p_value"), - row.names = c(NA,-3L), - class = "data.frame")) - expect_error(compare_models(df,"auc","model",100), - "The metric does not exist in the data.") - expect_error(compare_models(df,"AUC","group",100), - "The group_name does not exist in the data.") -}) \ No newline at end of file + df <- structure(list( + model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), + AUC = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2) + ), + .Names = c("model", "AUC"), + row.names = c(NA, -6L), + class = "data.frame" + ) + expect_equal( + compare_models(df, "AUC", "model", 10), + structure(list( + group1 = c("glmnet", "rf", "rf"), + group2 = c("svmRadial", "glmnet", "svmRadial"), + p_value = c(1, 1, 1) + ), + .Names = c("group1", "group2", "p_value"), + row.names = c(NA, -3L), + class = "data.frame" + ) + ) + expect_error( + compare_models(df, "auc", "model", 100), + "The metric does not exist in the data." + ) + expect_error( + compare_models(df, "AUC", "group", 100), + "The group_name does not exist in the data." + ) +}) From dfe817d11bd5c0087f91cd34dbe82a53fe5b6066 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 2 May 2022 16:45:11 +0000 Subject: [PATCH 06/26] =?UTF-8?q?=F0=9F=93=91=20Build=20docs=20site?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/404.html | 4 +- docs/CODE_OF_CONDUCT.html | 5 +-- docs/CONTRIBUTING.html | 5 +-- docs/LICENSE-text.html | 4 +- docs/LICENSE.html | 5 +-- docs/SUPPORT.html | 5 +-- docs/articles/index.html | 4 +- docs/articles/introduction.html | 4 +- docs/articles/paper.html | 4 +- docs/articles/parallel.html | 6 +-- .../figure-html/customize_plot-1.png | Bin 26642 -> 26656 bytes .../figure-html/plot_perf-1.png | Bin 29976 -> 29919 bytes docs/articles/preprocess.html | 4 +- docs/articles/tuning.html | 4 +- docs/authors.html | 4 +- docs/index.html | 9 ++-- docs/news/index.html | 41 ++++++------------ docs/pkgdown.yml | 4 +- docs/pull_request_template.html | 28 +++++------- docs/reference/calc_perf_metrics.html | 4 +- docs/reference/combine_hp_performance.html | 4 +- docs/reference/define_cv.html | 4 +- docs/reference/get_caret_processed_df.html | 4 +- docs/reference/get_feature_importance.html | 6 +-- docs/reference/get_hp_performance.html | 4 +- docs/reference/get_hyperparams_list.html | 4 +- docs/reference/get_outcome_type.html | 4 +- docs/reference/get_partition_indices.html | 4 +- docs/reference/get_perf_metric_fn.html | 10 ++--- docs/reference/get_perf_metric_name.html | 4 +- docs/reference/get_performance_tbl.html | 4 +- docs/reference/get_tuning_grid.html | 4 +- docs/reference/group_correlated_features.html | 4 +- docs/reference/index.html | 4 +- docs/reference/mikropml.html | 4 +- docs/reference/otu_mini_bin.html | 4 +- .../otu_mini_bin_results_glmnet.html | 4 +- docs/reference/otu_mini_bin_results_rf.html | 4 +- .../otu_mini_bin_results_rpart2.html | 4 +- .../otu_mini_bin_results_svmRadial.html | 4 +- .../otu_mini_bin_results_xgbTree.html | 4 +- .../otu_mini_cont_results_glmnet.html | 4 +- .../reference/otu_mini_cont_results_nocv.html | 4 +- docs/reference/otu_mini_cv.html | 4 +- docs/reference/otu_mini_multi.html | 4 +- docs/reference/otu_mini_multi_group.html | 4 +- .../otu_mini_multi_results_glmnet.html | 4 +- docs/reference/otu_small.html | 4 +- docs/reference/plot_hp_performance.html | 4 +- docs/reference/plot_model_performance.html | 4 +- docs/reference/preprocess_data.html | 4 +- docs/reference/randomize_feature_order.html | 12 ++--- docs/reference/reexports.html | 4 +- docs/reference/remove_singleton_columns.html | 4 +- docs/reference/replace_spaces.html | 4 +- docs/reference/run_ml.html | 8 ++-- docs/reference/tidy_perf_data.html | 4 +- docs/reference/train_model.html | 4 +- 58 files changed, 143 insertions(+), 173 deletions(-) diff --git a/docs/404.html b/docs/404.html index 0f7568a8..986332d0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -49,7 +49,7 @@ Reference