diff --git a/.Rbuildignore b/.Rbuildignore index c0900ea1..96912ad0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ ^cran-comments\.md$ ^revdep$ ^CRAN-RELEASE$ +^CRAN-SUBMISSION$ diff --git a/DESCRIPTION b/DESCRIPTION index b94c54f8..b8ea3e32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mikropml Title: User-Friendly R Package for Supervised Machine Learning Pipelines -Version: 1.2.2.9000 -Date: 2022-02-03 +Version: 1.3.0 +Date: 2022-05-19 Authors@R: c(person(given = "Begüm", family = "Topçuoğlu", @@ -91,4 +91,4 @@ VignetteBuilder: Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 diff --git a/NAMESPACE b/NAMESPACE index 81dd1dd4..68575aad 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(":=") export(.data) export(calc_perf_metrics) export(combine_hp_performance) +export(compare_models) export(contr.ltfr) export(define_cv) export(get_caret_processed_df) @@ -19,6 +20,7 @@ export(get_perf_metric_name) export(get_performance_tbl) export(get_tuning_grid) export(group_correlated_features) +export(permute_p_value) export(plot_hp_performance) export(plot_model_performance) export(preprocess_data) diff --git a/NEWS.md b/NEWS.md index af236574..aebff2c4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# development version +# mikropml 1.3.0 - mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292). - New function `compare_models()` compares the performance of two models with a permutation test (#295, @courtneyarmour). diff --git a/R/compare_models.R b/R/compare_models.R index 13ba1a19..5d4844e7 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -1,5 +1,3 @@ -#' Average metric difference -#' #' Calculate the difference in the mean of the metric for two groups #' #' @param sub_data subset of the merged performance data frame for two groups @@ -8,7 +6,7 @@ #' #' @return numeric difference in the average metric between the two groups #' -#' @export +#' @noRd #' @author Courtney Armour, \email{armourc@@umich.edu} #' #' @examples @@ -38,7 +36,7 @@ get_difference <- function(sub_data, group_name, metric) { #' @param col_name column name to shuffle #' #' @return `dat` with the rows of `col_name` shuffled -#' @export +#' @noRd #' @author Courtney R Armour, \email{armourc@@umich.edu} #' #' @examples @@ -62,7 +60,6 @@ shuffle_group <- function(dat, col_name) { return(data_shuffled) } - #' Calculated a permuted p-value comparing two models #' #' @inheritParams compare_models @@ -124,16 +121,17 @@ permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, n } -#' Compute all pairs of comparisons -#' calculate permuted p-value across all pairs of group variable. -#' wrapper for `permute_p_value` +#' Perform permutation tests to compare the performance metric +#' across all pairs of a group variable. +#' +#' A wrapper for `permute_p_value()`. #' #' @param merged_data the concatenated performance data from `run_ml` #' @param metric metric to compare, must be numeric #' @param group_name column with group variables to compare #' @param nperm number of permutations, default=10000 #' -#' @return a table of p-values for all pairs of group varible +#' @return a table of p-values for all pairs of group variable #' @export #' @author Courtney R Armour, \email{armourc@@umich.edu} #' diff --git a/R/data.R b/R/data.R index 8df06165..9f12e9b0 100644 --- a/R/data.R +++ b/R/data.R @@ -32,19 +32,19 @@ #' Cross validation on `train_data_mini` with grouped features. "otu_mini_cv" -#' Results from running the pipline with L2 logistic regression on `otu_mini_bin` with feature importance and grouping +#' Results from running the pipeline with L2 logistic regression on `otu_mini_bin` with feature importance and grouping "otu_mini_bin_results_glmnet" -#' Results from running the pipline with random forest on `otu_mini_bin` +#' Results from running the pipeline with random forest on `otu_mini_bin` "otu_mini_bin_results_rf" -#' Results from running the pipline with svmRadial on `otu_mini_bin` +#' Results from running the pipeline with svmRadial on `otu_mini_bin` "otu_mini_bin_results_svmRadial" -#' Results from running the pipline with xbgTree on `otu_mini_bin` +#' Results from running the pipeline with xbgTree on `otu_mini_bin` "otu_mini_bin_results_xgbTree" -#' Results from running the pipline with rpart2 on `otu_mini_bin` +#' Results from running the pipeline with rpart2 on `otu_mini_bin` "otu_mini_bin_results_rpart2" #' Results from running the pipeline with glmnet on `otu_mini_bin` with `Otu00001` diff --git a/_pkgdown.yml b/_pkgdown.yml index b126508f..dc00676f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -29,15 +29,22 @@ reference: - mikropml - preprocess_data - run_ml -- title: Plotting & evalutation helpers +- title: Plotting helpers desc: > - Visualize & evalutate performance to help you tune hyperparameters and choose model methods. + Visualize results to help you tune hyperparameters and choose model methods. contents: - - compare_models - starts_with('plot') - tidy_perf_data - get_hp_performance - combine_hp_performance +- title: Model evaluation + desc: > + Evaluate and interpret models. + contents: + - get_feature_importance + - get_performance_tbl + - compare_models + - permute_p_value - title: Package Data - subtitle: datasets contents: @@ -54,9 +61,8 @@ reference: - replace_spaces - title: Pipeline customization desc: > - These are functions called by preprocess_data() or run_ml(). - We make them available in case you would like to customize various steps - of the pipeline beyond the arguments provided by the main functions. + Customize various steps of the pipeline beyond the arguments provided by + run_ml() and preprocess_data(). contents: - remove_singleton_columns - get_caret_processed_df @@ -70,6 +76,4 @@ reference: - get_perf_metric_fn - train_model - calc_perf_metrics - - get_performance_tbl - - get_feature_importance - group_correlated_features diff --git a/cran-comments.md b/cran-comments.md index a6e1dfd3..7078366e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,3 @@ -This patch fixes a test failure on the no long doubles platform. ## Test environments diff --git a/docs/404.html b/docs/404.html index 986332d0..50b47da1 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@
diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html index ccf732ce..356d405e 100644 --- a/docs/CODE_OF_CONDUCT.html +++ b/docs/CODE_OF_CONDUCT.html @@ -17,7 +17,7 @@ diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 632efc65..cfcb2766 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -17,7 +17,7 @@ diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index c35487fd..8c20f191 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 757d2260..a38ff1d8 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ diff --git a/docs/SUPPORT.html b/docs/SUPPORT.html index a297054e..31683945 100644 --- a/docs/SUPPORT.html +++ b/docs/SUPPORT.html @@ -17,7 +17,7 @@ diff --git a/docs/articles/index.html b/docs/articles/index.html index 11263a96..9e13e8a1 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html index 0b546ce5..8466437c 100644 --- a/docs/articles/introduction.html +++ b/docs/articles/introduction.html @@ -40,7 +40,7 @@ @@ -280,6 +280,11 @@kfold
,
#> Training the model...
#> Loading required package: ggplot2
#> Loading required package: lattice
+#>
+#> Attaching package: 'caret'
+#> The following object is masked from 'package:mikropml':
+#>
+#> compare_models
#> Warning in (function (w) : `caret::train()` issued the following warning:
#>
#> simpleWarning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled performance measures.
@@ -374,7 +379,7 @@ The one difference here is run_ml()
will report how much of the data is in the training set if you run the above code chunk. This can be a little finicky depending on how many samples and groups you have. This is because it won’t be exactly what you specify with training_frac
, since you have to include all of one group in either the training set or the test set.
When you use the groups
parameter as above, by default run_ml()
will assume that you want all of the observations from each group to be placed in the same partition of the train/test split. This makes sense when you want to use groups to control for batch effects. However, in some cases you might prefer to control exactly which groups end up in which partition, and you might even be okay with some observations from the same group being assigned to different partitions.
For example, say you want groups A and B to be used for training, C and D for testing, and you don’t have a preference for what happens to the other groups. You can give the group_partitions
parameter a named list to specify which groups should go in the training set and which should go in the testing set.
method
: The ML method used.
perf_metric_name
: The peformance metric used.perf_metric_name
: The performance metric used.
seed
: The seed (if set).Machine learning (ML) for classification and prediction based on a set of features is used to make decisions in healthcare, economics, criminal justice and more. However, implementing an ML pipeline including preprocessing, model selection, and evaluation can be time-consuming, confusing, and difficult. Here, we present mikropml
(prononced “meek-ROPE em el”), an easy-to-use R package that implements ML pipelines using regression, support vector machines, decision trees, random forest, or gradient-boosted trees. The package is available on GitHub, CRAN, and conda.
Machine learning (ML) for classification and prediction based on a set of features is used to make decisions in healthcare, economics, criminal justice and more. However, implementing an ML pipeline including preprocessing, model selection, and evaluation can be time-consuming, confusing, and difficult. Here, we present mikropml
(pronounced “meek-ROPE em el”), an easy-to-use R package that implements ML pipelines using regression, support vector machines, decision trees, random forest, or gradient-boosted trees. The package is available on GitHub, CRAN, and conda.
By default, preprocess_data()
, run_ml()
, and compare_models()
use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach
, future
, future.apply
, and doFuture
. Then, register a future plan prior to calling these functions:
By default, preprocess_data()
, run_ml()
, and compare_models()
use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach
, future
, future.apply
, and doFuture
. Then, register a future plan prior to calling these functions:
doFuture::registerDoFuture()
future::plan(future::multicore, workers = 2)
The output is a list: dat_transformed
which has the transformed data, grp_feats
which is a list of grouped features, and removed_feats
which is a list of featuures that were removed. Here, grp_feats
is NULL
because there are no perfectly correlated features (e.g. c(0,1,0)
and c(0,1,0)
, or c(0,1,0)
and c(1,0,1)
- see below for more details).
The output is a list: dat_transformed
which has the transformed data, grp_feats
which is a list of grouped features, and removed_feats
which is a list of features that were removed. Here, grp_feats
is NULL
because there are no perfectly correlated features (e.g. c(0,1,0)
and c(0,1,0)
, or c(0,1,0)
and c(1,0,1)
- see below for more details).
The first column (var1
) in dat_transformed
is a character and is changed to var1_yes
that has zeros (no) and ones (yes). The values in the second column (var2
) stay the same because it’s already binary, but the name changes to var2_1
. The third column (var3
) is a factor and is also changed to binary where b is 1 and a is 0, as denoted by the new column name var3_b
.
run_ml()
#> Training the model...
#> Loading required package: ggplot2
#> Loading required package: lattice
+#>
+#> Attaching package: 'caret'
+#> The following object is masked from 'package:mikropml':
+#>
+#> compare_models
#> Training complete.
You’ll probably get a warning when you run this because the dataset is very small. If you want to learn more about that, check out the introductory vignette about training and evaluating a ML model: vignette("introduction")
.
By default, run_ml()
selects hyperparameters depending on the dataset and method used.
NEWS.md
+ compare_models()
compares the performance of two models with a permutation test (#295, @courtneyarmour).cv_times
did not affect the reported repeats for cross-validation (#291, @kelly-sovacool).This minor patch fixes a test failure on platforms with no long doubles. The actual package code remains unchanged.
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index e9094896..d50db519 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -7,7 +7,7 @@ articles: parallel: parallel.html preprocess: preprocess.html tuning: tuning.html -last_built: 2022-05-18T17:33Z +last_built: 2022-05-19T19:39Z urls: reference: http://www.schlosslab.org/mikropml/reference article: http://www.schlosslab.org/mikropml/articles diff --git a/docs/pull_request_template.html b/docs/pull_request_template.html index cc58413a..4a26cae7 100644 --- a/docs/pull_request_template.html +++ b/docs/pull_request_template.html @@ -17,7 +17,7 @@R/compare_models.R
+ compare_models.Rd
A wrapper for permute_p_value()
.
compare_models(merged_data, metric, group_name, nperm = 10000)
the concatenated performance data from run_ml
metric to compare, must be numeric
column with group variables to compare
number of permutations, default=10000
a table of p-values for all pairs of group variable
+df <- dplyr::tibble(
+ model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"),
+ AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95)
+)
+set.seed(123)
+compare_models(df, "AUC", "model", nperm = 10)
+#> group1 group2 p_value
+#> 1 glmnet svmRadial 0.7272727
+#> 2 rf glmnet 0.2727273
+#> 3 rf svmRadial 0.5454545
+
Calculate the difference in the mean of the metric for two groups
+get_difference(sub_data, group_name, metric)
subset of the merged performance data frame for two groups
name of column with group variable
metric to compare
numeric difference in the average metric between the two groups
+Run the machine learning pipeline
Visualize & evalutate performance to help you tune hyperparameters and choose model methods.
+Visualize results to help you tune hyperparameters and choose model methods.
combine_hp_performance()
Combine hyperparameter performance metrics for multiple train/test splits
Evaluate and interpret models.
+Get feature importance using the permutation method
Get model performance metrics as a one-row tibble
Perform permutation tests to compare the performance metric +across all pairs of a group variable.
Calculated a permuted p-value comparing two models
Results from running the pipline with L2 logistic regression on otu_mini_bin
with feature importance and grouping
Results from running the pipeline with L2 logistic regression on otu_mini_bin
with feature importance and grouping
Results from running the pipline with random forest on otu_mini_bin
Results from running the pipeline with random forest on otu_mini_bin
Results from running the pipline with rpart2 on otu_mini_bin
Results from running the pipeline with rpart2 on otu_mini_bin
Results from running the pipline with svmRadial on otu_mini_bin
Results from running the pipeline with svmRadial on otu_mini_bin
Results from running the pipline with xbgTree on otu_mini_bin
Results from running the pipeline with xbgTree on otu_mini_bin
Replace spaces in all elements of a character vector with underscores
These are functions called by preprocess_data() or run_ml(). We make them available in case you would like to customize various steps of the pipeline beyond the arguments provided by the main functions.
+Customize various steps of the pipeline beyond the arguments provided by run_ml() and preprocess_data().
Get performance metrics for test data
Get model performance metrics as a one-row tibble
Get feature importance using the permutation method
otu_mini_bin
with feature importance and groupingotu_mini_bin
with feature importance and groupingR/data.R
otu_mini_bin_results_glmnet.Rd
Results from running the pipline with L2 logistic regression on otu_mini_bin
with feature importance and grouping
Results from running the pipeline with L2 logistic regression on otu_mini_bin
with feature importance and grouping
otu_mini_bin
otu_mini_bin
R/data.R
otu_mini_bin_results_rf.Rd
Results from running the pipline with random forest on otu_mini_bin
Results from running the pipeline with random forest on otu_mini_bin
otu_mini_bin
otu_mini_bin
R/data.R
otu_mini_bin_results_rpart2.Rd
Results from running the pipline with rpart2 on otu_mini_bin
Results from running the pipeline with rpart2 on otu_mini_bin
otu_mini_bin
otu_mini_bin
R/data.R
otu_mini_bin_results_svmRadial.Rd
Results from running the pipline with svmRadial on otu_mini_bin
Results from running the pipeline with svmRadial on otu_mini_bin
otu_mini_bin
otu_mini_bin
R/data.R
otu_mini_bin_results_xgbTree.Rd
Results from running the pipline with xbgTree on otu_mini_bin
Results from running the pipeline with xbgTree on otu_mini_bin
R/compare_models.R
+ permute_p_value.Rd
Calculated a permuted p-value comparing two models
+permute_p_value(
+ merged_data,
+ metric,
+ group_name,
+ group_1,
+ group_2,
+ nperm = 10000
+)
the concatenated performance data from run_ml
metric to compare, must be numeric
column with group variables to compare
name of one group to compare
name of other group to compare
number of permutations, default=10000
numeric p-value comparing two models
+Shuffle the rows in a column
+shuffle_group(dat, col_name)
a data frame containing col_name
column name to shuffle
dat
with the rows of col_name
shuffled