From 6944510ca9e62fd33a576580dd227ea94b11d1ea Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 12:34:17 -0400 Subject: [PATCH 01/10] Tweak doc title for compare_models() --- R/compare_models.R | 7 ++++--- man/compare_models.Rd | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 man/compare_models.Rd diff --git a/R/compare_models.R b/R/compare_models.R index 13ba1a19..e6ef622e 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -124,9 +124,10 @@ permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, n } -#' Compute all pairs of comparisons -#' calculate permuted p-value across all pairs of group variable. -#' wrapper for `permute_p_value` +#' Perform permutation tests to compare the performance metric +#' across all pairs of a group variable. +#' +#' A wrapper for `permute_p_value()`. #' #' @param merged_data the concatenated performance data from `run_ml` #' @param metric metric to compare, must be numeric diff --git a/man/compare_models.Rd b/man/compare_models.Rd new file mode 100644 index 00000000..d2ccab3e --- /dev/null +++ b/man/compare_models.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_models.R +\name{compare_models} +\alias{compare_models} +\title{Perform permutation tests to compare the performance metric +across all pairs of a group variable.} +\usage{ +compare_models(merged_data, metric, group_name, nperm = 10000) +} +\arguments{ +\item{merged_data}{the concatenated performance data from \code{run_ml}} + +\item{metric}{metric to compare, must be numeric} + +\item{group_name}{column with group variables to compare} + +\item{nperm}{number of permutations, default=10000} +} +\value{ +a table of p-values for all pairs of group varible +} +\description{ +A wrapper for \code{permute_p_value()}. +} +\examples{ +df <- dplyr::tibble( + model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), + AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +) +set.seed(123) +compare_models(df, "AUC", "model", nperm = 10) +} +\author{ +Courtney R Armour, \email{armourc@umich.edu} +} From 81d1605fecf63ed3feb543a784262c4fa8bb8ff3 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 12:34:37 -0400 Subject: [PATCH 02/10] Prepare to release 1.3.0 --- DESCRIPTION | 6 +++--- NEWS.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b94c54f8..b8ea3e32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mikropml Title: User-Friendly R Package for Supervised Machine Learning Pipelines -Version: 1.2.2.9000 -Date: 2022-02-03 +Version: 1.3.0 +Date: 2022-05-19 Authors@R: c(person(given = "Begüm", family = "Topçuoğlu", @@ -91,4 +91,4 @@ VignetteBuilder: Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 diff --git a/NEWS.md b/NEWS.md index af236574..aebff2c4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# development version +# mikropml 1.3.0 - mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292). - New function `compare_models()` compares the performance of two models with a permutation test (#295, @courtneyarmour). From ca1df249a9b76019f29c479bc69c258da95f8476 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 12:34:55 -0400 Subject: [PATCH 03/10] document() & build_site() --- NAMESPACE | 4 + docs/404.html | 2 +- docs/CODE_OF_CONDUCT.html | 3 +- docs/CONTRIBUTING.html | 3 +- docs/LICENSE-text.html | 2 +- docs/LICENSE.html | 3 +- docs/SUPPORT.html | 3 +- docs/articles/index.html | 2 +- docs/articles/introduction.html | 359 +++++++++++---- docs/articles/paper.html | 417 +++++++++++++----- docs/articles/parallel.html | 161 ++++--- docs/articles/preprocess.html | 216 ++++++--- docs/articles/tuning.html | 169 +++++-- docs/authors.html | 2 +- docs/index.html | 9 +- docs/news/index.html | 46 +- docs/pkgdown.yml | 4 +- docs/pull_request_template.html | 28 +- docs/reference/calc_perf_metrics.html | 2 +- docs/reference/combine_hp_performance.html | 2 +- docs/reference/compare_models.html | 145 ++++++ docs/reference/define_cv.html | 2 +- docs/reference/get_caret_processed_df.html | 2 +- docs/reference/get_difference.html | 137 ++++++ docs/reference/get_feature_importance.html | 4 +- docs/reference/get_hp_performance.html | 2 +- docs/reference/get_hyperparams_list.html | 2 +- docs/reference/get_outcome_type.html | 2 +- docs/reference/get_partition_indices.html | 2 +- docs/reference/get_perf_metric_fn.html | 8 +- docs/reference/get_perf_metric_name.html | 2 +- docs/reference/get_performance_tbl.html | 2 +- docs/reference/get_tuning_grid.html | 2 +- docs/reference/group_correlated_features.html | 2 +- docs/reference/index.html | 7 +- docs/reference/mikropml.html | 2 +- docs/reference/otu_mini_bin.html | 2 +- .../otu_mini_bin_results_glmnet.html | 2 +- docs/reference/otu_mini_bin_results_rf.html | 2 +- .../otu_mini_bin_results_rpart2.html | 2 +- .../otu_mini_bin_results_svmRadial.html | 2 +- .../otu_mini_bin_results_xgbTree.html | 2 +- .../otu_mini_cont_results_glmnet.html | 2 +- .../reference/otu_mini_cont_results_nocv.html | 2 +- docs/reference/otu_mini_cv.html | 2 +- docs/reference/otu_mini_multi.html | 2 +- docs/reference/otu_mini_multi_group.html | 2 +- .../otu_mini_multi_results_glmnet.html | 2 +- docs/reference/otu_small.html | 2 +- docs/reference/permute_p_value.html | 151 +++++++ docs/reference/plot_hp_performance.html | 2 +- docs/reference/plot_model_performance.html | 2 +- docs/reference/preprocess_data.html | 2 +- docs/reference/randomize_feature_order.html | 10 +- docs/reference/reexports.html | 4 +- docs/reference/remove_singleton_columns.html | 2 +- docs/reference/replace_spaces.html | 2 +- docs/reference/run_ml.html | 2 +- docs/reference/shuffle_group.html | 141 ++++++ docs/reference/tidy_perf_data.html | 2 +- docs/reference/train_model.html | 2 +- docs/sitemap.xml | 12 + man/get_difference.Rd | 32 ++ man/permute_p_value.Rd | 47 ++ man/reexports.Rd | 2 +- man/shuffle_group.Rd | 30 ++ 66 files changed, 1799 insertions(+), 432 deletions(-) create mode 100644 docs/reference/compare_models.html create mode 100644 docs/reference/get_difference.html create mode 100644 docs/reference/permute_p_value.html create mode 100644 docs/reference/shuffle_group.html create mode 100644 man/get_difference.Rd create mode 100644 man/permute_p_value.Rd create mode 100644 man/shuffle_group.Rd diff --git a/NAMESPACE b/NAMESPACE index 81dd1dd4..8410e1c2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,9 +6,11 @@ export(":=") export(.data) export(calc_perf_metrics) export(combine_hp_performance) +export(compare_models) export(contr.ltfr) export(define_cv) export(get_caret_processed_df) +export(get_difference) export(get_feature_importance) export(get_hp_performance) export(get_hyperparams_list) @@ -19,6 +21,7 @@ export(get_perf_metric_name) export(get_performance_tbl) export(get_tuning_grid) export(group_correlated_features) +export(permute_p_value) export(plot_hp_performance) export(plot_model_performance) export(preprocess_data) @@ -26,6 +29,7 @@ export(randomize_feature_order) export(remove_singleton_columns) export(replace_spaces) export(run_ml) +export(shuffle_group) export(tidy_perf_data) export(train_model) importFrom(MLmetrics,AUC) diff --git a/docs/404.html b/docs/404.html index 986332d0..50b47da1 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html index ccf732ce..5bb0afd6 100644 --- a/docs/CODE_OF_CONDUCT.html +++ b/docs/CODE_OF_CONDUCT.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -73,6 +73,7 @@

Contributor Covenant Code of Conduct

+

This document was adapted from the Tidyverse Code of Conduct.

Our Pledge

diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 632efc65..4cc444a7 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -73,6 +73,7 @@

Contributing to mikropml

+

This document was adapted from the Tidyverse Contributing guide.

Fixing typos

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index c35487fd..8c20f191 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 757d2260..2d2d71ff 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -73,6 +73,7 @@

MIT License

+

Copyright (c) 2019-2021 Begüm D. Topçuoğlu, Zena Lapp, Kelly L. Sovacool, Evan Snitkin, Jenna Wiens, and Patrick D. Schloss

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

diff --git a/docs/SUPPORT.html b/docs/SUPPORT.html index a297054e..e80f09e2 100644 --- a/docs/SUPPORT.html +++ b/docs/SUPPORT.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -73,6 +73,7 @@

Getting help with mikropml

+

Thanks for using mikropml! Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible.

Make a reprex

diff --git a/docs/articles/index.html b/docs/articles/index.html index 11263a96..9e13e8a1 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html index 0b546ce5..57ba736e 100644 --- a/docs/articles/introduction.html +++ b/docs/articles/introduction.html @@ -40,7 +40,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -100,7 +100,7 @@ -
+

Before running ML

-

Before you execute run_ml(), you should consider preprocessing your data, either on your own or with the preprocess_data() function. You can learn more about this in the preprocessing vignette: vignette("preprocess").

+

Before you execute run_ml(), you should consider +preprocessing your data, either on your own or with the +preprocess_data() function. You can learn more about this +in the preprocessing vignette: vignette("preprocess").

The simplest way to run_ml()

-

As mentioned above, the minimal input is your dataset (dataset) and the machine learning model you want to use (method).

+

As mentioned above, the minimal input is your dataset +(dataset) and the machine learning model you want to use +(method).

You may also want to provide:

    -
  • The outcome column name. By default run_ml() will pick the first column, but it’s best practice to specify the column name explicitly.
  • -
  • A seed so that the results will be reproducible, and so that you get the same results as those you see here (i.e have the same train/test split).
  • +
  • The outcome column name. By default run_ml() will pick +the first column, but it’s best practice to specify the column name +explicitly.
  • +
  • A seed so that the results will be reproducible, and so that you get +the same results as those you see here (i.e have the same train/test +split).
-

Say we want to use logistic regression, then the method we will use is glmnet. To do so, run the ML pipeline with:

+

Say we want to use logistic regression, then the method we will use +is glmnet. To do so, run the ML pipeline with:

 results <- run_ml(otu_mini_bin,
                   'glmnet',
@@ -200,16 +253,25 @@ 

The simplest way to run_ml()= 2019)

You’ll notice a few things:

    -
  1. It takes a little while to run. This is because of some of the parameters we use.
  2. -
  3. There is a message stating that ‘dx’ is being used as the outcome column. This is what we want, but it’s a nice sanity check!
  4. -
  5. There was a warning. Don’t worry about this warning right now - it just means that some of the hyperparameters aren’t a good fit - but if you’re interested in learning more, see vignette("tuning").
  6. +
  7. It takes a little while to run. This is because of some of the +parameters we use.
  8. +
  9. There is a message stating that ‘dx’ is being used as the outcome +column. This is what we want, but it’s a nice sanity check!
  10. +
  11. There was a warning. Don’t worry about this warning right now - it +just means that some of the hyperparameters aren’t a good fit - but if +you’re interested in learning more, see +vignette("tuning").
-

Now, let’s dig into the output a bit. The results is a list of 4 things:

+

Now, let’s dig into the output a bit. The results is a list of 4 +things:

 names(results)
 #> [1] "trained_model"      "test_data"          "performance"       
 #> [4] "feature_importance"
-

trained_model is the trained model from caret. There is a bunch of info in this that we won’t get into, because you can learn more from the caret::train() documentation.

+

trained_model is the trained model from +caret. There is a bunch of info in this that we won’t get +into, because you can learn more from the caret::train() +documentation.

 names(results$trained_model)
 #>  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
@@ -217,7 +279,13 @@ 

The simplest way to run_ml()#> [11] "finalModel" "preProcess" "trainingData" "ptype" "resample" #> [16] "resampledCM" "perfNames" "maximize" "yLimits" "times" #> [21] "levels" "terms" "coefnames" "xlevels"

-

test_data is the partition of the dataset that was used for testing. In machine learning, it’s always important to have a held-out test dataset that is not used in the training stage. In this pipeline we do that using run_ml() where we split your data into training and testing sets. The training data are used to build the model (e.g. tune hyperparameters, learn the data) and the test data are used to evaluate how well the model performs.

+

test_data is the partition of the dataset that was used +for testing. In machine learning, it’s always important to have a +held-out test dataset that is not used in the training stage. In this +pipeline we do that using run_ml() where we split your data +into training and testing sets. The training data are used to build the +model (e.g. tune hyperparameters, learn the data) and the test data are +used to evaluate how well the model performs.

 head(results$test_data)
 #>        dx Otu00009 Otu00005 Otu00010 Otu00001 Otu00008 Otu00004 Otu00003
@@ -234,7 +302,10 @@ 

The simplest way to run_ml()#> 17 357 253 341 #> 27 25 322 5 #> 30 179 6 30

-

performance is a dataframe of (mainly) performance metrics (1 column for cross-validation performance metric, several for test performance metrics, and 2 columns at the end with ML method and seed):

+

performance is a dataframe of (mainly) performance +metrics (1 column for cross-validation performance metric, several for +test performance metrics, and 2 columns at the end with ML method and +seed):

-

When using logistic regression for binary classification, area under the receiver-operator characteristic curve (AUC) is a useful metric to evaluate model performance. Because of that, it’s the default that we use for mikropml. However, it is crucial to evaluate your model performance using multiple metrics. Below you can find more information about other performance metrics and how to use them in our package.

-

cv_metric_AUC is the AUC for the cross-validation folds for the training data. This gives us a sense of how well the model performs on the training data.

-

Most of the other columns are performance metrics for the test data — the data that wasn’t used to build the model. Here, you can see that the AUC for the test data is not much above 0.5, suggesting that this model does not predict much better than chance, and that the model is overfit because the cross-validation AUC (cv_metric_AUC, measured during training) is much higher than the testing AUC. This isn’t too surprising since we’re using so few features with this example dataset, so don’t be discouraged. The default option also provides a number of other performance metrics that you might be interested in, including area under the precision-recall curve (prAUC).

-

The last columns of results$performance are the method and seed (if you set one) to help with combining results from multiple runs (see vignette("parallel")).

-

feature_importance has information about feature importance values if find_feature_importance = TRUE (the default is FALSE). Since we used the defaults, there’s nothing here:

+

When using logistic regression for binary classification, area under +the receiver-operator characteristic curve (AUC) is a useful metric to +evaluate model performance. Because of that, it’s the default that we +use for mikropml. However, it is crucial to evaluate your +model performance using multiple metrics. Below you can find more +information about other performance metrics and how to use them in our +package.

+

cv_metric_AUC is the AUC for the cross-validation folds +for the training data. This gives us a sense of how well the model +performs on the training data.

+

Most of the other columns are performance metrics for the test data — +the data that wasn’t used to build the model. Here, you can see that the +AUC for the test data is not much above 0.5, suggesting that this model +does not predict much better than chance, and that the model is overfit +because the cross-validation AUC (cv_metric_AUC, measured +during training) is much higher than the testing AUC. This isn’t too +surprising since we’re using so few features with this example dataset, +so don’t be discouraged. The default option also provides a number of +other performance metrics that you might be interested in, including +area under the precision-recall curve (prAUC).

+

The last columns of results$performance are the method +and seed (if you set one) to help with combining results from multiple +runs (see vignette("parallel")).

+

feature_importance has information about feature +importance values if find_feature_importance = TRUE (the +default is FALSE). Since we used the defaults, there’s +nothing here:

 results$feature_importance
 #> [1] "Skipped feature importance"
@@ -256,17 +349,24 @@

The simplest way to run_ml()

Customizing parameters

-

There are a few arguments that allow you to change how you execute run_ml(). We’ve chosen reasonable defaults for you, but we encourage you to change these if you think something else would be better for your data.

+

There are a few arguments that allow you to change how you execute +run_ml(). We’ve chosen reasonable defaults for you, but we +encourage you to change these if you think something else would be +better for your data.

-

Changing kfold, cv_times, and training_frac +

Changing kfold, cv_times, and +training_frac

  • -kfold: The number of folds to run for cross-validation (default: 5).
  • +kfold: The number of folds to run for cross-validation +(default: 5).
  • -cv_times: The number of times to run repeated cross-validation (default: 100).
  • +cv_times: The number of times to run repeated +cross-validation (default: 100).
  • -training_frac: The fraction of data for the training set (default: 0.8). The rest of the data is used for testing.
  • +training_frac: The fraction of data for the training +set (default: 0.8). The rest of the data is used for testing.

Here’s an example where we change some of the default parameters:

@@ -280,17 +380,32 @@ 

Changing kfold, #> Training the model... #> Loading required package: ggplot2 #> Loading required package: lattice +#> +#> Attaching package: 'caret' +#> The following object is masked from 'package:mikropml': +#> +#> compare_models #> Warning in (function (w) : `caret::train()` issued the following warning: #> #> simpleWarning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled performance measures. #> #> This warning usually means that the model didn't converge in some cross-validation folds because it is predicting something close to a constant. As a result, certain performance metrics can't be calculated. This suggests that some of the hyperparameters chosen are doing very poorly. #> Training complete.

-

You might have noticed that this one ran faster — that’s because we reduced kfold and cv_times. This is okay for testing things out and may even be necessary for smaller datasets. But in general it may be better to have larger numbers for these parameters; we think the defaults are a good starting point (Topçuoğlu et al. 2020).

+

You might have noticed that this one ran faster — that’s because we +reduced kfold and cv_times. This is okay for +testing things out and may even be necessary for smaller datasets. But +in general it may be better to have larger numbers for these parameters; +we think the defaults are a good starting point (Topçuoğlu et al. 2020).

Custom training indices

-

When training_frac is a fraction between 0 and 1, a random sample of observations in the dataset are chosen for the training set to satisfy the training_frac. However, in some cases you might wish to control exactly which observations are in the training set. You can instead assign training_frac a vector of indices that correspond to which rows of the dataset should go in the training set (all remaining sequences will go in the testing set).

+

When training_frac is a fraction between 0 and 1, a +random sample of observations in the dataset are chosen for the training +set to satisfy the training_frac. However, in some cases +you might wish to control exactly which observations are in the training +set. You can instead assign training_frac a vector of +indices that correspond to which rows of the dataset should go in the +training set (all remaining sequences will go in the testing set).

 n_obs <- otu_mini_bin %>% nrow()
 training_size <- 0.8 * n_obs
@@ -312,10 +427,20 @@ 

Custom training indices

Changing the performance metric

-

There are two arguments that allow you to change what performance metric to use for model evaluation, and what performance metrics to calculate using the test data.

-

perf_metric_function is the function used to calculate the performance metrics.

-

The default for classification is caret::multiClassSummary() and the default for regression is caret::defaultSummary(). We’d suggest not changing this unless you really know what you’re doing.

-

perf_metric_name is the column name from the output of perf_metric_function. We chose reasonable defaults (AUC for binary, logLoss for multiclass, and RMSE for continuous), but the default functions calculate a bunch of different performance metrics, so you can choose a different one if you’d like.

+

There are two arguments that allow you to change what performance +metric to use for model evaluation, and what performance metrics to +calculate using the test data.

+

perf_metric_function is the function used to calculate +the performance metrics.

+

The default for classification is +caret::multiClassSummary() and the default for regression +is caret::defaultSummary(). We’d suggest not changing this +unless you really know what you’re doing.

+

perf_metric_name is the column name from the output of +perf_metric_function. We chose reasonable defaults (AUC for +binary, logLoss for multiclass, and RMSE for continuous), but the +default functions calculate a bunch of different performance metrics, so +you can choose a different one if you’d like.

The default performance metrics available for classification are:

#>  [1] "logLoss"                "AUC"                    "prAUC"                 
 #>  [4] "Accuracy"               "Kappa"                  "Mean_F1"               
@@ -339,7 +464,8 @@ 

Changing the performance metric#> #> This warning usually means that the model didn't converge in some cross-validation folds because it is predicting something close to a constant. As a result, certain performance metrics can't be calculated. This suggests that some of the hyperparameters chosen are doing very poorly. #> Training complete.

-

You’ll see that the cross-validation metric is prAUC, instead of the default AUC:

+

You’ll see that the cross-validation metric is prAUC, instead of the +default AUC:

 results_pr$performance
 #> # A tibble: 1 × 17
@@ -353,8 +479,16 @@ 

Changing the performance metric

Using groups

-

The optional groups is a vector of groups to keep together when splitting the data into train and test sets and for cross-validation. Sometimes it’s important to split up the data based on a grouping instead of just randomly. This allows you to control for similarities within groups that you don’t want to skew your predictions (i.e. batch effects). For example, with biological data you may have samples collected from multiple hospitals, and you might like to keep observations from the same hospital in the same partition.

-

Here’s an example where we split the data into train/test sets based on groups:

+

The optional groups is a vector of groups to keep +together when splitting the data into train and test sets and for +cross-validation. Sometimes it’s important to split up the data based on +a grouping instead of just randomly. This allows you to control for +similarities within groups that you don’t want to skew your predictions +(i.e. batch effects). For example, with biological data you may have +samples collected from multiple hospitals, and you might like to keep +observations from the same hospital in the same partition.

+

Here’s an example where we split the data into train/test sets based +on groups:

-

The one difference here is run_ml() will report how much of the data is in the training set if you run the above code chunk. This can be a little finicky depending on how many samples and groups you have. This is because it won’t be exactly what you specify with training_frac, since you have to include all of one group in either the training set or the test set.

+

The one difference here is run_ml() will report how much +of the data is in the training set if you run the above code chunk. This +can be a little finicky depending on how many samples and groups you +have. This is because it won’t be exactly what you specify with +training_frac, since you have to include all of one group +in either the training set or the test set.

Controling how groups are assigned to partitions

-

When you use the groups parameter as above, by default run_ml() will assume that you want all of the observations from each group to be placed in the same partition of the train/test split. This makes sense when you want to use groups to control for batch effects. However, in some cases you might prefer to control exactly which groups end up in which partition, and you might even be okay with some observations from the same group being assigned to different partitions.

-

For example, say you want groups A and B to be used for training, C and D for testing, and you don’t have a preference for what happens to the other groups. You can give the group_partitions parameter a named list to specify which groups should go in the training set and which should go in the testing set.

+

When you use the groups parameter as above, by default +run_ml() will assume that you want all of the observations +from each group to be placed in the same partition of the train/test +split. This makes sense when you want to use groups to control for batch +effects. However, in some cases you might prefer to control exactly +which groups end up in which partition, and you might even be okay with +some observations from the same group being assigned to different +partitions.

+

For example, say you want groups A and B to be used for training, C +and D for testing, and you don’t have a preference for what happens to +the other groups. You can give the group_partitions +parameter a named list to specify which groups should go in the training +set and which should go in the testing set.

 results_grp_part <- run_ml(otu_mini_bin, 
                       'glmnet', 
@@ -395,8 +545,13 @@ 

Controling how groups #> Groups will not be kept together in CV partitions because the number of groups in the training set is not larger than `kfold` #> Training the model... #> Training complete.

-

In the above case, all observations from A & B will be used for training, all from C & D will be used for testing, and the remaining groups will be randomly assigned to one or the other to satisfy the training_frac as closely as possible.

-

In another scenario, maybe you want only groups A through F to be used for training, but you also want to allow other observations not selected for training from A through F to be used for testing:

+

In the above case, all observations from A & B will be used for +training, all from C & D will be used for testing, and the remaining +groups will be randomly assigned to one or the other to satisfy the +training_frac as closely as possible.

+

In another scenario, maybe you want only groups A through F to be +used for training, but you also want to allow other observations not +selected for training from A through F to be used for testing:

 results_grp_trainA <- run_ml(otu_mini_bin, 
                       'glmnet', 
@@ -415,14 +570,24 @@ 

Controling how groups #> Groups will be kept together in CV partitions #> Training the model... #> Training complete.

-

If you need even more control than this, take a look at setting custom training indices. You might also prefer to provide your own train control scheme with the cross_val parameter in run_ml().

+

If you need even more control than this, take a look at setting custom training indices. You +might also prefer to provide your own train control scheme with the +cross_val parameter in run_ml().

Finding feature importance

-

To find which features are contributing to predictive power, you can use find_feature_importance = TRUE. How we use permutation importance to determine feature importance is described in (Topçuoğlu et al. 2020). Briefly, it permutes each of the features individually (or correlated ones together) and evaluates how much the performance metric decreases. The more performance decreases when the feature is randomly shuffled, the more important that feature is. The default is FALSE because it takes a while to run and is only useful if you want to know what features are important in predicting your outcome.

+

To find which features are contributing to predictive power, you can +use find_feature_importance = TRUE. How we use permutation +importance to determine feature importance is described in (Topçuoğlu et al. 2020). Briefly, it permutes +each of the features individually (or correlated ones together) and +evaluates how much the performance metric decreases. The more +performance decreases when the feature is randomly shuffled, the more +important that feature is. The default is FALSE because it +takes a while to run and is only useful if you want to know what +features are important in predicting your outcome.

Let’s look at some feature importance results:

 results_imp <- run_ml(otu_mini_bin,
@@ -459,11 +624,16 @@ 

Finding feature importanceThere are several columns:

  1. -perf_metric: The performance value of the permuted feature.
  2. +perf_metric: The performance value of the permuted +feature.
  3. -perf_metric_diff: The difference between the performance for the actual and permuted data (i.e. test performance minus permuted performance). Features with a larger perf_metric_diff are more important.
  4. +perf_metric_diff: The difference between the +performance for the actual and permuted data (i.e. test performance +minus permuted performance). Features with a larger +perf_metric_diff are more important.
  5. -pvalue: the probability of obtaining the actual performance value under the null hypothesis.
  6. +pvalue: the probability of obtaining the actual +performance value under the null hypothesis.
  7. names: The feature that was permuted.
  8. @@ -473,8 +643,15 @@

    Finding feature importance seed: The seed (if set).

-

As you can see here, the differences are negligible (close to zero), which makes sense since our model isn’t great. If you’re interested in feature importance, it’s especially useful to run multiple different train/test splits, as shown in our example snakemake workflow.

-

You can also choose to permute correlated features together using corr_thresh (default: 1). Any features that are above the correlation threshold are permuted together; i.e. perfectly correlated features are permuted together when using the default value.

+

As you can see here, the differences are negligible (close to zero), +which makes sense since our model isn’t great. If you’re interested in +feature importance, it’s especially useful to run multiple different +train/test splits, as shown in our example +snakemake workflow.

+

You can also choose to permute correlated features together using +corr_thresh (default: 1). Any features that are above the +correlation threshold are permuted together; i.e. perfectly correlated +features are permuted together when using the default value.

 results_imp_corr <- run_ml(otu_mini_bin,
                            'glmnet',
@@ -505,19 +682,32 @@ 

Finding feature importance#> 1 glmnet AUC 2019 #> 2 glmnet AUC 2019 #> 3 glmnet AUC 2019

-

You can see which features were permuted together in the names column. Here all 3 features were permuted together (which doesn’t really make sense, but it’s just an example).

-

If you previously executed run_ml() without feature importance but now wish to find feature importance after the fact, see the example code in the get_feature_importance() documentation.

-

get_feature_importance() can show a live progress bar, see vignette("parallel") for examples.

+

You can see which features were permuted together in the +names column. Here all 3 features were permuted together +(which doesn’t really make sense, but it’s just an example).

+

If you previously executed run_ml() without feature +importance but now wish to find feature importance after the fact, see +the example code in the get_feature_importance() +documentation.

+

get_feature_importance() can show a live progress bar, +see vignette("parallel") for examples.

-

Tuning hyperparameters (using the hyperparameter argument) +

Tuning hyperparameters (using the hyperparameter +argument)

-

This is important, so we have a whole vignette about them. The bottom line is we provide default hyperparameters that you can start with, but it’s important to tune your hyperparameters. For more information about what the default hyperparameters are, and how to tune hyperparameters, see vignette("tuning").

+

This is important, so we have a whole vignette about them. The bottom +line is we provide default hyperparameters that you can start with, but +it’s important to tune your hyperparameters. For more information about +what the default hyperparameters are, and how to tune hyperparameters, +see vignette("tuning").

Other models

-

Here are examples of how to train and evaluate other models. The output for all of them is very similar, so we won’t go into those details.

+

Here are examples of how to train and evaluate other models. The +output for all of them is very similar, so we won’t go into those +details.

Random forest

@@ -526,7 +716,11 @@

Random forest'rf', cv_times = 5, seed = 2019)

-

You can also change the number of trees to use for random forest (ntree; default: 1000). This can’t be tuned using rf package implementation of random forest. Please refer to caret documentation if you are interested in other packages with random forest implementations.

+

You can also change the number of trees to use for random forest +(ntree; default: 1000). This can’t be tuned using +rf package implementation of random forest. Please refer to +caret documentation if you are interested in other packages +with random forest implementations.

 results_rf_nt <- run_ml(otu_mini_bin,
                         'rf',
@@ -551,7 +745,8 @@ 

SVM 'svmRadial', cv_times = 5, seed = 2019)

-

If you get a message “maximum number of iterations reached”, see this issue in caret.

+

If you get a message “maximum number of iterations reached”, see this issue in +caret.

@@ -560,7 +755,8 @@

Other data

Multiclass data

-

We provide otu_mini_multi with a multiclass outcome (three or more outcomes):

+

We provide otu_mini_multi with a multiclass outcome +(three or more outcomes):

 otu_mini_multi %>% dplyr::pull('dx') %>% unique()
 #> [1] "adenoma"   "carcinoma" "normal"
@@ -570,7 +766,8 @@

Multiclass data= "dx", seed = 2019 )

-

The performance metrics are slightly different, but the format of everything else is the same:

+

The performance metrics are slightly different, but the format of +everything else is the same:

 results_multi$performance
 #> # A tibble: 1 × 17
@@ -585,13 +782,15 @@ 

Multiclass data

Continuous data

-

And here’s an example for running continuous data, where the outcome column is numerical:

+

And here’s an example for running continuous data, where the outcome +column is numerical:

 results_cont <- run_ml(otu_mini_bin[, 2:11],
                        'glmnet',
                        outcome_colname = 'Otu00001',
                        seed = 2019)
-

Again, the performance metrics are slightly different, but the format of the rest is the same:

+

Again, the performance metrics are slightly different, but the format +of the rest is the same:

 results_cont$performance
 #> # A tibble: 1 × 6
@@ -601,14 +800,22 @@ 

Continuous data

-

References +

References

-
-
-

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

-
-
-

Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, and Patrick D. Schloss. 2020. “A Framework for Effective Application of Machine Learning to Microbiome-Based Classification Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20.

+
+
+Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael +W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR +Analyses with FIDDLE: A Flexible Data-Driven Preprocessing +Pipeline for Structured Clinical Data.” J Am Med Inform +Assoc, October. https://doi.org/10.1093/jamia/ocaa139. +
+
+Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, +and Patrick D. Schloss. 2020. “A Framework for +Effective Application of Machine Learning to +Microbiome-Based Classification +Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20.
diff --git a/docs/articles/paper.html b/docs/articles/paper.html index 4f4af106..22fa9401 100644 --- a/docs/articles/paper.html +++ b/docs/articles/paper.html @@ -40,7 +40,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -100,11 +100,14 @@ -
+

Statement of need

-

Most applications of machine learning (ML) require reproducible steps for data pre-processing, cross-validation, testing, model evaluation, and often interpretation of why the model makes particular predictions. Performing these steps is important, as failure to implement them can result in incorrect and misleading results (Teschendorff 2019; Wiens et al. 2019).

-

Supervised ML is widely used to recognize patterns in large datasets and to make predictions about outcomes of interest. Several packages including caret (Kuhn 2008) and tidymodels (Kuhn, Wickham, and RStudio 2020) in R, scikitlearn (Pedregosa et al. 2011) in Python, and the H2O autoML platform (H2O.ai 2020) allow scientists to train ML models with a variety of algorithms. While these packages provide the tools necessary for each ML step, they do not implement a complete ML pipeline according to good practices in the literature. This makes it difficult for practitioners new to ML to easily begin to perform ML analyses.

-

To enable a broader range of researchers to apply ML to their problem domains, we created mikropml, an easy-to-use R package (R Core Team 2020) that implements the ML pipeline created by Topçuoğlu et al. (Topçuoğlu et al. 2020) in a single function that returns a trained model, model performance metrics and feature importance. mikropml leverages the caret package to support several ML algorithms: linear regression, logistic regression, support vector machines with a radial basis kernel, decision trees, random forest, and gradient boosted trees. It incorporates good practices in ML training, testing, and model evaluation (Topçuoğlu et al. 2020; Teschendorff 2019). Furthermore, it provides data preprocessing steps based on the FIDDLE (FlexIble Data-Driven pipeLinE) framework outlined in Tang et al. (Tang et al. 2020) and post-training permutation importance steps to estimate the importance of each feature in the models trained (Breiman 2001; Fisher, Rudin, and Dominici 2018).

-

mikropml can be used as a starting point in the application of ML to datasets from many different fields. It has already been applied to microbiome data to categorize patients with colorectal cancer (Topçuoğlu et al. 2020), to identify differences in genomic and clinical features associated with bacterial infections (Lapp et al. 2020), and to predict gender-based biases in academic publishing (Hagan et al. 2020).

+

Most applications of machine learning (ML) require reproducible steps +for data pre-processing, cross-validation, testing, model evaluation, +and often interpretation of why the model makes particular predictions. +Performing these steps is important, as failure to implement them can +result in incorrect and misleading results (Teschendorff 2019; Wiens et al. 2019).

+

Supervised ML is widely used to recognize patterns in large datasets +and to make predictions about outcomes of interest. Several packages +including caret (Kuhn 2008) +and tidymodels (Kuhn, Wickham, and +RStudio 2020) in R, scikitlearn (Pedregosa et al. 2011) in Python, and the H2O +autoML platform (H2O.ai 2020) +allow scientists to train ML models with a variety of algorithms. While +these packages provide the tools necessary for each ML step, they do not +implement a complete ML pipeline according to good practices in the +literature. This makes it difficult for practitioners new to ML to +easily begin to perform ML analyses.

+

To enable a broader range of researchers to apply ML to their problem +domains, we created mikropml, +an easy-to-use R package (R Core Team +2020) that implements the ML pipeline created by Topçuoğlu et +al. (Topçuoğlu et al. 2020) in a +single function that returns a trained model, model performance metrics +and feature importance. mikropml leverages the +caret package to support several ML algorithms: linear +regression, logistic regression, support vector machines with a radial +basis kernel, decision trees, random forest, and gradient boosted trees. +It incorporates good practices in ML training, testing, and model +evaluation (Topçuoğlu et al. 2020; Teschendorff +2019). Furthermore, it provides data preprocessing steps based on +the FIDDLE (FlexIble Data-Driven pipeLinE) framework outlined in Tang +et al. (Tang et al. 2020) and +post-training permutation importance steps to estimate the importance of +each feature in the models trained (Breiman 2001; +Fisher, Rudin, and Dominici 2018).

+

mikropml can be used as a starting point in the +application of ML to datasets from many different fields. It has already +been applied to microbiome data to categorize patients with colorectal +cancer (Topçuoğlu et al. 2020), to +identify differences in genomic and clinical features associated with +bacterial infections (Lapp et al. 2020), +and to predict gender-based biases in academic publishing (Hagan et al. 2020).

mikropml package

-

The mikropml package includes functionality to preprocess the data, train ML models, evaluate model performance, and quantify feature importance (Figure 1). We also provide vignettes and an example Snakemake workflow (Köster and Rahmann 2012) to showcase how to run an ideal ML pipeline with multiple different train/test data splits. The results can be visualized using helper functions that use ggplot2 (Wickham 2016).

-

While mikropml allows users to get started quickly and facilitates reproducibility, it is not a replacement for understanding the ML workflow which is still necessary when interpreting results (Pollard et al. 2019). To facilitate understanding and enable one to tailor the code to their application, we have heavily commented the code and have provided supporting documentation which can be read online.

+

The mikropml package includes functionality to +preprocess the data, train ML models, evaluate model performance, and +quantify feature importance (Figure 1). We also provide vignettes +and an example +Snakemake workflow (Köster and Rahmann +2012) to showcase how to run an ideal ML pipeline with multiple +different train/test data splits. The results can be visualized using +helper functions that use ggplot2 (Wickham 2016).

+

While mikropml allows users to get started quickly and facilitates +reproducibility, it is not a replacement for understanding the ML +workflow which is still necessary when interpreting results (Pollard et al. 2019). To facilitate +understanding and enable one to tailor the code to their application, we +have heavily commented the code and have provided supporting +documentation which can be read online.

Preprocessing data

-

We provide the function preprocess_data() to preprocess features using several different functions from the caret package. preprocess_data() takes continuous and categorical data, re-factors categorical data into binary features, and provides options to normalize continuous data, remove features with near-zero variance, and keep only one instance of perfectly correlated features. We set the default options based on those implemented in FIDDLE (Tang et al. 2020). More details on how to use preprocess_data() can be found in the accompanying vignette.

+

We provide the function preprocess_data() to preprocess +features using several different functions from the caret +package. preprocess_data() takes continuous and categorical +data, re-factors categorical data into binary features, and provides +options to normalize continuous data, remove features with near-zero +variance, and keep only one instance of perfectly correlated features. +We set the default options based on those implemented in FIDDLE (Tang et al. 2020). More details on how to use +preprocess_data() can be found in the accompanying vignette.

Running ML

-

The main function in mikropml, run_ml(), minimally takes in the model choice and a data frame with an outcome column and feature columns. For model choice, mikropml currently supports logistic and linear regression (glmnet: Friedman, Hastie, and Tibshirani 2010), support vector machines with a radial basis kernel (kernlab: Karatzoglou et al. 2004), decision trees (rpart: Therneau et al. 2019), random forest (randomForest: Liaw and Wiener 2002), and gradient-boosted trees (xgboost: Chen et al. 2020). run_ml() randomly splits the data into train and test sets while maintaining the distribution of the outcomes found in the full dataset. It also provides the option to split the data into train and test sets based on categorical variables (e.g. batch, geographic location, etc.). mikropml uses the caret package (Kuhn 2008) to train and evaluate the models, and optionally quantifies feature importance. The output includes the best model built based on tuning hyperparameters in an internal and repeated cross-validation step, model evaluation metrics, and optional feature importances. Feature importances are calculated using a permutation test, which breaks the relationship between the feature and the true outcome in the test data, and measures the change in model performance. This provides an intuitive metric of how individual features influence model performance and is comparable across model types, which is particularly useful for model interpretation (Topçuoğlu et al. 2020). Our introductory vignette contains a comprehensive tutorial on how to use run_ml().

+

The main function in mikropml, run_ml(), minimally takes +in the model choice and a data frame with an outcome column and feature +columns. For model choice, mikropml currently supports +logistic and linear regression (glmnet: Friedman, Hastie, and Tibshirani +2010), support vector machines with a radial basis kernel (kernlab: Karatzoglou et al. 2004), +decision trees (rpart: Therneau et +al. 2019), random forest (randomForest: Liaw and Wiener +2002), and gradient-boosted trees (xgboost: Chen et al. 2020). +run_ml() randomly splits the data into train and test sets +while maintaining the distribution of the outcomes found in the full +dataset. It also provides the option to split the data into train and +test sets based on categorical variables (e.g. batch, geographic +location, etc.). mikropml uses the caret +package (Kuhn 2008) to train and evaluate +the models, and optionally quantifies feature importance. The output +includes the best model built based on tuning hyperparameters in an +internal and repeated cross-validation step, model evaluation metrics, +and optional feature importances. Feature importances are calculated +using a permutation test, which breaks the relationship between the +feature and the true outcome in the test data, and measures the change +in model performance. This provides an intuitive metric of how +individual features influence model performance and is comparable across +model types, which is particularly useful for model interpretation (Topçuoğlu et al. 2020). Our introductory +vignette contains a comprehensive tutorial on how to use +run_ml().

mikropml pipeline

-

Ideal workflow for running mikropml with many different train/test splits +

Ideal workflow for running mikropml with many different train/test +splits

-

To investigate the variation in model performance depending on the train and test set used (Topçuoğlu et al. 2020; Lapp et al. 2020), we provide examples of how to run_ml() many times with different train/test splits and how to get summary information about model performance on a local computer or on a high-performance computing cluster using a Snakemake workflow.

+

To investigate the variation in model performance depending on the +train and test set used (Topçuoğlu et al. 2020; +Lapp et al. 2020), we provide examples of how to +run_ml() many times with different train/test splits and +how to get summary information about model performance on a local +computer or on a high-performance computing cluster using a Snakemake +workflow.

Tuning & visualization

-

One particularly important aspect of ML is hyperparameter tuning. We provide a reasonable range of default hyperparameters for each model type. However practitioners should explore whether that range is appropriate for their data, or if they should customize the hyperparameter range. Therefore, we provide a function plot_hp_performance() to plot the cross-validation performance metric of a single model or models built using different train/test splits. This helps evaluate if the hyperparameter range is being searched exhaustively and allows the user to pick the ideal set. We also provide summary plots of test performance metrics for the many train/test splits with different models using plot_model_performance(). Examples are described in the accompanying vignette on hyperparameter tuning.

+

One particularly important aspect of ML is hyperparameter tuning. We +provide a reasonable range of default hyperparameters for each model +type. However practitioners should explore whether that range is +appropriate for their data, or if they should customize the +hyperparameter range. Therefore, we provide a function +plot_hp_performance() to plot the cross-validation +performance metric of a single model or models built using different +train/test splits. This helps evaluate if the hyperparameter range is +being searched exhaustively and allows the user to pick the ideal set. +We also provide summary plots of test performance metrics for the many +train/test splits with different models using +plot_model_performance(). Examples are described in the +accompanying vignette +on hyperparameter tuning.

Dependencies

-

mikropml is written in R (R Core Team 2020) and depends on several packages: dplyr (Wickham et al. 2020), rlang (Henry, Wickham, and RStudio 2020) and caret (Kuhn 2008). The ML algorithms supported by mikropml require: glmnet (Friedman, Hastie, and Tibshirani 2010), e1071 (Meyer et al. 2020), and MLmetrics (Yan 2016) for logistic regression, rpart2 (Therneau et al. 2019) for decision trees, randomForest (Liaw and Wiener 2002) for random forest, xgboost (Chen et al. 2020) for xgboost, and kernlab (Karatzoglou et al. 2004) for support vector machines. We also allow for parallelization of cross-validation and other steps using the foreach, doFuture, future.apply, and future packages (Bengtsson and Team 2020). Finally, we use ggplot2 for plotting (Wickham 2016).

+

mikropml is written in R (R Core Team +2020) and depends on several packages: dplyr (Wickham et al. 2020), rlang (Henry, Wickham, and RStudio 2020) and +caret (Kuhn 2008). The ML +algorithms supported by mikropml require: +glmnet (Friedman, Hastie, and +Tibshirani 2010), e1071 (Meyer et al. 2020), and MLmetrics +(Yan 2016) for logistic regression, +rpart2 (Therneau et al. 2019) +for decision trees, randomForest (Liaw and Wiener 2002) for random forest, +xgboost (Chen et al. 2020) +for xgboost, and kernlab (Karatzoglou et al. 2004) for support vector +machines. We also allow for parallelization of cross-validation and +other steps using the foreach, doFuture, +future.apply, and future packages (Bengtsson and Team 2020). Finally, we use +ggplot2 for plotting (Wickham +2016).

Acknowledgments

-

We thank members of the Schloss Lab who participated in code clubs related to the initial development of the pipeline, made documentation improvements, and provided general feedback. We also thank Nick Lesniak for designing the mikropml logo.

-

We thank the US Research Software Sustainability Institute (NSF #1743188) for providing training to KLS at the Winter School in Research Software Engineering.

+

We thank members of the Schloss Lab who participated in code clubs +related to the initial development of the pipeline, made documentation +improvements, and provided general feedback. We also thank Nick Lesniak +for designing the mikropml logo.

+

We thank the US Research Software Sustainability Institute (NSF +#1743188) for providing training to KLS at the Winter School in Research +Software Engineering.

Funding

-

Salary support for PDS came from NIH grant 1R01CA215574. KLS received support from the NIH Training Program in Bioinformatics (T32 GM070449). ZL received support from the National Science Foundation Graduate Research Fellowship Program under Grant No. DGE 1256260. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation.

+

Salary support for PDS came from NIH grant 1R01CA215574. KLS received +support from the NIH Training Program in Bioinformatics (T32 GM070449). +ZL received support from the National Science Foundation Graduate +Research Fellowship Program under Grant No. DGE 1256260. Any opinions, +findings, and conclusions or recommendations expressed in this material +are those of the authors and do not necessarily reflect the views of the +National Science Foundation.

Author contributions

-

BDT, ZL, and KLS contributed equally. Author order among the co-first authors was determined by time since joining the project.

-

BDT, ZL, and KLS conceptualized the study and wrote the code. KLS structured the code in R package form. BDT, ZL, JW, and PDS developed methodology. PDS, ES, and JW supervised the project. BDT, ZL, and KLS wrote the original draft. All authors reviewed and edited the manuscript.

+

BDT, ZL, and KLS contributed equally. Author order among the co-first +authors was determined by time since joining the project.

+

BDT, ZL, and KLS conceptualized the study and wrote the code. KLS +structured the code in R package form. BDT, ZL, JW, and PDS developed +methodology. PDS, ES, and JW supervised the project. BDT, ZL, and KLS +wrote the original draft. All authors reviewed and edited the +manuscript.

Conflicts of interest @@ -186,90 +327,156 @@

Conflicts of interestNone.

-

References +

References

-
-
-

Bengtsson, Henrik, and R Core Team. 2020. “Future.Apply: Apply Function to Elements in Parallel Using Futures,” July.

-
-
-

Breiman, Leo. 2001. “Random Forests.” Machine Learning 45 (1): 5–32. https://doi.org/10.1023/A:1010933404324.

-
-
-

Chen, Tianqi, Tong He, Michael Benesty, Vadim Khotilovich, Yuan Tang, Hyunsu Cho, Kailong Chen, et al. 2020. “Xgboost: Extreme Gradient Boosting,” June.

-
-
-

Fisher, Aaron, Cynthia Rudin, and Francesca Dominici. 2018. “All Models Are Wrong, but Many Are Useful: Learning a Variable’s Importance by Studying an Entire Class of Prediction Models Simultaneously.”

-
-
-

Friedman, Jerome H., Trevor Hastie, and Rob Tibshirani. 2010. “Regularization Paths for Generalized Linear Models via Coordinate Descent.” Journal of Statistical Software 33 (1): 1–22. https://doi.org/10.18637/jss.v033.i01.

-
-
-

H2O.ai. 2020. H2O: Scalable Machine Learning Platform. Manual.

-
-
-

Hagan, Ada K., Begüm D. Topçuoğlu, Mia E. Gregory, Hazel A. Barton, and Patrick D. Schloss. 2020. “Women Are Underrepresented and Receive Differential Outcomes at ASM Journals: A Six-Year Retrospective Analysis.” mBio 11 (6). https://doi.org/10.1128/mBio.01680-20.

-
-
-

Henry, Lionel, Hadley Wickham, and RStudio. 2020. “Rlang: Functions for Base Types and Core R and ’Tidyverse’ Features,” July.

-
-
-

Karatzoglou, Alexandros, Alexandros Smola, Kurt Hornik, and Achim Zeileis. 2004. “Kernlab - an S4 Package for Kernel Methods in R.” Journal of Statistical Software 11 (1): 1–20. https://doi.org/10.18637/jss.v011.i09.

-
-
-

Köster, Johannes, and Sven Rahmann. 2012. “Snakemakea Scalable Bioinformatics Workflow Engine.” Bioinformatics 28 (19): 2520–2. https://doi.org/10.1093/bioinformatics/bts480.

-
-
-

Kuhn, Max. 2008. “Building Predictive Models in R Using the Caret Package.” Journal of Statistical Software 28 (1): 1–26. https://doi.org/10.18637/jss.v028.i05.

-
-
-

Kuhn, Max, Hadley Wickham, and RStudio. 2020. “Tidymodels: Easily Install and Load the ’Tidymodels’ Packages,” July.

-
-
-

Lapp, Zena, Jennifer Han, Jenna Wiens, Ellie JC Goldstein, Ebbing Lautenbach, and Evan Snitkin. 2020. “Machine Learning Models to Identify Patient and Microbial Genetic Factors Associated with Carbapenem-Resistant Klebsiella Pneumoniae Infection.” medRxiv, July, 2020.07.06.20147306. https://doi.org/10.1101/2020.07.06.20147306.

-
-
-

Liaw, Andy, and Matthew Wiener. 2002. “Classification and Regression by randomForest” 2: 5.

-
-
-

Meyer, David, Evgenia Dimitriadou, Kurt Hornik, Andreas Weingessel, Friedrich Leisch, Chih-Chung Chang (libsvm C++-code), and Chih-Chen Lin (libsvm C++-code). 2020. “E1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien.”

-
-
-

Pedregosa, Fabian, Gaël Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, et al. 2011. “Scikit-Learn: Machine Learning in Python.” Journal of Machine Learning Research 12 (85): 2825–30.

-
-
-

Pollard, Tom J., Irene Chen, Jenna Wiens, Steven Horng, Danny Wong, Marzyeh Ghassemi, Heather Mattie, Emily Lindemer, and Trishan Panch. 2019. “Turning the Crank for Machine Learning: Ease, at What Expense?” The Lancet Digital Health 1 (5): e198–e199. https://doi.org/10.1016/S2589-7500(19)30112-8.

-
-
-

R Core Team. 2020. “R: A Language and Environment for Statistical Computing.”

-
-
-

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

-
-
-

Teschendorff, Andrew E. 2019. “Avoiding Common Pitfalls in Machine Learning Omic Data Science.” Nature Materials 18 (5): 422–27. https://doi.org/10.1038/s41563-018-0241-z.

-
-
-

Therneau, Terry, Beth Atkinson, Brian Ripley (producer of the initial R. port, and maintainer 1999-2017). 2019. “Rpart: Recursive Partitioning and Regression Trees,” April.

-
-
-

Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, and Patrick D. Schloss. 2020. “A Framework for Effective Application of Machine Learning to Microbiome-Based Classification Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20.

-
-
-

Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Use R! Cham: Springer International Publishing. https://doi.org/10.1007/978-3-319-24277-4.

-
-
-

Wickham, Hadley, Romain François, Lionel Henry, Kirill Müller, and RStudio. 2020. “Dplyr: A Grammar of Data Manipulation,” August.

-
-
-

Wiens, Jenna, Suchi Saria, Mark Sendak, Marzyeh Ghassemi, Vincent X. Liu, Finale Doshi-Velez, Kenneth Jung, et al. 2019. “Do No Harm: A Roadmap for Responsible Machine Learning for Health Care.” Nat. Med. 25 (9): 1337–40. https://doi.org/10.1038/s41591-019-0548-6.

-
-
-

Yan, Yachen. 2016. “MLmetrics: Machine Learning Evaluation Metrics.”

-
-
-
-
+
+
+Bengtsson, Henrik, and R Core Team. 2020. “Future.apply: +Apply Function to Elements in +Parallel Using Futures,” July. +
+
+Breiman, Leo. 2001. “Random Forests.” Machine +Learning 45 (1): 5–32. https://doi.org/10.1023/A:1010933404324. +
+
+Chen, Tianqi, Tong He, Michael Benesty, Vadim Khotilovich, Yuan Tang, +Hyunsu Cho, Kailong Chen, et al. 2020. “Xgboost: Extreme +Gradient Boosting,” June. +
+
+Fisher, Aaron, Cynthia Rudin, and Francesca Dominici. 2018. “All +Models Are Wrong, but Many Are Useful: Learning a +Variable’s Importance by Studying an Entire Class of Prediction Models +Simultaneously.” +
+
+Friedman, Jerome H., Trevor Hastie, and Rob Tibshirani. 2010. +“Regularization Paths for Generalized Linear +Models via Coordinate Descent.” Journal +of Statistical Software 33 (1): 1–22. https://doi.org/10.18637/jss.v033.i01. +
+
+H2O.ai. 2020. H2o: Scalable Machine +Learning Platform. Manual. +
+
+Hagan, Ada K., Begüm D. Topçuoğlu, Mia E. Gregory, Hazel A. Barton, and +Patrick D. Schloss. 2020. “Women Are Underrepresented +and Receive Differential Outcomes at ASM +Journals: A Six-Year Retrospective +Analysis.” mBio 11 (6). https://doi.org/10.1128/mBio.01680-20. +
+
+Henry, Lionel, Hadley Wickham, and RStudio. 2020. “Rlang: +Functions for Base Types and Core +R and ’TidyverseFeatures,” +July. +
+
+Karatzoglou, Alexandros, Alexandros Smola, Kurt Hornik, and Achim +Zeileis. 2004. “Kernlab - An S4 Package for +Kernel Methods in R.” Journal of +Statistical Software 11 (1): 1–20. https://doi.org/10.18637/jss.v011.i09. +
+
+Köster, Johannes, and Sven Rahmann. 2012. “Snakemakea Scalable +Bioinformatics Workflow Engine.” Bioinformatics 28 (19): +2520–22. https://doi.org/10.1093/bioinformatics/bts480. +
+
+Kuhn, Max. 2008. “Building Predictive Models in +R Using the Caret Package.” Journal +of Statistical Software 28 (1): 1–26. https://doi.org/10.18637/jss.v028.i05. +
+
+Kuhn, Max, Hadley Wickham, and RStudio. 2020. “Tidymodels: +Easily Install and Load the +’TidymodelsPackages,” July. +
+
+Lapp, Zena, Jennifer Han, Jenna Wiens, Ellie JC Goldstein, Ebbing +Lautenbach, and Evan Snitkin. 2020. “Machine Learning Models to +Identify Patient and Microbial Genetic Factors Associated with +Carbapenem-Resistant Klebsiella Pneumoniae +Infection.” medRxiv, July, 2020.07.06.20147306. https://doi.org/10.1101/2020.07.06.20147306. +
+
+Liaw, Andy, and Matthew Wiener. 2002. “Classification and +Regression by randomForest 2: 5. +
+
+Meyer, David, Evgenia Dimitriadou, Kurt Hornik, Andreas Weingessel, +Friedrich Leisch, Chih-Chung Chang (libsvm C++-code), and Chih-Chen Lin +(libsvm C++-code). 2020. “E1071: Misc Functions of +the Department of Statistics, +Probability Theory Group (Formerly: +E1071), TU Wien.” +
+
+Pedregosa, Fabian, Gaël Varoquaux, Alexandre Gramfort, Vincent Michel, +Bertrand Thirion, Olivier Grisel, Mathieu Blondel, et al. 2011. +“Scikit-Learn: Machine Learning in +Python.” Journal of Machine Learning +Research 12 (85): 2825–30. +
+
+Pollard, Tom J., Irene Chen, Jenna Wiens, Steven Horng, Danny Wong, +Marzyeh Ghassemi, Heather Mattie, Emily Lindemer, and Trishan Panch. +2019. “Turning the Crank for Machine Learning: Ease, at What +Expense?” The Lancet Digital Health 1 (5): e198–99. https://doi.org/10.1016/S2589-7500(19)30112-8. +
+
+R Core Team. 2020. “R: A Language and +Environment for Statistical Computing.” +
+
+Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael +W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR +Analyses with FIDDLE: A Flexible Data-Driven Preprocessing +Pipeline for Structured Clinical Data.” J Am Med Inform +Assoc, October. https://doi.org/10.1093/jamia/ocaa139. +
+
+Teschendorff, Andrew E. 2019. “Avoiding Common Pitfalls in Machine +Learning Omic Data Science.” Nature Materials 18 (5): +422–27. https://doi.org/10.1038/s41563-018-0241-z. +
+
+Therneau, Terry, Beth Atkinson, Brian Ripley (producer of the initial R. +port, and maintainer 1999-2017). 2019. “Rpart: Recursive +Partitioning and Regression Trees,” April. +
+
+Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, +and Patrick D. Schloss. 2020. “A Framework for +Effective Application of Machine Learning to +Microbiome-Based Classification +Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20. +
+
+Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for +Data Analysis. Use R! Cham: +Springer International Publishing. https://doi.org/10.1007/978-3-319-24277-4. +
+
+Wickham, Hadley, Romain François, Lionel Henry, Kirill Müller, and +RStudio. 2020. “Dplyr: A Grammar of Data +Manipulation,” August. +
+
+Wiens, Jenna, Suchi Saria, Mark Sendak, Marzyeh Ghassemi, Vincent X. +Liu, Finale Doshi-Velez, Kenneth Jung, et al. 2019. “Do No Harm: A +Roadmap for Responsible Machine Learning for Health Care.” +Nat. Med. 25 (9): 1337–40. https://doi.org/10.1038/s41591-019-0548-6. +
+
+Yan, Yachen. 2016. MLmetrics: Machine Learning +Evaluation Metrics.” +
+
+
+

  1. co-first author↩︎

  2. diff --git a/docs/articles/parallel.html b/docs/articles/parallel.html index a20fdfa3..e1ac7594 100644 --- a/docs/articles/parallel.html +++ b/docs/articles/parallel.html @@ -40,7 +40,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -100,11 +100,12 @@ -
+

Speed up single runs

-

By default, preprocess_data(), run_ml(), and compare_models() use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach, future, future.apply, and doFuture. Then, register a future plan prior to calling these functions:

+

By default, preprocess_data(), run_ml(), +and compare_models() use only one process in series. If +you’d like to parallelize various steps of the pipeline to make them run +faster, install foreach, future, +future.apply, and doFuture. Then, register a +future plan prior to calling these functions:

-doFuture::registerDoFuture()
+doFuture::registerDoFuture()
 future::plan(future::multicore, workers = 2)
-

Above, we used the multicore plan to split the work across 2 cores. See the future documentation for more about picking the best plan for your use case. Notably, multicore does not work inside RStudio or on Windows; you will need to use multisession instead in those cases.

-

After registering a future plan, you can call preprocess_data() and run_ml() as usual, and they will run certain tasks in parallel.

+

Above, we used the multicore plan to split the work +across 2 cores. See the future +documentation for more about picking the best plan for your use +case. Notably, multicore does not work inside RStudio or on +Windows; you will need to use multisession instead in those +cases.

+

After registering a future plan, you can call +preprocess_data() and run_ml() as usual, and +they will run certain tasks in parallel.

 otu_data_preproc <- preprocess_data(otu_mini_bin, 'dx')$dat_transformed
-#> Using 'dx' as the outcome column.
-result1 <- run_ml(otu_data_preproc, 'glmnet')
-#> Using 'dx' as the outcome column.
-#> Training the model...
-#> Loading required package: ggplot2
-#> Loading required package: lattice
-#> Training complete.
+result1 <- run_ml(otu_data_preproc, 'glmnet')

Call run_ml() multiple times in parallel in R

-

You can use functions from the future.apply package to call run_ml() multiple times in parallel with different parameters. You will first need to run future::plan() as above if you haven’t already. Then, call run_ml() with multiple seeds using future_lapply():

+

You can use functions from the future.apply package to +call run_ml() multiple times in parallel with different +parameters. You will first need to run future::plan() as +above if you haven’t already. Then, call run_ml() with +multiple seeds using future_lapply():

 # NOTE: use more seeds for real-world data
 results_multi <- future.apply::future_lapply(seq(100, 102), function(seed) {
   run_ml(otu_data_preproc, 'glmnet', seed = seed)
-  }, future.seed = TRUE)
-#> Using 'dx' as the outcome column.
-#> Training the model...
-#> Training complete.
-#> Using 'dx' as the outcome column.
-#> Training the model...
-#> Training complete.
-#> Using 'dx' as the outcome column.
-#> Training the model...
-#> Training complete.
-

Each call to run_ml() with a different seed uses a different random split of the data into training and testing sets. Since we are using seeds, we must set future.seed to TRUE (see the future.apply documentation and this blog post for details on parallel-safe random seeds). This example uses only a few seeds for speed and simplicity, but for real data we recommend using many more seeds to get a better estimate of model performance.

-

In these examples, we used functions from the future.apply package to run_ml() in parallel, but you can accomplish the same thing with parallel versions of the purrr::map() functions using the furrr package (e.g. furrr::future_map_dfr()).

-

Extract the performance results and combine into one dataframe for all seeds:

+ }, future.seed = TRUE)
+

Each call to run_ml() with a different seed uses a +different random split of the data into training and testing sets. Since +we are using seeds, we must set future.seed to +TRUE (see the future.apply +documentation and this +blog post for details on parallel-safe random seeds). This example +uses only a few seeds for speed and simplicity, but for real data we +recommend using many more seeds to get a better estimate of model +performance.

+

In these examples, we used functions from the +future.apply package to run_ml() in parallel, +but you can accomplish the same thing with parallel versions of the +purrr::map() functions using the furrr package +(e.g. furrr::future_map_dfr()).

+

Extract the performance results and combine into one dataframe for +all seeds:

 perf_df <- future.apply::future_lapply(results_multi, 
                                        function(result) {
@@ -173,17 +180,13 @@ 

Call run_ml() }, future.seed = TRUE) %>% dplyr::bind_rows() -perf_df -#> # A tibble: 3 × 3 -#> cv_metric_AUC AUC method -#> <dbl> <dbl> <chr> -#> 1 0.630 0.634 glmnet -#> 2 0.591 0.608 glmnet -#> 3 0.671 0.471 glmnet

+perf_df

Multiple ML methods

-

You may also wish to compare performance for different ML methods. mapply() can iterate over multiple lists or vectors, and future_mapply() works the same way:

+

You may also wish to compare performance for different ML methods. +mapply() can iterate over multiple lists or vectors, and +future_mapply() works the same way:

 # NOTE: use more seeds for real-world data
 param_grid <- expand.grid(seeds = seq(100, 102),
@@ -195,61 +198,39 @@ 

Multiple ML methodsparam_grid$seeds, param_grid$methods %>% as.character(), future.seed = TRUE - ) -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete. -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete. -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete. -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete. -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete. -#> Using 'dx' as the outcome column. -#> Training the model... -#> Training complete.

-

Extract and combine the performance results for all seeds and methods:

+ )
+

Extract and combine the performance results for all seeds and +methods:

 perf_df2 <- lapply(results_mtx['performance',], 
                    function(x) {
                      x %>% select(cv_metric_AUC, AUC, method)
                    }) %>% 
   dplyr::bind_rows()
-perf_df2
-#> # A tibble: 6 × 3
-#>   cv_metric_AUC   AUC method
-#>           <dbl> <dbl> <chr> 
-#> 1         0.630 0.634 glmnet
-#> 2         0.591 0.608 glmnet
-#> 3         0.671 0.471 glmnet
-#> 4         0.665 0.708 rf    
-#> 5         0.651 0.697 rf    
-#> 6         0.701 0.592 rf
-

Visualize the performance results (ggplot2 is required):

+perf_df2
+

Visualize the performance results (ggplot2 is +required):

 perf_boxplot <- plot_model_performance(perf_df2)
 perf_boxplot
-

-

plot_model_performance() returns a ggplot2 object. You can add layers to customize the plot:

+

plot_model_performance() returns a ggplot2 object. You +can add layers to customize the plot:

 perf_boxplot +
    theme_classic() +
    scale_color_brewer(palette = "Dark2") +
    coord_flip()
-

-

You can also create your own plots however you like using the performance results.

+

You can also create your own plots however you like using the +performance results.

Live progress updates

-

preprocess_data() and get_feature_importance() support reporting live progress updates using the progressr package. The format is up to you, but we recommend using a progress bar like this:

+

preprocess_data() and +get_feature_importance() support reporting live progress +updates using the progressr package. The format is up to +you, but we recommend using a progress bar like this:

 # optionally, specify the progress bar format with the `progress` package.
 progressr::handlers(progressr::handler_progress(
@@ -270,12 +251,26 @@ 

Live progress updates#> Training the model... #> Training complete. #> Feature importance =========================== 100% | elapsed: 37s | eta: 0s

-

Note that some future backends support “near-live” progress updates, meaning the progress may not be reported immediately when parallel processing with futures. Read more on that in the progressr vignette. For more on progressr and how to customize the format of progress updates, see the progressr docs.

+

Note that some future backends support “near-live” progress updates, +meaning the progress may not be reported immediately when parallel +processing with futures. Read more on that in +the progressr vignette. For more on +progressr and how to customize the format of progress +updates, see the progressr +docs.

Parallelizing with Snakemake

-

When parallelizing multiple calls to run_ml() in R as in the examples above, all of the results objects are held in memory. This isn’t a big deal for a small dataset run with only a few seeds. However, for large datasets run in parallel with, say, 100 seeds (recommended), you may run into problems trying to store all of those objects in memory at once. One solution is to write the results files of each run_ml() call, then concatenate them at the end. We show one way to accomplish this with Snakemake in an example Snakemake workflow here.

+

When parallelizing multiple calls to run_ml() in R as in +the examples above, all of the results objects are held in memory. This +isn’t a big deal for a small dataset run with only a few seeds. However, +for large datasets run in parallel with, say, 100 seeds (recommended), +you may run into problems trying to store all of those objects in memory +at once. One solution is to write the results files of each +run_ml() call, then concatenate them at the end. We show +one way to accomplish this with Snakemake in an +example Snakemake workflow here.

diff --git a/docs/articles/preprocess.html b/docs/articles/preprocess.html index 5ccf1133..d6e21856 100644 --- a/docs/articles/preprocess.html +++ b/docs/articles/preprocess.html @@ -40,7 +40,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -100,7 +100,7 @@ -
+

Categorical data @@ -209,7 +245,9 @@

Categorical data#> #> $removed_feats #> character(0)

-

As you can see, this variable was split into 3 different columns - one for each type (a, b, and c). And again, grp_feats is NULL.

+

As you can see, this variable was split into 3 different columns - +one for each type (a, b, and c). And again, grp_feats is +NULL.

Continuous data @@ -243,12 +281,23 @@

Continuous data#> #> $removed_feats #> character(0)

-

Wow! Why did the numbers change? This is because the default is to normalize the data using "center" and "scale". While this is often best practice, you may not want to normalize the data, or you may want to normalize the data in a different way. If you don’t want to normalize the data, you can use method=NULL:

+

Wow! Why did the numbers change? This is because the default is to +normalize the data using "center" and "scale". +While this is often best practice, you may not want to normalize the +data, or you may want to normalize the data in a different way. If you +don’t want to normalize the data, you can use +method=NULL:

 # preprocess raw continuous data, no normalization
 preprocess_data(dataset = cont_df, outcome_colname = "outcome", method = NULL)
-

You can also normalize the data in different ways. You can choose any method supported by the method argument of caret::preProcess() (see the caret::preProcess() docs for details). Note that these methods are only applied to continuous variables.

-

Another feature of preprocess_data() is that if you provide continuous variables as characters, they will be converted to numeric:

+

You can also normalize the data in different ways. You can choose any +method supported by the method argument of +caret::preProcess() (see the +caret::preProcess() docs for details). Note that these +methods are only applied to continuous variables.

+

Another feature of preprocess_data() is that if you +provide continuous variables as characters, they will be converted to +numeric:

 # raw continuous dataset as characters
 cont_char_df <- data.frame(
@@ -263,7 +312,10 @@ 

Continuous data
 # preprocess raw continuous character data as numeric
 preprocess_data(dataset = cont_char_df, outcome_colname = "outcome")

-

If you don’t want this to happen, and you want character data to remain character data even if it can be converted to numeric, you can use to_numeric=FALSE and they will be kept as categorical:

+

If you don’t want this to happen, and you want character data to +remain character data even if it can be converted to numeric, you can +use to_numeric=FALSE and they will be kept as +categorical:

 # preprocess raw continuous character data as characters
 preprocess_data(dataset = cont_char_df, outcome_colname = "outcome", to_numeric = FALSE)
@@ -281,12 +333,17 @@ 

Continuous data#> #> $removed_feats #> character(0)

-

As you can see from this output, in this case the features are treated as groups rather than numbers (e.g. they are not normalized).

+

As you can see from this output, in this case the features are +treated as groups rather than numbers (e.g. they are not +normalized).

Collapse perfectly correlated features

-

By default, preprocess_data() collapses features that are perfectly positively or negatively correlated. This is because having multiple copies of those features does not add information to machine learning, and it makes run_ml faster.

+

By default, preprocess_data() collapses features that +are perfectly positively or negatively correlated. This is because +having multiple copies of those features does not add information to +machine learning, and it makes run_ml faster.

 # raw correlated dataset
 corr_df <- data.frame(
@@ -319,8 +376,14 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

As you can see, we end up with only one variable, as all 3 are grouped together. Also, the second element in the list is no longer NULL. Instead, it tells you that grp1 contains var1, var2, and var3.

-

If you want to group positively correlated features, but not negatively correlated features (e.g. for interpretability, or another downstream application), you can do that by using group_neg_corr=FALSE:

+

As you can see, we end up with only one variable, as all 3 are +grouped together. Also, the second element in the list is no longer +NULL. Instead, it tells you that grp1 contains +var1, var2, and var3.

+

If you want to group positively correlated features, but not +negatively correlated features (e.g. for interpretability, or another +downstream application), you can do that by using +group_neg_corr=FALSE:

 # preprocess raw correlated dataset; don't group negatively correlated features
 preprocess_data(dataset = corr_df, outcome_colname = "outcome", group_neg_corr = FALSE)
@@ -338,7 +401,10 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

Here, var3 is kept on it’s own because it’s negatively correlated with var1 and var2. You can also choose to keep all features separate, even if they are perfectly correlated, by using collapse_corr_feats=FALSE:

+

Here, var3 is kept on it’s own because it’s negatively +correlated with var1 and var2. You can also +choose to keep all features separate, even if they are perfectly +correlated, by using collapse_corr_feats=FALSE:

 # preprocess raw correlated dataset; don't group negatively correlated features
 preprocess_data(dataset = corr_df, outcome_colname = "outcome", collapse_corr_feats = FALSE)
@@ -356,12 +422,14 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

In this case, grp_feats will always be NULL.

+

In this case, grp_feats will always be +NULL.

Data with near-zero variance

-

What if we have variables that are all zero, or all “no”? Those ones won’t contribute any information, so we remove them:

+

What if we have variables that are all zero, or all “no”? Those ones +won’t contribute any information, so we remove them:

 # raw dataset with non-variable features
 nonvar_df <- data.frame(
@@ -377,7 +445,9 @@ 

Data with near-zero variance#> 1 normal no 0 no 0 12 #> 2 normal yes 1 no 0 12 #> 3 cancer no 1 no 0 12

-

Here, var3, var4, and var5 all have no variability, so these variables are removed during preprocessing:

+

Here, var3, var4, and var5 all +have no variability, so these variables are removed during +preprocessing:

 # remove features with near-zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome")
@@ -395,7 +465,13 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4" "var3" "var5"

-

You can read the caret::preProcess() documentation for more information. By default, we remove features with “near-zero variance” (remove_var='nzv'). This uses the default arguments from caret::nearZeroVar(). However, particularly with smaller datasets, you might not want to remove features with near-zero variance. If you want to remove only features with zero variance, you can use remove_var='zv':

+

You can read the caret::preProcess() documentation for +more information. By default, we remove features with “near-zero +variance” (remove_var='nzv'). This uses the default +arguments from caret::nearZeroVar(). However, particularly +with smaller datasets, you might not want to remove features with +near-zero variance. If you want to remove only features with zero +variance, you can use remove_var='zv':

 # remove features with zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome", remove_var = 'zv')
@@ -413,7 +489,10 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4" "var3" "var5"

-

If you want to include all features, you can use the argument remove_zv=NULL. For this to work, you cannot collapse correlated features (otherwise it errors out because of the underlying caret function we use).

+

If you want to include all features, you can use the argument +remove_zv=NULL. For this to work, you cannot collapse +correlated features (otherwise it errors out because of the underlying +caret function we use).

 # don't remove features with near-zero or zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome", remove_var = NULL, collapse_corr_feats = FALSE)
@@ -431,7 +510,12 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4"

-

If you want to be more nuanced in how you remove near-zero variance features (e.g. change the default 10% cutoff for the percentage of distinct values out of the total number of samples), you can use the caret::preProcess() function after running preprocess_data with remove_var=NULL (see the caret::nearZeroVar() function for more information).

+

If you want to be more nuanced in how you remove near-zero variance +features (e.g. change the default 10% cutoff for the percentage of +distinct values out of the total number of samples), you can use the +caret::preProcess() function after running +preprocess_data with remove_var=NULL (see the +caret::nearZeroVar() function for more information).

Missing data @@ -439,11 +523,16 @@

Missing datapreprocess_data() also deals with missing data. It:

  • Removes missing outcome variables.
  • -
  • Maintains zero variability in a feature if it already has no variability (i.e. the feature is removed if removing features with near-zero variance).
  • -
  • Replaces missing binary and categorical variables with zero (after splitting into multiple columns).
  • -
  • Replaces missing continuous data with the median value of that feature.
  • +
  • Maintains zero variability in a feature if it already has no +variability (i.e. the feature is removed if removing features with +near-zero variance).
  • +
  • Replaces missing binary and categorical variables with zero (after +splitting into multiple columns).
  • +
  • Replaces missing continuous data with the median value of that +feature.
-

If you’d like to deal with missing data in a different way, please do that prior to inputting the data to preprocess_data().

+

If you’d like to deal with missing data in a different way, please do +that prior to inputting the data to preprocess_data().

Remove missing outcome variables

@@ -480,7 +569,8 @@

Remove missing outcome variables#> character(0)

-

Maintain zero variability in a feature if it already has no variability +

Maintain zero variability in a feature if it already has no +variability

 # raw dataset with missing value in non-variable feature
@@ -512,7 +602,9 @@ 

#> #> $removed_feats #> [1] "var2"

-

Here, the non-variable feature with missing data is removed because we removed features with near-zero variance. If we maintained that feature, it’d be all ones:

+

Here, the non-variable feature with missing data is removed because +we removed features with near-zero variance. If we maintained that +feature, it’d be all ones:

 # preprocess raw dataset with missing value in non-variable feature
 preprocess_data(dataset = miss_nonvar_df, outcome_colname = "outcome", remove_var = NULL, collapse_corr_feats = FALSE)
@@ -565,10 +657,12 @@ 

Replace miss #> #> $removed_feats #> [1] "var2"

-

Here each binary variable is split into two, and the missing value is considered zero for both of them.

+

Here each binary variable is split into two, and the missing value is +considered zero for both of them.

-

Replace missing continuous data with the median value of that feature +

Replace missing continuous data with the median value of that +feature

 # raw dataset with missing value in continuous feature
@@ -583,7 +677,8 @@ 

R #> 2 normal 2 2 #> 3 cancer 2 3 #> 4 normal NA NA

-

Here we’re not normalizing continuous features so it’s easier to see what’s going on (i.e. the median value is used):

+

Here we’re not normalizing continuous features so it’s easier to see +what’s going on (i.e. the median value is used):

 # preprocess raw dataset with missing value in continuous feature
 preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL)
@@ -608,7 +703,8 @@ 

R

Putting it all together

-

Here’s some more complicated example raw data that puts everything we discussed together:

+

Here’s some more complicated example raw data that puts everything we +discussed together:

 test_df <- data.frame(
   outcome = c("normal", "normal", "cancer", NA),
@@ -631,7 +727,8 @@ 

Putting it all together#> 2 normal 2 b yes 1 0 no 1 6 x 0 1 2 #> 3 cancer 3 c no 0 0 no 0 NA y NA NA 3 #> 4 <NA> 4 d no 0 0 no 0 7 z NA NA 4

-

Let’s throw this into the preprocessing function with the default values:

+

Let’s throw this into the preprocessing function with the default +values:

 preprocess_data(dataset = test_df, outcome_colname = "outcome")
 #> Using 'outcome' as the outcome column.
@@ -668,20 +765,37 @@ 

Putting it all together#> [1] "var4" "var5" "var10" "var6" "var11"

As you can see, we got several messages:

    -
  • One of the samples (row 4) was removed because the outcome value was missing.
  • -
  • One of the variables in a feature with no variation had a missing value that was replaced with the the non-varying value (var11).
  • -
  • Four categorical missing values were replaced with zero (var9). There are 4 missing rather than just 1 (like in the raw data) because we split the categorical variable into 4 different columns first.
  • -
  • One missing continuous value was imputed using the median value of that feature (var8).
  • +
  • One of the samples (row 4) was removed because the outcome value was +missing.
  • +
  • One of the variables in a feature with no variation had a missing +value that was replaced with the the non-varying value +(var11).
  • +
  • Four categorical missing values were replaced with zero +(var9). There are 4 missing rather than just 1 (like in the +raw data) because we split the categorical variable into 4 different +columns first.
  • +
  • One missing continuous value was imputed using the median value of +that feature (var8).
-

Additionally, you can see that the continuous variables were normalized, the categorical variables were all changed to binary, and several features were grouped together. The variables in each group can be found in grp_feats.

+

Additionally, you can see that the continuous variables were +normalized, the categorical variables were all changed to binary, and +several features were grouped together. The variables in each group can +be found in grp_feats.

Next step: train and evaluate your model!

-

After you preprocess your data (either using preprocess_data() or by preprocessing the data on your own), you’re ready to train and evaluate machine learning models! Please see run_ml() information about training models.

-
-
-

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

+

After you preprocess your data (either using +preprocess_data() or by preprocessing the data on your +own), you’re ready to train and evaluate machine learning models! Please +see run_ml() information about training models.

+
+
+Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael +W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR +Analyses with FIDDLE: A Flexible Data-Driven Preprocessing +Pipeline for Structured Clinical Data.” J Am Med Inform +Assoc, October. https://doi.org/10.1093/jamia/ocaa139.
diff --git a/docs/articles/tuning.html b/docs/articles/tuning.html index 2092dc92..1cc971d6 100644 --- a/docs/articles/tuning.html +++ b/docs/articles/tuning.html @@ -40,7 +40,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -100,11 +100,12 @@ -
+
+
+ +
  • mikropml now requires R version 4.1.0 or greater due to an update in the randomForest package (#292).
  • +
  • New function compare_models() compares the performance of two models with a permutation test (#295, @courtneyarmour).
  • +
  • Fixed a bug where cv_times did not affect the reported repeats for cross-validation (#291, @kelly-sovacool).
  • +
  • Made minor documentation improvements (#293, @kelly-sovacool)
  • +

This minor patch fixes a test failure on platforms with no long doubles. The actual package code remains unchanged.

-
  • Allow kfold >= length(groups) (#285, @kelly-sovacool).
    • When using the groups parameter, groups are kept together in cross-validation partitions when kfold <= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions.
  • +
    • Allow kfold >= length(groups) (#285, @kelly-sovacool). +
      • When using the groups parameter, groups are kept together in cross-validation partitions when kfold <= the number of groups in the training set. Previously, an error was thrown if this condition was not met. Now, if there are not enough groups in the training set for groups to be kept together during CV, groups are allowed to be split up across CV partitions.
      • +
    • Report p-values for permutation feature importance (#288, @kelly-sovacool).
-
  • New parameter cross_val added to run_ml() allows users to define their own custom cross-validation scheme (#278, @kelly-sovacool).
    • Also added a new parameter calculate_performance, which controls whether performance metrics are calculated (default: TRUE). Users may wish to skip performance calculations when training models with no cross-validation.
  • +
    • New parameter cross_val added to run_ml() allows users to define their own custom cross-validation scheme (#278, @kelly-sovacool). +
      • Also added a new parameter calculate_performance, which controls whether performance metrics are calculated (default: TRUE). Users may wish to skip performance calculations when training models with no cross-validation.
      • +
    • New parameter group_partitions added to run_ml() allows users to control which groups should go to which partition of the train/test split (#281, @kelly-sovacool).
    • -
    • Modified the training_frac parameter in run_ml() (#281, @kelly-sovacool).
      • By default, training_frac is a fraction between 0 and 1 that specifies how much of the dataset should be used in the training fraction of the train/test split.
      • +
      • Modified the training_frac parameter in run_ml() (#281, @kelly-sovacool). +
        • By default, training_frac is a fraction between 0 and 1 that specifies how much of the dataset should be used in the training fraction of the train/test split.
        • Users can instead give training_frac a vector of indices that correspond to which rows of the dataset should go in the training fraction of the train/test split. This gives users direct control over exactly which observations are in the training fraction if desired.
-
+
-
  • New correlation method option for feature importance (#267, @courtneyarmour).
    • The default is still “spearman”, and now you can use other methods supported by stats::cor with the corr_method parameter: get_feature_importance(corr_method = "pearson") -
  • +
@@ -104,10 +121,12 @@
  • mikropml now has a logo created by @NLesniak!
  • Made documentation improvements (#238, #231 @kelly-sovacool; #256 @BTopcuoglu).
  • -
  • New option in preprocess_data(): prefilter_threshold (#240, @kelly-sovacool, @courtneyarmour).
    • Remove any features that appear in N=prefilter_threshold or fewer rows in the data.
    • +
    • New option in preprocess_data(): prefilter_threshold (#240, @kelly-sovacool, @courtneyarmour). +
    • -
    • New option in get_feature_importance(): groups (#246, @kelly-sovacool).
      • Provide custom groups of features to permute together during permutation importance.
      • +
      • New option in get_feature_importance(): groups (#246, @kelly-sovacool). +
        • Provide custom groups of features to permute together during permutation importance.
        • groups is NULL by default; in this case, correlated features above corr_thresh are grouped together.
      • @@ -128,12 +147,14 @@

        This is the first release version of mikropml! 🎉

        • Added a NEWS.md file to track changes to the package.
        • -
        • Major new functions:
          • run_ml()
          • +
          • Major new functions: +
          • -
          • Support for ML methods in run_ml():
            • +
            • Support for ML methods in run_ml(): +
              • glmnet: logistic and linear regression
              • rf: random forest
              • @@ -144,7 +165,8 @@
            • -
            • New vignettes:
              • Introduction
              • +
              • New vignettes: +
                • Introduction
                • Preprocess data
                • Hyperparameter tuning
                • Parallel processing
                • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index e9094896..43e8f4a9 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,4 +1,4 @@ -pandoc: 2.7.3 +pandoc: 2.17.1.1 pkgdown: 2.0.3 pkgdown_sha: ~ articles: @@ -7,7 +7,7 @@ articles: parallel: parallel.html preprocess: preprocess.html tuning: tuning.html -last_built: 2022-05-18T17:33Z +last_built: 2022-05-19T16:08Z urls: reference: http://www.schlosslab.org/mikropml/reference article: http://www.schlosslab.org/mikropml/articles diff --git a/docs/pull_request_template.html b/docs/pull_request_template.html index cc58413a..1a14b90b 100644 --- a/docs/pull_request_template.html +++ b/docs/pull_request_template.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
@@ -75,25 +75,33 @@

NA

Issues

-
  • Resolves # .
+
  • Resolves # .
  • +

## Change(s) made

-
+
  • +

Checklist

(Strikethrough any points that are not applicable.)

-
  • -Write unit tests for any new functionality or bug fixes.
  • +
    • + +Write unit tests for any new functionality or bug fixes.
    • -Update docs if there are any API changes:
      • -roxygen comments
      • + +Update docs if there are any API changes:
        • + +roxygen comments
        • -vignettes
        • + +vignettes
      • -Update NEWS.md if this includes any user-facing changes.
      • + +Update NEWS.md if this includes any user-facing changes.
      • -The check workflow succeeds on your most recent commit. This is always required before the PR can be merged. + +The check workflow succeeds on your most recent commit. This is always required before the PR can be merged.
diff --git a/docs/reference/calc_perf_metrics.html b/docs/reference/calc_perf_metrics.html index cb3f3f8e..8fea238d 100644 --- a/docs/reference/calc_perf_metrics.html +++ b/docs/reference/calc_perf_metrics.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0

diff --git a/docs/reference/combine_hp_performance.html b/docs/reference/combine_hp_performance.html index b3c69730..dca720af 100644 --- a/docs/reference/combine_hp_performance.html +++ b/docs/reference/combine_hp_performance.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
diff --git a/docs/reference/compare_models.html b/docs/reference/compare_models.html new file mode 100644 index 00000000..e5c84f0d --- /dev/null +++ b/docs/reference/compare_models.html @@ -0,0 +1,145 @@ + +Perform permutation tests to compare the performance metric +across all pairs of a group variable. — compare_models • mikropml + + +
+
+ + + +
+
+ + +
+

A wrapper for permute_p_value().

+
+ +
+
compare_models(merged_data, metric, group_name, nperm = 10000)
+
+ +
+

Arguments

+
merged_data
+

the concatenated performance data from run_ml

+
metric
+

metric to compare, must be numeric

+
group_name
+

column with group variables to compare

+
nperm
+

number of permutations, default=10000

+
+
+

Value

+

a table of p-values for all pairs of group varible

+
+
+

Author

+

Courtney R Armour, armourc@umich.edu

+
+ +
+

Examples

+
df <- dplyr::tibble(
+  model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"),
+  AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95)
+)
+set.seed(123)
+compare_models(df, "AUC", "model", nperm = 10)
+#>   group1    group2   p_value
+#> 1 glmnet svmRadial 0.7272727
+#> 2     rf    glmnet 0.2727273
+#> 3     rf svmRadial 0.5454545
+
+
+
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/define_cv.html b/docs/reference/define_cv.html index d88d7eef..c26bc116 100644 --- a/docs/reference/define_cv.html +++ b/docs/reference/define_cv.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0
diff --git a/docs/reference/get_caret_processed_df.html b/docs/reference/get_caret_processed_df.html index 1da76d45..dfbed020 100644 --- a/docs/reference/get_caret_processed_df.html +++ b/docs/reference/get_caret_processed_df.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_difference.html b/docs/reference/get_difference.html new file mode 100644 index 00000000..e493205a --- /dev/null +++ b/docs/reference/get_difference.html @@ -0,0 +1,137 @@ + +Average metric difference — get_difference • mikropml + + +
+
+ + + +
+
+ + +
+

Calculate the difference in the mean of the metric for two groups

+
+ +
+
get_difference(sub_data, group_name, metric)
+
+ +
+

Arguments

+
sub_data
+

subset of the merged performance data frame for two groups

+
group_name
+

name of column with group variable

+
metric
+

metric to compare

+
+
+

Value

+

numeric difference in the average metric between the two groups

+
+
+

Author

+

Courtney Armour, armourc@umich.edu

+
+ +
+

Examples

+
df <- dplyr::tibble(
+  condition = c("a", "a", "b", "b"),
+  AUC = c(.2, 0.3, 0.8, 0.9)
+)
+get_difference(df, "condition", "AUC")
+#> [1] 0.6
+
+
+
+
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/get_feature_importance.html b/docs/reference/get_feature_importance.html index 7b38b2f3..e12f97f8 100644 --- a/docs/reference/get_feature_importance.html +++ b/docs/reference/get_feature_importance.html @@ -18,7 +18,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -202,7 +202,7 @@

Examples

# We strongly recommend providing multiple cores to speed up computation time. # Do this before calling `get_feature_importance()`. -doFuture::registerDoFuture() +doFuture::registerDoFuture() future::plan(future::multicore, workers = 2) # Optionally, you can group features together with a custom grouping diff --git a/docs/reference/get_hp_performance.html b/docs/reference/get_hp_performance.html index 4725b431..d95f2c20 100644 --- a/docs/reference/get_hp_performance.html +++ b/docs/reference/get_hp_performance.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_hyperparams_list.html b/docs/reference/get_hyperparams_list.html index cffb9d2c..e073866a 100644 --- a/docs/reference/get_hyperparams_list.html +++ b/docs/reference/get_hyperparams_list.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_outcome_type.html b/docs/reference/get_outcome_type.html index 9932003f..ef79172a 100644 --- a/docs/reference/get_outcome_type.html +++ b/docs/reference/get_outcome_type.html @@ -19,7 +19,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_partition_indices.html b/docs/reference/get_partition_indices.html index c3c9c641..a33ebb3f 100644 --- a/docs/reference/get_partition_indices.html +++ b/docs/reference/get_partition_indices.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_perf_metric_fn.html b/docs/reference/get_perf_metric_fn.html index 2eb2e30c..11b8718c 100644 --- a/docs/reference/get_perf_metric_fn.html +++ b/docs/reference/get_perf_metric_fn.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -105,7 +105,7 @@

Examples

#> data$obs <- factor(data$obs, levels = lev) #> postResample(data[, "pred"], data[, "obs"]) #> } -#> <bytecode: 0x7fa3774c1120> +#> <bytecode: 0x7fdd8f312490> #> <environment: namespace:caret> get_perf_metric_fn("binary") #> function (data, lev = NULL, model = NULL) @@ -163,7 +163,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa389e58f88> +#> <bytecode: 0x7fdd88c84980> #> <environment: namespace:caret> get_perf_metric_fn("multiclass") #> function (data, lev = NULL, model = NULL) @@ -221,7 +221,7 @@

Examples

#> stats <- stats[c(stat_list)] #> return(stats) #> } -#> <bytecode: 0x7fa389e58f88> +#> <bytecode: 0x7fdd88c84980> #> <environment: namespace:caret>
diff --git a/docs/reference/get_perf_metric_name.html b/docs/reference/get_perf_metric_name.html index 16afe4eb..d96ec201 100644 --- a/docs/reference/get_perf_metric_name.html +++ b/docs/reference/get_perf_metric_name.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_performance_tbl.html b/docs/reference/get_performance_tbl.html index cb071b7b..cce8a298 100644 --- a/docs/reference/get_performance_tbl.html +++ b/docs/reference/get_performance_tbl.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/get_tuning_grid.html b/docs/reference/get_tuning_grid.html index 13497e90..a8d3faee 100644 --- a/docs/reference/get_tuning_grid.html +++ b/docs/reference/get_tuning_grid.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/group_correlated_features.html b/docs/reference/group_correlated_features.html index e91c8f69..ec9f0acb 100644 --- a/docs/reference/group_correlated_features.html +++ b/docs/reference/group_correlated_features.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/index.html b/docs/reference/index.html index 78413d2e..048d1f36 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -93,6 +93,11 @@

Plotting & evalutation helpers

Visualize & evalutate performance to help you tune hyperparameters and choose model methods.

+

compare_models()

+ +

Perform permutation tests to compare the performance metric +across all pairs of a group variable.

+

plot_hp_performance()

Plot hyperparameter performance metrics

diff --git a/docs/reference/mikropml.html b/docs/reference/mikropml.html index 033b0f39..53a58e4d 100644 --- a/docs/reference/mikropml.html +++ b/docs/reference/mikropml.html @@ -20,7 +20,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin.html b/docs/reference/otu_mini_bin.html index 80b5eec8..f839e6da 100644 --- a/docs/reference/otu_mini_bin.html +++ b/docs/reference/otu_mini_bin.html @@ -19,7 +19,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin_results_glmnet.html b/docs/reference/otu_mini_bin_results_glmnet.html index ca7233ac..5ae3499d 100644 --- a/docs/reference/otu_mini_bin_results_glmnet.html +++ b/docs/reference/otu_mini_bin_results_glmnet.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin_results_rf.html b/docs/reference/otu_mini_bin_results_rf.html index 7400b058..aaf32bab 100644 --- a/docs/reference/otu_mini_bin_results_rf.html +++ b/docs/reference/otu_mini_bin_results_rf.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin_results_rpart2.html b/docs/reference/otu_mini_bin_results_rpart2.html index adf384e5..f73bffd5 100644 --- a/docs/reference/otu_mini_bin_results_rpart2.html +++ b/docs/reference/otu_mini_bin_results_rpart2.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin_results_svmRadial.html b/docs/reference/otu_mini_bin_results_svmRadial.html index 895b21d9..165e2009 100644 --- a/docs/reference/otu_mini_bin_results_svmRadial.html +++ b/docs/reference/otu_mini_bin_results_svmRadial.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_bin_results_xgbTree.html b/docs/reference/otu_mini_bin_results_xgbTree.html index 4a593f16..bda6634d 100644 --- a/docs/reference/otu_mini_bin_results_xgbTree.html +++ b/docs/reference/otu_mini_bin_results_xgbTree.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_cont_results_glmnet.html b/docs/reference/otu_mini_cont_results_glmnet.html index 36c38de6..5c5dba44 100644 --- a/docs/reference/otu_mini_cont_results_glmnet.html +++ b/docs/reference/otu_mini_cont_results_glmnet.html @@ -20,7 +20,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_cont_results_nocv.html b/docs/reference/otu_mini_cont_results_nocv.html index f7a2e203..dda5c1ff 100644 --- a/docs/reference/otu_mini_cont_results_nocv.html +++ b/docs/reference/otu_mini_cont_results_nocv.html @@ -23,7 +23,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_cv.html b/docs/reference/otu_mini_cv.html index ecf1bba4..d35a6be4 100644 --- a/docs/reference/otu_mini_cv.html +++ b/docs/reference/otu_mini_cv.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_multi.html b/docs/reference/otu_mini_multi.html index 61380779..916e2f6c 100644 --- a/docs/reference/otu_mini_multi.html +++ b/docs/reference/otu_mini_multi.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_multi_group.html b/docs/reference/otu_mini_multi_group.html index 3430838a..627812b4 100644 --- a/docs/reference/otu_mini_multi_group.html +++ b/docs/reference/otu_mini_multi_group.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_mini_multi_results_glmnet.html b/docs/reference/otu_mini_multi_results_glmnet.html index c2cb8ddb..eceaa22e 100644 --- a/docs/reference/otu_mini_multi_results_glmnet.html +++ b/docs/reference/otu_mini_multi_results_glmnet.html @@ -20,7 +20,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/otu_small.html b/docs/reference/otu_small.html index c00d3a75..6689a854 100644 --- a/docs/reference/otu_small.html +++ b/docs/reference/otu_small.html @@ -19,7 +19,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/permute_p_value.html b/docs/reference/permute_p_value.html new file mode 100644 index 00000000..74a8c3f0 --- /dev/null +++ b/docs/reference/permute_p_value.html @@ -0,0 +1,151 @@ + +Calculated a permuted p-value comparing two models — permute_p_value • mikropml + + +
+
+ + + +
+
+ + +
+

Calculated a permuted p-value comparing two models

+
+ +
+
permute_p_value(
+  merged_data,
+  metric,
+  group_name,
+  group_1,
+  group_2,
+  nperm = 10000
+)
+
+ +
+

Arguments

+
merged_data
+

the concatenated performance data from run_ml

+
metric
+

metric to compare, must be numeric

+
group_name
+

column with group variables to compare

+
group_1
+

name of one group to compare

+
group_2
+

name of other group to compare

+
nperm
+

number of permutations, default=10000

+
+
+

Value

+

numeric p-value comparing two models

+
+
+

Author

+

Begüm Topçuoğlu, topcuoglu.begum@gmail.com

+

Courtney R Armour, armourc@umich.edu

+
+ +
+

Examples

+
df <- dplyr::tibble(
+  model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"),
+  AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95)
+)
+set.seed(123)
+permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100)
+#> [1] 0.3663366
+
+
+
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/plot_hp_performance.html b/docs/reference/plot_hp_performance.html index 3b4cd628..55a03389 100644 --- a/docs/reference/plot_hp_performance.html +++ b/docs/reference/plot_hp_performance.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/plot_model_performance.html b/docs/reference/plot_model_performance.html index 33e8da4f..d60c03be 100644 --- a/docs/reference/plot_model_performance.html +++ b/docs/reference/plot_model_performance.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/preprocess_data.html b/docs/reference/preprocess_data.html index cf768f1b..bd8f45e1 100644 --- a/docs/reference/preprocess_data.html +++ b/docs/reference/preprocess_data.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/randomize_feature_order.html b/docs/reference/randomize_feature_order.html index 6167eb39..ecb5c6dd 100644 --- a/docs/reference/randomize_feature_order.html +++ b/docs/reference/randomize_feature_order.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -107,10 +107,10 @@

Examples

a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome") -#> outcome a d c b -#> 1 1 4 13 10 7 -#> 2 2 5 14 11 8 -#> 3 3 6 15 12 9 +#> outcome c b a d +#> 1 1 10 7 4 13 +#> 2 2 11 8 5 14 +#> 3 3 12 9 6 15
diff --git a/docs/reference/reexports.html b/docs/reference/reexports.html index 956d56db..7a0dbcb1 100644 --- a/docs/reference/reexports.html +++ b/docs/reference/reexports.html @@ -32,7 +32,7 @@ mikropml - 1.2.2.9000 + 1.3.0 @@ -101,7 +101,7 @@

dplyr pipe

rlang
-

!!, .data, :=

+

!!, .data, :=

diff --git a/docs/reference/remove_singleton_columns.html b/docs/reference/remove_singleton_columns.html index 4b21208d..778d750d 100644 --- a/docs/reference/remove_singleton_columns.html +++ b/docs/reference/remove_singleton_columns.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/replace_spaces.html b/docs/reference/replace_spaces.html index d6558c95..0cd9e165 100644 --- a/docs/reference/replace_spaces.html +++ b/docs/reference/replace_spaces.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/run_ml.html b/docs/reference/run_ml.html index b47c7a7e..a6df61b5 100644 --- a/docs/reference/run_ml.html +++ b/docs/reference/run_ml.html @@ -23,7 +23,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/shuffle_group.html b/docs/reference/shuffle_group.html new file mode 100644 index 00000000..99aa43ef --- /dev/null +++ b/docs/reference/shuffle_group.html @@ -0,0 +1,141 @@ + +Shuffle the rows in a column — shuffle_group • mikropml + + +
+
+ + + +
+
+ + +
+

Shuffle the rows in a column

+
+ +
+
shuffle_group(dat, col_name)
+
+ +
+

Arguments

+
dat
+

a data frame containing col_name

+
col_name
+

column name to shuffle

+
+
+

Value

+

dat with the rows of col_name shuffled

+
+
+

Author

+

Courtney R Armour, armourc@umich.edu

+
+ +
+

Examples

+
set.seed(123)
+df <- dplyr::tibble(
+  condition = c("a", "a", "b", "b"),
+  AUC = c(.2, 0.3, 0.8, 0.9)
+)
+shuffle_group(df, "condition")
+#> # A tibble: 4 × 2
+#>   condition   AUC
+#>   <chr>     <dbl>
+#> 1 b           0.2
+#> 2 b           0.3
+#> 3 a           0.8
+#> 4 a           0.9
+
+
+
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/tidy_perf_data.html b/docs/reference/tidy_perf_data.html index 73a145fe..3d97b53d 100644 --- a/docs/reference/tidy_perf_data.html +++ b/docs/reference/tidy_perf_data.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/reference/train_model.html b/docs/reference/train_model.html index dd9d4372..8a9766e1 100644 --- a/docs/reference/train_model.html +++ b/docs/reference/train_model.html @@ -17,7 +17,7 @@ mikropml - 1.2.2.9000 + 1.3.0 diff --git a/docs/sitemap.xml b/docs/sitemap.xml index c203233c..6ed7b302 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -78,6 +78,9 @@ http://www.schlosslab.org/mikropml/reference/combine_hp_performance.html + + http://www.schlosslab.org/mikropml/reference/compare_models.html + http://www.schlosslab.org/mikropml/reference/createGroupedDataPartition.html @@ -108,6 +111,9 @@ http://www.schlosslab.org/mikropml/reference/get_corr_feats.html + + http://www.schlosslab.org/mikropml/reference/get_difference.html + http://www.schlosslab.org/mikropml/reference/get_feature_importance.html @@ -234,6 +240,9 @@ http://www.schlosslab.org/mikropml/reference/otu_small.html + + http://www.schlosslab.org/mikropml/reference/permute_p_value.html + http://www.schlosslab.org/mikropml/reference/plot_hp_performance.html @@ -282,6 +291,9 @@ http://www.schlosslab.org/mikropml/reference/setup_parallel.html + + http://www.schlosslab.org/mikropml/reference/shuffle_group.html + http://www.schlosslab.org/mikropml/reference/split_outcome_features.html diff --git a/man/get_difference.Rd b/man/get_difference.Rd new file mode 100644 index 00000000..158e682f --- /dev/null +++ b/man/get_difference.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_models.R +\name{get_difference} +\alias{get_difference} +\title{Average metric difference} +\usage{ +get_difference(sub_data, group_name, metric) +} +\arguments{ +\item{sub_data}{subset of the merged performance data frame for two groups} + +\item{group_name}{name of column with group variable} + +\item{metric}{metric to compare} +} +\value{ +numeric difference in the average metric between the two groups +} +\description{ +Calculate the difference in the mean of the metric for two groups +} +\examples{ +df <- dplyr::tibble( + condition = c("a", "a", "b", "b"), + AUC = c(.2, 0.3, 0.8, 0.9) +) +get_difference(df, "condition", "AUC") + +} +\author{ +Courtney Armour, \email{armourc@umich.edu} +} diff --git a/man/permute_p_value.Rd b/man/permute_p_value.Rd new file mode 100644 index 00000000..c3bc7350 --- /dev/null +++ b/man/permute_p_value.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_models.R +\name{permute_p_value} +\alias{permute_p_value} +\title{Calculated a permuted p-value comparing two models} +\usage{ +permute_p_value( + merged_data, + metric, + group_name, + group_1, + group_2, + nperm = 10000 +) +} +\arguments{ +\item{merged_data}{the concatenated performance data from \code{run_ml}} + +\item{metric}{metric to compare, must be numeric} + +\item{group_name}{column with group variables to compare} + +\item{group_1}{name of one group to compare} + +\item{group_2}{name of other group to compare} + +\item{nperm}{number of permutations, default=10000} +} +\value{ +numeric p-value comparing two models +} +\description{ +Calculated a permuted p-value comparing two models +} +\examples{ +df <- dplyr::tibble( + model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), + AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) +) +set.seed(123) +permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100) +} +\author{ +Begüm Topçuoğlu, \email{topcuoglu.begum@gmail.com} + +Courtney R Armour, \email{armourc@umich.edu} +} diff --git a/man/reexports.Rd b/man/reexports.Rd index 43708a49..b07f3030 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -19,6 +19,6 @@ below to see their documentation. \item{dplyr}{\code{\link[dplyr:reexports]{\%>\%}}} - \item{rlang}{\code{\link[rlang:nse-force]{!!}}, \code{\link[rlang:tidyeval-data]{.data}}, \code{\link[rlang:nse-force]{:=}}} + \item{rlang}{\code{\link[rlang:injection-operator]{!!}}, \code{\link[rlang:dot-data]{.data}}, \code{\link[rlang:dyn-dots]{:=}}} }} diff --git a/man/shuffle_group.Rd b/man/shuffle_group.Rd new file mode 100644 index 00000000..313f4f04 --- /dev/null +++ b/man/shuffle_group.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_models.R +\name{shuffle_group} +\alias{shuffle_group} +\title{Shuffle the rows in a column} +\usage{ +shuffle_group(dat, col_name) +} +\arguments{ +\item{dat}{a data frame containing \code{col_name}} + +\item{col_name}{column name to shuffle} +} +\value{ +\code{dat} with the rows of \code{col_name} shuffled +} +\description{ +Shuffle the rows in a column +} +\examples{ +set.seed(123) +df <- dplyr::tibble( + condition = c("a", "a", "b", "b"), + AUC = c(.2, 0.3, 0.8, 0.9) +) +shuffle_group(df, "condition") +} +\author{ +Courtney R Armour, \email{armourc@umich.edu} +} From e31c72f86f238e477f3abd2927d2baad1798cb2f Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 12:40:56 -0400 Subject: [PATCH 04/10] Fix typos --- R/compare_models.R | 2 +- R/data.R | 10 +++++----- vignettes/introduction.Rmd | 4 ++-- vignettes/paper.Rmd | 2 +- vignettes/preprocess.Rmd | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/compare_models.R b/R/compare_models.R index e6ef622e..8cc66b08 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -134,7 +134,7 @@ permute_p_value <- function(merged_data, metric, group_name, group_1, group_2, n #' @param group_name column with group variables to compare #' @param nperm number of permutations, default=10000 #' -#' @return a table of p-values for all pairs of group varible +#' @return a table of p-values for all pairs of group variable #' @export #' @author Courtney R Armour, \email{armourc@@umich.edu} #' diff --git a/R/data.R b/R/data.R index 8df06165..9f12e9b0 100644 --- a/R/data.R +++ b/R/data.R @@ -32,19 +32,19 @@ #' Cross validation on `train_data_mini` with grouped features. "otu_mini_cv" -#' Results from running the pipline with L2 logistic regression on `otu_mini_bin` with feature importance and grouping +#' Results from running the pipeline with L2 logistic regression on `otu_mini_bin` with feature importance and grouping "otu_mini_bin_results_glmnet" -#' Results from running the pipline with random forest on `otu_mini_bin` +#' Results from running the pipeline with random forest on `otu_mini_bin` "otu_mini_bin_results_rf" -#' Results from running the pipline with svmRadial on `otu_mini_bin` +#' Results from running the pipeline with svmRadial on `otu_mini_bin` "otu_mini_bin_results_svmRadial" -#' Results from running the pipline with xbgTree on `otu_mini_bin` +#' Results from running the pipeline with xbgTree on `otu_mini_bin` "otu_mini_bin_results_xgbTree" -#' Results from running the pipline with rpart2 on `otu_mini_bin` +#' Results from running the pipeline with rpart2 on `otu_mini_bin` "otu_mini_bin_results_rpart2" #' Results from running the pipeline with glmnet on `otu_mini_bin` with `Otu00001` diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 9cb1d8a9..faacf965 100644 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -335,7 +335,7 @@ depending on how many samples and groups you have. This is because it won't be exactly what you specify with `training_frac`, since you have to include all of one group in either the training set _or_ the test set. -### Controling how groups are assigned to partitions +### Controlling how groups are assigned to partitions When you use the `groups` parameter as above, by default `run_ml()` will assume that you want all of the observations from each group to be placed in the same @@ -426,7 +426,7 @@ There are several columns: 1. `pvalue`: the probability of obtaining the actual performance value under the null hypothesis. 1. `names`: The feature that was permuted. 1. `method`: The ML method used. -1. `perf_metric_name`: The peformance metric used. +1. `perf_metric_name`: The performance metric used. 1. `seed`: The seed (if set). As you can see here, the differences are negligible (close to zero), which makes diff --git a/vignettes/paper.Rmd b/vignettes/paper.Rmd index 0aafe5aa..11637300 100644 --- a/vignettes/paper.Rmd +++ b/vignettes/paper.Rmd @@ -72,7 +72,7 @@ Machine learning (ML) for classification and prediction based on a set of features is used to make decisions in healthcare, economics, criminal justice and more. However, implementing an ML pipeline including preprocessing, model selection, and evaluation can be time-consuming, confusing, and difficult. Here, -we present [`mikropml`](http://www.schlosslab.org/mikropml/) (prononced +we present [`mikropml`](http://www.schlosslab.org/mikropml/) (pronounced "meek-ROPE em el"), an easy-to-use R package that implements ML pipelines using regression, support vector machines, decision trees, random forest, or gradient-boosted trees. The package is available on diff --git a/vignettes/preprocess.Rmd b/vignettes/preprocess.Rmd index 3b7051c7..8eebee7a 100644 --- a/vignettes/preprocess.Rmd +++ b/vignettes/preprocess.Rmd @@ -81,7 +81,7 @@ preprocess_data(dataset = bin_df, outcome_colname = "outcome") The output is a list: `dat_transformed` which has the transformed data, `grp_feats` which is a list of grouped features, and `removed_feats` which is a -list of featuures that were removed. Here, `grp_feats` is `NULL` because there +list of features that were removed. Here, `grp_feats` is `NULL` because there are no perfectly correlated features (e.g. `c(0,1,0)` and `c(0,1,0)`, or `c(0,1,0)` and `c(1,0,1)` - see below for more details). From f8bcec96f473a234c689c9b28df7503759ff0778 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 15:19:22 -0400 Subject: [PATCH 05/10] Make shuffle_groups() & get_difference() internal functions --- R/compare_models.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/compare_models.R b/R/compare_models.R index 8cc66b08..5d4844e7 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -1,5 +1,3 @@ -#' Average metric difference -#' #' Calculate the difference in the mean of the metric for two groups #' #' @param sub_data subset of the merged performance data frame for two groups @@ -8,7 +6,7 @@ #' #' @return numeric difference in the average metric between the two groups #' -#' @export +#' @noRd #' @author Courtney Armour, \email{armourc@@umich.edu} #' #' @examples @@ -38,7 +36,7 @@ get_difference <- function(sub_data, group_name, metric) { #' @param col_name column name to shuffle #' #' @return `dat` with the rows of `col_name` shuffled -#' @export +#' @noRd #' @author Courtney R Armour, \email{armourc@@umich.edu} #' #' @examples @@ -62,7 +60,6 @@ shuffle_group <- function(dat, col_name) { return(data_shuffled) } - #' Calculated a permuted p-value comparing two models #' #' @inheritParams compare_models From d147a9960b550ea090cb11701e7371cc62553ad0 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 15:19:35 -0400 Subject: [PATCH 06/10] document() --- NAMESPACE | 2 -- man/compare_models.Rd | 2 +- man/get_difference.Rd | 32 --------------------------- man/otu_mini_bin_results_glmnet.Rd | 4 ++-- man/otu_mini_bin_results_rf.Rd | 4 ++-- man/otu_mini_bin_results_rpart2.Rd | 4 ++-- man/otu_mini_bin_results_svmRadial.Rd | 4 ++-- man/otu_mini_bin_results_xgbTree.Rd | 4 ++-- man/shuffle_group.Rd | 30 ------------------------- 9 files changed, 11 insertions(+), 75 deletions(-) delete mode 100644 man/get_difference.Rd delete mode 100644 man/shuffle_group.Rd diff --git a/NAMESPACE b/NAMESPACE index 8410e1c2..68575aad 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,6 @@ export(compare_models) export(contr.ltfr) export(define_cv) export(get_caret_processed_df) -export(get_difference) export(get_feature_importance) export(get_hp_performance) export(get_hyperparams_list) @@ -29,7 +28,6 @@ export(randomize_feature_order) export(remove_singleton_columns) export(replace_spaces) export(run_ml) -export(shuffle_group) export(tidy_perf_data) export(train_model) importFrom(MLmetrics,AUC) diff --git a/man/compare_models.Rd b/man/compare_models.Rd index d2ccab3e..3f821954 100644 --- a/man/compare_models.Rd +++ b/man/compare_models.Rd @@ -17,7 +17,7 @@ compare_models(merged_data, metric, group_name, nperm = 10000) \item{nperm}{number of permutations, default=10000} } \value{ -a table of p-values for all pairs of group varible +a table of p-values for all pairs of group variable } \description{ A wrapper for \code{permute_p_value()}. diff --git a/man/get_difference.Rd b/man/get_difference.Rd deleted file mode 100644 index 158e682f..00000000 --- a/man/get_difference.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_models.R -\name{get_difference} -\alias{get_difference} -\title{Average metric difference} -\usage{ -get_difference(sub_data, group_name, metric) -} -\arguments{ -\item{sub_data}{subset of the merged performance data frame for two groups} - -\item{group_name}{name of column with group variable} - -\item{metric}{metric to compare} -} -\value{ -numeric difference in the average metric between the two groups -} -\description{ -Calculate the difference in the mean of the metric for two groups -} -\examples{ -df <- dplyr::tibble( - condition = c("a", "a", "b", "b"), - AUC = c(.2, 0.3, 0.8, 0.9) -) -get_difference(df, "condition", "AUC") - -} -\author{ -Courtney Armour, \email{armourc@umich.edu} -} diff --git a/man/otu_mini_bin_results_glmnet.Rd b/man/otu_mini_bin_results_glmnet.Rd index 41c4eb93..2af95b37 100644 --- a/man/otu_mini_bin_results_glmnet.Rd +++ b/man/otu_mini_bin_results_glmnet.Rd @@ -3,7 +3,7 @@ \docType{data} \name{otu_mini_bin_results_glmnet} \alias{otu_mini_bin_results_glmnet} -\title{Results from running the pipline with L2 logistic regression on \code{otu_mini_bin} with feature importance and grouping} +\title{Results from running the pipeline with L2 logistic regression on \code{otu_mini_bin} with feature importance and grouping} \format{ An object of class \code{list} of length 4. } @@ -11,6 +11,6 @@ An object of class \code{list} of length 4. otu_mini_bin_results_glmnet } \description{ -Results from running the pipline with L2 logistic regression on \code{otu_mini_bin} with feature importance and grouping +Results from running the pipeline with L2 logistic regression on \code{otu_mini_bin} with feature importance and grouping } \keyword{datasets} diff --git a/man/otu_mini_bin_results_rf.Rd b/man/otu_mini_bin_results_rf.Rd index 9ea47cac..302a0d6c 100644 --- a/man/otu_mini_bin_results_rf.Rd +++ b/man/otu_mini_bin_results_rf.Rd @@ -3,7 +3,7 @@ \docType{data} \name{otu_mini_bin_results_rf} \alias{otu_mini_bin_results_rf} -\title{Results from running the pipline with random forest on \code{otu_mini_bin}} +\title{Results from running the pipeline with random forest on \code{otu_mini_bin}} \format{ An object of class \code{list} of length 4. } @@ -11,6 +11,6 @@ An object of class \code{list} of length 4. otu_mini_bin_results_rf } \description{ -Results from running the pipline with random forest on \code{otu_mini_bin} +Results from running the pipeline with random forest on \code{otu_mini_bin} } \keyword{datasets} diff --git a/man/otu_mini_bin_results_rpart2.Rd b/man/otu_mini_bin_results_rpart2.Rd index a4ad66ec..72748945 100644 --- a/man/otu_mini_bin_results_rpart2.Rd +++ b/man/otu_mini_bin_results_rpart2.Rd @@ -3,7 +3,7 @@ \docType{data} \name{otu_mini_bin_results_rpart2} \alias{otu_mini_bin_results_rpart2} -\title{Results from running the pipline with rpart2 on \code{otu_mini_bin}} +\title{Results from running the pipeline with rpart2 on \code{otu_mini_bin}} \format{ An object of class \code{list} of length 4. } @@ -11,6 +11,6 @@ An object of class \code{list} of length 4. otu_mini_bin_results_rpart2 } \description{ -Results from running the pipline with rpart2 on \code{otu_mini_bin} +Results from running the pipeline with rpart2 on \code{otu_mini_bin} } \keyword{datasets} diff --git a/man/otu_mini_bin_results_svmRadial.Rd b/man/otu_mini_bin_results_svmRadial.Rd index 1180e950..66194ad5 100644 --- a/man/otu_mini_bin_results_svmRadial.Rd +++ b/man/otu_mini_bin_results_svmRadial.Rd @@ -3,7 +3,7 @@ \docType{data} \name{otu_mini_bin_results_svmRadial} \alias{otu_mini_bin_results_svmRadial} -\title{Results from running the pipline with svmRadial on \code{otu_mini_bin}} +\title{Results from running the pipeline with svmRadial on \code{otu_mini_bin}} \format{ An object of class \code{list} of length 4. } @@ -11,6 +11,6 @@ An object of class \code{list} of length 4. otu_mini_bin_results_svmRadial } \description{ -Results from running the pipline with svmRadial on \code{otu_mini_bin} +Results from running the pipeline with svmRadial on \code{otu_mini_bin} } \keyword{datasets} diff --git a/man/otu_mini_bin_results_xgbTree.Rd b/man/otu_mini_bin_results_xgbTree.Rd index a509b3a2..3b193cd9 100644 --- a/man/otu_mini_bin_results_xgbTree.Rd +++ b/man/otu_mini_bin_results_xgbTree.Rd @@ -3,7 +3,7 @@ \docType{data} \name{otu_mini_bin_results_xgbTree} \alias{otu_mini_bin_results_xgbTree} -\title{Results from running the pipline with xbgTree on \code{otu_mini_bin}} +\title{Results from running the pipeline with xbgTree on \code{otu_mini_bin}} \format{ An object of class \code{list} of length 4. } @@ -11,6 +11,6 @@ An object of class \code{list} of length 4. otu_mini_bin_results_xgbTree } \description{ -Results from running the pipline with xbgTree on \code{otu_mini_bin} +Results from running the pipeline with xbgTree on \code{otu_mini_bin} } \keyword{datasets} diff --git a/man/shuffle_group.Rd b/man/shuffle_group.Rd deleted file mode 100644 index 313f4f04..00000000 --- a/man/shuffle_group.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_models.R -\name{shuffle_group} -\alias{shuffle_group} -\title{Shuffle the rows in a column} -\usage{ -shuffle_group(dat, col_name) -} -\arguments{ -\item{dat}{a data frame containing \code{col_name}} - -\item{col_name}{column name to shuffle} -} -\value{ -\code{dat} with the rows of \code{col_name} shuffled -} -\description{ -Shuffle the rows in a column -} -\examples{ -set.seed(123) -df <- dplyr::tibble( - condition = c("a", "a", "b", "b"), - AUC = c(.2, 0.3, 0.8, 0.9) -) -shuffle_group(df, "condition") -} -\author{ -Courtney R Armour, \email{armourc@umich.edu} -} From 518bbc8228e6f070671ef1e304cd7d9d32dfe8fc Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 15:24:52 -0400 Subject: [PATCH 07/10] Create model evaluation section of reference page --- _pkgdown.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index b126508f..dc00676f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -29,15 +29,22 @@ reference: - mikropml - preprocess_data - run_ml -- title: Plotting & evalutation helpers +- title: Plotting helpers desc: > - Visualize & evalutate performance to help you tune hyperparameters and choose model methods. + Visualize results to help you tune hyperparameters and choose model methods. contents: - - compare_models - starts_with('plot') - tidy_perf_data - get_hp_performance - combine_hp_performance +- title: Model evaluation + desc: > + Evaluate and interpret models. + contents: + - get_feature_importance + - get_performance_tbl + - compare_models + - permute_p_value - title: Package Data - subtitle: datasets contents: @@ -54,9 +61,8 @@ reference: - replace_spaces - title: Pipeline customization desc: > - These are functions called by preprocess_data() or run_ml(). - We make them available in case you would like to customize various steps - of the pipeline beyond the arguments provided by the main functions. + Customize various steps of the pipeline beyond the arguments provided by + run_ml() and preprocess_data(). contents: - remove_singleton_columns - get_caret_processed_df @@ -70,6 +76,4 @@ reference: - get_perf_metric_fn - train_model - calc_perf_metrics - - get_performance_tbl - - get_feature_importance - group_correlated_features From 5334183cba7c64e31d0f51676514f6b60edcd21e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 19 May 2022 20:05:52 +0000 Subject: [PATCH 08/10] =?UTF-8?q?=F0=9F=93=91=20Build=20docs=20site?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/CODE_OF_CONDUCT.html | 1 - docs/CONTRIBUTING.html | 1 - docs/LICENSE.html | 1 - docs/SUPPORT.html | 1 - docs/articles/introduction.html | 356 ++++----------- docs/articles/paper.html | 415 +++++------------- docs/articles/parallel.html | 164 +++---- docs/articles/preprocess.html | 214 +++------ docs/articles/tuning.html | 162 ++----- docs/index.html | 7 +- docs/news/index.html | 37 +- docs/pkgdown.yml | 4 +- docs/pull_request_template.html | 26 +- docs/reference/compare_models.html | 2 +- docs/reference/get_feature_importance.html | 2 +- docs/reference/get_perf_metric_fn.html | 6 +- docs/reference/index.html | 50 ++- .../otu_mini_bin_results_glmnet.html | 6 +- docs/reference/otu_mini_bin_results_rf.html | 6 +- .../otu_mini_bin_results_rpart2.html | 6 +- .../otu_mini_bin_results_svmRadial.html | 6 +- .../otu_mini_bin_results_xgbTree.html | 6 +- 22 files changed, 423 insertions(+), 1056 deletions(-) diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html index 5bb0afd6..356d405e 100644 --- a/docs/CODE_OF_CONDUCT.html +++ b/docs/CODE_OF_CONDUCT.html @@ -73,7 +73,6 @@

Contributor Covenant Code of Conduct

-

This document was adapted from the Tidyverse Code of Conduct.

Our Pledge

diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 4cc444a7..cfcb2766 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -73,7 +73,6 @@

Contributing to mikropml

-

This document was adapted from the Tidyverse Contributing guide.

Fixing typos

diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 2d2d71ff..a38ff1d8 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -73,7 +73,6 @@

MIT License

-

Copyright (c) 2019-2021 Begüm D. Topçuoğlu, Zena Lapp, Kelly L. Sovacool, Evan Snitkin, Jenna Wiens, and Patrick D. Schloss

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

diff --git a/docs/SUPPORT.html b/docs/SUPPORT.html index e80f09e2..31683945 100644 --- a/docs/SUPPORT.html +++ b/docs/SUPPORT.html @@ -73,7 +73,6 @@

Getting help with mikropml

-

Thanks for using mikropml! Before filing an issue, there are a few places to explore and pieces to put together to make the process as smooth as possible.

Make a reprex

diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html index 57ba736e..8466437c 100644 --- a/docs/articles/introduction.html +++ b/docs/articles/introduction.html @@ -100,7 +100,7 @@ -
+

Before running ML

-

Before you execute run_ml(), you should consider -preprocessing your data, either on your own or with the -preprocess_data() function. You can learn more about this -in the preprocessing vignette: vignette("preprocess").

+

Before you execute run_ml(), you should consider preprocessing your data, either on your own or with the preprocess_data() function. You can learn more about this in the preprocessing vignette: vignette("preprocess").

The simplest way to run_ml()

-

As mentioned above, the minimal input is your dataset -(dataset) and the machine learning model you want to use -(method).

+

As mentioned above, the minimal input is your dataset (dataset) and the machine learning model you want to use (method).

You may also want to provide:

    -
  • The outcome column name. By default run_ml() will pick -the first column, but it’s best practice to specify the column name -explicitly.
  • -
  • A seed so that the results will be reproducible, and so that you get -the same results as those you see here (i.e have the same train/test -split).
  • +
  • The outcome column name. By default run_ml() will pick the first column, but it’s best practice to specify the column name explicitly.
  • +
  • A seed so that the results will be reproducible, and so that you get the same results as those you see here (i.e have the same train/test split).
-

Say we want to use logistic regression, then the method we will use -is glmnet. To do so, run the ML pipeline with:

+

Say we want to use logistic regression, then the method we will use is glmnet. To do so, run the ML pipeline with:

 results <- run_ml(otu_mini_bin,
                   'glmnet',
@@ -253,25 +200,16 @@ 

The simplest way to run_ml()= 2019)

You’ll notice a few things:

    -
  1. It takes a little while to run. This is because of some of the -parameters we use.
  2. -
  3. There is a message stating that ‘dx’ is being used as the outcome -column. This is what we want, but it’s a nice sanity check!
  4. -
  5. There was a warning. Don’t worry about this warning right now - it -just means that some of the hyperparameters aren’t a good fit - but if -you’re interested in learning more, see -vignette("tuning").
  6. +
  7. It takes a little while to run. This is because of some of the parameters we use.
  8. +
  9. There is a message stating that ‘dx’ is being used as the outcome column. This is what we want, but it’s a nice sanity check!
  10. +
  11. There was a warning. Don’t worry about this warning right now - it just means that some of the hyperparameters aren’t a good fit - but if you’re interested in learning more, see vignette("tuning").
-

Now, let’s dig into the output a bit. The results is a list of 4 -things:

+

Now, let’s dig into the output a bit. The results is a list of 4 things:

 names(results)
 #> [1] "trained_model"      "test_data"          "performance"       
 #> [4] "feature_importance"
-

trained_model is the trained model from -caret. There is a bunch of info in this that we won’t get -into, because you can learn more from the caret::train() -documentation.

+

trained_model is the trained model from caret. There is a bunch of info in this that we won’t get into, because you can learn more from the caret::train() documentation.

 names(results$trained_model)
 #>  [1] "method"       "modelInfo"    "modelType"    "results"      "pred"        
@@ -279,13 +217,7 @@ 

The simplest way to run_ml()#> [11] "finalModel" "preProcess" "trainingData" "ptype" "resample" #> [16] "resampledCM" "perfNames" "maximize" "yLimits" "times" #> [21] "levels" "terms" "coefnames" "xlevels"

-

test_data is the partition of the dataset that was used -for testing. In machine learning, it’s always important to have a -held-out test dataset that is not used in the training stage. In this -pipeline we do that using run_ml() where we split your data -into training and testing sets. The training data are used to build the -model (e.g. tune hyperparameters, learn the data) and the test data are -used to evaluate how well the model performs.

+

test_data is the partition of the dataset that was used for testing. In machine learning, it’s always important to have a held-out test dataset that is not used in the training stage. In this pipeline we do that using run_ml() where we split your data into training and testing sets. The training data are used to build the model (e.g. tune hyperparameters, learn the data) and the test data are used to evaluate how well the model performs.

 head(results$test_data)
 #>        dx Otu00009 Otu00005 Otu00010 Otu00001 Otu00008 Otu00004 Otu00003
@@ -302,10 +234,7 @@ 

The simplest way to run_ml()#> 17 357 253 341 #> 27 25 322 5 #> 30 179 6 30

-

performance is a dataframe of (mainly) performance -metrics (1 column for cross-validation performance metric, several for -test performance metrics, and 2 columns at the end with ML method and -seed):

+

performance is a dataframe of (mainly) performance metrics (1 column for cross-validation performance metric, several for test performance metrics, and 2 columns at the end with ML method and seed):

-

When using logistic regression for binary classification, area under -the receiver-operator characteristic curve (AUC) is a useful metric to -evaluate model performance. Because of that, it’s the default that we -use for mikropml. However, it is crucial to evaluate your -model performance using multiple metrics. Below you can find more -information about other performance metrics and how to use them in our -package.

-

cv_metric_AUC is the AUC for the cross-validation folds -for the training data. This gives us a sense of how well the model -performs on the training data.

-

Most of the other columns are performance metrics for the test data — -the data that wasn’t used to build the model. Here, you can see that the -AUC for the test data is not much above 0.5, suggesting that this model -does not predict much better than chance, and that the model is overfit -because the cross-validation AUC (cv_metric_AUC, measured -during training) is much higher than the testing AUC. This isn’t too -surprising since we’re using so few features with this example dataset, -so don’t be discouraged. The default option also provides a number of -other performance metrics that you might be interested in, including -area under the precision-recall curve (prAUC).

-

The last columns of results$performance are the method -and seed (if you set one) to help with combining results from multiple -runs (see vignette("parallel")).

-

feature_importance has information about feature -importance values if find_feature_importance = TRUE (the -default is FALSE). Since we used the defaults, there’s -nothing here:

+

When using logistic regression for binary classification, area under the receiver-operator characteristic curve (AUC) is a useful metric to evaluate model performance. Because of that, it’s the default that we use for mikropml. However, it is crucial to evaluate your model performance using multiple metrics. Below you can find more information about other performance metrics and how to use them in our package.

+

cv_metric_AUC is the AUC for the cross-validation folds for the training data. This gives us a sense of how well the model performs on the training data.

+

Most of the other columns are performance metrics for the test data — the data that wasn’t used to build the model. Here, you can see that the AUC for the test data is not much above 0.5, suggesting that this model does not predict much better than chance, and that the model is overfit because the cross-validation AUC (cv_metric_AUC, measured during training) is much higher than the testing AUC. This isn’t too surprising since we’re using so few features with this example dataset, so don’t be discouraged. The default option also provides a number of other performance metrics that you might be interested in, including area under the precision-recall curve (prAUC).

+

The last columns of results$performance are the method and seed (if you set one) to help with combining results from multiple runs (see vignette("parallel")).

+

feature_importance has information about feature importance values if find_feature_importance = TRUE (the default is FALSE). Since we used the defaults, there’s nothing here:

 results$feature_importance
 #> [1] "Skipped feature importance"
@@ -349,24 +256,17 @@

The simplest way to run_ml()

Customizing parameters

-

There are a few arguments that allow you to change how you execute -run_ml(). We’ve chosen reasonable defaults for you, but we -encourage you to change these if you think something else would be -better for your data.

+

There are a few arguments that allow you to change how you execute run_ml(). We’ve chosen reasonable defaults for you, but we encourage you to change these if you think something else would be better for your data.

-

Changing kfold, cv_times, and -training_frac +

Changing kfold, cv_times, and training_frac

  • -kfold: The number of folds to run for cross-validation -(default: 5).
  • +kfold: The number of folds to run for cross-validation (default: 5).
  • -cv_times: The number of times to run repeated -cross-validation (default: 100).
  • +cv_times: The number of times to run repeated cross-validation (default: 100).
  • -training_frac: The fraction of data for the training -set (default: 0.8). The rest of the data is used for testing.
  • +training_frac: The fraction of data for the training set (default: 0.8). The rest of the data is used for testing.

Here’s an example where we change some of the default parameters:

@@ -391,21 +291,11 @@ 

Changing kfold, #> #> This warning usually means that the model didn't converge in some cross-validation folds because it is predicting something close to a constant. As a result, certain performance metrics can't be calculated. This suggests that some of the hyperparameters chosen are doing very poorly. #> Training complete.

-

You might have noticed that this one ran faster — that’s because we -reduced kfold and cv_times. This is okay for -testing things out and may even be necessary for smaller datasets. But -in general it may be better to have larger numbers for these parameters; -we think the defaults are a good starting point (Topçuoğlu et al. 2020).

+

You might have noticed that this one ran faster — that’s because we reduced kfold and cv_times. This is okay for testing things out and may even be necessary for smaller datasets. But in general it may be better to have larger numbers for these parameters; we think the defaults are a good starting point (Topçuoğlu et al. 2020).

Custom training indices

-

When training_frac is a fraction between 0 and 1, a -random sample of observations in the dataset are chosen for the training -set to satisfy the training_frac. However, in some cases -you might wish to control exactly which observations are in the training -set. You can instead assign training_frac a vector of -indices that correspond to which rows of the dataset should go in the -training set (all remaining sequences will go in the testing set).

+

When training_frac is a fraction between 0 and 1, a random sample of observations in the dataset are chosen for the training set to satisfy the training_frac. However, in some cases you might wish to control exactly which observations are in the training set. You can instead assign training_frac a vector of indices that correspond to which rows of the dataset should go in the training set (all remaining sequences will go in the testing set).

 n_obs <- otu_mini_bin %>% nrow()
 training_size <- 0.8 * n_obs
@@ -427,20 +317,10 @@ 

Custom training indices

Changing the performance metric

-

There are two arguments that allow you to change what performance -metric to use for model evaluation, and what performance metrics to -calculate using the test data.

-

perf_metric_function is the function used to calculate -the performance metrics.

-

The default for classification is -caret::multiClassSummary() and the default for regression -is caret::defaultSummary(). We’d suggest not changing this -unless you really know what you’re doing.

-

perf_metric_name is the column name from the output of -perf_metric_function. We chose reasonable defaults (AUC for -binary, logLoss for multiclass, and RMSE for continuous), but the -default functions calculate a bunch of different performance metrics, so -you can choose a different one if you’d like.

+

There are two arguments that allow you to change what performance metric to use for model evaluation, and what performance metrics to calculate using the test data.

+

perf_metric_function is the function used to calculate the performance metrics.

+

The default for classification is caret::multiClassSummary() and the default for regression is caret::defaultSummary(). We’d suggest not changing this unless you really know what you’re doing.

+

perf_metric_name is the column name from the output of perf_metric_function. We chose reasonable defaults (AUC for binary, logLoss for multiclass, and RMSE for continuous), but the default functions calculate a bunch of different performance metrics, so you can choose a different one if you’d like.

The default performance metrics available for classification are:

#>  [1] "logLoss"                "AUC"                    "prAUC"                 
 #>  [4] "Accuracy"               "Kappa"                  "Mean_F1"               
@@ -464,8 +344,7 @@ 

Changing the performance metric#> #> This warning usually means that the model didn't converge in some cross-validation folds because it is predicting something close to a constant. As a result, certain performance metrics can't be calculated. This suggests that some of the hyperparameters chosen are doing very poorly. #> Training complete.

-

You’ll see that the cross-validation metric is prAUC, instead of the -default AUC:

+

You’ll see that the cross-validation metric is prAUC, instead of the default AUC:

 results_pr$performance
 #> # A tibble: 1 × 17
@@ -479,16 +358,8 @@ 

Changing the performance metric

Using groups

-

The optional groups is a vector of groups to keep -together when splitting the data into train and test sets and for -cross-validation. Sometimes it’s important to split up the data based on -a grouping instead of just randomly. This allows you to control for -similarities within groups that you don’t want to skew your predictions -(i.e. batch effects). For example, with biological data you may have -samples collected from multiple hospitals, and you might like to keep -observations from the same hospital in the same partition.

-

Here’s an example where we split the data into train/test sets based -on groups:

+

The optional groups is a vector of groups to keep together when splitting the data into train and test sets and for cross-validation. Sometimes it’s important to split up the data based on a grouping instead of just randomly. This allows you to control for similarities within groups that you don’t want to skew your predictions (i.e. batch effects). For example, with biological data you may have samples collected from multiple hospitals, and you might like to keep observations from the same hospital in the same partition.

+

Here’s an example where we split the data into train/test sets based on groups:

-

The one difference here is run_ml() will report how much -of the data is in the training set if you run the above code chunk. This -can be a little finicky depending on how many samples and groups you -have. This is because it won’t be exactly what you specify with -training_frac, since you have to include all of one group -in either the training set or the test set.

+

The one difference here is run_ml() will report how much of the data is in the training set if you run the above code chunk. This can be a little finicky depending on how many samples and groups you have. This is because it won’t be exactly what you specify with training_frac, since you have to include all of one group in either the training set or the test set.

-

Controling how groups are assigned to partitions +

Controlling how groups are assigned to partitions

-

When you use the groups parameter as above, by default -run_ml() will assume that you want all of the observations -from each group to be placed in the same partition of the train/test -split. This makes sense when you want to use groups to control for batch -effects. However, in some cases you might prefer to control exactly -which groups end up in which partition, and you might even be okay with -some observations from the same group being assigned to different -partitions.

-

For example, say you want groups A and B to be used for training, C -and D for testing, and you don’t have a preference for what happens to -the other groups. You can give the group_partitions -parameter a named list to specify which groups should go in the training -set and which should go in the testing set.

+

When you use the groups parameter as above, by default run_ml() will assume that you want all of the observations from each group to be placed in the same partition of the train/test split. This makes sense when you want to use groups to control for batch effects. However, in some cases you might prefer to control exactly which groups end up in which partition, and you might even be okay with some observations from the same group being assigned to different partitions.

+

For example, say you want groups A and B to be used for training, C and D for testing, and you don’t have a preference for what happens to the other groups. You can give the group_partitions parameter a named list to specify which groups should go in the training set and which should go in the testing set.

 results_grp_part <- run_ml(otu_mini_bin, 
                       'glmnet', 
@@ -545,13 +400,8 @@ 

Controling how groups #> Groups will not be kept together in CV partitions because the number of groups in the training set is not larger than `kfold` #> Training the model... #> Training complete.

-

In the above case, all observations from A & B will be used for -training, all from C & D will be used for testing, and the remaining -groups will be randomly assigned to one or the other to satisfy the -training_frac as closely as possible.

-

In another scenario, maybe you want only groups A through F to be -used for training, but you also want to allow other observations not -selected for training from A through F to be used for testing:

+

In the above case, all observations from A & B will be used for training, all from C & D will be used for testing, and the remaining groups will be randomly assigned to one or the other to satisfy the training_frac as closely as possible.

+

In another scenario, maybe you want only groups A through F to be used for training, but you also want to allow other observations not selected for training from A through F to be used for testing:

 results_grp_trainA <- run_ml(otu_mini_bin, 
                       'glmnet', 
@@ -570,24 +420,14 @@ 

Controling how groups #> Groups will be kept together in CV partitions #> Training the model... #> Training complete.

-

If you need even more control than this, take a look at setting custom training indices. You -might also prefer to provide your own train control scheme with the -cross_val parameter in run_ml().

+

If you need even more control than this, take a look at setting custom training indices. You might also prefer to provide your own train control scheme with the cross_val parameter in run_ml().

Finding feature importance

-

To find which features are contributing to predictive power, you can -use find_feature_importance = TRUE. How we use permutation -importance to determine feature importance is described in (Topçuoğlu et al. 2020). Briefly, it permutes -each of the features individually (or correlated ones together) and -evaluates how much the performance metric decreases. The more -performance decreases when the feature is randomly shuffled, the more -important that feature is. The default is FALSE because it -takes a while to run and is only useful if you want to know what -features are important in predicting your outcome.

+

To find which features are contributing to predictive power, you can use find_feature_importance = TRUE. How we use permutation importance to determine feature importance is described in (Topçuoğlu et al. 2020). Briefly, it permutes each of the features individually (or correlated ones together) and evaluates how much the performance metric decreases. The more performance decreases when the feature is randomly shuffled, the more important that feature is. The default is FALSE because it takes a while to run and is only useful if you want to know what features are important in predicting your outcome.

Let’s look at some feature importance results:

 results_imp <- run_ml(otu_mini_bin,
@@ -624,34 +464,22 @@ 

Finding feature importanceThere are several columns:

  1. -perf_metric: The performance value of the permuted -feature.
  2. +perf_metric: The performance value of the permuted feature.
  3. -perf_metric_diff: The difference between the -performance for the actual and permuted data (i.e. test performance -minus permuted performance). Features with a larger -perf_metric_diff are more important.
  4. +perf_metric_diff: The difference between the performance for the actual and permuted data (i.e. test performance minus permuted performance). Features with a larger perf_metric_diff are more important.
  5. -pvalue: the probability of obtaining the actual -performance value under the null hypothesis.
  6. +pvalue: the probability of obtaining the actual performance value under the null hypothesis.
  7. names: The feature that was permuted.
  8. method: The ML method used.
  9. -perf_metric_name: The peformance metric used.
  10. +perf_metric_name: The performance metric used.
  11. seed: The seed (if set).
-

As you can see here, the differences are negligible (close to zero), -which makes sense since our model isn’t great. If you’re interested in -feature importance, it’s especially useful to run multiple different -train/test splits, as shown in our example -snakemake workflow.

-

You can also choose to permute correlated features together using -corr_thresh (default: 1). Any features that are above the -correlation threshold are permuted together; i.e. perfectly correlated -features are permuted together when using the default value.

+

As you can see here, the differences are negligible (close to zero), which makes sense since our model isn’t great. If you’re interested in feature importance, it’s especially useful to run multiple different train/test splits, as shown in our example snakemake workflow.

+

You can also choose to permute correlated features together using corr_thresh (default: 1). Any features that are above the correlation threshold are permuted together; i.e. perfectly correlated features are permuted together when using the default value.

 results_imp_corr <- run_ml(otu_mini_bin,
                            'glmnet',
@@ -682,32 +510,19 @@ 

Finding feature importance#> 1 glmnet AUC 2019 #> 2 glmnet AUC 2019 #> 3 glmnet AUC 2019

-

You can see which features were permuted together in the -names column. Here all 3 features were permuted together -(which doesn’t really make sense, but it’s just an example).

-

If you previously executed run_ml() without feature -importance but now wish to find feature importance after the fact, see -the example code in the get_feature_importance() -documentation.

-

get_feature_importance() can show a live progress bar, -see vignette("parallel") for examples.

+

You can see which features were permuted together in the names column. Here all 3 features were permuted together (which doesn’t really make sense, but it’s just an example).

+

If you previously executed run_ml() without feature importance but now wish to find feature importance after the fact, see the example code in the get_feature_importance() documentation.

+

get_feature_importance() can show a live progress bar, see vignette("parallel") for examples.

-

Tuning hyperparameters (using the hyperparameter -argument) +

Tuning hyperparameters (using the hyperparameter argument)

-

This is important, so we have a whole vignette about them. The bottom -line is we provide default hyperparameters that you can start with, but -it’s important to tune your hyperparameters. For more information about -what the default hyperparameters are, and how to tune hyperparameters, -see vignette("tuning").

+

This is important, so we have a whole vignette about them. The bottom line is we provide default hyperparameters that you can start with, but it’s important to tune your hyperparameters. For more information about what the default hyperparameters are, and how to tune hyperparameters, see vignette("tuning").

Other models

-

Here are examples of how to train and evaluate other models. The -output for all of them is very similar, so we won’t go into those -details.

+

Here are examples of how to train and evaluate other models. The output for all of them is very similar, so we won’t go into those details.

Random forest

@@ -716,11 +531,7 @@

Random forest'rf', cv_times = 5, seed = 2019)

-

You can also change the number of trees to use for random forest -(ntree; default: 1000). This can’t be tuned using -rf package implementation of random forest. Please refer to -caret documentation if you are interested in other packages -with random forest implementations.

+

You can also change the number of trees to use for random forest (ntree; default: 1000). This can’t be tuned using rf package implementation of random forest. Please refer to caret documentation if you are interested in other packages with random forest implementations.

 results_rf_nt <- run_ml(otu_mini_bin,
                         'rf',
@@ -745,8 +556,7 @@ 

SVM 'svmRadial', cv_times = 5, seed = 2019)

-

If you get a message “maximum number of iterations reached”, see this issue in -caret.

+

If you get a message “maximum number of iterations reached”, see this issue in caret.

@@ -755,8 +565,7 @@

Other data

Multiclass data

-

We provide otu_mini_multi with a multiclass outcome -(three or more outcomes):

+

We provide otu_mini_multi with a multiclass outcome (three or more outcomes):

 otu_mini_multi %>% dplyr::pull('dx') %>% unique()
 #> [1] "adenoma"   "carcinoma" "normal"
@@ -766,8 +575,7 @@

Multiclass data= "dx", seed = 2019 )

-

The performance metrics are slightly different, but the format of -everything else is the same:

+

The performance metrics are slightly different, but the format of everything else is the same:

 results_multi$performance
 #> # A tibble: 1 × 17
@@ -782,15 +590,13 @@ 

Multiclass data

Continuous data

-

And here’s an example for running continuous data, where the outcome -column is numerical:

+

And here’s an example for running continuous data, where the outcome column is numerical:

 results_cont <- run_ml(otu_mini_bin[, 2:11],
                        'glmnet',
                        outcome_colname = 'Otu00001',
                        seed = 2019)
-

Again, the performance metrics are slightly different, but the format -of the rest is the same:

+

Again, the performance metrics are slightly different, but the format of the rest is the same:

 results_cont$performance
 #> # A tibble: 1 × 6
@@ -800,22 +606,14 @@ 

Continuous data

-

References +

References

-
-
-Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael -W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR -Analyses with FIDDLE: A Flexible Data-Driven Preprocessing -Pipeline for Structured Clinical Data.” J Am Med Inform -Assoc, October. https://doi.org/10.1093/jamia/ocaa139. -
-
-Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, -and Patrick D. Schloss. 2020. “A Framework for -Effective Application of Machine Learning to -Microbiome-Based Classification -Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20. +
+
+

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

+
+
+

Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, and Patrick D. Schloss. 2020. “A Framework for Effective Application of Machine Learning to Microbiome-Based Classification Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20.

diff --git a/docs/articles/paper.html b/docs/articles/paper.html index 22fa9401..3dccf33d 100644 --- a/docs/articles/paper.html +++ b/docs/articles/paper.html @@ -100,14 +100,11 @@ -
+

Statement of need

-

Most applications of machine learning (ML) require reproducible steps -for data pre-processing, cross-validation, testing, model evaluation, -and often interpretation of why the model makes particular predictions. -Performing these steps is important, as failure to implement them can -result in incorrect and misleading results (Teschendorff 2019; Wiens et al. 2019).

-

Supervised ML is widely used to recognize patterns in large datasets -and to make predictions about outcomes of interest. Several packages -including caret (Kuhn 2008) -and tidymodels (Kuhn, Wickham, and -RStudio 2020) in R, scikitlearn (Pedregosa et al. 2011) in Python, and the H2O -autoML platform (H2O.ai 2020) -allow scientists to train ML models with a variety of algorithms. While -these packages provide the tools necessary for each ML step, they do not -implement a complete ML pipeline according to good practices in the -literature. This makes it difficult for practitioners new to ML to -easily begin to perform ML analyses.

-

To enable a broader range of researchers to apply ML to their problem -domains, we created mikropml, -an easy-to-use R package (R Core Team -2020) that implements the ML pipeline created by Topçuoğlu et -al. (Topçuoğlu et al. 2020) in a -single function that returns a trained model, model performance metrics -and feature importance. mikropml leverages the -caret package to support several ML algorithms: linear -regression, logistic regression, support vector machines with a radial -basis kernel, decision trees, random forest, and gradient boosted trees. -It incorporates good practices in ML training, testing, and model -evaluation (Topçuoğlu et al. 2020; Teschendorff -2019). Furthermore, it provides data preprocessing steps based on -the FIDDLE (FlexIble Data-Driven pipeLinE) framework outlined in Tang -et al. (Tang et al. 2020) and -post-training permutation importance steps to estimate the importance of -each feature in the models trained (Breiman 2001; -Fisher, Rudin, and Dominici 2018).

-

mikropml can be used as a starting point in the -application of ML to datasets from many different fields. It has already -been applied to microbiome data to categorize patients with colorectal -cancer (Topçuoğlu et al. 2020), to -identify differences in genomic and clinical features associated with -bacterial infections (Lapp et al. 2020), -and to predict gender-based biases in academic publishing (Hagan et al. 2020).

+

Most applications of machine learning (ML) require reproducible steps for data pre-processing, cross-validation, testing, model evaluation, and often interpretation of why the model makes particular predictions. Performing these steps is important, as failure to implement them can result in incorrect and misleading results (Teschendorff 2019; Wiens et al. 2019).

+

Supervised ML is widely used to recognize patterns in large datasets and to make predictions about outcomes of interest. Several packages including caret (Kuhn 2008) and tidymodels (Kuhn, Wickham, and RStudio 2020) in R, scikitlearn (Pedregosa et al. 2011) in Python, and the H2O autoML platform (H2O.ai 2020) allow scientists to train ML models with a variety of algorithms. While these packages provide the tools necessary for each ML step, they do not implement a complete ML pipeline according to good practices in the literature. This makes it difficult for practitioners new to ML to easily begin to perform ML analyses.

+

To enable a broader range of researchers to apply ML to their problem domains, we created mikropml, an easy-to-use R package (R Core Team 2020) that implements the ML pipeline created by Topçuoğlu et al. (Topçuoğlu et al. 2020) in a single function that returns a trained model, model performance metrics and feature importance. mikropml leverages the caret package to support several ML algorithms: linear regression, logistic regression, support vector machines with a radial basis kernel, decision trees, random forest, and gradient boosted trees. It incorporates good practices in ML training, testing, and model evaluation (Topçuoğlu et al. 2020; Teschendorff 2019). Furthermore, it provides data preprocessing steps based on the FIDDLE (FlexIble Data-Driven pipeLinE) framework outlined in Tang et al. (Tang et al. 2020) and post-training permutation importance steps to estimate the importance of each feature in the models trained (Breiman 2001; Fisher, Rudin, and Dominici 2018).

+

mikropml can be used as a starting point in the application of ML to datasets from many different fields. It has already been applied to microbiome data to categorize patients with colorectal cancer (Topçuoğlu et al. 2020), to identify differences in genomic and clinical features associated with bacterial infections (Lapp et al. 2020), and to predict gender-based biases in academic publishing (Hagan et al. 2020).

mikropml package

-

The mikropml package includes functionality to -preprocess the data, train ML models, evaluate model performance, and -quantify feature importance (Figure 1). We also provide vignettes -and an example -Snakemake workflow (Köster and Rahmann -2012) to showcase how to run an ideal ML pipeline with multiple -different train/test data splits. The results can be visualized using -helper functions that use ggplot2 (Wickham 2016).

-

While mikropml allows users to get started quickly and facilitates -reproducibility, it is not a replacement for understanding the ML -workflow which is still necessary when interpreting results (Pollard et al. 2019). To facilitate -understanding and enable one to tailor the code to their application, we -have heavily commented the code and have provided supporting -documentation which can be read online.

+

The mikropml package includes functionality to preprocess the data, train ML models, evaluate model performance, and quantify feature importance (Figure 1). We also provide vignettes and an example Snakemake workflow (Köster and Rahmann 2012) to showcase how to run an ideal ML pipeline with multiple different train/test data splits. The results can be visualized using helper functions that use ggplot2 (Wickham 2016).

+

While mikropml allows users to get started quickly and facilitates reproducibility, it is not a replacement for understanding the ML workflow which is still necessary when interpreting results (Pollard et al. 2019). To facilitate understanding and enable one to tailor the code to their application, we have heavily commented the code and have provided supporting documentation which can be read online.

Preprocessing data

-

We provide the function preprocess_data() to preprocess -features using several different functions from the caret -package. preprocess_data() takes continuous and categorical -data, re-factors categorical data into binary features, and provides -options to normalize continuous data, remove features with near-zero -variance, and keep only one instance of perfectly correlated features. -We set the default options based on those implemented in FIDDLE (Tang et al. 2020). More details on how to use -preprocess_data() can be found in the accompanying vignette.

+

We provide the function preprocess_data() to preprocess features using several different functions from the caret package. preprocess_data() takes continuous and categorical data, re-factors categorical data into binary features, and provides options to normalize continuous data, remove features with near-zero variance, and keep only one instance of perfectly correlated features. We set the default options based on those implemented in FIDDLE (Tang et al. 2020). More details on how to use preprocess_data() can be found in the accompanying vignette.

Running ML

-

The main function in mikropml, run_ml(), minimally takes -in the model choice and a data frame with an outcome column and feature -columns. For model choice, mikropml currently supports -logistic and linear regression (glmnet: Friedman, Hastie, and Tibshirani -2010), support vector machines with a radial basis kernel (kernlab: Karatzoglou et al. 2004), -decision trees (rpart: Therneau et -al. 2019), random forest (randomForest: Liaw and Wiener -2002), and gradient-boosted trees (xgboost: Chen et al. 2020). -run_ml() randomly splits the data into train and test sets -while maintaining the distribution of the outcomes found in the full -dataset. It also provides the option to split the data into train and -test sets based on categorical variables (e.g. batch, geographic -location, etc.). mikropml uses the caret -package (Kuhn 2008) to train and evaluate -the models, and optionally quantifies feature importance. The output -includes the best model built based on tuning hyperparameters in an -internal and repeated cross-validation step, model evaluation metrics, -and optional feature importances. Feature importances are calculated -using a permutation test, which breaks the relationship between the -feature and the true outcome in the test data, and measures the change -in model performance. This provides an intuitive metric of how -individual features influence model performance and is comparable across -model types, which is particularly useful for model interpretation (Topçuoğlu et al. 2020). Our introductory -vignette contains a comprehensive tutorial on how to use -run_ml().

+

The main function in mikropml, run_ml(), minimally takes in the model choice and a data frame with an outcome column and feature columns. For model choice, mikropml currently supports logistic and linear regression (glmnet: Friedman, Hastie, and Tibshirani 2010), support vector machines with a radial basis kernel (kernlab: Karatzoglou et al. 2004), decision trees (rpart: Therneau et al. 2019), random forest (randomForest: Liaw and Wiener 2002), and gradient-boosted trees (xgboost: Chen et al. 2020). run_ml() randomly splits the data into train and test sets while maintaining the distribution of the outcomes found in the full dataset. It also provides the option to split the data into train and test sets based on categorical variables (e.g. batch, geographic location, etc.). mikropml uses the caret package (Kuhn 2008) to train and evaluate the models, and optionally quantifies feature importance. The output includes the best model built based on tuning hyperparameters in an internal and repeated cross-validation step, model evaluation metrics, and optional feature importances. Feature importances are calculated using a permutation test, which breaks the relationship between the feature and the true outcome in the test data, and measures the change in model performance. This provides an intuitive metric of how individual features influence model performance and is comparable across model types, which is particularly useful for model interpretation (Topçuoğlu et al. 2020). Our introductory vignette contains a comprehensive tutorial on how to use run_ml().

mikropml pipeline

-

Ideal workflow for running mikropml with many different train/test -splits +

Ideal workflow for running mikropml with many different train/test splits

-

To investigate the variation in model performance depending on the -train and test set used (Topçuoğlu et al. 2020; -Lapp et al. 2020), we provide examples of how to -run_ml() many times with different train/test splits and -how to get summary information about model performance on a local -computer or on a high-performance computing cluster using a Snakemake -workflow.

+

To investigate the variation in model performance depending on the train and test set used (Topçuoğlu et al. 2020; Lapp et al. 2020), we provide examples of how to run_ml() many times with different train/test splits and how to get summary information about model performance on a local computer or on a high-performance computing cluster using a Snakemake workflow.

Tuning & visualization

-

One particularly important aspect of ML is hyperparameter tuning. We -provide a reasonable range of default hyperparameters for each model -type. However practitioners should explore whether that range is -appropriate for their data, or if they should customize the -hyperparameter range. Therefore, we provide a function -plot_hp_performance() to plot the cross-validation -performance metric of a single model or models built using different -train/test splits. This helps evaluate if the hyperparameter range is -being searched exhaustively and allows the user to pick the ideal set. -We also provide summary plots of test performance metrics for the many -train/test splits with different models using -plot_model_performance(). Examples are described in the -accompanying vignette -on hyperparameter tuning.

+

One particularly important aspect of ML is hyperparameter tuning. We provide a reasonable range of default hyperparameters for each model type. However practitioners should explore whether that range is appropriate for their data, or if they should customize the hyperparameter range. Therefore, we provide a function plot_hp_performance() to plot the cross-validation performance metric of a single model or models built using different train/test splits. This helps evaluate if the hyperparameter range is being searched exhaustively and allows the user to pick the ideal set. We also provide summary plots of test performance metrics for the many train/test splits with different models using plot_model_performance(). Examples are described in the accompanying vignette on hyperparameter tuning.

Dependencies

-

mikropml is written in R (R Core Team -2020) and depends on several packages: dplyr (Wickham et al. 2020), rlang (Henry, Wickham, and RStudio 2020) and -caret (Kuhn 2008). The ML -algorithms supported by mikropml require: -glmnet (Friedman, Hastie, and -Tibshirani 2010), e1071 (Meyer et al. 2020), and MLmetrics -(Yan 2016) for logistic regression, -rpart2 (Therneau et al. 2019) -for decision trees, randomForest (Liaw and Wiener 2002) for random forest, -xgboost (Chen et al. 2020) -for xgboost, and kernlab (Karatzoglou et al. 2004) for support vector -machines. We also allow for parallelization of cross-validation and -other steps using the foreach, doFuture, -future.apply, and future packages (Bengtsson and Team 2020). Finally, we use -ggplot2 for plotting (Wickham -2016).

+

mikropml is written in R (R Core Team 2020) and depends on several packages: dplyr (Wickham et al. 2020), rlang (Henry, Wickham, and RStudio 2020) and caret (Kuhn 2008). The ML algorithms supported by mikropml require: glmnet (Friedman, Hastie, and Tibshirani 2010), e1071 (Meyer et al. 2020), and MLmetrics (Yan 2016) for logistic regression, rpart2 (Therneau et al. 2019) for decision trees, randomForest (Liaw and Wiener 2002) for random forest, xgboost (Chen et al. 2020) for xgboost, and kernlab (Karatzoglou et al. 2004) for support vector machines. We also allow for parallelization of cross-validation and other steps using the foreach, doFuture, future.apply, and future packages (Bengtsson and Team 2020). Finally, we use ggplot2 for plotting (Wickham 2016).

Acknowledgments

-

We thank members of the Schloss Lab who participated in code clubs -related to the initial development of the pipeline, made documentation -improvements, and provided general feedback. We also thank Nick Lesniak -for designing the mikropml logo.

-

We thank the US Research Software Sustainability Institute (NSF -#1743188) for providing training to KLS at the Winter School in Research -Software Engineering.

+

We thank members of the Schloss Lab who participated in code clubs related to the initial development of the pipeline, made documentation improvements, and provided general feedback. We also thank Nick Lesniak for designing the mikropml logo.

+

We thank the US Research Software Sustainability Institute (NSF #1743188) for providing training to KLS at the Winter School in Research Software Engineering.

Funding

-

Salary support for PDS came from NIH grant 1R01CA215574. KLS received -support from the NIH Training Program in Bioinformatics (T32 GM070449). -ZL received support from the National Science Foundation Graduate -Research Fellowship Program under Grant No. DGE 1256260. Any opinions, -findings, and conclusions or recommendations expressed in this material -are those of the authors and do not necessarily reflect the views of the -National Science Foundation.

+

Salary support for PDS came from NIH grant 1R01CA215574. KLS received support from the NIH Training Program in Bioinformatics (T32 GM070449). ZL received support from the National Science Foundation Graduate Research Fellowship Program under Grant No. DGE 1256260. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation.

Author contributions

-

BDT, ZL, and KLS contributed equally. Author order among the co-first -authors was determined by time since joining the project.

-

BDT, ZL, and KLS conceptualized the study and wrote the code. KLS -structured the code in R package form. BDT, ZL, JW, and PDS developed -methodology. PDS, ES, and JW supervised the project. BDT, ZL, and KLS -wrote the original draft. All authors reviewed and edited the -manuscript.

+

BDT, ZL, and KLS contributed equally. Author order among the co-first authors was determined by time since joining the project.

+

BDT, ZL, and KLS conceptualized the study and wrote the code. KLS structured the code in R package form. BDT, ZL, JW, and PDS developed methodology. PDS, ES, and JW supervised the project. BDT, ZL, and KLS wrote the original draft. All authors reviewed and edited the manuscript.

Conflicts of interest @@ -327,156 +186,90 @@

Conflicts of interestNone.

-

References +

References

-
-
-Bengtsson, Henrik, and R Core Team. 2020. “Future.apply: -Apply Function to Elements in -Parallel Using Futures,” July. -
-
-Breiman, Leo. 2001. “Random Forests.” Machine -Learning 45 (1): 5–32. https://doi.org/10.1023/A:1010933404324. -
-
-Chen, Tianqi, Tong He, Michael Benesty, Vadim Khotilovich, Yuan Tang, -Hyunsu Cho, Kailong Chen, et al. 2020. “Xgboost: Extreme -Gradient Boosting,” June. -
-
-Fisher, Aaron, Cynthia Rudin, and Francesca Dominici. 2018. “All -Models Are Wrong, but Many Are Useful: Learning a -Variable’s Importance by Studying an Entire Class of Prediction Models -Simultaneously.” -
-
-Friedman, Jerome H., Trevor Hastie, and Rob Tibshirani. 2010. -“Regularization Paths for Generalized Linear -Models via Coordinate Descent.” Journal -of Statistical Software 33 (1): 1–22. https://doi.org/10.18637/jss.v033.i01. -
-
-H2O.ai. 2020. H2o: Scalable Machine -Learning Platform. Manual. -
-
-Hagan, Ada K., Begüm D. Topçuoğlu, Mia E. Gregory, Hazel A. Barton, and -Patrick D. Schloss. 2020. “Women Are Underrepresented -and Receive Differential Outcomes at ASM -Journals: A Six-Year Retrospective -Analysis.” mBio 11 (6). https://doi.org/10.1128/mBio.01680-20. -
-
-Henry, Lionel, Hadley Wickham, and RStudio. 2020. “Rlang: -Functions for Base Types and Core -R and ’TidyverseFeatures,” -July. -
-
-Karatzoglou, Alexandros, Alexandros Smola, Kurt Hornik, and Achim -Zeileis. 2004. “Kernlab - An S4 Package for -Kernel Methods in R.” Journal of -Statistical Software 11 (1): 1–20. https://doi.org/10.18637/jss.v011.i09. -
-
-Köster, Johannes, and Sven Rahmann. 2012. “Snakemakea Scalable -Bioinformatics Workflow Engine.” Bioinformatics 28 (19): -2520–22. https://doi.org/10.1093/bioinformatics/bts480. -
-
-Kuhn, Max. 2008. “Building Predictive Models in -R Using the Caret Package.” Journal -of Statistical Software 28 (1): 1–26. https://doi.org/10.18637/jss.v028.i05. -
-
-Kuhn, Max, Hadley Wickham, and RStudio. 2020. “Tidymodels: -Easily Install and Load the -’TidymodelsPackages,” July. -
-
-Lapp, Zena, Jennifer Han, Jenna Wiens, Ellie JC Goldstein, Ebbing -Lautenbach, and Evan Snitkin. 2020. “Machine Learning Models to -Identify Patient and Microbial Genetic Factors Associated with -Carbapenem-Resistant Klebsiella Pneumoniae -Infection.” medRxiv, July, 2020.07.06.20147306. https://doi.org/10.1101/2020.07.06.20147306. -
-
-Liaw, Andy, and Matthew Wiener. 2002. “Classification and -Regression by randomForest 2: 5. -
-
-Meyer, David, Evgenia Dimitriadou, Kurt Hornik, Andreas Weingessel, -Friedrich Leisch, Chih-Chung Chang (libsvm C++-code), and Chih-Chen Lin -(libsvm C++-code). 2020. “E1071: Misc Functions of -the Department of Statistics, -Probability Theory Group (Formerly: -E1071), TU Wien.” -
-
-Pedregosa, Fabian, Gaël Varoquaux, Alexandre Gramfort, Vincent Michel, -Bertrand Thirion, Olivier Grisel, Mathieu Blondel, et al. 2011. -“Scikit-Learn: Machine Learning in -Python.” Journal of Machine Learning -Research 12 (85): 2825–30. -
-
-Pollard, Tom J., Irene Chen, Jenna Wiens, Steven Horng, Danny Wong, -Marzyeh Ghassemi, Heather Mattie, Emily Lindemer, and Trishan Panch. -2019. “Turning the Crank for Machine Learning: Ease, at What -Expense?” The Lancet Digital Health 1 (5): e198–99. https://doi.org/10.1016/S2589-7500(19)30112-8. -
-
-R Core Team. 2020. “R: A Language and -Environment for Statistical Computing.” -
-
-Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael -W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR -Analyses with FIDDLE: A Flexible Data-Driven Preprocessing -Pipeline for Structured Clinical Data.” J Am Med Inform -Assoc, October. https://doi.org/10.1093/jamia/ocaa139. -
-
-Teschendorff, Andrew E. 2019. “Avoiding Common Pitfalls in Machine -Learning Omic Data Science.” Nature Materials 18 (5): -422–27. https://doi.org/10.1038/s41563-018-0241-z. -
-
-Therneau, Terry, Beth Atkinson, Brian Ripley (producer of the initial R. -port, and maintainer 1999-2017). 2019. “Rpart: Recursive -Partitioning and Regression Trees,” April. -
-
-Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, -and Patrick D. Schloss. 2020. “A Framework for -Effective Application of Machine Learning to -Microbiome-Based Classification -Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20. -
-
-Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for -Data Analysis. Use R! Cham: -Springer International Publishing. https://doi.org/10.1007/978-3-319-24277-4. -
-
-Wickham, Hadley, Romain François, Lionel Henry, Kirill Müller, and -RStudio. 2020. “Dplyr: A Grammar of Data -Manipulation,” August. -
-
-Wiens, Jenna, Suchi Saria, Mark Sendak, Marzyeh Ghassemi, Vincent X. -Liu, Finale Doshi-Velez, Kenneth Jung, et al. 2019. “Do No Harm: A -Roadmap for Responsible Machine Learning for Health Care.” -Nat. Med. 25 (9): 1337–40. https://doi.org/10.1038/s41591-019-0548-6. -
-
-Yan, Yachen. 2016. MLmetrics: Machine Learning -Evaluation Metrics.” -
-
-
-
+
+
+

Bengtsson, Henrik, and R Core Team. 2020. “Future.Apply: Apply Function to Elements in Parallel Using Futures,” July.

+
+
+

Breiman, Leo. 2001. “Random Forests.” Machine Learning 45 (1): 5–32. https://doi.org/10.1023/A:1010933404324.

+
+
+

Chen, Tianqi, Tong He, Michael Benesty, Vadim Khotilovich, Yuan Tang, Hyunsu Cho, Kailong Chen, et al. 2020. “Xgboost: Extreme Gradient Boosting,” June.

+
+
+

Fisher, Aaron, Cynthia Rudin, and Francesca Dominici. 2018. “All Models Are Wrong, but Many Are Useful: Learning a Variable’s Importance by Studying an Entire Class of Prediction Models Simultaneously.”

+
+
+

Friedman, Jerome H., Trevor Hastie, and Rob Tibshirani. 2010. “Regularization Paths for Generalized Linear Models via Coordinate Descent.” Journal of Statistical Software 33 (1): 1–22. https://doi.org/10.18637/jss.v033.i01.

+
+
+

H2O.ai. 2020. H2O: Scalable Machine Learning Platform. Manual.

+
+
+

Hagan, Ada K., Begüm D. Topçuoğlu, Mia E. Gregory, Hazel A. Barton, and Patrick D. Schloss. 2020. “Women Are Underrepresented and Receive Differential Outcomes at ASM Journals: A Six-Year Retrospective Analysis.” mBio 11 (6). https://doi.org/10.1128/mBio.01680-20.

+
+
+

Henry, Lionel, Hadley Wickham, and RStudio. 2020. “Rlang: Functions for Base Types and Core R and ’Tidyverse’ Features,” July.

+
+
+

Karatzoglou, Alexandros, Alexandros Smola, Kurt Hornik, and Achim Zeileis. 2004. “Kernlab - an S4 Package for Kernel Methods in R.” Journal of Statistical Software 11 (1): 1–20. https://doi.org/10.18637/jss.v011.i09.

+
+
+

Köster, Johannes, and Sven Rahmann. 2012. “Snakemakea Scalable Bioinformatics Workflow Engine.” Bioinformatics 28 (19): 2520–2. https://doi.org/10.1093/bioinformatics/bts480.

+
+
+

Kuhn, Max. 2008. “Building Predictive Models in R Using the Caret Package.” Journal of Statistical Software 28 (1): 1–26. https://doi.org/10.18637/jss.v028.i05.

+
+
+

Kuhn, Max, Hadley Wickham, and RStudio. 2020. “Tidymodels: Easily Install and Load the ’Tidymodels’ Packages,” July.

+
+
+

Lapp, Zena, Jennifer Han, Jenna Wiens, Ellie JC Goldstein, Ebbing Lautenbach, and Evan Snitkin. 2020. “Machine Learning Models to Identify Patient and Microbial Genetic Factors Associated with Carbapenem-Resistant Klebsiella Pneumoniae Infection.” medRxiv, July, 2020.07.06.20147306. https://doi.org/10.1101/2020.07.06.20147306.

+
+
+

Liaw, Andy, and Matthew Wiener. 2002. “Classification and Regression by randomForest” 2: 5.

+
+
+

Meyer, David, Evgenia Dimitriadou, Kurt Hornik, Andreas Weingessel, Friedrich Leisch, Chih-Chung Chang (libsvm C++-code), and Chih-Chen Lin (libsvm C++-code). 2020. “E1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien.”

+
+
+

Pedregosa, Fabian, Gaël Varoquaux, Alexandre Gramfort, Vincent Michel, Bertrand Thirion, Olivier Grisel, Mathieu Blondel, et al. 2011. “Scikit-Learn: Machine Learning in Python.” Journal of Machine Learning Research 12 (85): 2825–30.

+
+
+

Pollard, Tom J., Irene Chen, Jenna Wiens, Steven Horng, Danny Wong, Marzyeh Ghassemi, Heather Mattie, Emily Lindemer, and Trishan Panch. 2019. “Turning the Crank for Machine Learning: Ease, at What Expense?” The Lancet Digital Health 1 (5): e198–e199. https://doi.org/10.1016/S2589-7500(19)30112-8.

+
+
+

R Core Team. 2020. “R: A Language and Environment for Statistical Computing.”

+
+
+

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

+
+
+

Teschendorff, Andrew E. 2019. “Avoiding Common Pitfalls in Machine Learning Omic Data Science.” Nature Materials 18 (5): 422–27. https://doi.org/10.1038/s41563-018-0241-z.

+
+
+

Therneau, Terry, Beth Atkinson, Brian Ripley (producer of the initial R. port, and maintainer 1999-2017). 2019. “Rpart: Recursive Partitioning and Regression Trees,” April.

+
+
+

Topçuoğlu, Begüm D., Nicholas A. Lesniak, Mack T. Ruffin, Jenna Wiens, and Patrick D. Schloss. 2020. “A Framework for Effective Application of Machine Learning to Microbiome-Based Classification Problems.” mBio 11 (3). https://doi.org/10.1128/mBio.00434-20.

+
+
+

Wickham, Hadley. 2016. Ggplot2: Elegant Graphics for Data Analysis. Use R! Cham: Springer International Publishing. https://doi.org/10.1007/978-3-319-24277-4.

+
+
+

Wickham, Hadley, Romain François, Lionel Henry, Kirill Müller, and RStudio. 2020. “Dplyr: A Grammar of Data Manipulation,” August.

+
+
+

Wiens, Jenna, Suchi Saria, Mark Sendak, Marzyeh Ghassemi, Vincent X. Liu, Finale Doshi-Velez, Kenneth Jung, et al. 2019. “Do No Harm: A Roadmap for Responsible Machine Learning for Health Care.” Nat. Med. 25 (9): 1337–40. https://doi.org/10.1038/s41591-019-0548-6.

+
+
+

Yan, Yachen. 2016. “MLmetrics: Machine Learning Evaluation Metrics.”

+
+
+
+

  1. co-first author↩︎

  2. diff --git a/docs/articles/parallel.html b/docs/articles/parallel.html index e1ac7594..373dfe7e 100644 --- a/docs/articles/parallel.html +++ b/docs/articles/parallel.html @@ -100,12 +100,11 @@ -
    +

Speed up single runs

-

By default, preprocess_data(), run_ml(), -and compare_models() use only one process in series. If -you’d like to parallelize various steps of the pipeline to make them run -faster, install foreach, future, -future.apply, and doFuture. Then, register a -future plan prior to calling these functions:

+

By default, preprocess_data(), run_ml(), and compare_models() use only one process in series. If you’d like to parallelize various steps of the pipeline to make them run faster, install foreach, future, future.apply, and doFuture. Then, register a future plan prior to calling these functions:

-doFuture::registerDoFuture()
+doFuture::registerDoFuture()
 future::plan(future::multicore, workers = 2)
-

Above, we used the multicore plan to split the work -across 2 cores. See the future -documentation for more about picking the best plan for your use -case. Notably, multicore does not work inside RStudio or on -Windows; you will need to use multisession instead in those -cases.

-

After registering a future plan, you can call -preprocess_data() and run_ml() as usual, and -they will run certain tasks in parallel.

+

Above, we used the multicore plan to split the work across 2 cores. See the future documentation for more about picking the best plan for your use case. Notably, multicore does not work inside RStudio or on Windows; you will need to use multisession instead in those cases.

+

After registering a future plan, you can call preprocess_data() and run_ml() as usual, and they will run certain tasks in parallel.

 otu_data_preproc <- preprocess_data(otu_mini_bin, 'dx')$dat_transformed
-result1 <- run_ml(otu_data_preproc, 'glmnet')
+#> Using 'dx' as the outcome column. +result1 <- run_ml(otu_data_preproc, 'glmnet') +#> Using 'dx' as the outcome column. +#> Training the model... +#> Loading required package: ggplot2 +#> Loading required package: lattice +#> +#> Attaching package: 'caret' +#> The following object is masked from 'package:mikropml': +#> +#> compare_models +#> Training complete.

Call run_ml() multiple times in parallel in R

-

You can use functions from the future.apply package to -call run_ml() multiple times in parallel with different -parameters. You will first need to run future::plan() as -above if you haven’t already. Then, call run_ml() with -multiple seeds using future_lapply():

+

You can use functions from the future.apply package to call run_ml() multiple times in parallel with different parameters. You will first need to run future::plan() as above if you haven’t already. Then, call run_ml() with multiple seeds using future_lapply():

 # NOTE: use more seeds for real-world data
 results_multi <- future.apply::future_lapply(seq(100, 102), function(seed) {
   run_ml(otu_data_preproc, 'glmnet', seed = seed)
-  }, future.seed = TRUE)
-

Each call to run_ml() with a different seed uses a -different random split of the data into training and testing sets. Since -we are using seeds, we must set future.seed to -TRUE (see the future.apply -documentation and this -blog post for details on parallel-safe random seeds). This example -uses only a few seeds for speed and simplicity, but for real data we -recommend using many more seeds to get a better estimate of model -performance.

-

In these examples, we used functions from the -future.apply package to run_ml() in parallel, -but you can accomplish the same thing with parallel versions of the -purrr::map() functions using the furrr package -(e.g. furrr::future_map_dfr()).

-

Extract the performance results and combine into one dataframe for -all seeds:

+ }, future.seed = TRUE) +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete.
+

Each call to run_ml() with a different seed uses a different random split of the data into training and testing sets. Since we are using seeds, we must set future.seed to TRUE (see the future.apply documentation and this blog post for details on parallel-safe random seeds). This example uses only a few seeds for speed and simplicity, but for real data we recommend using many more seeds to get a better estimate of model performance.

+

In these examples, we used functions from the future.apply package to run_ml() in parallel, but you can accomplish the same thing with parallel versions of the purrr::map() functions using the furrr package (e.g. furrr::future_map_dfr()).

+

Extract the performance results and combine into one dataframe for all seeds:

 perf_df <- future.apply::future_lapply(results_multi, 
                                        function(result) {
@@ -180,13 +178,17 @@ 

Call run_ml() }, future.seed = TRUE) %>% dplyr::bind_rows() -perf_df

+perf_df +#> # A tibble: 3 × 3 +#> cv_metric_AUC AUC method +#> <dbl> <dbl> <chr> +#> 1 0.630 0.634 glmnet +#> 2 0.591 0.608 glmnet +#> 3 0.671 0.471 glmnet

Multiple ML methods

-

You may also wish to compare performance for different ML methods. -mapply() can iterate over multiple lists or vectors, and -future_mapply() works the same way:

+

You may also wish to compare performance for different ML methods. mapply() can iterate over multiple lists or vectors, and future_mapply() works the same way:

 # NOTE: use more seeds for real-world data
 param_grid <- expand.grid(seeds = seq(100, 102),
@@ -198,39 +200,61 @@ 

Multiple ML methodsparam_grid$seeds, param_grid$methods %>% as.character(), future.seed = TRUE - )

-

Extract and combine the performance results for all seeds and -methods:

+ ) +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete. +#> Using 'dx' as the outcome column. +#> Training the model... +#> Training complete.
+

Extract and combine the performance results for all seeds and methods:

 perf_df2 <- lapply(results_mtx['performance',], 
                    function(x) {
                      x %>% select(cv_metric_AUC, AUC, method)
                    }) %>% 
   dplyr::bind_rows()
-perf_df2
-

Visualize the performance results (ggplot2 is -required):

+perf_df2 +#> # A tibble: 6 × 3 +#> cv_metric_AUC AUC method +#> <dbl> <dbl> <chr> +#> 1 0.630 0.634 glmnet +#> 2 0.591 0.608 glmnet +#> 3 0.671 0.471 glmnet +#> 4 0.665 0.708 rf +#> 5 0.651 0.697 rf +#> 6 0.701 0.592 rf
+

Visualize the performance results (ggplot2 is required):

 perf_boxplot <- plot_model_performance(perf_df2)
 perf_boxplot
-

plot_model_performance() returns a ggplot2 object. You -can add layers to customize the plot:

+

+

plot_model_performance() returns a ggplot2 object. You can add layers to customize the plot:

 perf_boxplot +
    theme_classic() +
    scale_color_brewer(palette = "Dark2") +
    coord_flip()
-

You can also create your own plots however you like using the -performance results.

+

+

You can also create your own plots however you like using the performance results.

Live progress updates

-

preprocess_data() and -get_feature_importance() support reporting live progress -updates using the progressr package. The format is up to -you, but we recommend using a progress bar like this:

+

preprocess_data() and get_feature_importance() support reporting live progress updates using the progressr package. The format is up to you, but we recommend using a progress bar like this:

 # optionally, specify the progress bar format with the `progress` package.
 progressr::handlers(progressr::handler_progress(
@@ -251,26 +275,12 @@ 

Live progress updates#> Training the model... #> Training complete. #> Feature importance =========================== 100% | elapsed: 37s | eta: 0s

-

Note that some future backends support “near-live” progress updates, -meaning the progress may not be reported immediately when parallel -processing with futures. Read more on that in -the progressr vignette. For more on -progressr and how to customize the format of progress -updates, see the progressr -docs.

+

Note that some future backends support “near-live” progress updates, meaning the progress may not be reported immediately when parallel processing with futures. Read more on that in the progressr vignette. For more on progressr and how to customize the format of progress updates, see the progressr docs.

Parallelizing with Snakemake

-

When parallelizing multiple calls to run_ml() in R as in -the examples above, all of the results objects are held in memory. This -isn’t a big deal for a small dataset run with only a few seeds. However, -for large datasets run in parallel with, say, 100 seeds (recommended), -you may run into problems trying to store all of those objects in memory -at once. One solution is to write the results files of each -run_ml() call, then concatenate them at the end. We show -one way to accomplish this with Snakemake in an -example Snakemake workflow here.

+

When parallelizing multiple calls to run_ml() in R as in the examples above, all of the results objects are held in memory. This isn’t a big deal for a small dataset run with only a few seeds. However, for large datasets run in parallel with, say, 100 seeds (recommended), you may run into problems trying to store all of those objects in memory at once. One solution is to write the results files of each run_ml() call, then concatenate them at the end. We show one way to accomplish this with Snakemake in an example Snakemake workflow here.

diff --git a/docs/articles/preprocess.html b/docs/articles/preprocess.html index d6e21856..7d7bdb8d 100644 --- a/docs/articles/preprocess.html +++ b/docs/articles/preprocess.html @@ -100,7 +100,7 @@ -
+

Categorical data @@ -245,9 +209,7 @@

Categorical data#> #> $removed_feats #> character(0)

-

As you can see, this variable was split into 3 different columns - -one for each type (a, b, and c). And again, grp_feats is -NULL.

+

As you can see, this variable was split into 3 different columns - one for each type (a, b, and c). And again, grp_feats is NULL.

Continuous data @@ -281,23 +243,12 @@

Continuous data#> #> $removed_feats #> character(0)

-

Wow! Why did the numbers change? This is because the default is to -normalize the data using "center" and "scale". -While this is often best practice, you may not want to normalize the -data, or you may want to normalize the data in a different way. If you -don’t want to normalize the data, you can use -method=NULL:

+

Wow! Why did the numbers change? This is because the default is to normalize the data using "center" and "scale". While this is often best practice, you may not want to normalize the data, or you may want to normalize the data in a different way. If you don’t want to normalize the data, you can use method=NULL:

 # preprocess raw continuous data, no normalization
 preprocess_data(dataset = cont_df, outcome_colname = "outcome", method = NULL)
-

You can also normalize the data in different ways. You can choose any -method supported by the method argument of -caret::preProcess() (see the -caret::preProcess() docs for details). Note that these -methods are only applied to continuous variables.

-

Another feature of preprocess_data() is that if you -provide continuous variables as characters, they will be converted to -numeric:

+

You can also normalize the data in different ways. You can choose any method supported by the method argument of caret::preProcess() (see the caret::preProcess() docs for details). Note that these methods are only applied to continuous variables.

+

Another feature of preprocess_data() is that if you provide continuous variables as characters, they will be converted to numeric:

 # raw continuous dataset as characters
 cont_char_df <- data.frame(
@@ -312,10 +263,7 @@ 

Continuous data
 # preprocess raw continuous character data as numeric
 preprocess_data(dataset = cont_char_df, outcome_colname = "outcome")

-

If you don’t want this to happen, and you want character data to -remain character data even if it can be converted to numeric, you can -use to_numeric=FALSE and they will be kept as -categorical:

+

If you don’t want this to happen, and you want character data to remain character data even if it can be converted to numeric, you can use to_numeric=FALSE and they will be kept as categorical:

 # preprocess raw continuous character data as characters
 preprocess_data(dataset = cont_char_df, outcome_colname = "outcome", to_numeric = FALSE)
@@ -333,17 +281,12 @@ 

Continuous data#> #> $removed_feats #> character(0)

-

As you can see from this output, in this case the features are -treated as groups rather than numbers (e.g. they are not -normalized).

+

As you can see from this output, in this case the features are treated as groups rather than numbers (e.g. they are not normalized).

Collapse perfectly correlated features

-

By default, preprocess_data() collapses features that -are perfectly positively or negatively correlated. This is because -having multiple copies of those features does not add information to -machine learning, and it makes run_ml faster.

+

By default, preprocess_data() collapses features that are perfectly positively or negatively correlated. This is because having multiple copies of those features does not add information to machine learning, and it makes run_ml faster.

 # raw correlated dataset
 corr_df <- data.frame(
@@ -376,14 +319,8 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

As you can see, we end up with only one variable, as all 3 are -grouped together. Also, the second element in the list is no longer -NULL. Instead, it tells you that grp1 contains -var1, var2, and var3.

-

If you want to group positively correlated features, but not -negatively correlated features (e.g. for interpretability, or another -downstream application), you can do that by using -group_neg_corr=FALSE:

+

As you can see, we end up with only one variable, as all 3 are grouped together. Also, the second element in the list is no longer NULL. Instead, it tells you that grp1 contains var1, var2, and var3.

+

If you want to group positively correlated features, but not negatively correlated features (e.g. for interpretability, or another downstream application), you can do that by using group_neg_corr=FALSE:

 # preprocess raw correlated dataset; don't group negatively correlated features
 preprocess_data(dataset = corr_df, outcome_colname = "outcome", group_neg_corr = FALSE)
@@ -401,10 +338,7 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

Here, var3 is kept on it’s own because it’s negatively -correlated with var1 and var2. You can also -choose to keep all features separate, even if they are perfectly -correlated, by using collapse_corr_feats=FALSE:

+

Here, var3 is kept on it’s own because it’s negatively correlated with var1 and var2. You can also choose to keep all features separate, even if they are perfectly correlated, by using collapse_corr_feats=FALSE:

 # preprocess raw correlated dataset; don't group negatively correlated features
 preprocess_data(dataset = corr_df, outcome_colname = "outcome", collapse_corr_feats = FALSE)
@@ -422,14 +356,12 @@ 

Collapse perfectly correlated fe #> #> $removed_feats #> [1] "var2"

-

In this case, grp_feats will always be -NULL.

+

In this case, grp_feats will always be NULL.

Data with near-zero variance

-

What if we have variables that are all zero, or all “no”? Those ones -won’t contribute any information, so we remove them:

+

What if we have variables that are all zero, or all “no”? Those ones won’t contribute any information, so we remove them:

 # raw dataset with non-variable features
 nonvar_df <- data.frame(
@@ -445,9 +377,7 @@ 

Data with near-zero variance#> 1 normal no 0 no 0 12 #> 2 normal yes 1 no 0 12 #> 3 cancer no 1 no 0 12

-

Here, var3, var4, and var5 all -have no variability, so these variables are removed during -preprocessing:

+

Here, var3, var4, and var5 all have no variability, so these variables are removed during preprocessing:

 # remove features with near-zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome")
@@ -465,13 +395,7 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4" "var3" "var5"

-

You can read the caret::preProcess() documentation for -more information. By default, we remove features with “near-zero -variance” (remove_var='nzv'). This uses the default -arguments from caret::nearZeroVar(). However, particularly -with smaller datasets, you might not want to remove features with -near-zero variance. If you want to remove only features with zero -variance, you can use remove_var='zv':

+

You can read the caret::preProcess() documentation for more information. By default, we remove features with “near-zero variance” (remove_var='nzv'). This uses the default arguments from caret::nearZeroVar(). However, particularly with smaller datasets, you might not want to remove features with near-zero variance. If you want to remove only features with zero variance, you can use remove_var='zv':

 # remove features with zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome", remove_var = 'zv')
@@ -489,10 +413,7 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4" "var3" "var5"

-

If you want to include all features, you can use the argument -remove_zv=NULL. For this to work, you cannot collapse -correlated features (otherwise it errors out because of the underlying -caret function we use).

+

If you want to include all features, you can use the argument remove_zv=NULL. For this to work, you cannot collapse correlated features (otherwise it errors out because of the underlying caret function we use).

 # don't remove features with near-zero or zero variance
 preprocess_data(dataset = nonvar_df, outcome_colname = "outcome", remove_var = NULL, collapse_corr_feats = FALSE)
@@ -510,12 +431,7 @@ 

Data with near-zero variance#> #> $removed_feats #> [1] "var4"

-

If you want to be more nuanced in how you remove near-zero variance -features (e.g. change the default 10% cutoff for the percentage of -distinct values out of the total number of samples), you can use the -caret::preProcess() function after running -preprocess_data with remove_var=NULL (see the -caret::nearZeroVar() function for more information).

+

If you want to be more nuanced in how you remove near-zero variance features (e.g. change the default 10% cutoff for the percentage of distinct values out of the total number of samples), you can use the caret::preProcess() function after running preprocess_data with remove_var=NULL (see the caret::nearZeroVar() function for more information).

Missing data @@ -523,16 +439,11 @@

Missing datapreprocess_data() also deals with missing data. It:

  • Removes missing outcome variables.
  • -
  • Maintains zero variability in a feature if it already has no -variability (i.e. the feature is removed if removing features with -near-zero variance).
  • -
  • Replaces missing binary and categorical variables with zero (after -splitting into multiple columns).
  • -
  • Replaces missing continuous data with the median value of that -feature.
  • +
  • Maintains zero variability in a feature if it already has no variability (i.e. the feature is removed if removing features with near-zero variance).
  • +
  • Replaces missing binary and categorical variables with zero (after splitting into multiple columns).
  • +
  • Replaces missing continuous data with the median value of that feature.
-

If you’d like to deal with missing data in a different way, please do -that prior to inputting the data to preprocess_data().

+

If you’d like to deal with missing data in a different way, please do that prior to inputting the data to preprocess_data().

Remove missing outcome variables

@@ -569,8 +480,7 @@

Remove missing outcome variables#> character(0)

-

Maintain zero variability in a feature if it already has no -variability +

Maintain zero variability in a feature if it already has no variability

 # raw dataset with missing value in non-variable feature
@@ -602,9 +512,7 @@ 

#> #> $removed_feats #> [1] "var2"

-

Here, the non-variable feature with missing data is removed because -we removed features with near-zero variance. If we maintained that -feature, it’d be all ones:

+

Here, the non-variable feature with missing data is removed because we removed features with near-zero variance. If we maintained that feature, it’d be all ones:

 # preprocess raw dataset with missing value in non-variable feature
 preprocess_data(dataset = miss_nonvar_df, outcome_colname = "outcome", remove_var = NULL, collapse_corr_feats = FALSE)
@@ -657,12 +565,10 @@ 

Replace miss #> #> $removed_feats #> [1] "var2"

-

Here each binary variable is split into two, and the missing value is -considered zero for both of them.

+

Here each binary variable is split into two, and the missing value is considered zero for both of them.

-

Replace missing continuous data with the median value of that -feature +

Replace missing continuous data with the median value of that feature

 # raw dataset with missing value in continuous feature
@@ -677,8 +583,7 @@ 

R #> 2 normal 2 2 #> 3 cancer 2 3 #> 4 normal NA NA

-

Here we’re not normalizing continuous features so it’s easier to see -what’s going on (i.e. the median value is used):

+

Here we’re not normalizing continuous features so it’s easier to see what’s going on (i.e. the median value is used):

 # preprocess raw dataset with missing value in continuous feature
 preprocess_data(dataset = miss_cont_df, outcome_colname = "outcome", method = NULL)
@@ -703,8 +608,7 @@ 

R

Putting it all together

-

Here’s some more complicated example raw data that puts everything we -discussed together:

+

Here’s some more complicated example raw data that puts everything we discussed together:

 test_df <- data.frame(
   outcome = c("normal", "normal", "cancer", NA),
@@ -727,8 +631,7 @@ 

Putting it all together#> 2 normal 2 b yes 1 0 no 1 6 x 0 1 2 #> 3 cancer 3 c no 0 0 no 0 NA y NA NA 3 #> 4 <NA> 4 d no 0 0 no 0 7 z NA NA 4

-

Let’s throw this into the preprocessing function with the default -values:

+

Let’s throw this into the preprocessing function with the default values:

 preprocess_data(dataset = test_df, outcome_colname = "outcome")
 #> Using 'outcome' as the outcome column.
@@ -765,37 +668,20 @@ 

Putting it all together#> [1] "var4" "var5" "var10" "var6" "var11"

As you can see, we got several messages:

    -
  • One of the samples (row 4) was removed because the outcome value was -missing.
  • -
  • One of the variables in a feature with no variation had a missing -value that was replaced with the the non-varying value -(var11).
  • -
  • Four categorical missing values were replaced with zero -(var9). There are 4 missing rather than just 1 (like in the -raw data) because we split the categorical variable into 4 different -columns first.
  • -
  • One missing continuous value was imputed using the median value of -that feature (var8).
  • +
  • One of the samples (row 4) was removed because the outcome value was missing.
  • +
  • One of the variables in a feature with no variation had a missing value that was replaced with the the non-varying value (var11).
  • +
  • Four categorical missing values were replaced with zero (var9). There are 4 missing rather than just 1 (like in the raw data) because we split the categorical variable into 4 different columns first.
  • +
  • One missing continuous value was imputed using the median value of that feature (var8).
-

Additionally, you can see that the continuous variables were -normalized, the categorical variables were all changed to binary, and -several features were grouped together. The variables in each group can -be found in grp_feats.

+

Additionally, you can see that the continuous variables were normalized, the categorical variables were all changed to binary, and several features were grouped together. The variables in each group can be found in grp_feats.

Next step: train and evaluate your model!

-

After you preprocess your data (either using -preprocess_data() or by preprocessing the data on your -own), you’re ready to train and evaluate machine learning models! Please -see run_ml() information about training models.

-
-
-Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael -W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR -Analyses with FIDDLE: A Flexible Data-Driven Preprocessing -Pipeline for Structured Clinical Data.” J Am Med Inform -Assoc, October. https://doi.org/10.1093/jamia/ocaa139. +

After you preprocess your data (either using preprocess_data() or by preprocessing the data on your own), you’re ready to train and evaluate machine learning models! Please see run_ml() information about training models.

+
+
+

Tang, Shengpu, Parmida Davarmanesh, Yanmeng Song, Danai Koutra, Michael W. Sjoding, and Jenna Wiens. 2020. “Democratizing EHR Analyses with FIDDLE: A Flexible Data-Driven Preprocessing Pipeline for Structured Clinical Data.” J Am Med Inform Assoc, October. https://doi.org/10.1093/jamia/ocaa139.

diff --git a/docs/articles/tuning.html b/docs/articles/tuning.html index 1cc971d6..13d99588 100644 --- a/docs/articles/tuning.html +++ b/docs/articles/tuning.html @@ -100,12 +100,11 @@ -
+

diff --git a/docs/reference/index.html b/docs/reference/index.html index 048d1f36..6303fba6 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -89,15 +89,10 @@

Main

Run the machine learning pipeline

-

Plotting & evalutation helpers

-

Visualize & evalutate performance to help you tune hyperparameters and choose model methods.

+

Plotting helpers

+

Visualize results to help you tune hyperparameters and choose model methods.

-

compare_models()

- -

Perform permutation tests to compare the performance metric -across all pairs of a group variable.

-

plot_hp_performance()

Plot hyperparameter performance metrics

@@ -117,6 +112,27 @@

Plotting & evalutation helpers combine_hp_performance()

Combine hyperparameter performance metrics for multiple train/test splits

+ +

Model evaluation

+

Evaluate and interpret models.

+ + +

get_feature_importance()

+ +

Get feature importance using the permutation method

+ +

get_performance_tbl()

+ +

Get model performance metrics as a one-row tibble

+ +

compare_models()

+ +

Perform permutation tests to compare the performance metric +across all pairs of a group variable.

+ +

permute_p_value()

+ +

Calculated a permuted p-value comparing two models

Package Data

@@ -148,23 +164,23 @@

ML results

otu_mini_bin_results_glmnet

-

Results from running the pipline with L2 logistic regression on otu_mini_bin with feature importance and grouping

+

Results from running the pipeline with L2 logistic regression on otu_mini_bin with feature importance and grouping

otu_mini_bin_results_rf

-

Results from running the pipline with random forest on otu_mini_bin

+

Results from running the pipeline with random forest on otu_mini_bin

otu_mini_bin_results_rpart2

-

Results from running the pipline with rpart2 on otu_mini_bin

+

Results from running the pipeline with rpart2 on otu_mini_bin

otu_mini_bin_results_svmRadial

-

Results from running the pipline with svmRadial on otu_mini_bin

+

Results from running the pipeline with svmRadial on otu_mini_bin

otu_mini_bin_results_xgbTree

-

Results from running the pipline with xbgTree on otu_mini_bin

+

Results from running the pipeline with xbgTree on otu_mini_bin

otu_mini_cont_results_glmnet

@@ -195,7 +211,7 @@

misc

Replace spaces in all elements of a character vector with underscores

Pipeline customization

-

These are functions called by preprocess_data() or run_ml(). We make them available in case you would like to customize various steps of the pipeline beyond the arguments provided by the main functions.

+

Customize various steps of the pipeline beyond the arguments provided by run_ml() and preprocess_data().

remove_singleton_columns()

@@ -245,14 +261,6 @@

Pipeline customization calc_perf_metrics()

Get performance metrics for test data

- -

get_performance_tbl()

- -

Get model performance metrics as a one-row tibble

- -

get_feature_importance()

- -

Get feature importance using the permutation method

group_correlated_features()

diff --git a/docs/reference/otu_mini_bin_results_glmnet.html b/docs/reference/otu_mini_bin_results_glmnet.html index 5ae3499d..4e42c52e 100644 --- a/docs/reference/otu_mini_bin_results_glmnet.html +++ b/docs/reference/otu_mini_bin_results_glmnet.html @@ -1,5 +1,5 @@ -Results from running the pipline with L2 logistic regression on otu_mini_bin with feature importance and grouping — otu_mini_bin_results_glmnet • mikropmlResults from running the pipeline with L2 logistic regression on otu_mini_bin with feature importance and grouping — otu_mini_bin_results_glmnet • mikropml @@ -69,13 +69,13 @@
-

Results from running the pipline with L2 logistic regression on otu_mini_bin with feature importance and grouping

+

Results from running the pipeline with L2 logistic regression on otu_mini_bin with feature importance and grouping

diff --git a/docs/reference/otu_mini_bin_results_rf.html b/docs/reference/otu_mini_bin_results_rf.html index aaf32bab..9952d73e 100644 --- a/docs/reference/otu_mini_bin_results_rf.html +++ b/docs/reference/otu_mini_bin_results_rf.html @@ -1,5 +1,5 @@ -Results from running the pipline with random forest on otu_mini_bin — otu_mini_bin_results_rf • mikropmlResults from running the pipeline with random forest on otu_mini_bin — otu_mini_bin_results_rf • mikropml @@ -69,13 +69,13 @@
-

Results from running the pipline with random forest on otu_mini_bin

+

Results from running the pipeline with random forest on otu_mini_bin

diff --git a/docs/reference/otu_mini_bin_results_rpart2.html b/docs/reference/otu_mini_bin_results_rpart2.html index f73bffd5..8e49390e 100644 --- a/docs/reference/otu_mini_bin_results_rpart2.html +++ b/docs/reference/otu_mini_bin_results_rpart2.html @@ -1,5 +1,5 @@ -Results from running the pipline with rpart2 on otu_mini_bin — otu_mini_bin_results_rpart2 • mikropmlResults from running the pipeline with rpart2 on otu_mini_bin — otu_mini_bin_results_rpart2 • mikropml @@ -69,13 +69,13 @@
-

Results from running the pipline with rpart2 on otu_mini_bin

+

Results from running the pipeline with rpart2 on otu_mini_bin

diff --git a/docs/reference/otu_mini_bin_results_svmRadial.html b/docs/reference/otu_mini_bin_results_svmRadial.html index 165e2009..34b5c1b8 100644 --- a/docs/reference/otu_mini_bin_results_svmRadial.html +++ b/docs/reference/otu_mini_bin_results_svmRadial.html @@ -1,5 +1,5 @@ -Results from running the pipline with svmRadial on otu_mini_bin — otu_mini_bin_results_svmRadial • mikropmlResults from running the pipeline with svmRadial on otu_mini_bin — otu_mini_bin_results_svmRadial • mikropml @@ -69,13 +69,13 @@
-

Results from running the pipline with svmRadial on otu_mini_bin

+

Results from running the pipeline with svmRadial on otu_mini_bin

diff --git a/docs/reference/otu_mini_bin_results_xgbTree.html b/docs/reference/otu_mini_bin_results_xgbTree.html index bda6634d..4a8ff944 100644 --- a/docs/reference/otu_mini_bin_results_xgbTree.html +++ b/docs/reference/otu_mini_bin_results_xgbTree.html @@ -1,5 +1,5 @@ -Results from running the pipline with xbgTree on otu_mini_bin — otu_mini_bin_results_xgbTree • mikropmlResults from running the pipeline with xbgTree on otu_mini_bin — otu_mini_bin_results_xgbTree • mikropml @@ -69,13 +69,13 @@
-

Results from running the pipline with xbgTree on otu_mini_bin

+

Results from running the pipeline with xbgTree on otu_mini_bin

From e0f3fbec133f6826bcb51f2622e75a644dea78ee Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 21:57:38 -0400 Subject: [PATCH 09/10] Update comments for CRAN --- cran-comments.md | 1 - 1 file changed, 1 deletion(-) diff --git a/cran-comments.md b/cran-comments.md index a6e1dfd3..7078366e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,4 +1,3 @@ -This patch fixes a test failure on the no long doubles platform. ## Test environments From 7c17eb768e3f9832bf3c2b04cda4bfc7d8eb7594 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Thu, 19 May 2022 22:07:05 -0400 Subject: [PATCH 10/10] Ignore CRAN-SUBMISSION I guess they changed the name from CRAN-RELEASE --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index c0900ea1..96912ad0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,4 @@ ^cran-comments\.md$ ^revdep$ ^CRAN-RELEASE$ +^CRAN-SUBMISSION$