From e439ba1ee778d57a8c3d49217b60d23460d16e3b Mon Sep 17 00:00:00 2001 From: Chris Kennedy Date: Thu, 9 Jun 2016 19:27:39 -0700 Subject: [PATCH] Support XGBoost --- DESCRIPTION | 46 +++++++++++++-- NAMESPACE | 2 + R/SL.xgboost.R | 108 +++++++++++++++++++++++++++++++++++ man/SL.xgboost.Rd | 39 +++++++++++++ man/create.SL.xgboost.Rd | 40 +++++++++++++ man/predict.SL.xgboost.Rd | 12 ++++ tests/testthat/testXGBoost.R | 42 ++++++++++++++ 7 files changed, 284 insertions(+), 5 deletions(-) mode change 100755 => 100644 DESCRIPTION create mode 100644 R/SL.xgboost.R create mode 100644 man/SL.xgboost.Rd create mode 100644 man/create.SL.xgboost.Rd create mode 100644 man/predict.SL.xgboost.Rd create mode 100644 tests/testthat/testXGBoost.R diff --git a/DESCRIPTION b/DESCRIPTION old mode 100755 new mode 100644 index 9186621..4627a56 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,10 +5,46 @@ Version: 2.0-20 Date: 2016-04-06 Author: Eric Polley, Erin LeDell, Mark van der Laan Maintainer: Eric Polley -Description: Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner. +Description: Implements the super learner prediction method and contains a + library of prediction algorithms to be used in the super learner. License: GPL-3 -URL: https://github.com/ecpolley/SuperLearner -Depends: R (>= 2.14.0), nnls -Imports: cvAUC -Suggests: arm, caret, class, e1071, earth, gam, gbm, genefilter, ggplot2, glmnet, Hmisc, ipred, lattice, LogicReg, MASS, mda, mlbench, nloptr, nnet, party, polspline, quadprog, randomForest, ROCR, rpart, SIS, spls, stepPlr, sva +URL: https://github.com/ecpolley/SuperLearner +Depends: + R (>= 2.14.0), + nnls +Imports: + cvAUC +Suggests: + arm, + caret, + class, + e1071, + earth, + gam, + gbm, + genefilter, + ggplot2, + glmnet, + Hmisc, + ipred, + lattice, + LogicReg, + MASS, + mda, + mlbench, + nloptr, + nnet, + party, + polspline, + quadprog, + randomForest, + ROCR, + rpart, + SIS, + spls, + stepPlr, + sva, + testthat, + xgboost LazyLoad: yes +RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE index 42a52ad..e276567 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ importFrom(graphics, plot) importFrom("stats", "as.formula", "binomial", "cor.test", "dlogis", "formula", "gaussian", "glm", "lm", "loess", "loess.control", "model.matrix", "optim", "plogis", "predict.lm", "qnorm", "reorder", "sd", "step", "var", "weighted.mean") importFrom("utils", "RShowDoc") +import("xgboost") S3method(print, SuperLearner) S3method(coef, SuperLearner) @@ -42,4 +43,5 @@ S3method(predict, SL.step) S3method(predict, SL.stepAIC) S3method(predict, SL.svm) S3method(predict, SL.template) +S3method(predict, SL.xgboost) S3method(predict, SuperLearner) diff --git a/R/SL.xgboost.R b/R/SL.xgboost.R new file mode 100644 index 0000000..80f728a --- /dev/null +++ b/R/SL.xgboost.R @@ -0,0 +1,108 @@ +#' XGBoost SuperLearner wrapper +#' +#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is a +#' variant of gradient boosted machines (GBM). +#' +#' The performance of XGBoost, like GBM, is sensitive to the configuration settings. +#' Therefore it is best to create multiple configurations using create.SL.xgboost +#' and allow the SuperLearner to choose the best weights based on cross-validated +#' performance. +#' +#' @param family "gaussian" for regression, "binomial" for binary classification, "multinomial" +#' for multiple classification. +#' @param ntrees How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage. +#' @param max_depth How deep each tree can be. 1 means no interactions, aka tree stubs. +#' @param shrinkage How much to shrink the predictions, in order to reduce overfitting. +#' @param minobspernode Minimum observations allowed per tree node, after which no more splitting will occur. +#' @param params Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md} +#' @param nthread How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization. +#' @param verbose Verbosity of XGB fitting. +#' @export +SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000, + max_depth=4, shrinkage=0.1, minobspernode=10, params = list(), + nthread = 1, verbose = 0, + ...) { + .SL.require("xgboost") + + # Convert to an xgboost compatible data matrix, using the sample weights. + xgmat = xgboost::xgb.DMatrix(data=as.matrix(X), label=Y, weight = obsWeights) + + # TODO: support early stopping, which requires a "watchlist". See ?xgb.train + + if (family$family == "gaussian") { + model = xgboost::xgboost(data=xgmat, objective="reg:linear", nround = ntrees, + max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose, + nthread = nthread, params = params) + } + if (family$family == "binomial") { + model = xgboost::xgboost(data=xgmat, objective="binary:logistic", nround = ntrees, + max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose, + nthread = nthread, params = params) + } + if (family$family == "multinomial") { + # TODO: test this. + model = xgboost::xgboost(data=xgmat, objective="multi:softmax", nround = ntrees, + max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose, + num_class=length(unique(Y)), nthread = nthread, params = params) + } + pred = predict(model, newdata=data.matrix(newX)) + fit = list(object = model) + class(fit) = c("SL.xgboost") + out = list(pred = pred, fit = fit) + return(out) +} + +#' XGBoost prediction on new data +predict.SL.xgboost <- function(object, newdata, family, ...) { + .SL.require("xgboost") + pred <- predict(object$object, xgboost::xgb.DMatrix(newdata)) + return(pred) +} + +#' Factory for XGBoost SL wrappers +#' +#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters. +#' +#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined. +#' @param detailed_names Set to T to have the function names include the parameter configurations. +#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment. +#' @param name_prefix The prefix string for the name of each function that is generated. +#' +#' @examples +#' +#' # Create a new environment to store the learner functions. +#' # This keeps the global environment organized. +#' sl_env = new.environment() +#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters. +#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10, +#' shrinkage = c(0.1, 0.01, 0.001)) +#' # Generate a separate learner for each combination. +#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env) +#' # Review the function configurations. +#' xgb_grid +#' # Attach the environment so that the custom learner functions can be accessed. +#' attach(sl_env) +#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names) +#' detach(sl_env) +#' @export +create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1), + minobspernode = c(10)), detailed_names = F, env = .GlobalEnv, + name_prefix = "SL.xgb") { + # Create all combinations of hyperparameters, for grid-like search. + tuneGrid = expand.grid(tune, stringsAsFactors=F) + + names = rep("", nrow(tuneGrid)) + + for (i in seq(nrow(tuneGrid))) { + g = tuneGrid[i,] + if (detailed_names) { + name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".") + } else { + name = paste(name_prefix, i, sep=".") + } + names[i] = name + eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env) + } + results = list(grid = tuneGrid, names = names) + invisible(results) +} diff --git a/man/SL.xgboost.Rd b/man/SL.xgboost.Rd new file mode 100644 index 0000000..4f17148 --- /dev/null +++ b/man/SL.xgboost.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.xgboost.R +\name{SL.xgboost} +\alias{SL.xgboost} +\title{XGBoost SuperLearner wrapper} +\usage{ +SL.xgboost(Y, X, newX, family, obsWeights, id, ntrees = 1000, max_depth = 4, + shrinkage = 0.1, minobspernode = 10, params = list(), nthread = 1, + verbose = 0, ...) +} +\arguments{ +\item{family}{"gaussian" for regression, "binomial" for binary classification, "multinomial" +for multiple classification.} + +\item{ntrees}{How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.} + +\item{max_depth}{How deep each tree can be. 1 means no interactions, aka tree stubs.} + +\item{shrinkage}{How much to shrink the predictions, in order to reduce overfitting.} + +\item{minobspernode}{Minimum observations allowed per tree node, after which no more splitting will occur.} + +\item{params}{Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md}} + +\item{nthread}{How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization.} + +\item{verbose}{Verbosity of XGB fitting.} +} +\description{ +Supports the Extreme Gradient Boosting package for SuperLearnering, which is a +variant of gradient boosted machines (GBM). +} +\details{ +The performance of XGBoost, like GBM, is sensitive to the configuration settings. +Therefore it is best to create multiple configurations using create.SL.xgboost +and allow the SuperLearner to choose the best weights based on cross-validated +performance. +} + diff --git a/man/create.SL.xgboost.Rd b/man/create.SL.xgboost.Rd new file mode 100644 index 0000000..338e32f --- /dev/null +++ b/man/create.SL.xgboost.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.xgboost.R +\name{create.SL.xgboost} +\alias{create.SL.xgboost} +\title{Factory for XGBoost SL wrappers} +\usage{ +create.SL.xgboost(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = + c(0.1), minobspernode = c(10)), detailed_names = F, env = .GlobalEnv, + name_prefix = "SL.xgb") +} +\arguments{ +\item{tune}{List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.} + +\item{detailed_names}{Set to T to have the function names include the parameter configurations.} + +\item{env}{Environment in which to create the SL.xgboost functions. Defaults to the global environment.} + +\item{name_prefix}{The prefix string for the name of each function that is generated.} +} +\description{ +Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters. +} +\examples{ + +# Create a new environment to store the learner functions. +# This keeps the global environment organized. +sl_env = new.environment() +# Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters. +tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10, + shrinkage = c(0.1, 0.01, 0.001)) +# Generate a separate learner for each combination. +xgb_grid = create.SL.xgboost(tune = tune, env = sl_env) +# Review the function configurations. +xgb_grid +# Attach the environment so that the custom learner functions can be accessed. +attach(sl_env) +sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names) +detach(sl_env) +} + diff --git a/man/predict.SL.xgboost.Rd b/man/predict.SL.xgboost.Rd new file mode 100644 index 0000000..2d2b711 --- /dev/null +++ b/man/predict.SL.xgboost.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SL.xgboost.R +\name{predict.SL.xgboost} +\alias{predict.SL.xgboost} +\title{XGBoost prediction on new data} +\usage{ +predict.SL.xgboost(object, newdata, family, ...) +} +\description{ +XGBoost prediction on new data +} + diff --git a/tests/testthat/testXGBoost.R b/tests/testthat/testXGBoost.R new file mode 100644 index 0000000..236dbf3 --- /dev/null +++ b/tests/testthat/testXGBoost.R @@ -0,0 +1,42 @@ +library(testthat) +library(xgboost) + +context("Learner: XGBoost") + +# Create sample dataset for testing. +set.seed(1) +N <- 200 +X <- matrix(rnorm(N*10), N, 10) +X <- as.data.frame(X) +Y_bin <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) +table(Y_bin) + +SL.library <- c("SL.glmnet", "SL.stepAIC", "SL.xgboost") + +# Test xgboost - binary classification +sl <- SuperLearner(Y = Y_bin, X = X, SL.library = SL.library, family = binomial()) +sl + +# Test xgboost - regression +Y_reg <- .2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]) + rnorm(N) +summary(Y_reg) +sl <- SuperLearner(Y = Y_reg, X = X, SL.library = SL.library, family = gaussian()) +sl + +# Test xgboost - multi-classification +# TODO: add test here. + +test_that("Test create.SL.xgboost", { + # Create a new environment to hold the functions. + sl_env = new.env() + xgb_grid = create.SL.xgboost(tune = list(ntrees = c(100, 500), max_depth = c(1, 2), + minobspernode = 10, shrinkage = c(0.1, 0.01, 0.001)), env = sl_env) + xgb_grid + xgb_functions = ls(sl_env) + expect_equal(length(xgb_functions), 12) + # Load the functions for use in the SuperLearner call. + attach(sl_env) + sl <- SuperLearner(Y = Y_reg, X = X, SL.library = c(SL.library, xgb_grid$names), family = gaussian()) + print(sl) + detach(sl_env) +})