Skip to content

Commit

Permalink
Merge pull request #35 from ck37/xgb
Browse files Browse the repository at this point in the history
Add xgboost support
  • Loading branch information
ecpolley authored Jun 16, 2016
2 parents fd1f0fa + e439ba1 commit e164242
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 5 deletions.
46 changes: 41 additions & 5 deletions DESCRIPTION
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,46 @@ Version: 2.0-20
Date: 2016-04-06
Author: Eric Polley, Erin LeDell, Mark van der Laan
Maintainer: Eric Polley <[email protected]>
Description: Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner.
Description: Implements the super learner prediction method and contains a
library of prediction algorithms to be used in the super learner.
License: GPL-3
URL: https://github.com/ecpolley/SuperLearner
Depends: R (>= 2.14.0), nnls
Imports: cvAUC
Suggests: arm, caret, class, e1071, earth, gam, gbm, genefilter, ggplot2, glmnet, Hmisc, ipred, lattice, LogicReg, MASS, mda, mlbench, nloptr, nnet, party, polspline, quadprog, randomForest, ROCR, rpart, SIS, spls, stepPlr, sva
URL: https://github.com/ecpolley/SuperLearner
Depends:
R (>= 2.14.0),
nnls
Imports:
cvAUC
Suggests:
arm,
caret,
class,
e1071,
earth,
gam,
gbm,
genefilter,
ggplot2,
glmnet,
Hmisc,
ipred,
lattice,
LogicReg,
MASS,
mda,
mlbench,
nloptr,
nnet,
party,
polspline,
quadprog,
randomForest,
ROCR,
rpart,
SIS,
spls,
stepPlr,
sva,
testthat,
xgboost
LazyLoad: yes
RoxygenNote: 5.0.1
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ importFrom(graphics, plot)

importFrom("stats", "as.formula", "binomial", "cor.test", "dlogis", "formula", "gaussian", "glm", "lm", "loess", "loess.control", "model.matrix", "optim", "plogis", "predict.lm", "qnorm", "reorder", "sd", "step", "var", "weighted.mean")
importFrom("utils", "RShowDoc")
import("xgboost")

S3method(print, SuperLearner)
S3method(coef, SuperLearner)
Expand Down Expand Up @@ -42,4 +43,5 @@ S3method(predict, SL.step)
S3method(predict, SL.stepAIC)
S3method(predict, SL.svm)
S3method(predict, SL.template)
S3method(predict, SL.xgboost)
S3method(predict, SuperLearner)
108 changes: 108 additions & 0 deletions R/SL.xgboost.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#' XGBoost SuperLearner wrapper
#'
#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is a
#' variant of gradient boosted machines (GBM).
#'
#' The performance of XGBoost, like GBM, is sensitive to the configuration settings.
#' Therefore it is best to create multiple configurations using create.SL.xgboost
#' and allow the SuperLearner to choose the best weights based on cross-validated
#' performance.
#'
#' @param family "gaussian" for regression, "binomial" for binary classification, "multinomial"
#' for multiple classification.
#' @param ntrees How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.
#' @param max_depth How deep each tree can be. 1 means no interactions, aka tree stubs.
#' @param shrinkage How much to shrink the predictions, in order to reduce overfitting.
#' @param minobspernode Minimum observations allowed per tree node, after which no more splitting will occur.
#' @param params Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md}
#' @param nthread How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization.
#' @param verbose Verbosity of XGB fitting.
#' @export
SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000,
max_depth=4, shrinkage=0.1, minobspernode=10, params = list(),
nthread = 1, verbose = 0,
...) {
.SL.require("xgboost")

# Convert to an xgboost compatible data matrix, using the sample weights.
xgmat = xgboost::xgb.DMatrix(data=as.matrix(X), label=Y, weight = obsWeights)

# TODO: support early stopping, which requires a "watchlist". See ?xgb.train

if (family$family == "gaussian") {
model = xgboost::xgboost(data=xgmat, objective="reg:linear", nround = ntrees,
max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
nthread = nthread, params = params)
}
if (family$family == "binomial") {
model = xgboost::xgboost(data=xgmat, objective="binary:logistic", nround = ntrees,
max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
nthread = nthread, params = params)
}
if (family$family == "multinomial") {
# TODO: test this.
model = xgboost::xgboost(data=xgmat, objective="multi:softmax", nround = ntrees,
max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
num_class=length(unique(Y)), nthread = nthread, params = params)
}
pred = predict(model, newdata=data.matrix(newX))
fit = list(object = model)
class(fit) = c("SL.xgboost")
out = list(pred = pred, fit = fit)
return(out)
}

#' XGBoost prediction on new data
predict.SL.xgboost <- function(object, newdata, family, ...) {
.SL.require("xgboost")
pred <- predict(object$object, xgboost::xgb.DMatrix(newdata))
return(pred)
}

#' Factory for XGBoost SL wrappers
#'
#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters.
#'
#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.
#' @param detailed_names Set to T to have the function names include the parameter configurations.
#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment.
#' @param name_prefix The prefix string for the name of each function that is generated.
#'
#' @examples
#'
#' # Create a new environment to store the learner functions.
#' # This keeps the global environment organized.
#' sl_env = new.environment()
#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters.
#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10,
#' shrinkage = c(0.1, 0.01, 0.001))
#' # Generate a separate learner for each combination.
#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env)
#' # Review the function configurations.
#' xgb_grid
#' # Attach the environment so that the custom learner functions can be accessed.
#' attach(sl_env)
#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names)
#' detach(sl_env)
#' @export
create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1),
minobspernode = c(10)), detailed_names = F, env = .GlobalEnv,
name_prefix = "SL.xgb") {
# Create all combinations of hyperparameters, for grid-like search.
tuneGrid = expand.grid(tune, stringsAsFactors=F)

names = rep("", nrow(tuneGrid))

for (i in seq(nrow(tuneGrid))) {
g = tuneGrid[i,]
if (detailed_names) {
name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".")
} else {
name = paste(name_prefix, i, sep=".")
}
names[i] = name
eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env)
}
results = list(grid = tuneGrid, names = names)
invisible(results)
}
39 changes: 39 additions & 0 deletions man/SL.xgboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions man/create.SL.xgboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/predict.SL.xgboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions tests/testthat/testXGBoost.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
library(testthat)
library(xgboost)

context("Learner: XGBoost")

# Create sample dataset for testing.
set.seed(1)
N <- 200
X <- matrix(rnorm(N*10), N, 10)
X <- as.data.frame(X)
Y_bin <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
table(Y_bin)

SL.library <- c("SL.glmnet", "SL.stepAIC", "SL.xgboost")

# Test xgboost - binary classification
sl <- SuperLearner(Y = Y_bin, X = X, SL.library = SL.library, family = binomial())
sl

# Test xgboost - regression
Y_reg <- .2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]) + rnorm(N)
summary(Y_reg)
sl <- SuperLearner(Y = Y_reg, X = X, SL.library = SL.library, family = gaussian())
sl

# Test xgboost - multi-classification
# TODO: add test here.

test_that("Test create.SL.xgboost", {
# Create a new environment to hold the functions.
sl_env = new.env()
xgb_grid = create.SL.xgboost(tune = list(ntrees = c(100, 500), max_depth = c(1, 2),
minobspernode = 10, shrinkage = c(0.1, 0.01, 0.001)), env = sl_env)
xgb_grid
xgb_functions = ls(sl_env)
expect_equal(length(xgb_functions), 12)
# Load the functions for use in the SuperLearner call.
attach(sl_env)
sl <- SuperLearner(Y = Y_reg, X = X, SL.library = c(SL.library, xgb_grid$names), family = gaussian())
print(sl)
detach(sl_env)
})

0 comments on commit e164242

Please sign in to comment.