ecpolley · ecpolley · Jun 16, 2016 · Jun 10, 2016
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,10 +5,46 @@ Version: 2.0-20
 Date: 2016-04-06
 Author: Eric Polley, Erin LeDell, Mark van der Laan
 Maintainer: Eric Polley <[email protected]>
-Description: Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner.
+Description: Implements the super learner prediction method and contains a
+    library of prediction algorithms to be used in the super learner.
 License: GPL-3
-URL: https://github.com/ecpolley/SuperLearner 
-Depends: R (>= 2.14.0), nnls
-Imports: cvAUC
-Suggests: arm, caret, class, e1071, earth, gam, gbm, genefilter, ggplot2, glmnet, Hmisc, ipred, lattice, LogicReg, MASS, mda, mlbench, nloptr, nnet, party, polspline, quadprog, randomForest, ROCR, rpart, SIS, spls, stepPlr, sva
+URL: https://github.com/ecpolley/SuperLearner
+Depends:
+    R (>= 2.14.0),
+    nnls
+Imports:
+    cvAUC
+Suggests:
+    arm,
+    caret,
+    class,
+    e1071,
+    earth,
+    gam,
+    gbm,
+    genefilter,
+    ggplot2,
+    glmnet,
+    Hmisc,
+    ipred,
+    lattice,
+    LogicReg,
+    MASS,
+    mda,
+    mlbench,
+    nloptr,
+    nnet,
+    party,
+    polspline,
+    quadprog,
+    randomForest,
+    ROCR,
+    rpart,
+    SIS,
+    spls,
+    stepPlr,
+    sva,
+    testthat,
+    xgboost
 LazyLoad: yes
+RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ importFrom(graphics, plot)
 
 importFrom("stats", "as.formula", "binomial", "cor.test", "dlogis", "formula", "gaussian", "glm", "lm", "loess", "loess.control", "model.matrix", "optim", "plogis", "predict.lm", "qnorm", "reorder", "sd", "step", "var", "weighted.mean")
 importFrom("utils", "RShowDoc")
+import("xgboost")
 
 S3method(print, SuperLearner)
 S3method(coef, SuperLearner)
@@ -42,4 +43,5 @@ S3method(predict, SL.step)
 S3method(predict, SL.stepAIC)
 S3method(predict, SL.svm)
 S3method(predict, SL.template)
+S3method(predict, SL.xgboost)
 S3method(predict, SuperLearner)
diff --git a/R/SL.xgboost.R b/R/SL.xgboost.R
@@ -0,0 +1,108 @@
+#' XGBoost SuperLearner wrapper
+#'
+#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is a
+#' variant of gradient boosted machines (GBM).
+#'
+#' The performance of XGBoost, like GBM, is sensitive to the configuration settings.
+#' Therefore it is best to create multiple configurations using create.SL.xgboost
+#' and allow the SuperLearner to choose the best weights based on cross-validated
+#' performance.
+#'
+#' @param family "gaussian" for regression, "binomial" for binary classification, "multinomial"
+#' for multiple classification.
+#' @param ntrees How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.
+#' @param max_depth How deep each tree can be. 1 means no interactions, aka tree stubs.
+#' @param shrinkage How much to shrink the predictions, in order to reduce overfitting.
+#' @param minobspernode Minimum observations allowed per tree node, after which no more splitting will occur.
+#' @param params Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md}
+#' @param nthread How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization.
+#' @param verbose Verbosity of XGB fitting.
+#' @export
+SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000,
+                      max_depth=4, shrinkage=0.1, minobspernode=10, params = list(),
+                      nthread = 1, verbose = 0,
+                      ...) {
+  .SL.require("xgboost")
+
+  # Convert to an xgboost compatible data matrix, using the sample weights.
+  xgmat = xgboost::xgb.DMatrix(data=as.matrix(X), label=Y, weight = obsWeights)
+
+  # TODO: support early stopping, which requires a "watchlist". See ?xgb.train
+
+  if (family$family == "gaussian") {
+    model = xgboost::xgboost(data=xgmat, objective="reg:linear", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                nthread = nthread, params = params)
+  }
+  if (family$family == "binomial") {
+    model = xgboost::xgboost(data=xgmat, objective="binary:logistic", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                nthread = nthread, params = params)
+  }
+  if (family$family == "multinomial") {
+    # TODO: test this.
+    model = xgboost::xgboost(data=xgmat, objective="multi:softmax", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                num_class=length(unique(Y)), nthread = nthread, params = params)
+  }
+  pred = predict(model, newdata=data.matrix(newX))
+  fit = list(object = model)
+  class(fit) = c("SL.xgboost")
+  out = list(pred = pred, fit = fit)
+  return(out)
+}
+
+#' XGBoost prediction on new data
+predict.SL.xgboost <- function(object, newdata, family, ...) {
+  .SL.require("xgboost")
+  pred <- predict(object$object, xgboost::xgb.DMatrix(newdata))
+  return(pred)
+}
+
+#' Factory for XGBoost SL wrappers
+#'
+#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters.
+#'
+#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.
+#' @param detailed_names Set to T to have the function names include the parameter configurations.
+#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment.
+#' @param name_prefix The prefix string for the name of each function that is generated.
+#'
+#' @examples
+#'
+#' # Create a new environment to store the learner functions.
+#' # This keeps the global environment organized.
+#' sl_env = new.environment()
+#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters.
+#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10,
+#'             shrinkage = c(0.1, 0.01, 0.001))
+#' # Generate a separate learner for each combination.
+#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env)
+#' # Review the function configurations.
+#' xgb_grid
+#' # Attach the environment so that the custom learner functions can be accessed.
+#' attach(sl_env)
+#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names)
+#' detach(sl_env)
+#' @export
+create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1),
+                             minobspernode = c(10)), detailed_names = F, env = .GlobalEnv,
+                             name_prefix = "SL.xgb") {
+  # Create all combinations of hyperparameters, for grid-like search.
+  tuneGrid = expand.grid(tune, stringsAsFactors=F)
+
+  names = rep("", nrow(tuneGrid))
+
+  for (i in seq(nrow(tuneGrid))) {
+    g = tuneGrid[i,]
+    if (detailed_names) {
+      name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".")
+    } else {
+      name = paste(name_prefix, i, sep=".")
+    }
+    names[i] = name
+    eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env)
+  }
+  results = list(grid = tuneGrid, names = names)
+  invisible(results)
+}
diff --git a/man/SL.xgboost.Rd b/man/SL.xgboost.Rd
diff --git a/man/create.SL.xgboost.Rd b/man/create.SL.xgboost.Rd
diff --git a/man/predict.SL.xgboost.Rd b/man/predict.SL.xgboost.Rd
diff --git a/tests/testthat/testXGBoost.R b/tests/testthat/testXGBoost.R
@@ -0,0 +1,42 @@
+library(testthat)
+library(xgboost)
+
+context("Learner: XGBoost")
+
+# Create sample dataset for testing.
+set.seed(1)
+N <- 200
+X <- matrix(rnorm(N*10), N, 10)
+X <- as.data.frame(X)
+Y_bin <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
+table(Y_bin)
+
+SL.library <- c("SL.glmnet", "SL.stepAIC", "SL.xgboost")
+
+# Test xgboost - binary classification
+sl <- SuperLearner(Y = Y_bin, X = X, SL.library = SL.library, family = binomial())
+sl
+
+# Test xgboost - regression
+Y_reg <- .2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]) + rnorm(N)
+summary(Y_reg)
+sl <- SuperLearner(Y = Y_reg, X = X, SL.library = SL.library, family = gaussian())
+sl
+
+# Test xgboost - multi-classification
+# TODO: add test here.
+
+test_that("Test create.SL.xgboost", {
+  # Create a new environment to hold the functions.
+  sl_env = new.env()
+  xgb_grid = create.SL.xgboost(tune = list(ntrees = c(100, 500), max_depth = c(1, 2),
+                      minobspernode = 10, shrinkage = c(0.1, 0.01, 0.001)), env = sl_env)
+  xgb_grid
+  xgb_functions = ls(sl_env)
+  expect_equal(length(xgb_functions), 12)
+  # Load the functions for use in the SuperLearner call.
+  attach(sl_env)
+  sl <- SuperLearner(Y = Y_reg, X = X, SL.library = c(SL.library, xgb_grid$names), family = gaussian())
+  print(sl)
+  detach(sl_env)
+})