From e439ba1ee778d57a8c3d49217b60d23460d16e3b Mon Sep 17 00:00:00 2001
From: Chris Kennedy <chrisken@gmail.com>
Date: Thu, 9 Jun 2016 19:27:39 -0700
Subject: [PATCH] Support XGBoost

---
 DESCRIPTION                  |  46 +++++++++++++--
 NAMESPACE                    |   2 +
 R/SL.xgboost.R               | 108 +++++++++++++++++++++++++++++++++++
 man/SL.xgboost.Rd            |  39 +++++++++++++
 man/create.SL.xgboost.Rd     |  40 +++++++++++++
 man/predict.SL.xgboost.Rd    |  12 ++++
 tests/testthat/testXGBoost.R |  42 ++++++++++++++
 7 files changed, 284 insertions(+), 5 deletions(-)
 mode change 100755 => 100644 DESCRIPTION
 create mode 100644 R/SL.xgboost.R
 create mode 100644 man/SL.xgboost.Rd
 create mode 100644 man/create.SL.xgboost.Rd
 create mode 100644 man/predict.SL.xgboost.Rd
 create mode 100644 tests/testthat/testXGBoost.R

diff --git a/DESCRIPTION b/DESCRIPTION
old mode 100755
new mode 100644
index 9186621..4627a56
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -5,10 +5,46 @@ Version: 2.0-20
 Date: 2016-04-06
 Author: Eric Polley, Erin LeDell, Mark van der Laan
 Maintainer: Eric Polley <polley.eric@mayo.edu>
-Description: Implements the super learner prediction method and contains a library of prediction algorithms to be used in the super learner.
+Description: Implements the super learner prediction method and contains a
+    library of prediction algorithms to be used in the super learner.
 License: GPL-3
-URL: https://github.com/ecpolley/SuperLearner 
-Depends: R (>= 2.14.0), nnls
-Imports: cvAUC
-Suggests: arm, caret, class, e1071, earth, gam, gbm, genefilter, ggplot2, glmnet, Hmisc, ipred, lattice, LogicReg, MASS, mda, mlbench, nloptr, nnet, party, polspline, quadprog, randomForest, ROCR, rpart, SIS, spls, stepPlr, sva
+URL: https://github.com/ecpolley/SuperLearner
+Depends:
+    R (>= 2.14.0),
+    nnls
+Imports:
+    cvAUC
+Suggests:
+    arm,
+    caret,
+    class,
+    e1071,
+    earth,
+    gam,
+    gbm,
+    genefilter,
+    ggplot2,
+    glmnet,
+    Hmisc,
+    ipred,
+    lattice,
+    LogicReg,
+    MASS,
+    mda,
+    mlbench,
+    nloptr,
+    nnet,
+    party,
+    polspline,
+    quadprog,
+    randomForest,
+    ROCR,
+    rpart,
+    SIS,
+    spls,
+    stepPlr,
+    sva,
+    testthat,
+    xgboost
 LazyLoad: yes
+RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
index 42a52ad..e276567 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ importFrom(graphics, plot)
 
 importFrom("stats", "as.formula", "binomial", "cor.test", "dlogis", "formula", "gaussian", "glm", "lm", "loess", "loess.control", "model.matrix", "optim", "plogis", "predict.lm", "qnorm", "reorder", "sd", "step", "var", "weighted.mean")
 importFrom("utils", "RShowDoc")
+import("xgboost")
 
 S3method(print, SuperLearner)
 S3method(coef, SuperLearner)
@@ -42,4 +43,5 @@ S3method(predict, SL.step)
 S3method(predict, SL.stepAIC)
 S3method(predict, SL.svm)
 S3method(predict, SL.template)
+S3method(predict, SL.xgboost)
 S3method(predict, SuperLearner)
diff --git a/R/SL.xgboost.R b/R/SL.xgboost.R
new file mode 100644
index 0000000..80f728a
--- /dev/null
+++ b/R/SL.xgboost.R
@@ -0,0 +1,108 @@
+#' XGBoost SuperLearner wrapper
+#'
+#' Supports the Extreme Gradient Boosting package for SuperLearnering, which is a
+#' variant of gradient boosted machines (GBM).
+#'
+#' The performance of XGBoost, like GBM, is sensitive to the configuration settings.
+#' Therefore it is best to create multiple configurations using create.SL.xgboost
+#' and allow the SuperLearner to choose the best weights based on cross-validated
+#' performance.
+#'
+#' @param family "gaussian" for regression, "binomial" for binary classification, "multinomial"
+#' for multiple classification.
+#' @param ntrees How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.
+#' @param max_depth How deep each tree can be. 1 means no interactions, aka tree stubs.
+#' @param shrinkage How much to shrink the predictions, in order to reduce overfitting.
+#' @param minobspernode Minimum observations allowed per tree node, after which no more splitting will occur.
+#' @param params Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md}
+#' @param nthread How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization.
+#' @param verbose Verbosity of XGB fitting.
+#' @export
+SL.xgboost = function(Y, X, newX, family, obsWeights, id, ntrees = 1000,
+                      max_depth=4, shrinkage=0.1, minobspernode=10, params = list(),
+                      nthread = 1, verbose = 0,
+                      ...) {
+  .SL.require("xgboost")
+
+  # Convert to an xgboost compatible data matrix, using the sample weights.
+  xgmat = xgboost::xgb.DMatrix(data=as.matrix(X), label=Y, weight = obsWeights)
+
+  # TODO: support early stopping, which requires a "watchlist". See ?xgb.train
+
+  if (family$family == "gaussian") {
+    model = xgboost::xgboost(data=xgmat, objective="reg:linear", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                nthread = nthread, params = params)
+  }
+  if (family$family == "binomial") {
+    model = xgboost::xgboost(data=xgmat, objective="binary:logistic", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                nthread = nthread, params = params)
+  }
+  if (family$family == "multinomial") {
+    # TODO: test this.
+    model = xgboost::xgboost(data=xgmat, objective="multi:softmax", nround = ntrees,
+                max_depth = max_depth, minchildweight = minobspernode, eta = shrinkage, verbose=verbose,
+                num_class=length(unique(Y)), nthread = nthread, params = params)
+  }
+  pred = predict(model, newdata=data.matrix(newX))
+  fit = list(object = model)
+  class(fit) = c("SL.xgboost")
+  out = list(pred = pred, fit = fit)
+  return(out)
+}
+
+#' XGBoost prediction on new data
+predict.SL.xgboost <- function(object, newdata, family, ...) {
+  .SL.require("xgboost")
+  pred <- predict(object$object, xgboost::xgb.DMatrix(newdata))
+  return(pred)
+}
+
+#' Factory for XGBoost SL wrappers
+#'
+#' Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters.
+#'
+#' @param tune List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.
+#' @param detailed_names Set to T to have the function names include the parameter configurations.
+#' @param env Environment in which to create the SL.xgboost functions. Defaults to the global environment.
+#' @param name_prefix The prefix string for the name of each function that is generated.
+#'
+#' @examples
+#'
+#' # Create a new environment to store the learner functions.
+#' # This keeps the global environment organized.
+#' sl_env = new.environment()
+#' # Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters.
+#' tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10,
+#'             shrinkage = c(0.1, 0.01, 0.001))
+#' # Generate a separate learner for each combination.
+#' xgb_grid = create.SL.xgboost(tune = tune, env = sl_env)
+#' # Review the function configurations.
+#' xgb_grid
+#' # Attach the environment so that the custom learner functions can be accessed.
+#' attach(sl_env)
+#' sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names)
+#' detach(sl_env)
+#' @export
+create.SL.xgboost = function(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage = c(0.1),
+                             minobspernode = c(10)), detailed_names = F, env = .GlobalEnv,
+                             name_prefix = "SL.xgb") {
+  # Create all combinations of hyperparameters, for grid-like search.
+  tuneGrid = expand.grid(tune, stringsAsFactors=F)
+
+  names = rep("", nrow(tuneGrid))
+
+  for (i in seq(nrow(tuneGrid))) {
+    g = tuneGrid[i,]
+    if (detailed_names) {
+      name = paste(name_prefix, g$ntrees, g$max_depth, g$shrinkage, g$minobspernode, sep=".")
+    } else {
+      name = paste(name_prefix, i, sep=".")
+    }
+    names[i] = name
+    eval(parse(text = paste0(name, "= function(..., ntrees = ", g$ntrees, ", max_depth = ", g$max_depth, ", shrinkage=", g$shrinkage, ", minobspernode=", g$minobspernode, ") SL.xgboost(..., ntrees = ntrees, max_depth = max_depth, shrinkage=shrinkage, minobspernode=minobspernode)")), envir = env)
+  }
+  results = list(grid = tuneGrid, names = names)
+  invisible(results)
+}
diff --git a/man/SL.xgboost.Rd b/man/SL.xgboost.Rd
new file mode 100644
index 0000000..4f17148
--- /dev/null
+++ b/man/SL.xgboost.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SL.xgboost.R
+\name{SL.xgboost}
+\alias{SL.xgboost}
+\title{XGBoost SuperLearner wrapper}
+\usage{
+SL.xgboost(Y, X, newX, family, obsWeights, id, ntrees = 1000, max_depth = 4,
+  shrinkage = 0.1, minobspernode = 10, params = list(), nthread = 1,
+  verbose = 0, ...)
+}
+\arguments{
+\item{family}{"gaussian" for regression, "binomial" for binary classification, "multinomial"
+for multiple classification.}
+
+\item{ntrees}{How many trees to fit. Low numbers may underfit but high numbers may overfit, depending also on the shrinkage.}
+
+\item{max_depth}{How deep each tree can be. 1 means no interactions, aka tree stubs.}
+
+\item{shrinkage}{How much to shrink the predictions, in order to reduce overfitting.}
+
+\item{minobspernode}{Minimum observations allowed per tree node, after which no more splitting will occur.}
+
+\item{params}{Many other parameters can be customized. See \url{https://github.com/dmlc/xgboost/blob/master/doc/parameter.md}}
+
+\item{nthread}{How many threads (cores) should xgboost use. Generally we want to keep this to 1 so that XGBoost does not compete with SuperLearner parallelization.}
+
+\item{verbose}{Verbosity of XGB fitting.}
+}
+\description{
+Supports the Extreme Gradient Boosting package for SuperLearnering, which is a
+variant of gradient boosted machines (GBM).
+}
+\details{
+The performance of XGBoost, like GBM, is sensitive to the configuration settings.
+Therefore it is best to create multiple configurations using create.SL.xgboost
+and allow the SuperLearner to choose the best weights based on cross-validated
+performance.
+}
+
diff --git a/man/create.SL.xgboost.Rd b/man/create.SL.xgboost.Rd
new file mode 100644
index 0000000..338e32f
--- /dev/null
+++ b/man/create.SL.xgboost.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SL.xgboost.R
+\name{create.SL.xgboost}
+\alias{create.SL.xgboost}
+\title{Factory for XGBoost SL wrappers}
+\usage{
+create.SL.xgboost(tune = list(ntrees = c(1000), max_depth = c(4), shrinkage =
+  c(0.1), minobspernode = c(10)), detailed_names = F, env = .GlobalEnv,
+  name_prefix = "SL.xgb")
+}
+\arguments{
+\item{tune}{List of hyperparameter settings to test. If specified, each hyperparameter will need to be defined.}
+
+\item{detailed_names}{Set to T to have the function names include the parameter configurations.}
+
+\item{env}{Environment in which to create the SL.xgboost functions. Defaults to the global environment.}
+
+\item{name_prefix}{The prefix string for the name of each function that is generated.}
+}
+\description{
+Create multiple configurations of XGBoost learners based on the desired combinations of hyperparameters.
+}
+\examples{
+
+# Create a new environment to store the learner functions.
+# This keeps the global environment organized.
+sl_env = new.environment()
+# Create 2 * 2 * 1 * 3 = 12 combinations of hyperparameters.
+tune = list(ntrees = c(100, 500), max_depth = c(1, 2), minobspernode = 10,
+            shrinkage = c(0.1, 0.01, 0.001))
+# Generate a separate learner for each combination.
+xgb_grid = create.SL.xgboost(tune = tune, env = sl_env)
+# Review the function configurations.
+xgb_grid
+# Attach the environment so that the custom learner functions can be accessed.
+attach(sl_env)
+sl = SuperLearner(Y = Y, X = X, SL.library = xgb_grid$names)
+detach(sl_env)
+}
+
diff --git a/man/predict.SL.xgboost.Rd b/man/predict.SL.xgboost.Rd
new file mode 100644
index 0000000..2d2b711
--- /dev/null
+++ b/man/predict.SL.xgboost.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SL.xgboost.R
+\name{predict.SL.xgboost}
+\alias{predict.SL.xgboost}
+\title{XGBoost prediction on new data}
+\usage{
+predict.SL.xgboost(object, newdata, family, ...)
+}
+\description{
+XGBoost prediction on new data
+}
+
diff --git a/tests/testthat/testXGBoost.R b/tests/testthat/testXGBoost.R
new file mode 100644
index 0000000..236dbf3
--- /dev/null
+++ b/tests/testthat/testXGBoost.R
@@ -0,0 +1,42 @@
+library(testthat)
+library(xgboost)
+
+context("Learner: XGBoost")
+
+# Create sample dataset for testing.
+set.seed(1)
+N <- 200
+X <- matrix(rnorm(N*10), N, 10)
+X <- as.data.frame(X)
+Y_bin <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
+table(Y_bin)
+
+SL.library <- c("SL.glmnet", "SL.stepAIC", "SL.xgboost")
+
+# Test xgboost - binary classification
+sl <- SuperLearner(Y = Y_bin, X = X, SL.library = SL.library, family = binomial())
+sl
+
+# Test xgboost - regression
+Y_reg <- .2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]) + rnorm(N)
+summary(Y_reg)
+sl <- SuperLearner(Y = Y_reg, X = X, SL.library = SL.library, family = gaussian())
+sl
+
+# Test xgboost - multi-classification
+# TODO: add test here.
+
+test_that("Test create.SL.xgboost", {
+  # Create a new environment to hold the functions.
+  sl_env = new.env()
+  xgb_grid = create.SL.xgboost(tune = list(ntrees = c(100, 500), max_depth = c(1, 2),
+                      minobspernode = 10, shrinkage = c(0.1, 0.01, 0.001)), env = sl_env)
+  xgb_grid
+  xgb_functions = ls(sl_env)
+  expect_equal(length(xgb_functions), 12)
+  # Load the functions for use in the SuperLearner call.
+  attach(sl_env)
+  sl <- SuperLearner(Y = Y_reg, X = X, SL.library = c(SL.library, xgb_grid$names), family = gaussian())
+  print(sl)
+  detach(sl_env)
+})