Skip to content

Commit

Permalink
[R] Make xgb.cv work with xgb.DMatrix only, adding support for su…
Browse files Browse the repository at this point in the history
…rvival and ranking fields (#10031)



---------

Co-authored-by: Philip Hyunsu Cho <[email protected]>
  • Loading branch information
david-cortes and hcho3 authored Mar 31, 2024
1 parent 8bad677 commit bc9ea62
Show file tree
Hide file tree
Showing 12 changed files with 283 additions and 90 deletions.
68 changes: 53 additions & 15 deletions R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ NVL <- function(x, val) {
'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
}

.RANKING_OBJECTIVES <- function() {
return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
'multi:softprob'))
}


#
# Low-level functions for boosting --------------------------------------------
Expand Down Expand Up @@ -235,33 +240,43 @@ convert.labels <- function(labels, objective_name) {
}

# Generates random (stratified if needed) CV folds
generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
if (NROW(group)) {
if (stratified) {
warning(
paste0(
"Stratified splitting is not supported when using 'group' attribute.",
" Will use unstratified splitting."
)
)
}
return(generate.group.folds(nfold, group))
}
objective <- params$objective
if (!is.character(objective)) {
warning("Will use unstratified splitting (custom objective used)")
stratified <- FALSE
}
# cannot stratify if label is NULL
if (stratified && is.null(label)) {
warning("Will use unstratified splitting (no 'labels' available)")
stratified <- FALSE
}

# cannot do it for rank
objective <- params$objective
if (is.character(objective) && strtrim(objective, 5) == 'rank:') {
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n",
"\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
}
# shuffle
rnd_idx <- sample.int(nrows)
if (stratified &&
length(label) == length(rnd_idx)) {
if (stratified && length(label) == length(rnd_idx)) {
y <- label[rnd_idx]
# WARNING: some heuristic logic is employed to identify classification setting!
# - For classification, need to convert y labels to factor before making the folds,
# and then do stratification by factor levels.
# - For regression, leave y numeric and do stratification by quantiles.
if (is.character(objective)) {
y <- convert.labels(y, params$objective)
} else {
# If no 'objective' given in params, it means that user either wants to
# use the default 'reg:squarederror' objective or has provided a custom
# obj function. Here, assume classification setting when y has 5 or less
# unique values:
if (length(unique(y)) <= 5) {
y <- factor(y)
}
y <- convert.labels(y, objective)
}
folds <- xgb.createFolds(y = y, k = nfold)
} else {
Expand All @@ -277,6 +292,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
return(folds)
}

generate.group.folds <- function(nfold, group) {
ngroups <- length(group) - 1
if (ngroups < nfold) {
stop("DMatrix has fewer groups than folds.")
}
seq_groups <- seq_len(ngroups)
indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1]))
assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold))
assignments <- unname(assignments)

out <- vector("list", nfold)
randomized_groups <- sample(ngroups)
for (idx in seq_len(nfold)) {
groups_idx_test <- randomized_groups[assignments[[idx]]]
groups_test <- indices[groups_idx_test]
idx_test <- unlist(groups_test)
attributes(idx_test)$group_test <- lengths(groups_test)
attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test])
out[[idx]] <- idx_test
}
return(out)
}

# Creates CV folds stratified by the values of y.
# It was borrowed from caret::createFolds and simplified
# by always returning an unnamed list of fold indices.
Expand Down
31 changes: 21 additions & 10 deletions R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -1259,8 +1259,11 @@ xgb.get.DMatrix.data <- function(dmat) {
#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param object Object of class "xgb.DMatrix".
#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
#' equivalently `qid`) field. Note that in such case, the result will not have
#' the groups anymore - they need to be set manually through `setinfo`.
#' @param colset currently not used (columns subsetting is not available)
#'
#' @examples
Expand All @@ -1275,11 +1278,11 @@ xgb.get.DMatrix.data <- function(dmat) {
#'
#' @rdname xgb.slice.DMatrix
#' @export
xgb.slice.DMatrix <- function(object, idxset) {
xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)

attr_list <- attributes(object)
nr <- nrow(object)
Expand All @@ -1296,7 +1299,15 @@ xgb.slice.DMatrix <- function(object, idxset) {
}
}
}
return(structure(ret, class = "xgb.DMatrix"))

out <- structure(ret, class = "xgb.DMatrix")
parent_fields <- as.list(attributes(object)$fields)
if (NROW(parent_fields)) {
child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
child_fields <- as.environment(child_fields)
attributes(out)$fields <- child_fields
}
return(out)
}

#' @rdname xgb.slice.DMatrix
Expand Down Expand Up @@ -1340,11 +1351,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
}

cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0)
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
if (length(infos) == 0) infos <- 'NA'
infos <- names(attributes(x)$fields)
infos <- infos[infos != "feature_name"]
if (!NROW(infos)) infos <- "NA"
infos <- infos[order(infos)]
infos <- paste(infos, collapse = ", ")
cat(infos)
cnames <- colnames(x)
cat(' colnames:')
Expand Down
92 changes: 59 additions & 33 deletions R-package/R/xgb.cv.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Cross Validation
#'
#' The cross validation function of xgboost
#' The cross validation function of xgboost.
#'
#' @param params the list of parameters. The complete list of parameters is
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
Expand All @@ -19,13 +19,17 @@
#'
#' See \code{\link{xgb.train}} for further details.
#' See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
#'
#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand.
#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required
#' for model training by the objective.
#'
#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
#' or `xgb.ExternalDMatrix` are not supported here.
#' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
#' that NA values should be considered as 'missing' by the algorithm.
#' Sometimes, 0 or other extreme value might be used to represent missing values.
#' @param prediction A logical value indicating whether to return the test fold predictions
#' from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
Expand All @@ -47,13 +51,30 @@
#' @param feval customized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain.
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels.
#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels. For real-valued labels in regression objectives,
#' stratification will be done by discretizing the labels into up to 5 buckets beforehand.
#'
#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
#' `FALSE` otherwise.
#'
#' This parameter is ignored when `data` has a `group` field - in such case, the splitting
#' will be based on whole groups (note that this might make the folds have different sizes).
#'
#' Value `TRUE` here is \bold{not} supported for custom objectives.
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#'
#' If `data` has a `group` field and the objective requires this field, each fold (list element)
#' must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
#' and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
#' the resulting DMatrices.
#' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL}
#' (the default) all indices not specified in \code{folds} will be used for training.
#'
#' This is not supported when `data` has `group` field.
#' @param verbose \code{boolean}, print the statistics during the process
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
#' Default is 1 which means all messages are printed. This parameter is passed to the
Expand Down Expand Up @@ -118,13 +139,14 @@
#' print(cv, verbose=TRUE)
#'
#' @export
xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
xgb.cv <- function(params = list(), data, nrounds, nfold,
prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL,
verbose = TRUE, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {

check.deprecation(...)
stopifnot(inherits(data, "xgb.DMatrix"))
if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) {
stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.")
}
Expand All @@ -137,16 +159,22 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
check.custom.obj()
check.custom.eval()

# Check the labels
if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) ||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) {
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
} else if (inherits(data, 'xgb.DMatrix')) {
if (!is.null(label))
warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
cv_label <- getinfo(data, 'label')
} else {
cv_label <- label
if (stratified == "auto") {
if (is.character(params$objective)) {
stratified <- (
(params$objective %in% .CLASSIFICATION_OBJECTIVES())
&& !(params$objective %in% .RANKING_OBJECTIVES())
)
} else {
stratified <- FALSE
}
}

# Check the labels and groups
cv_label <- getinfo(data, "label")
cv_group <- getinfo(data, "group")
if (!is.null(train_folds) && NROW(cv_group)) {
stop("'train_folds' is not supported for DMatrix object with 'group' field.")
}

# CV folds
Expand All @@ -157,7 +185,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
} else {
if (nfold <= 1)
stop("'nfold' must be > 1")
folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params)
}

# Callbacks
Expand Down Expand Up @@ -195,20 +223,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing

# create the booster-folds
# train_folds
dall <- xgb.get.DMatrix(
data = data,
label = label,
missing = missing,
weight = NULL,
nthread = params$nthread
)
dall <- data
bst_folds <- lapply(seq_along(folds), function(k) {
dtest <- xgb.slice.DMatrix(dall, folds[[k]])
dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE)
# code originally contributed by @RolandASc on stackoverflow
if (is.null(train_folds))
dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE)
else
dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE)
if (!is.null(attributes(folds[[k]])$group_test)) {
setinfo(dtest, "group", attributes(folds[[k]])$group_test)
setinfo(dtrain, "group", attributes(folds[[k]])$group_train)
}
bst <- xgb.Booster(
params = params,
cachelist = list(dtrain, dtest),
Expand Down Expand Up @@ -312,8 +338,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' print(cv)
#' print(cv, verbose=TRUE)
#'
Expand Down
4 changes: 2 additions & 2 deletions R-package/man/print.xgb.cv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit bc9ea62

Please sign in to comment.