Skip to content

Commit

Permalink
[R] On-demand serialization + standardization of attributes (#9924)
Browse files Browse the repository at this point in the history

---------

Co-authored-by: Jiaming Yuan <[email protected]>
  • Loading branch information
david-cortes and trivialfis authored Jan 10, 2024
1 parent 01c4711 commit d3a8d28
Show file tree
Hide file tree
Showing 64 changed files with 1,773 additions and 1,281 deletions.
13 changes: 9 additions & 4 deletions R-package/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@

S3method("[",xgb.DMatrix)
S3method("dimnames<-",xgb.DMatrix)
S3method(coef,xgb.Booster)
S3method(dim,xgb.DMatrix)
S3method(dimnames,xgb.DMatrix)
S3method(getinfo,xgb.Booster)
S3method(getinfo,xgb.DMatrix)
S3method(predict,xgb.Booster)
S3method(predict,xgb.Booster.handle)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.Booster)
S3method(setinfo,xgb.DMatrix)
S3method(slice,xgb.DMatrix)
S3method(variable.names,xgb.Booster)
export("xgb.attr<-")
export("xgb.attributes<-")
export("xgb.config<-")
Expand All @@ -26,13 +29,13 @@ export(cb.save.model)
export(getinfo)
export(setinfo)
export(slice)
export(xgb.Booster.complete)
export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
export(xgb.attr)
export(xgb.attributes)
export(xgb.config)
export(xgb.copy.Booster)
export(xgb.create.features)
export(xgb.cv)
export(xgb.dump)
Expand All @@ -41,10 +44,12 @@ export(xgb.get.DMatrix.data)
export(xgb.get.DMatrix.num.non.missing)
export(xgb.get.DMatrix.qcut)
export(xgb.get.config)
export(xgb.get.num.boosted.rounds)
export(xgb.ggplot.deepness)
export(xgb.ggplot.importance)
export(xgb.ggplot.shap.summary)
export(xgb.importance)
export(xgb.is.same.Booster)
export(xgb.load)
export(xgb.load.raw)
export(xgb.model.dt.tree)
Expand All @@ -56,10 +61,8 @@ export(xgb.plot.shap.summary)
export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)
export(xgb.serialize)
export(xgb.set.config)
export(xgb.train)
export(xgb.unserialize)
export(xgboost)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
Expand Down Expand Up @@ -88,8 +91,10 @@ importFrom(graphics,title)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(methods,new)
importFrom(stats,coef)
importFrom(stats,median)
importFrom(stats,predict)
importFrom(stats,variable.names)
importFrom(utils,head)
importFrom(utils,object.size)
importFrom(utils,str)
Expand Down
81 changes: 46 additions & 35 deletions R-package/R/callbacks.R
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) {
})

if (!is.null(env$bst)) {
xgb.parameters(env$bst$handle) <- pars
xgb.parameters(env$bst) <- pars
} else {
for (fd in env$bst_folds)
xgb.parameters(fd$bst) <- pars
Expand Down Expand Up @@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if (!is.null(env$bst)) {
if (!inherits(env$bst, 'xgb.Booster'))
stop("'bst' in the parent frame must be an 'xgb.Booster'")
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
best_score <<- as.numeric(best_score)
best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
} else {
xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
best_score = best_score)
xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
best_score = best_score)
}
} else if (is.null(env$bst_folds) || is.null(env$basket)) {
stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
Expand All @@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,

finalizer <- function(env) {
if (!is.null(env$bst)) {
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
if (best_score != attr_best_score) {
# If the difference is too big, throw an error
if (abs(best_score - attr_best_score) >= 1e-14) {
Expand All @@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
# If the difference is due to floating-point truncation, update best_score
best_score <- attr_best_score
}
env$bst$best_iteration <- best_iteration
env$bst$best_ntreelimit <- best_ntreelimit
env$bst$best_score <- best_score
xgb.attr(env$bst, "best_iteration") <- best_iteration
xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
xgb.attr(env$bst, "best_score") <- best_score
} else {
env$basket$best_iteration <- best_iteration
env$basket$best_ntreelimit <- best_ntreelimit
Expand Down Expand Up @@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
#' @param save_period save the model to disk after every
#' \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#'
#' Note that the format of the model being saved is determined by the file
#' extension specified here (see \link{xgb.save} for details about how it works).
#'
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
#' to include the integer iteration number in the file name.
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
#' the file saved at iteration 50 would be named "xgboost_0050.model".
#'
#' E.g., with \code{save_name} = 'xgboost_%04d.ubj',
#' the file saved at iteration 50 would be named "xgboost_0050.ubj".
#' @seealso \link{xgb.save}
#' @details
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#'
Expand All @@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
#' \code{\link{callbacks}}
#'
#' @export
cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {

if (save_period < 0)
stop("'save_period' cannot be negative")
Expand All @@ -440,8 +444,13 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
stop("'save_model' callback requires the 'bst' booster object in its calling frame")

if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
(save_period == 0 && env$iteration == env$end_iteration))
xgb.save(env$bst, sprintf(save_name, env$iteration))
(save_period == 0 && env$iteration == env$end_iteration)) {
# Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
suppressWarnings({
save_name <- sprintf(save_name, env$iteration)
})
xgb.save(env$bst, save_name)
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.save.model'
Expand Down Expand Up @@ -512,8 +521,7 @@ cb.cv.predict <- function(save_models = FALSE) {
env$basket$pred <- pred
if (save_models) {
env$basket$models <- lapply(env$bst_folds, function(fd) {
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
return(fd$bst)
})
}
}
Expand Down Expand Up @@ -665,7 +673,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
} else { # xgb.cv:
cf <- vector("list", length(env$bst_folds))
for (i in seq_along(env$bst_folds)) {
dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
dmp <- xgb.dump(env$bst_folds[[i]]$bst)
cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
}
Expand All @@ -685,14 +693,19 @@ cb.gblinear.history <- function(sparse = FALSE) {
callback
}

#' Extract gblinear coefficients history.
#'
#' A helper function to extract the matrix of linear coefficients' history
#' @title Extract gblinear coefficients history.
#' @description A helper function to extract the matrix of linear coefficients' history
#' from a gblinear model created while using the \code{cb.gblinear.history()}
#' callback.
#' @details Note that this is an R-specific function that relies on R attributes that
#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
#' or \link{xgb.load.raw}.
#'
#' In order for a serialized model to be accepted by tgis function, one must use R
#' serializers such as \link{saveRDS}.
#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
#' using the \code{cb.gblinear.history()} callback.
#' using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
#' loaded from \link{xgb.load} or \link{xgb.load.raw}.
#' @param class_index zero-based class index to extract the coefficients for only that
#' specific class in a multinomial multiclass model. When it is NULL, all the
#' coefficients are returned. Has no effect in non-multiclass models.
Expand All @@ -713,20 +726,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
is_cv <- inherits(model, "xgb.cv.synchronous")

if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
if (is_cv) {
callbacks <- model$callbacks
} else {
callbacks <- attributes(model)$callbacks
}

if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
stop("model must be trained while using the cb.gblinear.history() callback")

if (!is_cv) {
# extract num_class & num_feat from the internal model
dmp <- xgb.dump(model)
if (length(dmp) < 2 || dmp[2] != "bias:")
stop("It does not appear to be a gblinear model")
dmp <- dmp[-c(1, 2)]
n <- which(dmp == 'weight:')
if (length(n) != 1)
stop("It does not appear to be a gblinear model")
num_class <- n - 1
num_feat <- (length(dmp) - 4) / num_class
num_class <- xgb.num_class(model)
num_feat <- xgb.num_feature(model)
} else {
# in case of CV, the object is expected to have this info
if (model$params$booster != "gblinear")
Expand All @@ -742,7 +753,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
(class_index[1] < 0 || class_index[1] >= num_class))
stop("class_index has to be within [0,", num_class - 1, "]")

coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
if (!is.null(class_index) && num_class > 1) {
coef_path <- if (is.list(coef_path)) {
lapply(coef_path,
Expand Down
75 changes: 51 additions & 24 deletions R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) {


# Update a booster handle for an iteration with dtrain data
xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
if (!identical(class(booster_handle), "xgb.Booster.handle")) {
stop("booster_handle must be of xgb.Booster.handle class")
}
xgb.iter.update <- function(bst, dtrain, iter, obj) {
if (!inherits(dtrain, "xgb.DMatrix")) {
stop("dtrain must be of xgb.DMatrix class")
}
handle <- xgb.get.handle(bst)

if (is.null(obj)) {
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
.Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain)
} else {
pred <- predict(
booster_handle,
bst,
dtrain,
outputmargin = TRUE,
training = TRUE,
Expand All @@ -185,7 +183,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
}

.Call(
XGBoosterTrainOneIter_R, booster_handle, dtrain, iter, grad, hess
XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess
)
}
return(TRUE)
Expand All @@ -195,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
# Evaluate one iteration.
# Returns a named vector of evaluation metrics
# with the names in a 'datasetname-metricname' format.
xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
if (!identical(class(booster_handle), "xgb.Booster.handle"))
stop("class of booster_handle must be xgb.Booster.handle")
xgb.iter.eval <- function(bst, watchlist, iter, feval) {
handle <- xgb.get.handle(bst)

if (length(watchlist) == 0)
return(NULL)

evnames <- names(watchlist)
if (is.null(feval)) {
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
} else {
res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
## predict using all trees
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)
Expand Down Expand Up @@ -352,16 +349,45 @@ xgb.createFolds <- function(y, k) {
#' @name xgboost-deprecated
NULL

#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
#' @title Model Serialization and Compatibility
#' @description
#'
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
#' its own serializers with better compatibility guarantees, which allow loading
#' said models in other language bindings of XGBoost.
#'
#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
#' \item Additional model configuration (accessible through \link{xgb.config}),
#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
#' These are not necessarily useful for prediction/importance/plotting.
#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
#' if present.
#' }
#'
#' The first one (configurations) does not have the same compatibility guarantees as
#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
#' might be lost after loading the booster in a different XGBoost version, regardless of the
#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
#'
#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
#' not saved when using XGBoost's own serializers. These attributes are only used for informational
#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
#' These R attributes are only preserved when using R's serializers.
#'
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
#' before version `2.1.0`; have a very different R object structure and are incompatible with
#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
#' should be preferred for long-term storage.
#'
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
#' the model is to be accessed in the future. If you train a model with the current version of
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
#' higher of said package, and will have the same compatibility restrictions as R serializers.
#'
#' @details
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
Expand All @@ -374,9 +400,10 @@ NULL
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
#' as part of another R object.
#'
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
#' model but also internal configurations and parameters, and its format is not stable across
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
#' control as it relies on R's serialization format (see e.g. the details section in
#' \link{serialize} and \link{save} from base R).
#'
#' For more details and explanation about model persistence and archival, consult the page
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
Expand Down
Loading

0 comments on commit d3a8d28

Please sign in to comment.