[R] On-demand serialization + standardization of attributes (#9924)

--------- Co-authored-by: Jiaming Yuan <[email protected]>
dmlc · Jan 10, 2024 · d3a8d28 · d3a8d28
1 parent 01c4711
commit d3a8d28
Show file tree

Hide file tree

Showing 64 changed files with 1,773 additions and 1,281 deletions.
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -2,16 +2,19 @@
 
 S3method("[",xgb.DMatrix)
 S3method("dimnames<-",xgb.DMatrix)
+S3method(coef,xgb.Booster)
 S3method(dim,xgb.DMatrix)
 S3method(dimnames,xgb.DMatrix)
+S3method(getinfo,xgb.Booster)
 S3method(getinfo,xgb.DMatrix)
 S3method(predict,xgb.Booster)
-S3method(predict,xgb.Booster.handle)
 S3method(print,xgb.Booster)
 S3method(print,xgb.DMatrix)
 S3method(print,xgb.cv.synchronous)
+S3method(setinfo,xgb.Booster)
 S3method(setinfo,xgb.DMatrix)
 S3method(slice,xgb.DMatrix)
+S3method(variable.names,xgb.Booster)
 export("xgb.attr<-")
 export("xgb.attributes<-")
 export("xgb.config<-")
@@ -26,13 +29,13 @@ export(cb.save.model)
 export(getinfo)
 export(setinfo)
 export(slice)
-export(xgb.Booster.complete)
 export(xgb.DMatrix)
 export(xgb.DMatrix.hasinfo)
 export(xgb.DMatrix.save)
 export(xgb.attr)
 export(xgb.attributes)
 export(xgb.config)
+export(xgb.copy.Booster)
 export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
@@ -41,10 +44,12 @@ export(xgb.get.DMatrix.data)
 export(xgb.get.DMatrix.num.non.missing)
 export(xgb.get.DMatrix.qcut)
 export(xgb.get.config)
+export(xgb.get.num.boosted.rounds)
 export(xgb.ggplot.deepness)
 export(xgb.ggplot.importance)
 export(xgb.ggplot.shap.summary)
 export(xgb.importance)
+export(xgb.is.same.Booster)
 export(xgb.load)
 export(xgb.load.raw)
 export(xgb.model.dt.tree)
@@ -56,10 +61,8 @@ export(xgb.plot.shap.summary)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
-export(xgb.serialize)
 export(xgb.set.config)
 export(xgb.train)
-export(xgb.unserialize)
 export(xgboost)
 import(methods)
 importClassesFrom(Matrix,dgCMatrix)
@@ -88,8 +91,10 @@ importFrom(graphics,title)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)
 importFrom(methods,new)
+importFrom(stats,coef)
 importFrom(stats,median)
 importFrom(stats,predict)
+importFrom(stats,variable.names)
 importFrom(utils,head)
 importFrom(utils,object.size)
 importFrom(utils,str)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
@@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) {
     })
 
     if (!is.null(env$bst)) {
-      xgb.parameters(env$bst$handle) <- pars
+      xgb.parameters(env$bst) <- pars
     } else {
       for (fd in env$bst_folds)
         xgb.parameters(fd$bst) <- pars
@@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
     if (!is.null(env$bst)) {
       if (!inherits(env$bst, 'xgb.Booster'))
         stop("'bst' in the parent frame must be an 'xgb.Booster'")
-      if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
+      if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
         best_score <<- as.numeric(best_score)
-        best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
-        best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
+        best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
+        best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
       } else {
-        xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
-                                               best_score = best_score)
+        xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
+                                        best_score = best_score)
       }
     } else if (is.null(env$bst_folds) || is.null(env$basket)) {
       stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
@@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
 
   finalizer <- function(env) {
     if (!is.null(env$bst)) {
-      attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
+      attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
       if (best_score != attr_best_score) {
         # If the difference is too big, throw an error
         if (abs(best_score - attr_best_score) >= 1e-14) {
@@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
         # If the difference is due to floating-point truncation, update best_score
         best_score <- attr_best_score
       }
-      env$bst$best_iteration <- best_iteration
-      env$bst$best_ntreelimit <- best_ntreelimit
-      env$bst$best_score <- best_score
+      xgb.attr(env$bst, "best_iteration") <- best_iteration
+      xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
+      xgb.attr(env$bst, "best_score") <- best_score
     } else {
       env$basket$best_iteration <- best_iteration
       env$basket$best_ntreelimit <- best_ntreelimit
@@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
 #' @param save_period save the model to disk after every
 #'        \code{save_period} iterations; 0 means save the model at the end.
 #' @param save_name the name or path for the saved model file.
+#'
+#'        Note that the format of the model being saved is determined by the file
+#'        extension specified here (see \link{xgb.save} for details about how it works).
+#'
 #'        It can contain a \code{\link[base]{sprintf}} formatting specifier
 #'        to include the integer iteration number in the file name.
-#'        E.g., with \code{save_name} = 'xgboost_%04d.model',
-#'        the file saved at iteration 50 would be named "xgboost_0050.model".
-#'
+#'        E.g., with \code{save_name} = 'xgboost_%04d.ubj',
+#'        the file saved at iteration 50 would be named "xgboost_0050.ubj".
+#' @seealso \link{xgb.save}
 #' @details
 #' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
 #'
@@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
 #' \code{\link{callbacks}}
 #'
 #' @export
-cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
+cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
 
   if (save_period < 0)
     stop("'save_period' cannot be negative")
@@ -440,8 +444,13 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
       stop("'save_model' callback requires the 'bst' booster object in its calling frame")
 
     if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
-        (save_period == 0 && env$iteration == env$end_iteration))
-      xgb.save(env$bst, sprintf(save_name, env$iteration))
+        (save_period == 0 && env$iteration == env$end_iteration)) {
+      # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
+      suppressWarnings({
+        save_name <- sprintf(save_name, env$iteration)
+      })
+      xgb.save(env$bst, save_name)
+    }
   }
   attr(callback, 'call') <- match.call()
   attr(callback, 'name') <- 'cb.save.model'
@@ -512,8 +521,7 @@ cb.cv.predict <- function(save_models = FALSE) {
     env$basket$pred <- pred
     if (save_models) {
       env$basket$models <- lapply(env$bst_folds, function(fd) {
-        xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
-        xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
+        return(fd$bst)
       })
     }
   }
@@ -665,7 +673,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
     } else { # xgb.cv:
       cf <- vector("list", length(env$bst_folds))
       for (i in seq_along(env$bst_folds)) {
-        dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
+        dmp <- xgb.dump(env$bst_folds[[i]]$bst)
         cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
         if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
       }
@@ -685,14 +693,19 @@ cb.gblinear.history <- function(sparse = FALSE) {
   callback
 }
 
-#' Extract gblinear coefficients history.
-#'
-#' A helper function to extract the matrix of linear coefficients' history
+#' @title Extract gblinear coefficients history.
+#' @description A helper function to extract the matrix of linear coefficients' history
 #' from a gblinear model created while using the \code{cb.gblinear.history()}
 #' callback.
+#' @details Note that this is an R-specific function that relies on R attributes that
+#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
+#' or \link{xgb.load.raw}.
 #'
+#' In order for a serialized model to be accepted by tgis function, one must use R
+#' serializers such as \link{saveRDS}.
 #' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-#'        using the \code{cb.gblinear.history()} callback.
+#'        using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
+#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
 #' @param class_index zero-based class index to extract the coefficients for only that
 #'        specific class in a multinomial multiclass model. When it is NULL, all the
 #'        coefficients are returned. Has no effect in non-multiclass models.
@@ -713,20 +726,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
     stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
   is_cv <- inherits(model, "xgb.cv.synchronous")
 
-  if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
+  if (is_cv) {
+    callbacks <- model$callbacks
+  } else {
+    callbacks <- attributes(model)$callbacks
+  }
+
+  if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
     stop("model must be trained while using the cb.gblinear.history() callback")
 
   if (!is_cv) {
-    # extract num_class & num_feat from the internal model
-    dmp <- xgb.dump(model)
-    if (length(dmp) < 2 || dmp[2] != "bias:")
-      stop("It does not appear to be a gblinear model")
-    dmp <- dmp[-c(1, 2)]
-    n <- which(dmp == 'weight:')
-    if (length(n) != 1)
-      stop("It does not appear to be a gblinear model")
-    num_class <- n - 1
-    num_feat <- (length(dmp) - 4) / num_class
+    num_class <- xgb.num_class(model)
+    num_feat <- xgb.num_feature(model)
   } else {
     # in case of CV, the object is expected to have this info
     if (model$params$booster != "gblinear")
@@ -742,7 +753,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
       (class_index[1] < 0 || class_index[1] >= num_class))
     stop("class_index has to be within [0,", num_class - 1, "]")
 
-  coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
+  coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
   if (!is.null(class_index) && num_class > 1) {
     coef_path <- if (is.list(coef_path)) {
       lapply(coef_path,

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
@@ -148,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) {
 
 
 # Update a booster handle for an iteration with dtrain data
-xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
-  if (!identical(class(booster_handle), "xgb.Booster.handle")) {
-    stop("booster_handle must be of xgb.Booster.handle class")
-  }
+xgb.iter.update <- function(bst, dtrain, iter, obj) {
   if (!inherits(dtrain, "xgb.DMatrix")) {
     stop("dtrain must be of xgb.DMatrix class")
   }
+  handle <- xgb.get.handle(bst)
 
   if (is.null(obj)) {
-    .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
+    .Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain)
   } else {
     pred <- predict(
-      booster_handle,
+      bst,
       dtrain,
       outputmargin = TRUE,
       training = TRUE,
@@ -185,7 +183,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
     }
 
     .Call(
-      XGBoosterTrainOneIter_R, booster_handle, dtrain, iter, grad, hess
+      XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess
     )
   }
   return(TRUE)
@@ -195,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
 # Evaluate one iteration.
 # Returns a named vector of evaluation metrics
 # with the names in a 'datasetname-metricname' format.
-xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
-  if (!identical(class(booster_handle), "xgb.Booster.handle"))
-    stop("class of booster_handle must be xgb.Booster.handle")
+xgb.iter.eval <- function(bst, watchlist, iter, feval) {
+  handle <- xgb.get.handle(bst)
 
   if (length(watchlist) == 0)
     return(NULL)
 
   evnames <- names(watchlist)
   if (is.null(feval)) {
-    msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
+    msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
     mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
     res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
   } else {
     res <- sapply(seq_along(watchlist), function(j) {
       w <- watchlist[[j]]
       ## predict using all trees
-      preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
+      preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
       eval_res <- feval(preds, w)
       out <- eval_res$value
       names(out) <- paste0(evnames[j], "-", eval_res$metric)
@@ -352,16 +349,45 @@ xgb.createFolds <- function(y, k) {
 #' @name xgboost-deprecated
 NULL
 
-#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
-#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
+#' @title Model Serialization and Compatibility
+#' @description
+#'
+#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
+#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
+#' its own serializers with better compatibility guarantees, which allow loading
+#' said models in other language bindings of XGBoost.
+#'
+#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
+#' \item Additional model configuration (accessible through \link{xgb.config}),
+#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
+#' These are not necessarily useful for prediction/importance/plotting.
+#' \item Additional R-specific attributes  - e.g. results of callbacks, such as evaluation logs,
+#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
+#' if present.
+#' }
+#'
+#' The first one (configurations) does not have the same compatibility guarantees as
+#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
+#' might be lost after loading the booster in a different XGBoost version, regardless of the
+#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
+#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
+#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
+#'
+#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
+#' not saved when using XGBoost's own serializers. These attributes are only used for informational
+#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
+#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
+#' These R attributes are only preserved when using R's serializers.
+#'
+#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
+#' before version `2.1.0`; have a very different R object structure and are incompatible with
+#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
+#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
+#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
+#' should be preferred for long-term storage.
 #'
-#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
-#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
-#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
-#' the model is to be accessed in the future. If you train a model with the current version of
-#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
-#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
-#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
+#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
+#' higher of said package, and will have the same compatibility restrictions as R serializers.
 #'
 #' @details
 #' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
@@ -374,9 +400,10 @@ NULL
 #' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
 #' as part of another R object.
 #'
-#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
-#' model but also internal configurations and parameters, and its format is not stable across
-#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
+#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
+#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
+#' control as it relies on R's serialization format (see e.g. the details section in
+#' \link{serialize} and \link{save} from base R).
 #'
 #' For more details and explanation about model persistence and archival, consult the page
 #' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.