From 09694ac6f0b61a5601b4fda53bbf844498f4fd6f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 24 Dec 2023 21:55:51 +0100 Subject: [PATCH 01/47] on-demand serialization, refactor of attributes --- R-package/NAMESPACE | 7 +- R-package/R/callbacks.R | 66 +- R-package/R/utils.R | 29 +- R-package/R/xgb.Booster.R | 646 ++++++++++-------- R-package/R/xgb.DMatrix.R | 59 +- R-package/R/xgb.cv.R | 12 +- R-package/R/xgb.dump.R | 10 +- R-package/R/xgb.importance.R | 20 +- R-package/R/xgb.load.R | 10 +- R-package/R/xgb.load.raw.R | 17 +- R-package/R/xgb.model.dt.tree.R | 50 +- R-package/R/xgb.plot.shap.R | 8 +- R-package/R/xgb.save.R | 5 +- R-package/R/xgb.save.raw.R | 4 +- R-package/R/xgb.serialize.R | 8 +- R-package/R/xgb.train.R | 155 +++-- R-package/R/xgb.unserialize.R | 22 +- R-package/demo/predict_leaf_indices.R | 2 +- .../a-compatibility-note-for-saveRDS-save.Rd | 8 + R-package/man/coef.xgb.Booster.Rd | 50 ++ R-package/man/getinfo.Rd | 48 +- R-package/man/predict.xgb.Booster.Rd | 5 +- R-package/man/print.xgb.Booster.Rd | 10 +- R-package/man/setinfo.Rd | 44 -- R-package/man/xgb.Booster.complete.Rd | 61 -- R-package/man/xgb.attr.Rd | 15 +- R-package/man/xgb.config.Rd | 13 +- R-package/man/xgb.copy.Booster.Rd | 53 ++ R-package/man/xgb.gblinear.history.Rd | 11 +- R-package/man/xgb.is.same.Booster.Rd | 59 ++ R-package/man/xgb.load.Rd | 2 +- R-package/man/xgb.load.raw.Rd | 2 - R-package/man/xgb.model.dt.tree.Rd | 4 +- R-package/man/xgb.parameters.Rd | 16 +- R-package/man/xgb.save.Rd | 2 +- R-package/man/xgb.save.raw.Rd | 6 +- R-package/man/xgb.serialize.Rd | 10 +- R-package/man/xgb.train.Rd | 68 +- R-package/man/xgb.unserialize.Rd | 8 +- R-package/src/init.c | 20 +- R-package/src/xgboost_R.cc | 255 ++++++- R-package/src/xgboost_R.h | 60 +- R-package/tests/testthat/test_basic.R | 204 +++++- R-package/tests/testthat/test_callbacks.R | 78 ++- .../tests/testthat/test_custom_objective.R | 8 +- R-package/tests/testthat/test_glm.R | 16 +- R-package/tests/testthat/test_helpers.R | 47 +- .../tests/testthat/test_model_compatibility.R | 8 +- .../tests/testthat/test_parameter_exposure.R | 6 +- R-package/tests/testthat/test_update.R | 2 +- 50 files changed, 1526 insertions(+), 803 deletions(-) create mode 100644 R-package/man/coef.xgb.Booster.Rd delete mode 100644 R-package/man/setinfo.Rd delete mode 100644 R-package/man/xgb.Booster.complete.Rd create mode 100644 R-package/man/xgb.copy.Booster.Rd create mode 100644 R-package/man/xgb.is.same.Booster.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 40ede23a537a..f82ea9ef427e 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -2,14 +2,16 @@ S3method("[",xgb.DMatrix) S3method("dimnames<-",xgb.DMatrix) +S3method(coef,xgb.Booster) S3method(dim,xgb.DMatrix) S3method(dimnames,xgb.DMatrix) +S3method(getinfo,xgb.Booster) S3method(getinfo,xgb.DMatrix) S3method(predict,xgb.Booster) -S3method(predict,xgb.Booster.handle) S3method(print,xgb.Booster) S3method(print,xgb.DMatrix) S3method(print,xgb.cv.synchronous) +S3method(setinfo,xgb.Booster) S3method(setinfo,xgb.DMatrix) S3method(slice,xgb.DMatrix) export("xgb.attr<-") @@ -26,13 +28,13 @@ export(cb.save.model) export(getinfo) export(setinfo) export(slice) -export(xgb.Booster.complete) export(xgb.DMatrix) export(xgb.DMatrix.hasinfo) export(xgb.DMatrix.save) export(xgb.attr) export(xgb.attributes) export(xgb.config) +export(xgb.copy.Booster) export(xgb.create.features) export(xgb.cv) export(xgb.dump) @@ -42,6 +44,7 @@ export(xgb.ggplot.deepness) export(xgb.ggplot.importance) export(xgb.ggplot.shap.summary) export(xgb.importance) +export(xgb.is.same.Booster) export(xgb.load) export(xgb.load.raw) export(xgb.model.dt.tree) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 54f821a795cb..4839f71fa799 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) { }) if (!is.null(env$bst)) { - xgb.parameters(env$bst$handle) <- pars + xgb.parameters(env$bst) <- pars } else { for (fd in env$bst_folds) xgb.parameters(fd$bst) <- pars @@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, if (!is.null(env$bst)) { if (!inherits(env$bst, 'xgb.Booster')) stop("'bst' in the parent frame must be an 'xgb.Booster'") - if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) { + if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) { best_score <<- as.numeric(best_score) - best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1 - best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg')) + best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1 + best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg')) } else { - xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1, - best_score = best_score) + xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1, + best_score = best_score) } } else if (is.null(env$bst_folds) || is.null(env$basket)) { stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')") @@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, finalizer <- function(env) { if (!is.null(env$bst)) { - attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score')) + attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score')) if (best_score != attr_best_score) { # If the difference is too big, throw an error if (abs(best_score - attr_best_score) >= 1e-14) { @@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, # If the difference is due to floating-point truncation, update best_score best_score <- attr_best_score } - env$bst$best_iteration <- best_iteration - env$bst$best_ntreelimit <- best_ntreelimit - env$bst$best_score <- best_score + xgb.attr(env$bst, "best_iteration") <- best_iteration + xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit + xgb.attr(env$bst, "best_score") <- best_score } else { env$basket$best_iteration <- best_iteration env$basket$best_ntreelimit <- best_ntreelimit @@ -440,8 +440,10 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") { stop("'save_model' callback requires the 'bst' booster object in its calling frame") if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) || - (save_period == 0 && env$iteration == env$end_iteration)) - xgb.save(env$bst, sprintf(save_name, env$iteration)) + (save_period == 0 && env$iteration == env$end_iteration)) { + suppressWarnings({save_name <- sprintf(save_name, env$iteration)}) + xgb.save(env$bst, save_name) + } } attr(callback, 'call') <- match.call() attr(callback, 'name') <- 'cb.save.model' @@ -512,8 +514,7 @@ cb.cv.predict <- function(save_models = FALSE) { env$basket$pred <- pred if (save_models) { env$basket$models <- lapply(env$bst_folds, function(fd) { - xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1 - xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE) + return(fd$bst) }) } } @@ -665,7 +666,7 @@ cb.gblinear.history <- function(sparse = FALSE) { } else { # xgb.cv: cf <- vector("list", length(env$bst_folds)) for (i in seq_along(env$bst_folds)) { - dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL)) + dmp <- xgb.dump(env$bst_folds[[i]]$bst) cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE)) if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector") } @@ -685,14 +686,19 @@ cb.gblinear.history <- function(sparse = FALSE) { callback } -#' Extract gblinear coefficients history. -#' -#' A helper function to extract the matrix of linear coefficients' history +#' @title Extract gblinear coefficients history. +#' @description A helper function to extract the matrix of linear coefficients' history #' from a gblinear model created while using the \code{cb.gblinear.history()} #' callback. +#' @details Note that this is an R-specific function that relies on R attributes that +#' are not saved when using xgboost's own serialization functions like \link{xgb.load} +#' or \link{xgb.load.raw}. #' +#' In order for a serialized model to be accepted by tgis function, one must use R +#' serializers such as \link{saveRDS}. #' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained -#' using the \code{cb.gblinear.history()} callback. +#' using the \code{cb.gblinear.history()} callback, but \bold{not} a booster +#' loaded from \link{xgb.load} or \link{xgb.load.raw}. #' @param class_index zero-based class index to extract the coefficients for only that #' specific class in a multinomial multiclass model. When it is NULL, all the #' coefficients are returned. Has no effect in non-multiclass models. @@ -713,20 +719,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) { stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class") is_cv <- inherits(model, "xgb.cv.synchronous") - if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]])) + if (is_cv) { + callbacks <- model$callbacks + } else { + callbacks <- attributes(model)$callbacks + } + + if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history)) stop("model must be trained while using the cb.gblinear.history() callback") if (!is_cv) { - # extract num_class & num_feat from the internal model - dmp <- xgb.dump(model) - if (length(dmp) < 2 || dmp[2] != "bias:") - stop("It does not appear to be a gblinear model") - dmp <- dmp[-c(1, 2)] - n <- which(dmp == 'weight:') - if (length(n) != 1) - stop("It does not appear to be a gblinear model") - num_class <- n - 1 - num_feat <- (length(dmp) - 4) / num_class + num_class <- xgb.num_class(model) + num_feat <- .Call(XGBoosterGetNumFeature_R, xgb.get.handle(model)) } else { # in case of CV, the object is expected to have this info if (model$params$booster != "gblinear") @@ -742,7 +746,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) { (class_index[1] < 0 || class_index[1] >= num_class)) stop("class_index has to be within [0,", num_class - 1, "]") - coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]] + coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]] if (!is.null(class_index) && num_class > 1) { coef_path <- if (is.list(coef_path)) { lapply(coef_path, diff --git a/R-package/R/utils.R b/R-package/R/utils.R index bf08c481d118..e0683b84555a 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -148,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) { # Update a booster handle for an iteration with dtrain data -xgb.iter.update <- function(booster_handle, dtrain, iter, obj) { - if (!identical(class(booster_handle), "xgb.Booster.handle")) { - stop("booster_handle must be of xgb.Booster.handle class") - } +xgb.iter.update <- function(bst, dtrain, iter, obj) { if (!inherits(dtrain, "xgb.DMatrix")) { stop("dtrain must be of xgb.DMatrix class") } + handle <- xgb.get.handle(bst) if (is.null(obj)) { - .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain) + .Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain) } else { pred <- predict( - booster_handle, + bst, dtrain, outputmargin = TRUE, training = TRUE, @@ -185,7 +183,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) { } .Call( - XGBoosterTrainOneIter_R, booster_handle, dtrain, iter, grad, hess + XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess ) } return(TRUE) @@ -195,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) { # Evaluate one iteration. # Returns a named vector of evaluation metrics # with the names in a 'datasetname-metricname' format. -xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) { - if (!identical(class(booster_handle), "xgb.Booster.handle")) - stop("class of booster_handle must be xgb.Booster.handle") +xgb.iter.eval <- function(bst, watchlist, iter, feval) { + handle <- xgb.get.handle(bst) if (length(watchlist) == 0) return(NULL) evnames <- names(watchlist) if (is.null(feval)) { - msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames)) + msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames)) mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2) res <- structure(as.numeric(mat[2, ]), names = mat[1, ]) } else { res <- sapply(seq_along(watchlist), function(j) { w <- watchlist[[j]] ## predict using all trees - preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1)) + preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1)) eval_res <- feval(preds, w) out <- eval_res$value names(out) <- paste0(evnames[j], "-", eval_res$metric) @@ -363,6 +360,14 @@ NULL #' accessible in later releases of XGBoost. To ensure that your model can be accessed in future #' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. #' +#' Currently, it is not possible to use R serializers like `readRDS` to load an XGBoost. model +#' saved with an XGBoost. version lower than 2.1.0, and it's not possible to load an XGBoost. model +#' saved with R serializers like `readRDS` under XGBoost. version 2.1.0 when using an older version +#' of XGBoost. +#' +#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or +#' higher of said package, and will have the same compatibility restrictions as R serializers. +#' #' @details #' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into #' the JSON format by specifying the JSON extension. To read the model back, use diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 4e980641a17d..33a0adb594ca 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -1,180 +1,81 @@ -# Construct an internal xgboost Booster and return a handle to it. +# Construct an internal xgboost Booster. # internal utility function -xgb.Booster.handle <- function(params, cachelist, modelfile, handle) { +xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { if (typeof(cachelist) != "list" || !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { stop("cachelist must be a list of xgb.DMatrix objects") } ## Load existing model, dispatch for on disk model file and in memory buffer if (!is.null(modelfile)) { - if (typeof(modelfile) == "character") { + if (is.character(modelfile)) { ## A filename - handle <- .Call(XGBoosterCreate_R, cachelist) + bst <- .Call(XGBoosterCreate_R, cachelist) modelfile <- path.expand(modelfile) - .Call(XGBoosterLoadModel_R, handle, enc2utf8(modelfile[1])) - class(handle) <- "xgb.Booster.handle" + .Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1])) if (length(params) > 0) { - xgb.parameters(handle) <- params + xgb.parameters(bst) <- params } - return(handle) - } else if (typeof(modelfile) == "raw") { + return(bst) + } else if (is.raw(modelfile)) { ## A memory buffer - bst <- xgb.unserialize(modelfile, handle) + bst <- xgb.unserialize(modelfile) xgb.parameters(bst) <- params return(bst) } else if (inherits(modelfile, "xgb.Booster")) { ## A booster object - bst <- xgb.Booster.complete(modelfile, saveraw = TRUE) - bst <- xgb.unserialize(bst$raw) - xgb.parameters(bst) <- params - return(bst) + if (training_continuation == "copy") { + bst <- .Call(XGDuplicate_R, modelfile) + xgb.parameters(bst) <- params + return(bst) + } else { + xgb.parameters(modelfile) <- params + return(modelfile) + } } else { stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") } } ## Create new model - handle <- .Call(XGBoosterCreate_R, cachelist) - class(handle) <- "xgb.Booster.handle" + bst <- .Call(XGBoosterCreate_R, cachelist) if (length(params) > 0) { - xgb.parameters(handle) <- params + xgb.parameters(bst) <- params } - return(handle) -} - -# Convert xgb.Booster.handle to xgb.Booster -# internal utility function -xgb.handleToBooster <- function(handle, raw) { - bst <- list(handle = handle, raw = raw) - class(bst) <- "xgb.Booster" return(bst) } -# Check whether xgb.Booster.handle is null +# Check whether xgb.Booster handle is null # internal utility function is.null.handle <- function(handle) { if (is.null(handle)) return(TRUE) - if (!identical(class(handle), "xgb.Booster.handle")) - stop("argument type must be xgb.Booster.handle") + if (!inherits(handle, "externalptr")) + stop("argument type must be 'externalptr'") - if (.Call(XGCheckNullPtr_R, handle)) - return(TRUE) - - return(FALSE) + return(.Call(XGCheckNullPtr_R, handle)) } -# Return a verified to be valid handle out of either xgb.Booster.handle or -# xgb.Booster internal utility function +# Return a verified to be valid handle out of xgb.Booster +# internal utility function xgb.get.handle <- function(object) { if (inherits(object, "xgb.Booster")) { - handle <- object$handle - } else if (inherits(object, "xgb.Booster.handle")) { - handle <- object + handle <- object$ptr + if (is.null(handle) || !inherits(handle, "externalptr")) { + stop("'xgb.Booster' object is corrupted or is from an incompatible xgboost version.") + } } else { - stop("argument must be of either xgb.Booster or xgb.Booster.handle class") + stop("argument must be an 'xgb.Booster' object.") } if (is.null.handle(handle)) { - stop("invalid xgb.Booster.handle") - } - handle -} - -#' Restore missing parts of an incomplete xgb.Booster object -#' -#' It attempts to complete an `xgb.Booster` object by restoring either its missing -#' raw model memory dump (when it has no `raw` data but its `xgb.Booster.handle` is valid) -#' or its missing internal handle (when its `xgb.Booster.handle` is not valid -#' but it has a raw Booster memory dump). -#' -#' @param object Object of class `xgb.Booster`. -#' @param saveraw A flag indicating whether to append `raw` Booster memory dump data -#' when it doesn't already exist. -#' -#' @details -#' -#' While this method is primarily for internal use, it might be useful in some practical situations. -#' -#' E.g., when an `xgb.Booster` model is saved as an R object and then is loaded as an R object, -#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods -#' should still work for such a model object since those methods would be using -#' `xgb.Booster.complete()` internally. However, one might find it to be more efficient to call the -#' `xgb.Booster.complete()` function explicitly once after loading a model as an R-object. -#' That would prevent further repeated implicit reconstruction of an internal booster model. -#' -#' @return -#' An object of `xgb.Booster` class. -#' -#' @examples -#' -#' data(agaricus.train, package = "xgboost") -#' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, -#' max_depth = 2, -#' eta = 1, -#' nthread = 2, -#' nrounds = 2, -#' objective = "binary:logistic" -#' ) -#' -#' saveRDS(bst, "xgb.model.rds") -#' -#' # Warning: The resulting RDS file is only compatible with the current XGBoost version. -#' # Refer to the section titled "a-compatibility-note-for-saveRDS-save". -#' bst1 <- readRDS("xgb.model.rds") -#' if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds") -#' # the handle is invalid: -#' print(bst1$handle) -#' -#' bst1 <- xgb.Booster.complete(bst1) -#' # now the handle points to a valid internal booster model: -#' print(bst1$handle) -#' -#' @export -xgb.Booster.complete <- function(object, saveraw = TRUE) { - if (!inherits(object, "xgb.Booster")) - stop("argument type must be xgb.Booster") - - if (is.null.handle(object$handle)) { - object$handle <- xgb.Booster.handle( - params = list(), - cachelist = list(), - modelfile = object$raw, - handle = object$handle - ) - } else { - if (is.null(object$raw) && saveraw) { - object$raw <- xgb.serialize(object$handle) - } - } - - attrs <- xgb.attributes(object) - if (!is.null(attrs$best_ntreelimit)) { - object$best_ntreelimit <- as.integer(attrs$best_ntreelimit) - } - if (!is.null(attrs$best_iteration)) { - ## Convert from 0 based back to 1 based. - object$best_iteration <- as.integer(attrs$best_iteration) + 1 - } - if (!is.null(attrs$best_score)) { - object$best_score <- as.numeric(attrs$best_score) + stop("invalid 'xgb.Booster' (blank 'externalptr').") } - if (!is.null(attrs$best_msg)) { - object$best_msg <- attrs$best_msg - } - if (!is.null(attrs$niter)) { - object$niter <- as.integer(attrs$niter) - } - - return(object) + return(handle) } #' Predict method for XGBoost model #' #' Predicted values based on either xgboost model or model handle object. #' -#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`. +#' @param object Object of class `xgb.Booster`. #' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`, #' local data file, or `xgb.DMatrix`. #' For single-row predictions on sparse data, it is recommended to use the CSR format. @@ -361,27 +262,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6)) #' sum(pred5 != lb) / length(lb) #' -#' @rdname predict.xgb.Booster #' @export predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) { - object <- xgb.Booster.complete(object, saveraw = FALSE) - if (!inherits(newdata, "xgb.DMatrix")) { - config <- jsonlite::fromJSON(xgb.config(object)) - nthread <- strtoi(config$learner$generic_param$nthread) + nthread <- xgb.nthread(object) newdata <- xgb.DMatrix( newdata, missing = missing, nthread = NVL(nthread, -1) ) } - if (!is.null(object[["feature_names"]]) && - !is.null(colnames(newdata)) && - !identical(object[["feature_names"]], colnames(newdata))) - stop("Feature names stored in `object` and `newdata` are different!") - if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit)) + if (NVL(xgb.booster_type(object), '') == 'gblinear' || is.null(ntreelimit)) ntreelimit <- 0 if (ntreelimit != 0 && is.null(iterationrange)) { @@ -394,11 +287,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA ## both are specified, let libgxgboost throw an error } else { ## no limit is supplied, use best - if (is.null(object$best_iteration)) { + best_iteration <- xgb.best_iteration(object) + if (is.null(best_iteration)) { iterationrange <- c(0, 0) } else { ## We don't need to + 1 as R is 1-based index. - iterationrange <- c(0, as.integer(object$best_iteration)) + iterationrange <- c(0, as.integer(best_iteration)) } } ## Handle the 0 length values. @@ -441,7 +335,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA } predts <- .Call( - XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE) + XGBoosterPredictFromDMatrix_R, + xgb.get.handle(object), + newdata, + jsonlite::toJSON(args, auto_unbox = TRUE) ) names(predts) <- c("shape", "results") shape <- predts$shape @@ -512,22 +409,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA return(arr) } -#' @rdname predict.xgb.Booster -#' @export -predict.xgb.Booster.handle <- function(object, ...) { - - bst <- xgb.handleToBooster(handle = object, raw = NULL) - ret <- predict(bst, ...) - return(ret) -} - - -#' Accessors for serializable attributes of a model +#' @title Accessors for serializable attributes of a model #' -#' These methods allow to manipulate the key-value attribute strings of an xgboost model. +#' @description These methods allow to manipulate the key-value attribute strings of an xgboost model. #' -#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`. +#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it. #' @param name A non-empty character string specifying which attribute is to be accessed. #' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`, #' it is a list (or an object coercible to a list) with the names of attributes to set @@ -549,16 +436,15 @@ predict.xgb.Booster.handle <- function(object, ...) { #' change the value of that parameter for a model. #' Use [xgb.parameters<-()] to set or change model parameters. #' -#' The attribute setters would usually work more efficiently for `xgb.Booster.handle` -#' than for `xgb.Booster`, since only just a handle (pointer) would need to be copied. -#' That would only matter if attributes need to be set many times. -#' Note, however, that when feeding a handle of an `xgb.Booster` object to the attribute setters, -#' the raw model cache of an `xgb.Booster` object would not be automatically updated, -#' and it would be the user's responsibility to call [xgb.serialize()] to update it. -#' #' The `xgb.attributes<-` setter either updates the existing or adds one or several attributes, #' but it doesn't delete the other existing attributes. #' +#' Important: since this modifies the booster's C object, semantics for assignment here +#' will differ from R's, as any object reference to the same booster will be modified +#' too, while assignment of R attributes through `attributes(model)$ <- ` +#' will follow the usual copy-on-write R semantics (see \link{xgb.xopy.Booster} for an +#' example of these behaviors). +#' #' @return #' - `xgb.attr()` returns either a string value of an attribute #' or `NULL` if an attribute wasn't stored in a model. @@ -600,14 +486,25 @@ predict.xgb.Booster.handle <- function(object, ...) { xgb.attr <- function(object, name) { if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name") handle <- xgb.get.handle(object) - .Call(XGBoosterGetAttr_R, handle, as.character(name[1])) + out <- .Call(XGBoosterGetAttr_R, handle, as.character(name[1])) + if (!NROW(out) || !nchar(out)) { + return(NULL) + } + if (!is.null(out)) { + if (name %in% c("best_iteration", "best_ntreelimit", "best_score")) { + out <- as.numeric(out) + } + } + return(out) } #' @rdname xgb.attr #' @export `xgb.attr<-` <- function(object, name, value) { - if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name") + name <- as.character(name[1]) + if (!NROW(name) || !nchar(name)) stop("invalid attribute name") handle <- xgb.get.handle(object) + if (!is.null(value)) { # Coerce the elements to be scalar strings. # Q: should we warn user about non-scalar elements? @@ -617,11 +514,8 @@ xgb.attr <- function(object, name) { value <- as.character(value[1]) } } - .Call(XGBoosterSetAttr_R, handle, as.character(name[1]), value) - if (is(object, 'xgb.Booster') && !is.null(object$raw)) { - object$raw <- xgb.serialize(object$handle) - } - object + .Call(XGBoosterSetAttr_R, handle, name, value) + return(object) } #' @rdname xgb.attr @@ -629,12 +523,10 @@ xgb.attr <- function(object, name) { xgb.attributes <- function(object) { handle <- xgb.get.handle(object) attr_names <- .Call(XGBoosterGetAttrNames_R, handle) - if (is.null(attr_names)) return(NULL) - res <- lapply(attr_names, function(x) { - .Call(XGBoosterGetAttr_R, handle, x) - }) - names(res) <- attr_names - res + if (!NROW(attr_names)) return(list()) + out <- lapply(attr_names, function(name) xgb.attr(object, name)) + names(out) <- attr_names + return(out) } #' @rdname xgb.attr @@ -644,31 +536,22 @@ xgb.attributes <- function(object) { if (is.null(names(a)) || any(nchar(names(a)) == 0)) { stop("attribute names cannot be empty strings") } - # Coerce the elements to be scalar strings. - # Q: should we warn a user about non-scalar elements? - a <- lapply(a, function(x) { - if (is.null(x)) return(NULL) - if (is.numeric(x[1])) { - format(x[1], digits = 17) - } else { - as.character(x[1]) - } - }) - handle <- xgb.get.handle(object) for (i in seq_along(a)) { - .Call(XGBoosterSetAttr_R, handle, names(a[i]), a[[i]]) - } - if (is(object, 'xgb.Booster') && !is.null(object$raw)) { - object$raw <- xgb.serialize(object$handle) + xgb.attr(object, names(a[i])) <- a[[i]] } - object + return(object) } -#' Accessors for model parameters as JSON string +#' @title Accessors for model parameters as JSON string +#' @details Note that assignment is performed in-place on the booster C object, which unlike assignment +#' of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references +#' to the same booster will also get updated. #' -#' @param object Object of class `xgb.Booster`. +#' See \link{xgb.copy.Booster} for an example of this behavior. +#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it. #' @param value A JSON string. -#' +#' @return `xgb.config` will return the parameters as JSON text, which can be parsed through +#' e.g. `jsonlite::fromJSON`. #' @examples #' data(agaricus.train, package = "xgboost") #' @@ -701,23 +584,24 @@ xgb.config <- function(object) { `xgb.config<-` <- function(object, value) { handle <- xgb.get.handle(object) .Call(XGBoosterLoadJsonConfig_R, handle, value) - object$raw <- NULL # force renew the raw buffer - object <- xgb.Booster.complete(object) - object + return(object) } -#' Accessors for model parameters +#' @title Accessors for model parameters +#' @description Only the setter for xgboost parameters is currently implemented. +#' @details Just like \link{xgb.attr}, this function will make in-place modifications +#' on the booster object which do not follow typical R assignment semantics - that is, +#' all references to the same booster will also be updated, unlike assingment of R +#' attributes which follow copy-on-write semantics. #' -#' Only the setter for xgboost parameters is currently implemented. +#' See \link{xgb.copy.Booster} for an example of this behavior. #' -#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`. +#' Be aware that setting the parameters of a fitted booster will reset its number of rounds +#' indicator to zero. +#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}. #' @param value A list (or an object coercible to a list) with the names of parameters to set #' and the elements corresponding to parameter values. -#' -#' @details -#' Note that the setter would usually work more efficiently for `xgb.Booster.handle` -#' than for `xgb.Booster`, since only just a handle would need to be copied. -#' +#' @return The same booster `object`, which gets modified in-place. #' @examples #' data(agaricus.train, package = "xgboost") #' train <- agaricus.train @@ -754,28 +638,283 @@ xgb.config <- function(object) { for (i in seq_along(p)) { .Call(XGBoosterSetParam_R, handle, names(p[i]), p[[i]]) } - if (is(object, 'xgb.Booster') && !is.null(object$raw)) { - object$raw <- xgb.serialize(object$handle) + return(object) +} + +#' @rdname getinfo +#' @export +getinfo.xgb.Booster <- function(object, name, ...) { + name <- as.character(head(name, 1L)) + allowed_fields <- c("feature_name", "feature_type") + if (!(name %in% allowed_fields)) { + stop("getinfo: name must be one of the following: ", paste(allowed_fields, collapse = ", ")) } - object + handle <- xgb.get.handle(object) + out <- .Call( + XGBoosterGetStrFeatureInfo_R, + handle, + name + ) + if (!NROW(out)) { + return(NULL) + } + return(out) +} + +#' @rdname getinfo +#' @export +setinfo.xgb.Booster <- function(object, name, info, ...) { + name <- as.character(head(name, 1L)) + allowed_fields <- c("feature_name", "feature_type") + if (!(name %in% allowed_fields)) { + stop("setinfo: unknown info name ", name) + } + if (!is.character(info)) { + stop("'info' must be a character vector.") + } + handle <- xgb.get.handle(object) + .Call( + XGBoosterSetStrFeatureInfo_R, + handle, + name, + info + ) + return(TRUE) } # Extract the number of trees in a model. -# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration. # internal utility function +xgb.nrounds <- function(bst) { + return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(bst))) +} + xgb.ntree <- function(bst) { - length(grep('^booster', xgb.dump(bst))) + config <- jsonlite::fromJSON(xgb.config(bst)) + out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees) + return(out) +} + +xgb.nthread <- function(bst) { + config <- jsonlite::fromJSON(xgb.config(bst)) + out <- strtoi(config$learner$generic_param$nthread) + return(out) +} + +xgb.booster_type <- function(bst) { + config <- jsonlite::fromJSON(xgb.config(bst)) + out <- config$learner$learner_train_param$booster + return(out) +} + +xgb.num_class <- function(bst) { + config <- jsonlite::fromJSON(xgb.config(bst)) + out <- strtoi(config$learner$learner_model_param$num_class) + return(out) +} + +xgb.feature_names <- function(bst) { + return(getinfo(bst, "feature_name")) +} + +xgb.feature_types <- function(bst) { + return(getinfo(bst, "feature_type")) +} + +xgb.num_feature <- function(bst) { + handle <- xgb.get.handle(bst) + return(.Call(XGBoosterGetNumFeature_R, handle)) +} + +xgb.best_iteration <- function(bst) { + out <- xgb.attr(bst, "best_iteration") + if (!NROW(out) || !nchar(out)) { + out <- NULL + } + return(out) } +#' @title Extract coefficients from linear booster +#' @description Extracts the coefficients from a 'gblinear' booster object, +#' as produced by \code{xgb.train} when using parameter `booster="gblinear"`. +#' +#' Note: this function will error out if passing a booster model +#' which is not of "gblinear" type. +#' @param object A fitted booster of 'gblinear' type. +#' @param ... Not used. +#' @return The extracted coefficients:\itemize{ +#' \item If there's only one coefficient per column in the data, will be returned as a +#' vector, potentially containing the feature names if available, with the intercept +#' as first column. +#' \item If there's more than one coefficient per column in the data (e.g. when using +#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal +#' to [num_features, num_cols], with the intercepts as first row. Note that the column +#' (classes in multi-class classification) dimension will not be named. +#' } +#' +#' The intercept returned here will include the 'base_score' parameter (unlike the 'bias' +#' or the last coefficient in the model dump, which doesn't have 'base_score' added to it), +#' hence one should get the same values from calling `predict(..., outputmargin = TRUE)` and +#' from performing a matrix multiplication with `model.matrix(~., ...)`. +#' +#' Be aware that the coefficients are obtained by first converting them to strings and +#' back, so there will always be some very small lose of precision compared to the actual +#' coefficients as used by \link{predict.xgb.Booster}. +#' @examples +#' library(xgboost) +#' data(mtcars) +#' y <- mtcars[, 1] +#' x <- as.matrix(mtcars[, -1]) +#' dm <- xgb.DMatrix(data = x, label = y, nthread = 1) +#' params <- list(booster = "gblinear", nthread = 1) +#' model <- xgb.train(data = dm, params = params, nrounds = 2) +#' coef(model) +#' @export +coef.xgb.Booster <- function(object, ...) { + booster_type <- xgb.booster_type(object) + if (booster_type != "gblinear") { + stop("Coefficients are not defined for Booster type ", booster_type) + } + model_json <- jsonlite::fromJSON(rawToChar(xgb.save.raw(object, raw_format = "json"))) + base_score <- model_json$learner$learner_model_param$base_score + num_feature <- as.numeric(model_json$learner$learner_model_param$num_feature) + + weights <- model_json$learner$gradient_booster$model$weights + n_cols <- length(weights) / (num_feature + 1) + if (n_cols != floor(n_cols) || n_cols < 1) { + stop("Internal error: could not determine shape of coefficients.") + } + sep <- num_feature * n_cols + coefs <- weights[seq(1, sep)] + intercepts <- weights[seq(sep + 1, length(weights))] + intercepts <- intercepts + as.numeric(base_score) + + feature_names <- xgb.feature_names(object) + if (!NROW(feature_names)) { + # This mimics the default naming in R which names columns as "V1..N" + # when names are needed but not available + feature_names <- paste0("V", seq(1L, num_feature)) + } + feature_names <- c("(Intercept)", feature_names) + if (n_cols == 1L) { + out <- c(intercepts, coefs) + names(out) <- feature_names + } else { + coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE) + dim(intercepts) <- c(1L, n_cols) + out <- rbind(intercepts, coefs) + row.names(out) <- feature_names + # TODO: if a class names attributes is added, + # should use those names here. + } + return(out) +} -#' Print xgb.Booster +#' @title Deep-copies a Booster Object +#' @description Creates a deep copy of an 'xgb.Booster' object, such that the +#' C object pointer contained will be a different object, and hence functions +#' like \link{xgb.attr} will not affect the object from which it was copied. +#' @param model An 'xgb.Booster' object. +#' @return A deep copy of `model` - it will be identical in every way, but C-level +#' functions called on that copy will not affect the `model` variable. +#' @examples +#' library(xgboost) +#' data(mtcars) +#' y <- mtcars$mpg +#' x <- mtcars[, -1] +#' dm <- xgb.DMatrix(x, label = y, nthread = 1) +#' model <- xgb.train( +#' data = dm, +#' params = list(nthread = 1), +#' nround = 3 +#' ) #' -#' Print information about `xgb.Booster`. +#' # Set an arbitrary attribute kept at the C level +#' xgb.attr(model, "my_attr") <- 100 +#' print(xgb.attr(model, "my_attr")) +#' +#' # Just assigning to a new variable will not create +#' # a deep copy - C object pointer is shared, and in-place +#' # modifications will affect both objects +#' model_shallow_copy <- model +#' xgb.attr(model_shallow_copy, "my_attr") <- 333 +#' # 'model' was also affected by this change: +#' print(xgb.attr(model, "my_attr")) +#' +#' model_deep_copy <- xgb.copy.Booster(model) +#' xgb.attr(model_deep_copy, "my_attr") <- 444 +#' # 'model' was NOT affected by this change +#' # (keeps previous value that was assigned before) +#' print(xgb.attr(model, "my_attr")) +#' +#' # Verify that the new object was actually modified +#' print(xgb.attr(model_deep_copy, "my_attr")) +#' @export +xgb.copy.Booster <- function(model) { + if (!inherits(model, "xgb.Booster")) { + stop("'model' must be an 'xgb.Booster' object.") + } + return(.Call(XGDuplicate_R, model)) +} + +#' @title Check if two boosters share the same C object +#' @description Checks whether two booster objects refer to the same underlying C object. +#' @details As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr' +#' object, they don't follow typical copy-on-write semantics of other R objects - that is, if +#' one assigns a booster to a different variable and modifies that new variable through in-place +#' methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new +#' variable, unlike typical R assignments which would only modify the latter. +#' +#' This function allows checking whether two booster objects share the same 'externalptr', +#' regardless of the R attributes that they might have. +#' +#' In order to duplicate a booster in such a way that the copy wouldn't share the same +#' 'externalptr', one can use function \link{xgb.copy.Booster}. +#' @param obj1 Booster model to compare with `obj2`. +#' @param obj2 Booster model to compare with `obj1`. +#' @return Either `TRUE` or `FALSE` according to whether the two boosters share +#' the underlying C object. +#' @seealso \link{xgb.copy.Booster} +#' @examples +#' library(xgboost) +#' data(mtcars) +#' y <- mtcars$mpg +#' x <- as.matrix(mtcars[, -1]) +#' model <- xgb.train( +#' params = list(nthread = 1), +#' data = xgb.DMatrix(x, label = y, nthread = 1), +#' nround = 3 +#' ) #' -#' @param x An `xgb.Booster` object. -#' @param verbose Whether to print detailed data (e.g., attribute values). -#' @param ... Not currently used. +#' model_shallow_copy <- model +#' xgb.is.same.Booster(model, model_shallow_copy) # same C object +#' +#' model_deep_copy <- xgb.copy.Booster(model) +#' xgb.is.same.Booster(model, model_deep_copy) # different C objects #' +#' # In-place assignments modify all references, +#' # but not full/deep copies of the booster +#' xgb.attr(model_shallow_copy, "my_attr") <- 111 +#' xgb.attr(model, "my_attr") # gets modified +#' xgb.attr(model_deep_copy, "my_attr") # doesn't get modified +#' @export +xgb.is.same.Booster <- function(obj1, obj2) { + if (!inherits(obj1, "xgb.Booster") || !inherits(obj2, "xgb.Booster")) { + stop("'xgb.is.same.Booster' is only applicable to 'xgb.Booster' objects.") + } + return( + .Call( + XGPointerEqComparison, + xgb.get.handle(obj1), + xgb.get.handle(obj2) + ) + ) +} + +#' @title Print xgb.Booster +#' @description Print information about `xgb.Booster`. +#' @param x An `xgb.Booster` object. +#' @param ... Not used. +#' @return The same `x` object, returned invisibly #' @examples #' data(agaricus.train, package = "xgboost") #' train <- agaricus.train @@ -793,79 +932,40 @@ xgb.ntree <- function(bst) { #' attr(bst, "myattr") <- "memo" #' #' print(bst) -#' print(bst, verbose = TRUE) #' #' @export -print.xgb.Booster <- function(x, verbose = FALSE, ...) { +print.xgb.Booster <- function(x, ...) { + # this lets it error out when the object comes from an earlier R xgboost version + handle <- xgb.get.handle(x) cat('##### xgb.Booster\n') - valid_handle <- !is.null.handle(x$handle) - if (!valid_handle) - cat("Handle is invalid! Suggest using xgb.Booster.complete\n") - - cat('raw: ') - if (!is.null(x$raw)) { - cat(format(object.size(x$raw), units = "auto"), '\n') - } else { - cat('NULL\n') - } - if (!is.null(x$call)) { + R_attrs <- attributes(x) + if (!is.null(R_attrs$call)) { cat('call:\n ') - print(x$call) + print(R_attrs$call) } - if (!is.null(x$params)) { - cat('params (as set within xgb.train):\n') - cat(' ', - paste(names(x$params), - paste0('"', unlist(x$params), '"'), - sep = ' = ', collapse = ', '), '\n', sep = '') - } - # TODO: need an interface to access all the xgboosts parameters + cat('# of features:', xgb.num_feature(x), '\n') + cat('# of rounds: ', xgb.nrounds(x), '\n') - attrs <- character(0) - if (valid_handle) - attrs <- xgb.attributes(x) - if (length(attrs) > 0) { + attr_names <- .Call(XGBoosterGetAttrNames_R, handle) + if (NROW(attr_names)) { cat('xgb.attributes:\n') - if (verbose) { - cat(paste(paste0(' ', names(attrs)), - paste0('"', unlist(attrs), '"'), - sep = ' = ', collapse = '\n'), '\n', sep = '') - } else { - cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '') - } + cat(" ", paste(attr_names, collapse = ", "), "\n") } - if (!is.null(x$callbacks) && length(x$callbacks) > 0) { + if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) { cat('callbacks:\n') - lapply(callback.calls(x$callbacks), function(x) { + lapply(callback.calls(R_attrs$callbacks), function(x) { cat(' ') print(x) }) } - if (!is.null(x$feature_names)) - cat('# of features:', length(x$feature_names), '\n') - - cat('niter: ', x$niter, '\n', sep = '') - # TODO: uncomment when faster xgb.ntree is implemented - #cat('ntree: ', xgb.ntree(x), '\n', sep='') - - for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', - 'evaluation_log', 'niter', 'feature_names'))) { - if (is.atomic(x[[n]])) { - cat(n, ':', x[[n]], '\n', sep = ' ') - } else { - cat(n, ':\n\t', sep = ' ') - print(x[[n]]) - } - } - - if (!is.null(x$evaluation_log)) { + if (!is.null(R_attrs$evaluation_log)) { cat('evaluation_log:\n') - print(x$evaluation_log, row.names = FALSE, topn = 2) + print(R_attrs$evaluation_log, row.names = FALSE, topn = 2) } - invisible(x) + return(invisible(x)) } diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 11d1105e6b5d..ce4c1a302a3a 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -335,15 +335,14 @@ dimnames.xgb.DMatrix <- function(x) { } -#' Get information of an xgb.DMatrix object -#' -#' Get information of an xgb.DMatrix object -#' @param object Object of class \code{xgb.DMatrix} +#' @title Get or set information of xgb.DMatrix and xgb.Booster objects +#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`. #' @param name the name of the information field to get (see details) -#' @param ... other parameters -#' +#' @param ... Not used. +#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE` +#' if it succeeds. #' @details -#' The \code{name} field can be one of the following: +#' The \code{name} field can be one of the following for `xgb.DMatrix`: #' #' \itemize{ #' \item \code{label} @@ -358,9 +357,22 @@ dimnames.xgb.DMatrix <- function(x) { #' } #' See the documentation for \link{xgb.DMatrix} for more information about these fields. #' +#' For `xgb.Booster`, can be one of the following: +#' \itemize{ +#' \item \code{feature_type} +#' \item \code{feature_name} +#' } +#' #' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group' #' for a DMatrix that had 'qid' assigned. #' +#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See +#' \link{xgb.copy.Booster} for an idea of this in-place assignment works. +#' +#' Be aware that, when a booster object is serialized to disk through functions like +#' \link{xgb.save}, information that was set on it will be lost, but such information +#' would be kept as part of the serialization when using to-bytes serializers like +#' \link{xgb.save.raw}, and when using R serializers like \link{saveRDS}. #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) @@ -413,41 +425,12 @@ getinfo.xgb.DMatrix <- function(object, name, ...) { return(ret) } - -#' Set information of an xgb.DMatrix object -#' -#' Set information of an xgb.DMatrix object -#' -#' @param object Object of class "xgb.DMatrix" -#' @param name the name of the field to get +#' @rdname getinfo #' @param info the specific field of information to set -#' @param ... Not used. -#' -#' @details -#' See the documentation for \link{xgb.DMatrix} for possible fields that can be set -#' (which correspond to arguments in that function). -#' -#' Note that the following fields are allowed in the construction of an \code{xgb.DMatrix} -#' but \bold{aren't} allowed here:\itemize{ -#' \item data -#' \item missing -#' \item silent -#' \item nthread -#' } -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) -#' -#' labels <- getinfo(dtrain, 'label') -#' setinfo(dtrain, 'label', 1-labels) -#' labels2 <- getinfo(dtrain, 'label') -#' stopifnot(all.equal(labels2, 1-labels)) -#' @rdname setinfo #' @export setinfo <- function(object, ...) UseMethod("setinfo") -#' @rdname setinfo +#' @rdname getinfo #' @export setinfo.xgb.DMatrix <- function(object, name, info, ...) { .internal.setinfo.xgb.DMatrix(object, name, info, ...) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 1c17d86f042f..f1edf184d1d9 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -204,13 +204,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing dtrain <- slice(dall, unlist(folds[-k])) else dtrain <- slice(dall, train_folds[[k]]) - handle <- xgb.Booster.handle( + bst <- xgb.Booster( params = params, cachelist = list(dtrain, dtest), modelfile = NULL, - handle = NULL + training_continuation = "copy" # <- not used ) - list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) + list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) }) rm(dall) # a "basket" to collect some results from callbacks @@ -231,13 +231,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing msg <- lapply(bst_folds, function(fd) { xgb.iter.update( - booster_handle = fd$bst, + bst = fd$bst, dtrain = fd$dtrain, iter = iteration - 1, obj = obj ) xgb.iter.eval( - booster_handle = fd$bst, + bst = fd$bst, watchlist = fd$watchlist, iter = iteration - 1, feval = feval @@ -266,7 +266,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing ret <- c(ret, basket) class(ret) <- 'xgb.cv.synchronous' - invisible(ret) + return(invisible(ret)) } diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index a2de26c26d3b..a5d25d221570 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -49,9 +49,13 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE, if (!(is.null(fmap) || is.character(fmap))) stop("fmap: argument must be a character string (when provided)") - model <- xgb.Booster.complete(model) - model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats), - as.character(dump_format)) + model_dump <- .Call( + XGBoosterDumpModel_R, + xgb.get.handle(model), + NVL(fmap, "")[1], + as.integer(with_stats), + as.character(dump_format) + ) if (is.null(fname)) model_dump <- gsub('\t', '', model_dump, fixed = TRUE) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 844e36cdf839..4cbdf472c393 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -87,21 +87,21 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, if (!(is.null(data) && is.null(label) && is.null(target))) warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated") - if (!inherits(model, "xgb.Booster")) - stop("model: must be an object of class xgb.Booster") - - if (is.null(feature_names) && !is.null(model$feature_names)) - feature_names <- model$feature_names + if (is.null(feature_names)) { + model_feature_names <- xgb.feature_names(model) + if (NROW(model_feature_names)) { + feature_names <- model_feature_names + } + } if (!(is.null(feature_names) || is.character(feature_names))) stop("feature_names: Has to be a character vector") - model <- xgb.Booster.complete(model) - config <- jsonlite::fromJSON(xgb.config(model)) - if (config$learner$gradient_booster$name == "gblinear") { + handle <- xgb.get.handle(model) + if (xgb.booster_type(model) == "gblinear") { args <- list(importance_type = "weight", feature_names = feature_names) results <- .Call( - XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") + XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") ) names(results) <- c("features", "shape", "weight") if (length(results$shape) == 2) { @@ -122,7 +122,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, for (importance_type in c("weight", "total_gain", "total_cover")) { args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees) results <- .Call( - XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") + XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") ) names(results) <- c("features", "shape", importance_type) concatenated[ diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index cbdbdacc35f2..ba446e50a483 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -17,7 +17,7 @@ #' An object of \code{xgb.Booster} class. #' #' @seealso -#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. +#' \code{\link{xgb.save}} #' #' @examples #' data(agaricus.train, package='xgboost') @@ -44,11 +44,11 @@ xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") - handle <- xgb.Booster.handle( + bst <- xgb.Booster( params = list(), cachelist = list(), modelfile = modelfile, - handle = NULL + training_continuation = "copy" ) # re-use modelfile if it is raw so we do not need to serialize if (typeof(modelfile) == "raw") { @@ -59,10 +59,6 @@ xgb.load <- function(modelfile) { " `xgb.unserialize` instead. " ) ) - bst <- xgb.handleToBooster(handle = handle, raw = modelfile) - } else { - bst <- xgb.handleToBooster(handle = handle, raw = NULL) } - bst <- xgb.Booster.complete(bst, saveraw = TRUE) return(bst) } diff --git a/R-package/R/xgb.load.raw.R b/R-package/R/xgb.load.raw.R index b159e9de1d3f..a19febae55d2 100644 --- a/R-package/R/xgb.load.raw.R +++ b/R-package/R/xgb.load.raw.R @@ -3,21 +3,10 @@ #' User can generate raw memory buffer by calling xgb.save.raw #' #' @param buffer the buffer returned by xgb.save.raw -#' @param as_booster Return the loaded model as xgb.Booster instead of xgb.Booster.handle. -#' #' @export xgb.load.raw <- function(buffer, as_booster = FALSE) { cachelist <- list() - handle <- .Call(XGBoosterCreate_R, cachelist) - .Call(XGBoosterLoadModelFromRaw_R, handle, buffer) - class(handle) <- "xgb.Booster.handle" - - if (as_booster) { - booster <- list(handle = handle, raw = NULL) - class(booster) <- "xgb.Booster" - booster <- xgb.Booster.complete(booster, saveraw = TRUE) - return(booster) - } else { - return(handle) - } + bst <- .Call(XGBoosterCreate_R, cachelist) + .Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer) + return(bst) } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index d69169b89e96..73783f857863 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -4,7 +4,8 @@ #' #' @param feature_names character vector of feature names. If the model already #' contains feature names, those would be used when \code{feature_names=NULL} (default value). -#' Non-null \code{feature_names} could be provided to override those in the model. +#' +#' Note that, if the model already contains feature names, it's \bold{not} possible to override them here. #' @param model object of class \code{xgb.Booster} #' @param text \code{character} vector previously generated by the \code{xgb.dump} #' function (where parameter \code{with_stats = TRUE} should have been set). @@ -74,8 +75,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, " (or NULL if 'model' was provided).") } - if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names)) - feature_names <- model$feature_names + model_feature_names <- NULL + if (inherits(model, "xgb.Booster")) { + model_feature_names <- xgb.feature_names(model) + if (NROW(model_feature_names) && !is.null(feature_names)) { + stop("'model' contains feature names. Cannot override them.") + } + } + if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names)) + feature_names <- model_feature_names if (!(is.null(feature_names) || is.character(feature_names))) { stop("feature_names: must be a character vector") @@ -85,8 +93,10 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, stop("trees: must be a vector of integers.") } + from_text <- TRUE if (is.null(text)) { text <- xgb.dump(model = model, with_stats = TRUE) + from_text <- FALSE } if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) { @@ -115,8 +125,28 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, td[, isLeaf := grepl("leaf", t, fixed = TRUE)] # parse branch lines - branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),", - "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") + branch_rx_nonames <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),", + "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") + branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),", + "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") + text_has_feature_names <- FALSE + if (NROW(model_feature_names)) { + branch_rx <- branch_rx_w_names + text_has_feature_names <- TRUE + } else { + # Note: when passing a text dump, it might or might not have feature names, + # but that aspect is unknown from just the text attributes + branch_rx <- branch_rx_nonames + if (from_text) { + if (sum(grepl(branch_rx_w_names, text)) > sum(grepl(branch_rx_nonames, text))) { + branch_rx <- branch_rx_w_names + text_has_feature_names <- TRUE + } + } + } + if (text_has_feature_names && is.null(model) && !is.null(feature_names)) { + stop("'text' contains feature names. Cannot override them.") + } branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") td[ isLeaf == FALSE, @@ -139,10 +169,12 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, is_stump <- function() { return(length(td$Feature) == 1 && is.na(td$Feature)) } - if (!is.null(feature_names) && !is_stump()) { - if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE)) - stop("feature_names has less elements than there are features used in the model") - td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]] + if (!text_has_feature_names) { + if (!is.null(feature_names) && !is_stump()) { + if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE)) + stop("feature_names has less elements than there are features used in the model") + td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]] + } } # parse leaf lines diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index d61bd23d4684..68153b5ec1c9 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -241,7 +241,11 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, if (is.character(features) && is.null(colnames(data))) stop("either provide `data` with column names or provide `features` as column indices") - if (is.null(model$feature_names) && model$nfeatures != ncol(data)) + model_feature_names <- NULL + if (is.null(features) && !is.null(model)) { + model_feature_names <- xgb.feature_names(model) + } + if (is.null(model_feature_names) && .Call(model, XGBoosterGetNumFeature_R) != ncol(data)) stop("if model has no feature_names, columns in `data` must match features in model") if (!is.null(subsample)) { @@ -270,7 +274,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, } if (is.null(features)) { - if (!is.null(model$feature_names)) { + if (!is.null(model_feature_names)) { imp <- xgb.importance(model = model, trees = trees) } else { imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data)) diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index ab55bc4a9699..9f0bdd1d662d 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -20,7 +20,7 @@ #' releases of XGBoost. #' #' @seealso -#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. +#' \code{\link{xgb.load}} #' #' @examples #' data(agaricus.train, package='xgboost') @@ -49,8 +49,7 @@ xgb.save <- function(model, fname) { stop("model must be xgb.Booster.", if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "") } - model <- xgb.Booster.complete(model, saveraw = FALSE) fname <- path.expand(fname) - .Call(XGBoosterSaveModel_R, model$handle, enc2utf8(fname[1])) + .Call(XGBoosterSaveModel_R, xgb.get.handle(model), enc2utf8(fname[1])) return(TRUE) } diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index cad0fb0e01c2..57052325a072 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -11,8 +11,6 @@ #' \item \code{deprecated}: Encode the booster into old customized binary format. #' } #' -#' Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release. -#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') @@ -30,7 +28,7 @@ #' bst <- xgb.load.raw(raw) #' #' @export -xgb.save.raw <- function(model, raw_format = "deprecated") { +xgb.save.raw <- function(model, raw_format = "ubj") { handle <- xgb.get.handle(model) args <- list(format = raw_format) .Call(XGBoosterSaveModelToRaw_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE)) diff --git a/R-package/R/xgb.serialize.R b/R-package/R/xgb.serialize.R index 00bbb429320c..90304af7a4c2 100644 --- a/R-package/R/xgb.serialize.R +++ b/R-package/R/xgb.serialize.R @@ -1,8 +1,10 @@ -#' Serialize the booster instance into R's raw vector. The serialization method differs +#' Serialize the booster instance into R's raw vector. The serialization method differs #' from \code{\link{xgb.save.raw}} as the latter one saves only the model but not -#' parameters. This serialization format is not stable across different xgboost versions. +#' parameters. #' -#' @param booster the booster instance +#' \bold{IMPORTANT:} this serialization format is not stable across different xgboost versions. +#' +#' @param booster A booster instance #' #' @examples #' data(agaricus.train, package='xgboost') diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index d93a0643d1b3..df3c79c025df 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -148,10 +148,25 @@ #' @param xgb_model a previously built model to continue the training from. #' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a #' file with a previously saved model. +#' @param training_continuation when passing `xgb_model`, whether to update the previous model by +#' creating a copy of it which will contain the new boosted rounds (meaning: the original object +#' is kept as it was before the call to `xgb.train`, and there will be two booster objects), or +#' by updating the previous object in-place (meaning: the object passed under `xgb_model` will +#' be updated, and nothing will be returned from this function). +#' +#' Note that, if passing "update" here, the object in `xgb_model` +#' \bold{will get updated regardless of whether this function succeeds or not} +#' (for example, the parameters will be set on the existing `xgb_model`, overwriting previous ones, +#' even if the training fails because of some error). #' @param callbacks a list of callback functions to perform various task during boosting. #' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the #' parameters' values. User can provide either existing or their own callback methods in order #' to customize the training process. +#' +#' Note that some callbacks might try to set an evaluation log - in order to keep such logs, +#' it's necessary to pass `keep_extra_attributes = TRUE`. Be aware that these evaluation logs +#' are kept as R attributes, and thus do not get saved when using non-R serializaters like +#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). #' @param ... other parameters to pass to \code{params}. #' @param label vector of response values. Should not be provided when data is #' a local data file name or an \code{xgb.DMatrix}. @@ -159,6 +174,17 @@ #' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values. #' This parameter is only used when input is a dense matrix. #' @param weight a vector indicating the weight for each row of the input. +#' @param keep_extra_attributes Whether to keep extra R attributes in the booster object which +#' are specific to the R interface and which do not get saved along when calling functions +#' like `xgb.save`, but which get saved with R-specific serializers such as `saveRDS`. +#' +#' These attributes include, for example, the function call that was used to produce the model, +#' evaluation logs from callbacks, among others. +#' +#' @return +#' An object of class \code{xgb.Booster}, unless passing a previous `xgb_model` and passing +#' `training_continuation="update"`, in which case will update that object and return NULL +#' (invisibly) from this function. #' #' @details #' These are the training functions for \code{xgboost}. @@ -201,28 +227,19 @@ #' \item \code{cb.save.model}: when \code{save_period > 0} is set. #' } #' -#' @return -#' An object of class \code{xgb.Booster} with the following elements: -#' \itemize{ -#' \item \code{handle} a handle (pointer) to the xgboost model in memory. -#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type. -#' \item \code{niter} number of boosting iterations. -#' \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the -#' first column corresponding to iteration number and the rest corresponding to evaluation -#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback. -#' \item \code{call} a function call. -#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not -#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback. -#' \item \code{callbacks} callback functions that were either automatically assigned or -#' explicitly passed. -#' \item \code{best_iteration} iteration number with the best evaluation metric value -#' (only available with early stopping). -#' \item \code{best_score} the best evaluation metric value during early stopping. -#' (only available with early stopping). -#' \item \code{feature_names} names of the training dataset features -#' (only when column names were defined in training data). -#' \item \code{nfeatures} number of features in training data. -#' } +#' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently +#' from typical R objects (it's an 'altrep' list class), and it makes a separation between +#' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr} +#' and shared between interfaces through serialization functions like \link{xgb.save}; and +#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise +#' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and +#' not anyhow used by functions like \link{predict.xgb.Booster}. +#' +#' If passing `keep_extra_attributes=TRUE`, note that the parameters passed here will be kept +#' in the R-specific attributes, but since functions like \link{xgb.parameters} allow changing +#' parameters in the C-level object after it has been fitted, be aware that there's no guarantee +#' that these R parameters would be synchronized with the internal booster parameters as +#' return by \link{xgb.parameters} or \link{xgb.config}. #' #' @seealso #' \code{\link{callbacks}}, @@ -309,7 +326,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, save_name = "xgboost.model", - xgb_model = NULL, callbacks = list(), ...) { + xgb_model = NULL, training_continuation = c("copy", "update"), + callbacks = list(), keep_extra_attributes = TRUE, ...) { check.deprecation(...) @@ -318,6 +336,16 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), check.custom.obj() check.custom.eval() + if (is.null(xgb_model)) { + training_continuation <- "copy" + } else { + training_continuation <- head(training_continuation, 1L) + training_continuation <- as.character(training_continuation) + if (!(training_continuation %in% c("copy", "update"))) { + stop("'training_continuation' must be one of 'copy' or 'update'.") + } + } + # data & watchlist checks dtrain <- data if (!inherits(dtrain, "xgb.DMatrix")) @@ -371,23 +399,36 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), # The tree updating process would need slightly different handling is_update <- NVL(params[['process_type']], '.') == 'update' + past_evaluation_log <- NULL + if (inherits(xgb_model, "xgb.Booster")) { + past_evaluation_log <- attributes(xgb_model)$evaluation_log + } + + niter_init <- 0 + if (inherits(xgb_model, "xgb.Booster")) { + # Note: when assigning 'xgb.params', the number of rounds in the object + # gets reset to zero, hence this piece of code. + niter_init <- xgb.nrounds(xgb_model) + } + # Construct a booster (either a new one or load from xgb_model) - handle <- xgb.Booster.handle( + bst <- xgb.Booster( params = params, cachelist = append(watchlist, dtrain), modelfile = xgb_model, - handle = NULL + training_continuation = training_continuation ) - bst <- xgb.handleToBooster(handle = handle, raw = NULL) # extract parameters that can affect the relationship b/w #trees and #iterations + # Note: it might look like these aren't used, but they need to be defined in this + # environment for the callbacks for work correctly. num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # When the 'xgb_model' was set, find out how many boosting iterations it has - niter_init <- 0 - if (!is.null(xgb_model)) { - niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1 + # TODO: improve this kind of logic by leveraging C-level attributes + if (!is.null(xgb_model) && !inherits(xgb_model, "xgb.Booster")) { + niter_init <- xgb.nrounds(bst) if (length(niter_init) == 0) { niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class) } @@ -405,7 +446,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), for (f in cb$pre_iter) f() xgb.iter.update( - booster_handle = bst$handle, + bst = bst, dtrain = dtrain, iter = iteration - 1, obj = obj @@ -413,46 +454,58 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), if (length(watchlist) > 0) { bst_evaluation <- xgb.iter.eval( # nolint: object_usage_linter - booster_handle = bst$handle, + bst = bst, watchlist = watchlist, iter = iteration - 1, feval = feval ) } - xgb.attr(bst$handle, 'niter') <- iteration - 1 - for (f in cb$post_iter) f() if (stop_condition) break } for (f in cb$finalize) f(finalize = TRUE) - bst <- xgb.Booster.complete(bst, saveraw = TRUE) - - # store the total number of boosting iterations - bst$niter <- end_iteration - # store the evaluation results - if (length(evaluation_log) > 0 && - nrow(evaluation_log) > 0) { + keep_evaluation_log <- FALSE + if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) { + keep_evaluation_log <- TRUE # include the previous compatible history when available if (inherits(xgb_model, 'xgb.Booster') && !is_update && - !is.null(xgb_model$evaluation_log) && + !is.null(past_evaluation_log) && isTRUE(all.equal(colnames(evaluation_log), - colnames(xgb_model$evaluation_log)))) { - evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log)) + colnames(past_evaluation_log)))) { + evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log)) } - bst$evaluation_log <- evaluation_log } - bst$call <- match.call() - bst$params <- params - bst$callbacks <- callbacks - if (!is.null(colnames(dtrain))) - bst$feature_names <- colnames(dtrain) - bst$nfeatures <- ncol(dtrain) + .Call( + XGBoosterCopyInfoFromDMatrix, + xgb.get.handle(bst), + dtrain + ) - return(bst) + if (keep_extra_attributes) { + extra_attrs <- list( + call = match.call(), + params = params, + callbacks = callbacks + ) + if (keep_evaluation_log) { + extra_attrs$evaluation_log <- evaluation_log + } + curr_attrs <- attributes(bst) + attributes(bst) <- c(curr_attrs, extra_attrs) + if (training_continuation == "update") { + .Call(XGDuplicateAttrib, bst, xgb_model) + } + } + + if (training_continuation == "update") { + return(invisible(NULL)) + } else { + return(bst) + } } diff --git a/R-package/R/xgb.unserialize.R b/R-package/R/xgb.unserialize.R index 291d3e7dac6c..28e3e9087148 100644 --- a/R-package/R/xgb.unserialize.R +++ b/R-package/R/xgb.unserialize.R @@ -1,23 +1,14 @@ #' Load the instance back from \code{\link{xgb.serialize}} #' #' @param buffer the buffer containing booster instance saved by \code{\link{xgb.serialize}} -#' @param handle An \code{xgb.Booster.handle} object which will be overwritten with -#' the new deserialized object. Must be a null handle (e.g. when loading the model through -#' `readRDS`). If not provided, a new handle will be created. -#' @return An \code{xgb.Booster.handle} object. +#' @return An \code{xgb.Booster} object. #' #' @export -xgb.unserialize <- function(buffer, handle = NULL) { +xgb.unserialize <- function(buffer) { cachelist <- list() - if (is.null(handle)) { - handle <- .Call(XGBoosterCreate_R, cachelist) - } else { - if (!is.null.handle(handle)) - stop("'handle' is not null/empty. Cannot overwrite existing handle.") - .Call(XGBoosterCreateInEmptyObj_R, cachelist, handle) - } + bst <- .Call(XGBoosterCreate_R, cachelist) tryCatch( - .Call(XGBoosterUnserializeFromBuffer_R, handle, buffer), + .Call(XGBoosterUnserializeFromBuffer_R, xgb.get.handle(bst), buffer), error = function(e) { error_msg <- conditionMessage(e) m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)", @@ -31,11 +22,10 @@ xgb.unserialize <- function(buffer, handle = NULL) { "long term. For more details and explanation, see ", "https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html", sep = "")) - .Call(XGBoosterLoadModelFromRaw_R, handle, buffer) + .Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer) } else { stop(e) } }) - class(handle) <- "xgb.Booster.handle" - return(handle) + return(bst) } diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 5ef9372acbcc..25be31f178e5 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -27,7 +27,7 @@ head(pred_with_leaf) create.new.tree.features <- function(model, original.features) { pred_with_leaf <- predict(model, original.features, predleaf = TRUE) cols <- list() - for (i in 1:model$niter) { + for (i in 1:xgb.attr(model, "niter")) { # max is not the real max but it s not important for the purpose of adding features leaf.id <- sort(unique(pred_with_leaf[, i])) cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 85b52243c1b9..9b1bddeba4e3 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -14,6 +14,14 @@ accessible in later releases of XGBoost. To ensure that your model can be access releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. } \details{ +Currently, it is not possible to use R serializers like \code{readRDS} to load an XGBoost. model +saved with an XGBoost. version lower than 2.1.0, and it's not possible to load an XGBoost. model +saved with R serializers like \code{readRDS} under XGBoost. version 2.1.0 when using an older version +of XGBoost. + +Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or +higher of said package, and will have the same compatibility restrictions as R serializers. + Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into the JSON format by specifying the JSON extension. To read the model back, use \code{\link{xgb.load}}. diff --git a/R-package/man/coef.xgb.Booster.Rd b/R-package/man/coef.xgb.Booster.Rd new file mode 100644 index 000000000000..3924987f0142 --- /dev/null +++ b/R-package/man/coef.xgb.Booster.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{coef.xgb.Booster} +\alias{coef.xgb.Booster} +\title{Extract coefficients from linear booster} +\usage{ +\method{coef}{xgb.Booster}(object, ...) +} +\arguments{ +\item{object}{A fitted booster of 'gblinear' type.} + +\item{...}{Not used.} +} +\value{ +The extracted coefficients:\itemize{ +\item If there's only one coefficient per column in the data, will be returned as a +vector, potentially containing the feature names if available, with the intercept +as first column. +\item If there's more than one coefficient per column in the data (e.g. when using +\code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal +to \link{num_features, num_cols}, with the intercepts as first row. Note that the column +(classes in multi-class classification) dimension will not be named. +} + +The intercept returned here will include the 'base_score' parameter (unlike the 'bias' +or the last coefficient in the model dump, which doesn't have 'base_score' added to it), +hence one should get the same values from calling \code{predict(..., outputmargin = TRUE)} and +from performing a matrix multiplication with \code{model.matrix(~., ...)}. + +Be aware that the coefficients are obtained by first converting them to strings and +back, so there will always be some very small lose of precision compared to the actual +coefficients as used by \link{predict.xgb.Booster}. +} +\description{ +Extracts the coefficients from a 'gblinear' booster object, +as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}. + +Note: this function will error out if passing a booster model +which is not of "gblinear" type. +} +\examples{ +library(xgboost) +data(mtcars) +y <- mtcars[, 1] +x <- as.matrix(mtcars[, -1]) +dm <- xgb.DMatrix(data = x, label = y, nthread = 1) +params <- list(booster = "gblinear", nthread = 1) +model <- xgb.train(data = dm, params = params, nrounds = 2) +coef(model) +} diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 71f855d8a756..7db900d4b4c6 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,26 +1,44 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.DMatrix.R -\name{getinfo} +% Please edit documentation in R/xgb.Booster.R, R/xgb.DMatrix.R +\name{getinfo.xgb.Booster} +\alias{getinfo.xgb.Booster} +\alias{setinfo.xgb.Booster} \alias{getinfo} \alias{getinfo.xgb.DMatrix} -\title{Get information of an xgb.DMatrix object} +\alias{setinfo} +\alias{setinfo.xgb.DMatrix} +\title{Get or set information of xgb.DMatrix and xgb.Booster objects} \usage{ +\method{getinfo}{xgb.Booster}(object, name, ...) + +\method{setinfo}{xgb.Booster}(object, name, info, ...) + getinfo(object, ...) \method{getinfo}{xgb.DMatrix}(object, name, ...) + +setinfo(object, ...) + +\method{setinfo}{xgb.DMatrix}(object, name, info, ...) } \arguments{ -\item{object}{Object of class \code{xgb.DMatrix}} - -\item{...}{other parameters} +\item{object}{Object of class \code{xgb.DMatrix} of \code{xgb.Booster}.} \item{name}{the name of the information field to get (see details)} + +\item{...}{Not used.} + +\item{info}{the specific field of information to set} +} +\value{ +For \code{getinfo}, will return the requested field. For \code{setinfo}, will always return value \code{TRUE} +if it succeeds. } \description{ -Get information of an xgb.DMatrix object +Get or set information of xgb.DMatrix and xgb.Booster objects } \details{ -The \code{name} field can be one of the following: +The \code{name} field can be one of the following for \code{xgb.DMatrix}: \itemize{ \item \code{label} @@ -35,8 +53,22 @@ The \code{name} field can be one of the following: } See the documentation for \link{xgb.DMatrix} for more information about these fields. +For \code{xgb.Booster}, can be one of the following: +\itemize{ +\item \code{feature_type} +\item \code{feature_name} +} + Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group' for a DMatrix that had 'qid' assigned. + +\bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See +\link{xgb.copy.Booster} for an idea of this in-place assignment works. + +Be aware that, when a booster object is serialized to disk through functions like +\link{xgb.save}, information that was set on it will be lost, but such information +would be kept as part of the serialization when using to-bytes serializers like +\link{xgb.save.raw}, and when using R serializers like \link{saveRDS}. } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 135177dda99b..80c879157c63 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/xgb.Booster.R \name{predict.xgb.Booster} \alias{predict.xgb.Booster} -\alias{predict.xgb.Booster.handle} \title{Predict method for XGBoost model} \usage{ \method{predict}{xgb.Booster}( @@ -21,11 +20,9 @@ strict_shape = FALSE, ... ) - -\method{predict}{xgb.Booster.handle}(object, ...) } \arguments{ -\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.} +\item{object}{Object of class \code{xgb.Booster}.} \item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector}, local data file, or \code{xgb.DMatrix}. diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd index 4d09bb5ec00c..9a783efaff27 100644 --- a/R-package/man/print.xgb.Booster.Rd +++ b/R-package/man/print.xgb.Booster.Rd @@ -4,14 +4,15 @@ \alias{print.xgb.Booster} \title{Print xgb.Booster} \usage{ -\method{print}{xgb.Booster}(x, verbose = FALSE, ...) +\method{print}{xgb.Booster}(x, ...) } \arguments{ \item{x}{An \code{xgb.Booster} object.} -\item{verbose}{Whether to print detailed data (e.g., attribute values).} - -\item{...}{Not currently used.} +\item{...}{Not used.} +} +\value{ +The same \code{x} object, returned invisibly } \description{ Print information about \code{xgb.Booster}. @@ -33,6 +34,5 @@ bst <- xgboost( attr(bst, "myattr") <- "memo" print(bst) -print(bst, verbose = TRUE) } diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd deleted file mode 100644 index 299e7267532a..000000000000 --- a/R-package/man/setinfo.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.DMatrix.R -\name{setinfo} -\alias{setinfo} -\alias{setinfo.xgb.DMatrix} -\title{Set information of an xgb.DMatrix object} -\usage{ -setinfo(object, ...) - -\method{setinfo}{xgb.DMatrix}(object, name, info, ...) -} -\arguments{ -\item{object}{Object of class "xgb.DMatrix"} - -\item{...}{Not used.} - -\item{name}{the name of the field to get} - -\item{info}{the specific field of information to set} -} -\description{ -Set information of an xgb.DMatrix object -} -\details{ -See the documentation for \link{xgb.DMatrix} for possible fields that can be set -(which correspond to arguments in that function). - -Note that the following fields are allowed in the construction of an \code{xgb.DMatrix} -but \bold{aren't} allowed here:\itemize{ -\item data -\item missing -\item silent -\item nthread -} -} -\examples{ -data(agaricus.train, package='xgboost') -dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) - -labels <- getinfo(dtrain, 'label') -setinfo(dtrain, 'label', 1-labels) -labels2 <- getinfo(dtrain, 'label') -stopifnot(all.equal(labels2, 1-labels)) -} diff --git a/R-package/man/xgb.Booster.complete.Rd b/R-package/man/xgb.Booster.complete.Rd deleted file mode 100644 index 0adb1a69f6c7..000000000000 --- a/R-package/man/xgb.Booster.complete.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.Booster.R -\name{xgb.Booster.complete} -\alias{xgb.Booster.complete} -\title{Restore missing parts of an incomplete xgb.Booster object} -\usage{ -xgb.Booster.complete(object, saveraw = TRUE) -} -\arguments{ -\item{object}{Object of class \code{xgb.Booster}.} - -\item{saveraw}{A flag indicating whether to append \code{raw} Booster memory dump data -when it doesn't already exist.} -} -\value{ -An object of \code{xgb.Booster} class. -} -\description{ -It attempts to complete an \code{xgb.Booster} object by restoring either its missing -raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) -or its missing internal handle (when its \code{xgb.Booster.handle} is not valid -but it has a raw Booster memory dump). -} -\details{ -While this method is primarily for internal use, it might be useful in some practical situations. - -E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, -its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods -should still work for such a model object since those methods would be using -\code{xgb.Booster.complete()} internally. However, one might find it to be more efficient to call the -\code{xgb.Booster.complete()} function explicitly once after loading a model as an R-object. -That would prevent further repeated implicit reconstruction of an internal booster model. -} -\examples{ - -data(agaricus.train, package = "xgboost") - -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, - max_depth = 2, - eta = 1, - nthread = 2, - nrounds = 2, - objective = "binary:logistic" -) - -saveRDS(bst, "xgb.model.rds") - -# Warning: The resulting RDS file is only compatible with the current XGBoost version. -# Refer to the section titled "a-compatibility-note-for-saveRDS-save". -bst1 <- readRDS("xgb.model.rds") -if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds") -# the handle is invalid: -print(bst1$handle) - -bst1 <- xgb.Booster.complete(bst1) -# now the handle points to a valid internal booster model: -print(bst1$handle) - -} diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index 9203e6281561..e84dc749d4bc 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -16,7 +16,7 @@ xgb.attributes(object) xgb.attributes(object) <- value } \arguments{ -\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.} +\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.} \item{name}{A non-empty character string specifying which attribute is to be accessed.} @@ -51,15 +51,14 @@ Also, setting an attribute that has the same name as one of xgboost's parameters change the value of that parameter for a model. Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters. -The attribute setters would usually work more efficiently for \code{xgb.Booster.handle} -than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied. -That would only matter if attributes need to be set many times. -Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters, -the raw model cache of an \code{xgb.Booster} object would not be automatically updated, -and it would be the user's responsibility to call \code{\link[=xgb.serialize]{xgb.serialize()}} to update it. - The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes, but it doesn't delete the other existing attributes. + +Important: since this modifies the booster's C object, semantics for assignment here +will differ from R's, as any object reference to the same booster will be modified +too, while assignment of R attributes through \verb{attributes(model)$ <- } +will follow the usual copy-on-write R semantics (see \link{xgb.xopy.Booster} for an +example of these behaviors). } \examples{ data(agaricus.train, package = "xgboost") diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd index 83040b877396..4657df3185a5 100644 --- a/R-package/man/xgb.config.Rd +++ b/R-package/man/xgb.config.Rd @@ -10,13 +10,24 @@ xgb.config(object) xgb.config(object) <- value } \arguments{ -\item{object}{Object of class \code{xgb.Booster}.} +\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.} \item{value}{A JSON string.} } +\value{ +\code{xgb.config} will return the parameters as JSON text, which can be parsed through +e.g. \code{jsonlite::fromJSON}. +} \description{ Accessors for model parameters as JSON string } +\details{ +Note that assignment is performed in-place on the booster C object, which unlike assignment +of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references +to the same booster will also get updated. + +See \link{xgb.copy.Booster} for an example of this behavior. +} \examples{ data(agaricus.train, package = "xgboost") diff --git a/R-package/man/xgb.copy.Booster.Rd b/R-package/man/xgb.copy.Booster.Rd new file mode 100644 index 000000000000..8426d039e5a0 --- /dev/null +++ b/R-package/man/xgb.copy.Booster.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{xgb.copy.Booster} +\alias{xgb.copy.Booster} +\title{Deep-copies a Booster Object} +\usage{ +xgb.copy.Booster(model) +} +\arguments{ +\item{model}{An 'xgb.Booster' object.} +} +\value{ +A deep copy of \code{model} - it will be identical in every way, but C-level +functions called on that copy will not affect the \code{model} variable. +} +\description{ +Creates a deep copy of an 'xgb.Booster' object, such that the +C object pointer contained will be a different object, and hence functions +like \link{xgb.attr} will not affect the object from which it was copied. +} +\examples{ +library(xgboost) +data(mtcars) +y <- mtcars$mpg +x <- mtcars[, -1] +dm <- xgb.DMatrix(x, label = y, nthread = 1) +model <- xgb.train( + data = dm, + params = list(nthread = 1), + nround = 3 +) + +# Set an arbitrary attribute kept at the C level +xgb.attr(model, "my_attr") <- 100 +print(xgb.attr(model, "my_attr")) + +# Just assigning to a new variable will not create +# a deep copy - C object pointer is shared, and in-place +# modifications will affect both objects +model_shallow_copy <- model +xgb.attr(model_shallow_copy, "my_attr") <- 333 +# 'model' was also affected by this change: +print(xgb.attr(model, "my_attr")) + +model_deep_copy <- xgb.copy.Booster(model) +xgb.attr(model_deep_copy, "my_attr") <- 444 +# 'model' was NOT affected by this change +# (keeps previous value that was assigned before) +print(xgb.attr(model, "my_attr")) + +# Verify that the new object was actually modified +print(xgb.attr(model_deep_copy, "my_attr")) +} diff --git a/R-package/man/xgb.gblinear.history.Rd b/R-package/man/xgb.gblinear.history.Rd index bc8d46747279..103be16f11a9 100644 --- a/R-package/man/xgb.gblinear.history.Rd +++ b/R-package/man/xgb.gblinear.history.Rd @@ -8,7 +8,8 @@ xgb.gblinear.history(model, class_index = NULL) } \arguments{ \item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained -using the \code{cb.gblinear.history()} callback.} +using the \code{cb.gblinear.history()} callback, but \bold{not} a booster +loaded from \link{xgb.load} or \link{xgb.load.raw}.} \item{class_index}{zero-based class index to extract the coefficients for only that specific class in a multinomial multiclass model. When it is NULL, all the @@ -27,3 +28,11 @@ A helper function to extract the matrix of linear coefficients' history from a gblinear model created while using the \code{cb.gblinear.history()} callback. } +\details{ +Note that this is an R-specific function that relies on R attributes that +are not saved when using xgboost's own serialization functions like \link{xgb.load} +or \link{xgb.load.raw}. + +In order for a serialized model to be accepted by tgis function, one must use R +serializers such as \link{saveRDS}. +} diff --git a/R-package/man/xgb.is.same.Booster.Rd b/R-package/man/xgb.is.same.Booster.Rd new file mode 100644 index 000000000000..d2a2f4d179d8 --- /dev/null +++ b/R-package/man/xgb.is.same.Booster.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{xgb.is.same.Booster} +\alias{xgb.is.same.Booster} +\title{Check if two boosters share the same C object} +\usage{ +xgb.is.same.Booster(obj1, obj2) +} +\arguments{ +\item{obj1}{Booster model to compare with \code{obj2}.} + +\item{obj2}{Booster model to compare with \code{obj1}.} +} +\value{ +Either \code{TRUE} or \code{FALSE} according to whether the two boosters share +the underlying C object. +} +\description{ +Checks whether two booster objects refer to the same underlying C object. +} +\details{ +As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr' +object, they don't follow typical copy-on-write semantics of other R objects - that is, if +one assigns a booster to a different variable and modifies that new variable through in-place +methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new +variable, unlike typical R assignments which would only modify the latter. + +This function allows checking whether two booster objects share the same 'externalptr', +regardless of the R attributes that they might have. + +In order to duplicate a booster in such a way that the copy wouldn't share the same +'externalptr', one can use function \link{xgb.copy.Booster}. +} +\examples{ +library(xgboost) +data(mtcars) +y <- mtcars$mpg +x <- as.matrix(mtcars[, -1]) +model <- xgb.train( + params = list(nthread = 1), + data = xgb.DMatrix(x, label = y, nthread = 1), + nround = 3 +) + +model_shallow_copy <- model +xgb.is.same.Booster(model, model_shallow_copy) # same C object + +model_deep_copy <- xgb.copy.Booster(model) +xgb.is.same.Booster(model, model_deep_copy) # different C objects + +# In-place assignments modify all references, +# but not full/deep copies of the booster +xgb.attr(model_shallow_copy, "my_attr") <- 111 +xgb.attr(model, "my_attr") # gets modified +xgb.attr(model_deep_copy, "my_attr") # doesn't get modified +} +\seealso{ +\link{xgb.copy.Booster} +} diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 1a406cc21d0e..ab72b74b15e0 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -46,5 +46,5 @@ bst <- xgb.load('xgb.model') if (file.exists('xgb.model')) file.remove('xgb.model') } \seealso{ -\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. +\code{\link{xgb.save}} } diff --git a/R-package/man/xgb.load.raw.Rd b/R-package/man/xgb.load.raw.Rd index 0af890e6977d..d1da607375bd 100644 --- a/R-package/man/xgb.load.raw.Rd +++ b/R-package/man/xgb.load.raw.Rd @@ -8,8 +8,6 @@ xgb.load.raw(buffer, as_booster = FALSE) } \arguments{ \item{buffer}{the buffer returned by xgb.save.raw} - -\item{as_booster}{Return the loaded model as xgb.Booster instead of xgb.Booster.handle.} } \description{ User can generate raw memory buffer by calling xgb.save.raw diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 131830bde686..0cc2a6b9cbfc 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -16,7 +16,9 @@ xgb.model.dt.tree( \arguments{ \item{feature_names}{character vector of feature names. If the model already contains feature names, those would be used when \code{feature_names=NULL} (default value). -Non-null \code{feature_names} could be provided to override those in the model.} + +\if{html}{\out{
}}\preformatted{ Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here. +}\if{html}{\out{
}}} \item{model}{object of class \code{xgb.Booster}} diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd index 5305afa51248..682e23ef5be8 100644 --- a/R-package/man/xgb.parameters.Rd +++ b/R-package/man/xgb.parameters.Rd @@ -7,17 +7,27 @@ xgb.parameters(object) <- value } \arguments{ -\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.} +\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place}.} \item{value}{A list (or an object coercible to a list) with the names of parameters to set and the elements corresponding to parameter values.} } +\value{ +The same booster \code{object}, which gets modified in-place. +} \description{ Only the setter for xgboost parameters is currently implemented. } \details{ -Note that the setter would usually work more efficiently for \code{xgb.Booster.handle} -than for \code{xgb.Booster}, since only just a handle would need to be copied. +Just like \link{xgb.attr}, this function will make in-place modifications +on the booster object which do not follow typical R assignment semantics - that is, +all references to the same booster will also be updated, unlike assingment of R +attributes which follow copy-on-write semantics. + +See \link{xgb.copy.Booster} for an example of this behavior. + +Be aware that setting the parameters of a fitted booster will reset its number of rounds +indicator to zero. } \examples{ data(agaricus.train, package = "xgboost") diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index a7e160a12a9b..387d46d59c4b 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -49,5 +49,5 @@ bst <- xgb.load('xgb.model') if (file.exists('xgb.model')) file.remove('xgb.model') } \seealso{ -\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. +\code{\link{xgb.load}} } diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 0835519336a0..824784448a5e 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -5,7 +5,7 @@ \title{Save xgboost model to R's raw vector, user can call xgb.load.raw to load the model back from raw vector} \usage{ -xgb.save.raw(model, raw_format = "deprecated") +xgb.save.raw(model, raw_format = "ubj") } \arguments{ \item{model}{the model object.} @@ -15,9 +15,7 @@ xgb.save.raw(model, raw_format = "deprecated") \item \code{json}: Encode the booster into JSON text document. \item \code{ubj}: Encode the booster into Universal Binary JSON. \item \code{deprecated}: Encode the booster into old customized binary format. -} - -Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.} +}} } \description{ Save xgboost model from xgboost or xgb.train diff --git a/R-package/man/xgb.serialize.Rd b/R-package/man/xgb.serialize.Rd index 952441d98bea..358aff4d4906 100644 --- a/R-package/man/xgb.serialize.Rd +++ b/R-package/man/xgb.serialize.Rd @@ -2,19 +2,17 @@ % Please edit documentation in R/xgb.serialize.R \name{xgb.serialize} \alias{xgb.serialize} -\title{Serialize the booster instance into R's raw vector. The serialization method differs +\title{Serialize the booster instance into R's raw vector. The serialization method differs from \code{\link{xgb.save.raw}} as the latter one saves only the model but not -parameters. This serialization format is not stable across different xgboost versions.} +parameters.} \usage{ xgb.serialize(booster) } \arguments{ -\item{booster}{the booster instance} +\item{booster}{A booster instance} } \description{ -Serialize the booster instance into R's raw vector. The serialization method differs -from \code{\link{xgb.save.raw}} as the latter one saves only the model but not -parameters. This serialization format is not stable across different xgboost versions. +\bold{IMPORTANT:} this serialization format is not stable across different xgboost versions. } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 0ef2e2216d66..f768ad5f0261 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -19,7 +19,9 @@ xgb.train( save_period = NULL, save_name = "xgboost.model", xgb_model = NULL, + training_continuation = c("copy", "update"), callbacks = list(), + keep_extra_attributes = TRUE, ... ) @@ -202,10 +204,36 @@ This parameter is passed to the \code{\link{cb.early.stop}} callback.} Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a file with a previously saved model.} +\item{training_continuation}{when passing \code{xgb_model}, whether to update the previous model by +creating a copy of it which will contain the new boosted rounds (meaning: the original object +is kept as it was before the call to \code{xgb.train}, and there will be two booster objects), or +by updating the previous object in-place (meaning: the object passed under \code{xgb_model} will +be updated, and nothing will be returned from this function). + +\if{html}{\out{
}}\preformatted{ Note that, if passing "update" here, the object in `xgb_model` + \\bold\{will get updated regardless of whether this function succeeds or not\} + (for example, the parameters will be set on the existing `xgb_model`, overwriting previous ones, + even if the training fails because of some error). +}\if{html}{\out{
}}} + \item{callbacks}{a list of callback functions to perform various task during boosting. See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the parameters' values. User can provide either existing or their own callback methods in order -to customize the training process.} +to customize the training process. + +\if{html}{\out{
}}\preformatted{ Note that some callbacks might try to set an evaluation log - in order to keep such logs, + it's necessary to pass `keep_extra_attributes = TRUE`. Be aware that these evaluation logs + are kept as R attributes, and thus do not get saved when using non-R serializaters like + \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). +}\if{html}{\out{
}}} + +\item{keep_extra_attributes}{Whether to keep extra R attributes in the booster object which +are specific to the R interface and which do not get saved along when calling functions +like \code{xgb.save}, but which get saved with R-specific serializers such as \code{saveRDS}. + +\if{html}{\out{
}}\preformatted{ These attributes include, for example, the function call that was used to produce the model, + evaluation logs from callbacks, among others. +}\if{html}{\out{
}}} \item{...}{other parameters to pass to \code{params}.} @@ -219,27 +247,9 @@ This parameter is only used when input is a dense matrix.} \item{weight}{a vector indicating the weight for each row of the input.} } \value{ -An object of class \code{xgb.Booster} with the following elements: -\itemize{ -\item \code{handle} a handle (pointer) to the xgboost model in memory. -\item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type. -\item \code{niter} number of boosting iterations. -\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the -first column corresponding to iteration number and the rest corresponding to evaluation -metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback. -\item \code{call} a function call. -\item \code{params} parameters that were passed to the xgboost library. Note that it does not -capture parameters changed by the \code{\link{cb.reset.parameters}} callback. -\item \code{callbacks} callback functions that were either automatically assigned or -explicitly passed. -\item \code{best_iteration} iteration number with the best evaluation metric value -(only available with early stopping). -\item \code{best_score} the best evaluation metric value during early stopping. -(only available with early stopping). -\item \code{feature_names} names of the training dataset features -(only when column names were defined in training data). -\item \code{nfeatures} number of features in training data. -} +An object of class \code{xgb.Booster}, unless passing a previous \code{xgb_model} and passing +\code{training_continuation="update"}, in which case will update that object and return NULL +(invisibly) from this function. } \description{ \code{xgb.train} is an advanced interface for training an xgboost model. @@ -285,6 +295,20 @@ and the \code{print_every_n} parameter is passed to it. \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set. \item \code{cb.save.model}: when \code{save_period > 0} is set. } + +Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently +from typical R objects (it's an 'altrep' list class), and it makes a separation between +internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr} +and shared between interfaces through serialization functions like \link{xgb.save}; and +R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise +only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and +not anyhow used by functions like \link{predict.xgb.Booster}. + +If passing \code{keep_extra_attributes=TRUE}, note that the parameters passed here will be kept +in the R-specific attributes, but since functions like \link{xgb.parameters} allow changing +parameters in the C-level object after it has been fitted, be aware that there's no guarantee +that these R parameters would be synchronized with the internal booster parameters as +return by \link{xgb.parameters} or \link{xgb.config}. } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.unserialize.Rd b/R-package/man/xgb.unserialize.Rd index f83ee635dfb5..d07fa5d1f58b 100644 --- a/R-package/man/xgb.unserialize.Rd +++ b/R-package/man/xgb.unserialize.Rd @@ -4,17 +4,13 @@ \alias{xgb.unserialize} \title{Load the instance back from \code{\link{xgb.serialize}}} \usage{ -xgb.unserialize(buffer, handle = NULL) +xgb.unserialize(buffer) } \arguments{ \item{buffer}{the buffer containing booster instance saved by \code{\link{xgb.serialize}}} - -\item{handle}{An \code{xgb.Booster.handle} object which will be overwritten with -the new deserialized object. Must be a null handle (e.g. when loading the model through -\code{readRDS}). If not provided, a new handle will be created.} } \value{ -An \code{xgb.Booster.handle} object. +An \code{xgb.Booster} object. } \description{ Load the instance back from \code{\link{xgb.serialize}} diff --git a/R-package/src/init.c b/R-package/src/init.c index f957229af236..81cf73f6dac7 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -15,9 +15,17 @@ Check these declarations against the C/Fortran source code. */ /* .Call calls */ +extern void XGBInitializeAltrepClass(DllInfo *info); +static SEXP XGDuplicate_R(SEXP); +static SEXP XGDuplicateAttrib(SEXP, SEXP); +static SEXP XGPointerEqComparison(SEXP, SEXP); extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterCreate_R(SEXP); -extern SEXP XGBoosterCreateInEmptyObj_R(SEXP, SEXP); +extern SEXP XGBoosterCopyInfoFromDMatrix(SEXP, SEXP); +extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP); +extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP); +extern SEXP XGBoosterBoostedRounds_R(SEXP); +extern SEXP XGBoosterGetNumFeature_R(SEXP); extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterEvalOneIter_R(SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterGetAttrNames_R(SEXP); @@ -54,9 +62,16 @@ extern SEXP XGBGetGlobalConfig_R(void); extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { + {"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1}, + {"XGDuplicateAttrib", (DL_FUNC) &XGDuplicateAttrib, 2}, + {"XGPointerEqComparison", (DL_FUNC) &XGPointerEqComparison, 2}, {"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5}, {"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1}, - {"XGBoosterCreateInEmptyObj_R", (DL_FUNC) &XGBoosterCreateInEmptyObj_R, 2}, + {"XGBoosterCopyInfoFromDMatrix",(DL_FUNC) &XGBoosterCopyInfoFromDMatrix,2}, // NOLINT + {"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3}, // NOLINT + {"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2}, // NOLINT + {"XGBoosterBoostedRounds_R", (DL_FUNC) &XGBoosterBoostedRounds_R, 1}, + {"XGBoosterGetNumFeature_R", (DL_FUNC) &XGBoosterGetNumFeature_R, 1}, {"XGBoosterDumpModel_R", (DL_FUNC) &XGBoosterDumpModel_R, 4}, {"XGBoosterEvalOneIter_R", (DL_FUNC) &XGBoosterEvalOneIter_R, 4}, {"XGBoosterGetAttrNames_R", (DL_FUNC) &XGBoosterGetAttrNames_R, 1}, @@ -100,4 +115,5 @@ __declspec(dllexport) void attribute_visible R_init_xgboost(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); + XGBInitializeAltrepClass(dll); } diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 79b9e60374b2..a30bc4101c5b 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -188,7 +188,7 @@ char cpp_ex_msg[256]; using dmlc::BeginPtr; XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) { - return ScalarLogical(R_ExternalPtrAddr(handle) == NULL); + return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr); } XGB_DLL void _DMatrixFinalizer(SEXP ext) { @@ -455,8 +455,14 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) { } SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len)); - for (size_t i = 0; i < len; ++i) { - SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i))); + if (TYPEOF(array) == STRSXP) { + for (size_t i = 0; i < len; ++i) { + SET_VECTOR_ELT(str_info_holder, i, STRING_ELT(array, i)); + } + } else { + for (size_t i = 0; i < len; ++i) { + SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i))); + } } SEXP field_ = PROTECT(Rf_asChar(field)); @@ -542,15 +548,147 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) { return ScalarInteger(static_cast(ncol)); } +XGB_DLL SEXP XGDuplicate_R(SEXP obj) { + return Rf_duplicate(obj); +} + +XGB_DLL SEXP XGDuplicateAttrib(SEXP src, SEXP dest) { + DUPLICATE_ATTRIB(dest, src); + return R_NilValue; +} + +XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2) { + return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2)); +} + // functions related to booster -void _BoosterFinalizer(SEXP ext) { - if (R_ExternalPtrAddr(ext) == NULL) return; - CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(ext))); - R_ClearExternalPtr(ext); +static void _BoosterFinalizer(SEXP R_ptr) { + if (R_ExternalPtrAddr(R_ptr) == NULL) return; + CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(R_ptr))); + R_ClearExternalPtr(R_ptr); +} + +/* Booster is represented as an altrep list with one element which +corresponds to an 'externalptr' holding the C object, forbidding +modification by not implementing setters, and adding custom serialization. */ +static R_altrep_class_t XGBAltrepPointerClass; + +static R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) { + return 1; +} + +static SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) { + return R_altrep_data1(R_altrepped_obj); +} + +static SEXP XGBMakeEmptyAltrep() { + SEXP class_name = Rf_protect(Rf_mkString("xgb.Booster")); + SEXP elt_names = Rf_protect(Rf_mkString("ptr")); + SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + SEXP R_altrepped_obj = Rf_protect(R_new_altrep(XGBAltrepPointerClass, R_ptr, R_NilValue)); + Rf_setAttrib(R_altrepped_obj, R_NamesSymbol, elt_names); + Rf_setAttrib(R_altrepped_obj, R_ClassSymbol, class_name); + Rf_unprotect(4); + return R_altrepped_obj; +} + +/* Note: the idea for separating this function from the one above is to be +able to trigger all R allocations first before doing non-R allocations. */ +static void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) { + SEXP R_ptr = R_altrep_data1(R_altrepped_obj); + R_SetExternalPtrAddr(R_ptr, handle); + R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE); +} + +const char *ubj_json_format_str = "{\"format\": \"ubj\"}"; + +static SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { + R_API_BEGIN(); + BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)); + char const *serialized_bytes; + bst_ulong serialized_length; + CHECK_CALL(XGBoosterSaveModelToBuffer( + handle, ubj_json_format_str, &serialized_length, &serialized_bytes)); + SEXP R_state = Rf_protect(Rf_allocVector(RAWSXP, serialized_length)); + if (serialized_length != 0) { + std::memcpy(RAW(R_state), serialized_bytes, serialized_length); + } + Rf_unprotect(1); + return R_state; + R_API_END(); + return R_NilValue; /* <- should not be reached */ +} + +static SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) { + SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep()); + R_API_BEGIN(); + BoosterHandle handle = nullptr; + CHECK_CALL(XGBoosterCreate(nullptr, 0, &handle)); + int res_code = XGBoosterLoadModelFromBuffer(handle, + RAW(R_state), + Rf_xlength(R_state)); + if (res_code != 0) { + XGBoosterFree(handle); + } + CHECK_CALL(res_code); + XGBAltrepSetPointer(R_altrepped_obj, handle); + R_API_END(); + Rf_unprotect(1); + return R_altrepped_obj; +} + +// https://purrple.cat/blog/2018/10/14/altrep-and-cpp/ +static Rboolean XGBAltrepInspector_R( + SEXP x, int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + Rprintf("Altrepped external pointer [address:%p]\n", + R_ExternalPtrAddr(R_altrep_data1(x))); + return TRUE; +} + +static SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { + R_API_BEGIN(); + if (!deep) { + SEXP out = Rf_protect(XGBMakeEmptyAltrep()); + R_set_altrep_data1(out, R_altrep_data1(R_altrepped_obj)); + Rf_unprotect(1); + return out; + } else { + SEXP out = Rf_protect(XGBMakeEmptyAltrep()); + char const *serialized_bytes; + bst_ulong serialized_length; + CHECK_CALL(XGBoosterSaveModelToBuffer( + R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)), + ubj_json_format_str, &serialized_length, &serialized_bytes)); + BoosterHandle new_handle = nullptr; + CHECK_CALL(XGBoosterCreate(nullptr, 0, &new_handle)); + int res_code = XGBoosterLoadModelFromBuffer(new_handle, + serialized_bytes, + serialized_length); + if (res_code != 0) { + XGBoosterFree(new_handle); + } + CHECK_CALL(res_code); + XGBAltrepSetPointer(out, new_handle); + Rf_unprotect(1); + return out; + } + R_API_END(); + return R_NilValue; /* <- should not be reached */ +} + +XGB_DLL void XGBInitializeAltrepClass(DllInfo *dll) { + XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll); + R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R); + R_set_altlist_Elt_method(XGBAltrepPointerClass, XGBAltrepPointerGetElt_R); + R_set_altrep_Inspect_method(XGBAltrepPointerClass, XGBAltrepInspector_R); + R_set_altrep_Serialized_state_method(XGBAltrepPointerClass, XGBAltrepSerializer_R); + R_set_altrep_Unserialize_method(XGBAltrepPointerClass, XGBAltrepDeserializer_R); + R_set_altrep_Duplicate_method(XGBAltrepPointerClass, XGBAltrepDuplicate_R); } XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) { - SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + SEXP out = Rf_protect(XGBMakeEmptyAltrep()); R_API_BEGIN(); R_xlen_t len = Rf_xlength(dmats); BoosterHandle handle; @@ -564,33 +702,104 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) { res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle); } CHECK_CALL(res_code); - R_SetExternalPtrAddr(ret, handle); - R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE); + XGBAltrepSetPointer(out, handle); R_API_END(); - UNPROTECT(1); - return ret; + Rf_unprotect(1); + return out; } -XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) { +XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix(SEXP booster, SEXP dmat) { R_API_BEGIN(); - R_xlen_t len = Rf_xlength(dmats); - BoosterHandle handle; + char const **feature_names; + bst_ulong len_feature_names = 0; + CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat), + "feature_name", + &len_feature_names, + &feature_names)); + if (len_feature_names) { + CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster), + "feature_name", + feature_names, + len_feature_names)); + } + + char const **feature_types; + bst_ulong len_feature_types = 0; + CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat), + "feature_type", + &len_feature_types, + &feature_types)); + if (len_feature_types) { + CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster), + "feature_type", + feature_types, + len_feature_types)); + } + R_API_END(); + return R_NilValue; +} + +XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features) { + R_API_BEGIN(); + SEXP field_char = Rf_protect(Rf_asChar(field)); + bst_ulong len_features = Rf_xlength(features); int res_code; { - std::vector dvec(len); - for (R_xlen_t i = 0; i < len; ++i) { - dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i)); + std::vector str_arr(len_features); + for (bst_ulong idx = 0; idx < len_features; idx++) { + str_arr[idx] = CHAR(STRING_ELT(features, idx)); } - res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle); + res_code = XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(handle), + CHAR(field_char), + str_arr.data(), + len_features); } CHECK_CALL(res_code); - R_SetExternalPtrAddr(R_handle, handle); - R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE); + Rf_unprotect(1); R_API_END(); return R_NilValue; } +XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field) { + R_API_BEGIN(); + bst_ulong len; + const char **out_features; + SEXP field_char = Rf_protect(Rf_asChar(field)); + CHECK_CALL(XGBoosterGetStrFeatureInfo(R_ExternalPtrAddr(handle), + CHAR(field_char), &len, &out_features)); + SEXP out = Rf_protect(Rf_allocVector(STRSXP, len)); + for (bst_ulong idx = 0; idx < len; idx++) { + SET_STRING_ELT(out, idx, Rf_mkChar(out_features[idx])); + } + Rf_unprotect(2); + return out; + R_API_END(); + return R_NilValue; /* <- should not be reached */ +} + +XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle) { + SEXP out = Rf_protect(Rf_allocVector(INTSXP, 1)); + R_API_BEGIN(); + CHECK_CALL(XGBoosterBoostedRounds(R_ExternalPtrAddr(handle), INTEGER(out))); + R_API_END(); + Rf_unprotect(1); + return out; +} + +/* Note: R's integer class is 32-bit-and-signed only, while xgboost +supports more, so it returns it as a floating point instead */ +XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle) { + SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1)); + R_API_BEGIN(); + bst_ulong res; + CHECK_CALL(XGBoosterGetNumFeature(R_ExternalPtrAddr(handle), &res)); + REAL(out)[0] = static_cast(res); + R_API_END(); + Rf_unprotect(1); + return out; +} + XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { R_API_BEGIN(); SEXP name_ = PROTECT(Rf_asChar(name)); @@ -606,8 +815,8 @@ XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) { R_API_BEGIN(); CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), - asInteger(iter), - R_ExternalPtrAddr(dtrain))); + Rf_asInteger(iter), + R_ExternalPtrAddr(dtrain))); R_API_END(); return R_NilValue; } diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 2e874e3a6a2a..b88a0ac7b832 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -8,7 +8,9 @@ #define XGBOOST_R_H_ // NOLINT(*) +#include #include +#include #include #include @@ -143,19 +145,69 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle); */ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle); +/*! + * \brief Call R C-level function 'duplicate' + * \param obj Object to duplicate + */ +XGB_DLL SEXP XGDuplicate_R(SEXP obj); + +/*! + * \brief Call R C-level function 'DUPLICATE_ATTRIB' + * \param src Object from which to take the attributes + * \param dest Object for which to set the attributes + */ +XGB_DLL SEXP XGDuplicateAttrib(SEXP src, SEXP dest); + +/*! + * \brief Equality comparison for two pointers + * \param obj1 R 'externalptr' + * \param obj2 R 'externalptr' + */ +XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2); + +/*! + * \brief Register the Altrep class used for the booster + * \param dll DLL info as provided by R_init + */ +XGB_DLL void XGBInitializeAltrepClass(DllInfo *dll); + /*! * \brief create xgboost learner * \param dmats a list of dmatrix handles that will be cached */ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats); +/*! + * \brief copy information about features from a DMatrix into a Booster + * \param booster R 'externalptr' pointing to a booster object + * \param dmat R 'externalptr' pointing to a DMatrix object + */ +XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix(SEXP booster, SEXP dmat); + +/*! + * \brief handle R 'externalptr' holding the booster object + * \param field field name + * \param features features to set for the field + */ +XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features); /*! - * \brief create xgboost learner, saving the pointer into an existing R object - * \param dmats a list of dmatrix handles that will be cached - * \param R_handle a clean R external pointer (not holding any object) + * \brief handle R 'externalptr' holding the booster object + * \param field field name + */ +XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field); + +/*! + * \brief Get the number of boosted rounds from a model + * \param handle R 'externalptr' holding the booster object + */ +XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle); + +/*! + * \brief Get the number of features to which the model was fitted + * \param handle R 'externalptr' holding the booster object */ -XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle); +XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle); /*! * \brief set parameters diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 8ecf86e87178..5cae4aba9321 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -24,10 +24,10 @@ test_that("train and predict binary classification", { "train-error" ) expect_equal(class(bst), "xgb.Booster") - expect_equal(bst$niter, nrounds) - expect_false(is.null(bst$evaluation_log)) - expect_equal(nrow(bst$evaluation_log), nrounds) - expect_lt(bst$evaluation_log[, min(train_error)], 0.03) + expect_equal(xgb.nrounds(bst), nrounds) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_equal(nrow(attributes(bst)$evaluation_log), nrounds) + expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.03) pred <- predict(bst, test$data) expect_length(pred, 1611) @@ -35,7 +35,7 @@ test_that("train and predict binary classification", { pred1 <- predict(bst, train$data, ntreelimit = 1) expect_length(pred1, 6513) err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) - err_log <- bst$evaluation_log[1, train_error] + err_log <- attributes(bst)$evaluation_log[1, train_error] expect_lt(abs(err_pred1 - err_log), 10e-6) pred2 <- predict(bst, train$data, iterationrange = c(1, 2)) @@ -158,9 +158,9 @@ test_that("train and predict softprob", { ), "train-merror" ) - expect_false(is.null(bst$evaluation_log)) - expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) - expect_equal(bst$niter * 3, xgb.ntree(bst)) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025) + expect_equal(xgb.nrounds(bst) * 3, xgb.ntree(bst)) pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris) * 3) # row sums add up to total probability of 1: @@ -170,12 +170,12 @@ test_that("train and predict softprob", { expect_equal(as.numeric(t(mpred)), pred) pred_labels <- max.col(mpred) - 1 err <- sum(pred_labels != lb) / length(lb) - expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) + expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6) # manually calculate error at the 1st iteration: mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1) pred_labels <- max.col(mpred) - 1 err <- sum(pred_labels != lb) / length(lb) - expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6) + expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6) mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2)) expect_equal(mpred, mpred1) @@ -208,14 +208,14 @@ test_that("train and predict softmax", { ), "train-merror" ) - expect_false(is.null(bst$evaluation_log)) - expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) - expect_equal(bst$niter * 3, xgb.ntree(bst)) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025) + expect_equal(xgb.nrounds(bst) * 3, xgb.ntree(bst)) pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris)) err <- sum(pred != lb) / length(lb) - expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) + expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6) }) test_that("train and predict RF", { @@ -228,12 +228,12 @@ test_that("train and predict RF", { nrounds = 1, objective = "binary:logistic", eval_metric = "error", num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1 ) - expect_equal(bst$niter, 1) + expect_equal(xgb.nrounds(bst), 1) expect_equal(xgb.ntree(bst), 20) pred <- predict(bst, train$data) pred_err <- sum((pred > 0.5) != lb) / length(lb) - expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6) + expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6) # expect_lt(pred_err, 0.03) pred <- predict(bst, train$data, ntreelimit = 20) @@ -255,18 +255,18 @@ test_that("train and predict RF with softprob", { num_class = 3, verbose = 0, num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5 ) - expect_equal(bst$niter, 15) + expect_equal(xgb.nrounds(bst), 15) expect_equal(xgb.ntree(bst), 15 * 3 * 4) # predict for all iterations: pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) expect_equal(dim(pred), c(nrow(iris), 3)) pred_labels <- max.col(pred) - 1 err <- sum(pred_labels != lb) / length(lb) - expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6) + expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6) # predict for 7 iterations and adjust for 4 parallel trees per iteration pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4) err <- sum((max.col(pred) - 1) != lb) / length(lb) - expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6) + expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6) }) test_that("use of multiple eval metrics works", { @@ -278,9 +278,9 @@ test_that("use of multiple eval metrics works", { ), "train-error.*train-auc.*train-logloss" ) - expect_false(is.null(bst$evaluation_log)) - expect_equal(dim(bst$evaluation_log), c(2, 4)) - expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_equal(dim(attributes(bst)$evaluation_log), c(2, 4)) + expect_equal(colnames(attributes(bst)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) expect_output( bst2 <- xgboost( data = train$data, label = train$label, max_depth = 2, @@ -289,9 +289,9 @@ test_that("use of multiple eval metrics works", { ), "train-error.*train-auc.*train-logloss" ) - expect_false(is.null(bst2$evaluation_log)) - expect_equal(dim(bst2$evaluation_log), c(2, 4)) - expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) + expect_false(is.null(attributes(bst2)$evaluation_log)) + expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 4)) + expect_equal(colnames(attributes(bst2)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss")) }) @@ -311,25 +311,34 @@ test_that("training continuation works", { # continue for two more: bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1) if (!windows_flag && !solaris_flag) { - expect_equal(bst$raw, bst2$raw) + expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } - expect_false(is.null(bst2$evaluation_log)) - expect_equal(dim(bst2$evaluation_log), c(4, 2)) - expect_equal(bst2$evaluation_log, bst$evaluation_log) + expect_false(is.null(attributes(bst2)$evaluation_log)) + expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2)) + expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log) # test continuing from raw model data - bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw) + bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1)) if (!windows_flag && !solaris_flag) { - expect_equal(bst$raw, bst2$raw) + expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } - expect_equal(dim(bst2$evaluation_log), c(2, 2)) + expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2)) # test continuing from a model in file xgb.save(bst1, "xgboost.json") bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json") if (!windows_flag && !solaris_flag) { - expect_equal(bst$raw, bst2$raw) + expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2)) } - expect_equal(dim(bst2$evaluation_log), c(2, 2)) + expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2)) file.remove("xgboost.json") + + set.seed(11) + bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0) + # continue for two more, this time in-place: + xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, + xgb_model = bst1, training_continuation = "update") + if (!windows_flag && !solaris_flag) { + expect_equal(xgb.save.raw(bst), xgb.save.raw(bst1)) + } }) test_that("model serialization works", { @@ -448,8 +457,8 @@ test_that("max_delta_step works", { # model with restricted max_delta_step bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1) # the no-restriction model is expected to have consistently lower loss during the initial iterations - expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss)) - expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8) + expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss)) + expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8) }) test_that("colsample_bytree works", { @@ -668,3 +677,126 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", { pred_gr <- predict(model_gr, x) expect_equal(pred_qid, pred_gr) }) + +test_that("Coefficients from gblinear have the expected shape and names", { + # Single-column coefficients + data(mtcars) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + mm <- model.matrix(~., data = mtcars[, -1]) + dm <- xgb.DMatrix(x, label = y, nthread = 1) + model <- xgb.train( + data = dm, + params = list( + booster = "gblinear", + nthread = 1 + ), + nrounds = 3 + ) + coefs <- coef(model) + expect_equal(length(coefs), ncol(x) + 1) + expect_equal(names(coefs), c("(Intercept)", colnames(x))) + pred_auto <- predict(model, x) + pred_manual <- as.numeric(mm %*% coefs) + expect_equal(pred_manual, pred_auto, tolerance = 1e-5) + + # Multi-column coefficients + data(iris) + y <- as.numeric(iris$Species) - 1 + x <- as.matrix(iris[, -5]) + dm <- xgb.DMatrix(x, label = y, nthread = 1) + mm <- model.matrix(~., data = iris[, -5]) + model <- xgb.train( + data = dm, + params = list( + booster = "gblinear", + objective = "multi:softprob", + num_class = 3, + nthread = 1 + ), + nrounds = 3 + ) + coefs <- coef(model) + expect_equal(nrow(coefs), ncol(x) + 1) + expect_equal(ncol(coefs), 3) + expect_equal(row.names(coefs), c("(Intercept)", colnames(x))) + pred_auto <- predict(model, x, outputmargin = TRUE, reshape = TRUE) + pred_manual <- unname(mm %*% coefs) + expect_equal(pred_manual, pred_auto, tolerance = 1e-7) +}) + +test_that("Deep copies work as expected", { + data(mtcars) + y <- mtcars$mpg + x <- mtcars[, -1] + dm <- xgb.DMatrix(x, label = y, nthread = 1) + model <- xgb.train( + data = dm, + params = list(nthread = 1), + nrounds = 3 + ) + + xgb.attr(model, "my_attr") <- 100 + model_shallow_copy <- model + xgb.attr(model_shallow_copy, "my_attr") <- 333 + attr_orig <- xgb.attr(model, "my_attr") + attr_shallow <- xgb.attr(model_shallow_copy, "my_attr") + expect_equal(attr_orig, attr_shallow) + + model_deep_copy <- xgb.copy.Booster(model) + xgb.attr(model_deep_copy, "my_attr") <- 444 + attr_orig <- xgb.attr(model, "my_attr") + attr_deep <- xgb.attr(model_deep_copy, "my_attr") + expect_false(attr_orig == attr_deep) +}) + +test_that("Pointer comparison works as expected", { + library(xgboost) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + model <- xgb.train( + params = list(nthread = 1), + data = xgb.DMatrix(x, label = y, nthread = 1), + nrounds = 3 + ) + + model_shallow_copy <- model + expect_true(xgb.is.same.Booster(model, model_shallow_copy)) + + model_deep_copy <- xgb.copy.Booster(model) + expect_false(xgb.is.same.Booster(model, model_deep_copy)) + + xgb.attr(model_shallow_copy, "my_attr") <- 111 + expect_equal(xgb.attr(model, "my_attr"), "111") + expect_null(xgb.attr(model_deep_copy, "my_attr")) +}) + +test_that("DMatrix field are set to booster when training", { + set.seed(123) + y <- rnorm(100) + x <- matrix(rnorm(100 * 3), nrow = 100) + x[, 2] <- abs(as.integer(x[, 2])) + + dm_unnamed <- xgb.DMatrix(x, label = y, nthread = 1) + dm_feature_names <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1) + dm_feature_types <- xgb.DMatrix(x, label = y) + setinfo(dm_feature_types, "feature_type", c("q", "c", "q")) + dm_both <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1) + setinfo(dm_both, "feature_type", c("q", "c", "q")) + + params <- list(nthread = 1) + model_unnamed <- xgb.train(data = dm_unnamed, params = params, nrounds = 3) + model_feature_names <- xgb.train(data = dm_feature_names, params = params, nrounds = 3) + model_feature_types <- xgb.train(data = dm_feature_types, params = params, nrounds = 3) + model_both <- xgb.train(data = dm_both, params = params, nrounds = 3) + + expect_null(getinfo(model_unnamed, "feature_name")) + expect_equal(getinfo(model_feature_names, "feature_name"), c("a", "b", "c")) + expect_null(getinfo(model_feature_types, "feature_name")) + expect_equal(getinfo(model_both, "feature_name"), c("a", "b", "c")) + + expect_null(getinfo(model_unnamed, "feature_type")) + expect_null(getinfo(model_feature_names, "feature_type")) + expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q")) + expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q")) +}) diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index b5d3c5310f00..7a8d4f53743f 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -111,9 +111,9 @@ test_that("can store evaluation_log without printing", { expect_silent( bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0) ) - expect_false(is.null(bst$evaluation_log)) - expect_false(is.null(bst$evaluation_log$train_error)) - expect_lt(bst$evaluation_log[, min(train_error)], 0.2) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_false(is.null(attributes(bst)$evaluation_log$train_error)) + expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2) }) test_that("cb.reset.parameters works as expected", { @@ -121,34 +121,34 @@ test_that("cb.reset.parameters works as expected", { # fixed eta set.seed(111) bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0) - expect_false(is.null(bst0$evaluation_log)) - expect_false(is.null(bst0$evaluation_log$train_error)) + expect_false(is.null(attributes(bst0)$evaluation_log)) + expect_false(is.null(attributes(bst0)$evaluation_log$train_error)) # same eta but re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.9, 0.9)) bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) - expect_false(is.null(bst1$evaluation_log$train_error)) - expect_equal(bst0$evaluation_log$train_error, - bst1$evaluation_log$train_error) + expect_false(is.null(attributes(bst1)$evaluation_log$train_error)) + expect_equal(attributes(bst0)$evaluation_log$train_error, + attributes(bst1)$evaluation_log$train_error) # same eta but re-set via a function in the callback set.seed(111) my_par <- list(eta = function(itr, itr_end) 0.9) bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) - expect_false(is.null(bst2$evaluation_log$train_error)) - expect_equal(bst0$evaluation_log$train_error, - bst2$evaluation_log$train_error) + expect_false(is.null(attributes(bst2)$evaluation_log$train_error)) + expect_equal(attributes(bst0)$evaluation_log$train_error, + attributes(bst2)$evaluation_log$train_error) # different eta re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.6, 0.5)) bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) - expect_false(is.null(bst3$evaluation_log$train_error)) - expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error)) + expect_false(is.null(attributes(bst3)$evaluation_log$train_error)) + expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error)) # resetting multiple parameters at the same time runs with no error my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8)) @@ -166,8 +166,8 @@ test_that("cb.reset.parameters works as expected", { my_par <- list(eta = c(0., 0.)) bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) - expect_false(is.null(bstX$evaluation_log$train_error)) - er <- unique(bstX$evaluation_log$train_error) + expect_false(is.null(attributes(bstX)$evaluation_log$train_error)) + er <- unique(attributes(bstX)$evaluation_log$train_error) expect_length(er, 1) expect_gt(er, 0.4) }) @@ -182,14 +182,21 @@ test_that("cb.save.model works as expected", { expect_true(file.exists('xgboost_02.json')) b1 <- xgb.load('xgboost_01.json') xgb.parameters(b1) <- list(nthread = 2) - expect_equal(xgb.ntree(b1), 1) + expect_equal(xgb.nrounds(b1), 1) b2 <- xgb.load('xgboost_02.json') xgb.parameters(b2) <- list(nthread = 2) - expect_equal(xgb.ntree(b2), 2) + expect_equal(xgb.nrounds(b2), 2) xgb.config(b2) <- xgb.config(bst) expect_equal(xgb.config(bst), xgb.config(b2)) - expect_equal(bst$raw, b2$raw) + # TODO: remove this workaround once serialization to disk is able to take more attributes + # expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) + # workaround below: + s1 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(bst, raw_format = "json"))) + s2 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(b2, raw_format = "json"))) + s1$learner$feature_names <- NULL + s2$learner$feature_names <- NULL + expect_equal(s1, s2) # save_period = 0 saves the last iteration's model bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, @@ -197,7 +204,14 @@ test_that("cb.save.model works as expected", { expect_true(file.exists('xgboost.json')) b2 <- xgb.load('xgboost.json') xgb.config(b2) <- xgb.config(bst) - expect_equal(bst$raw, b2$raw) + # TODO: remove this workaround once serialization to disk is able to take more attributes + # expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) + # workaround below: + s1 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(bst, raw_format = "json"))) + s2 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(b2, raw_format = "json"))) + s1$learner$feature_names <- NULL + s2$learner$feature_names <- NULL + expect_equal(s1, s2) for (f in files) if (file.exists(f)) file.remove(f) }) @@ -208,14 +222,14 @@ test_that("early stopping xgb.train works", { bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3, early_stopping_rounds = 3, maximize = FALSE) , "Stopping. Best iteration") - expect_false(is.null(bst$best_iteration)) - expect_lt(bst$best_iteration, 19) - expect_equal(bst$best_iteration, bst$best_ntreelimit) + expect_false(is.null(xgb.attr(bst, "best_iteration"))) + expect_lt(xgb.attr(bst, "best_iteration"), 19) + expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit")) pred <- predict(bst, dtest) expect_equal(length(pred), 1611) err_pred <- err(ltest, pred) - err_log <- bst$evaluation_log[bst$best_iteration, test_error] + err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error] expect_equal(err_log, err_pred, tolerance = 5e-6) set.seed(11) @@ -223,14 +237,14 @@ test_that("early stopping xgb.train works", { bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3, early_stopping_rounds = 3, maximize = FALSE, verbose = 0) ) - expect_equal(bst$evaluation_log, bst0$evaluation_log) + expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log) xgb.save(bst, "model.bin") loaded <- xgb.load("model.bin") - expect_false(is.null(loaded$best_iteration)) - expect_equal(loaded$best_iteration, bst$best_ntreelimit) - expect_equal(loaded$best_ntreelimit, bst$best_ntreelimit) + expect_false(is.null(xgb.attr(loaded, "best_iteration"))) + expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit")) + expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit")) file.remove("model.bin") }) @@ -243,14 +257,14 @@ test_that("early stopping using a specific metric works", { callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE, metric_name = 'test_logloss'))) , "Stopping. Best iteration") - expect_false(is.null(bst$best_iteration)) - expect_lt(bst$best_iteration, 19) - expect_equal(bst$best_iteration, bst$best_ntreelimit) + expect_false(is.null(xgb.attr(bst, "best_iteration"))) + expect_lt(xgb.attr(bst, "best_iteration"), 19) + expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit")) - pred <- predict(bst, dtest, ntreelimit = bst$best_ntreelimit) + pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit")) expect_equal(length(pred), 1611) logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest) - logloss_log <- bst$evaluation_log[bst$best_iteration, test_logloss] + logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss] expect_equal(logloss_log, logloss_pred, tolerance = 1e-5) }) diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index ff8eb1d6d821..c6503124682d 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -35,9 +35,9 @@ num_round <- 2 test_that("custom objective works", { bst <- xgb.train(param, dtrain, num_round, watchlist) expect_equal(class(bst), "xgb.Booster") - expect_false(is.null(bst$evaluation_log)) - expect_false(is.null(bst$evaluation_log$eval_error)) - expect_lt(bst$evaluation_log[num_round, eval_error], 0.03) + expect_false(is.null(attributes(bst)$evaluation_log)) + expect_false(is.null(attributes(bst)$evaluation_log$eval_error)) + expect_lt(attributes(bst)$evaluation_log[num_round, eval_error], 0.03) }) test_that("custom objective in CV works", { @@ -50,7 +50,7 @@ test_that("custom objective in CV works", { test_that("custom objective with early stop works", { bst <- xgb.train(param, dtrain, 10, watchlist) expect_equal(class(bst), "xgb.Booster") - train_log <- bst$evaluation_log$train_error + train_log <- attributes(bst)$evaluation_log$train_error expect_true(all(diff(train_log) <= 0)) }) diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R index 9e0a3551f68e..ae698d98f9db 100644 --- a/R-package/tests/testthat/test_glm.R +++ b/R-package/tests/testthat/test_glm.R @@ -24,28 +24,28 @@ test_that("gblinear works", { bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle') ypred <- predict(bst, dtest) expect_equal(length(getinfo(dtest, 'label')), 1611) - expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic', callbacks = list(cb.gblinear.history())) - expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) h <- xgb.gblinear.history(bst) expect_equal(dim(h), c(n, ncol(dtrain) + 1)) expect_is(h, "matrix") param$updater <- 'coord_descent' bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic') - expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle') - expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy') - expect_lt(bst$evaluation_log$eval_error[2], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL) bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty', top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE))) - expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) + expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL) h <- xgb.gblinear.history(bst) expect_equal(dim(h), c(n, ncol(dtrain) + 1)) expect_s4_class(h, "dgCMatrix") @@ -72,10 +72,10 @@ test_that("gblinear early stopping works", { booster <- xgb.train( param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round ) - expect_equal(booster$best_iteration, 5) + expect_equal(xgb.attr(booster, "best_iteration"), 5) predt_es <- predict(booster, dtrain) - n <- booster$best_iteration + es_round + n <- xgb.attr(booster, "best_iteration") + es_round booster <- xgb.train( param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round ) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index de6a099fc823..4019c88fdcce 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -49,6 +49,14 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds, objective = "multi:softprob", num_class = nclass, base_score = 0) +# without feature names +bst.Tree.unnamed <- xgb.copy.Booster(bst.Tree) +.Call( + XGBoosterSetStrFeatureInfo_R, + xgb.get.handle(bst.Tree.unnamed), + "feature_name", + NULL +) test_that("xgb.dump works", { .skip_if_vcd_not_available() @@ -204,7 +212,7 @@ test_that("xgb-attribute functionality", { list.ch <- list.val[order(names(list.val))] list.ch <- lapply(list.ch, as.character) # note: iter is 0-index in xgb attributes - list.default <- list(niter = as.character(nrounds - 1)) + list.default <- list() list.ch <- c(list.ch, list.default) # proper input: expect_error(xgb.attr(bst.Tree, NULL)) @@ -212,12 +220,13 @@ test_that("xgb-attribute functionality", { # set & get: expect_null(xgb.attr(bst.Tree, "asdf")) expect_equal(xgb.attributes(bst.Tree), list.default) - xgb.attr(bst.Tree, "my_attr") <- val - expect_equal(xgb.attr(bst.Tree, "my_attr"), val) - xgb.attributes(bst.Tree) <- list.val - expect_equal(xgb.attributes(bst.Tree), list.ch) + bst.Tree.copy <- xgb.copy.Booster(bst.Tree) + xgb.attr(bst.Tree.copy, "my_attr") <- val + expect_equal(xgb.attr(bst.Tree.copy, "my_attr"), val) + xgb.attributes(bst.Tree.copy) <- list.val + expect_equal(xgb.attributes(bst.Tree.copy), list.ch) # serializing: - xgb.save(bst.Tree, 'xgb.model') + xgb.save(bst.Tree.copy, 'xgb.model') bst <- xgb.load('xgb.model') if (file.exists('xgb.model')) file.remove('xgb.model') expect_equal(xgb.attr(bst, "my_attr"), val) @@ -225,11 +234,11 @@ test_that("xgb-attribute functionality", { # deletion: xgb.attr(bst, "my_attr") <- NULL expect_null(xgb.attr(bst, "my_attr")) - expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")]) + expect_equal(xgb.attributes(bst), list.ch[c("a", "b")]) xgb.attributes(bst) <- list(a = NULL, b = NULL) expect_equal(xgb.attributes(bst), list.default) xgb.attributes(bst) <- list(niter = NULL) - expect_null(xgb.attributes(bst)) + expect_equal(xgb.attributes(bst), list()) }) if (grepl('Windows', Sys.info()[['sysname']], fixed = TRUE) || @@ -265,18 +274,12 @@ test_that("xgb.Booster serializing as R object works", { if (file.exists('xgb.model')) file.remove('xgb.model') bst <- readRDS('xgb.model.rds') if (file.exists('xgb.model.rds')) file.remove('xgb.model.rds') - nil_ptr <- new("externalptr") - class(nil_ptr) <- "xgb.Booster.handle" - expect_true(identical(bst$handle, nil_ptr)) - bst <- xgb.Booster.complete(bst) - expect_true(!identical(bst$handle, nil_ptr)) - expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance) }) test_that("xgb.model.dt.tree works with and without feature names", { .skip_if_vcd_not_available() names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") - dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) + dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree.unnamed) expect_equal(names.dt.trees, names(dt.tree)) if (!flag_32bit) expect_equal(dim(dt.tree), c(188, 10)) @@ -286,9 +289,7 @@ test_that("xgb.model.dt.tree works with and without feature names", { expect_equal(dt.tree, dt.tree.0) # when model contains no feature names: - bst.Tree.x <- bst.Tree - bst.Tree.x$feature_names <- NULL - dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) + dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed) expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE]) @@ -316,9 +317,7 @@ test_that("xgb.importance works with and without feature names", { expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance) # when model contains no feature names: - bst.Tree.x <- bst.Tree - bst.Tree.x$feature_names <- NULL - importance.Tree.x <- xgb.importance(model = bst.Tree) + importance.Tree.x <- xgb.importance(model = bst.Tree.unnamed) expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE], tolerance = float_tolerance) @@ -334,7 +333,7 @@ test_that("xgb.importance works with and without feature names", { importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees) importance_from_dump <- function() { - model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees) + model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees) imp <- xgb.model.dt.tree( feature_names = feature.names, text = model_text_dump, @@ -415,13 +414,13 @@ test_that("xgb.plot.importance de-duplicates features", { test_that("xgb.plot.tree works with and without feature names", { .skip_if_vcd_not_available() - expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree)) + expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree.unnamed)) expect_silent(xgb.plot.tree(model = bst.Tree)) }) test_that("xgb.plot.multi.trees works with and without feature names", { .skip_if_vcd_not_available() - xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3) + xgb.plot.multi.trees(model = bst.Tree.unnamed, feature_names = feature.names, features_keep = 3) xgb.plot.multi.trees(model = bst.Tree, features_keep = 3) }) diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R index ce1725dc9596..40481a3f00e2 100644 --- a/R-package/tests/testthat/test_model_compatibility.R +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -23,10 +23,6 @@ get_num_tree <- function(booster) { } run_booster_check <- function(booster, name) { - # If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config(). - if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) { - booster <- xgb.Booster.complete(booster) - } config <- jsonlite::fromJSON(xgb.config(booster)) run_model_param_check(config) if (name == 'cls') { @@ -76,6 +72,10 @@ test_that("Models from previous versions of XGBoost can be loaded", { name <- m[3] is_rds <- endsWith(model_file, '.rds') is_json <- endsWith(model_file, '.json') + # TODO: update this test for new RDS format + if (is_rds) { + return(NULL) + } # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) { booster <- readRDS(model_file) diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R index ea71ca7b7e39..994a1f78e479 100644 --- a/R-package/tests/testthat/test_parameter_exposure.R +++ b/R-package/tests/testthat/test_parameter_exposure.R @@ -19,12 +19,12 @@ bst <- xgboost(data = dtrain, objective = "binary:logistic") test_that("call is exposed to R", { - expect_false(is.null(bst$call)) - expect_is(bst$call, "call") + expect_false(is.null(attributes(bst)$call)) + expect_is(attributes(bst)$call, "call") }) test_that("params is exposed to R", { - model_params <- bst$params + model_params <- attributes(bst)$params expect_is(model_params, "list") expect_equal(model_params$eta, 1) expect_equal(model_params$max_depth, 2) diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index cf8b6f007fbb..5daed61d8e35 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -106,7 +106,7 @@ test_that("updating works for multiclass & multitree", { # run update process for an original model with subsampling p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) - bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0) + bst0u <- xgb.train(p0u, dtr, nrounds = xgb.nrounds(bst0), watchlist, xgb_model = bst0, verbose = 0) tr0u <- xgb.model.dt.tree(model = bst0u) # should be the same evaluation but different gains and larger cover From 27bbdbcfa1b8f84242807e4f735f33a0fca4a66e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 13:35:59 +0100 Subject: [PATCH 02/47] export function for getting booster rounds --- R-package/NAMESPACE | 1 + R-package/R/xgb.Booster.R | 10 ++++++++++ R-package/demo/predict_leaf_indices.R | 2 +- R-package/man/xgb.get.Booster.nrounds.Rd | 21 +++++++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 R-package/man/xgb.get.Booster.nrounds.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f82ea9ef427e..a5411f3f30d8 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -39,6 +39,7 @@ export(xgb.create.features) export(xgb.cv) export(xgb.dump) export(xgb.gblinear.history) +export(xgb.get.Booster.nrounds) export(xgb.get.config) export(xgb.ggplot.deepness) export(xgb.ggplot.importance) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 33a0adb594ca..0cbad09372eb 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -682,6 +682,16 @@ setinfo.xgb.Booster <- function(object, name, info, ...) { return(TRUE) } +#' @title Get number of boosting in a fitted booster +#' @param model A fitted `xgb.Booster` model. +#' @return The number of rounds saved in the model, as an integer. +#' @details Note that setting the booster parameters will reset the +#' number of rounds to zero. +#' @export +xgb.get.Booster.nrounds <- function(model) { + return(xgb.nrounds(model)) +} + # Extract the number of trees in a model. # internal utility function xgb.nrounds <- function(bst) { diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 25be31f178e5..3706eb476fa3 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -27,7 +27,7 @@ head(pred_with_leaf) create.new.tree.features <- function(model, original.features) { pred_with_leaf <- predict(model, original.features, predleaf = TRUE) cols <- list() - for (i in 1:xgb.attr(model, "niter")) { + for (i in 1:xgb.get.Booster.nrounds(model)) { # max is not the real max but it s not important for the purpose of adding features leaf.id <- sort(unique(pred_with_leaf[, i])) cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) diff --git a/R-package/man/xgb.get.Booster.nrounds.Rd b/R-package/man/xgb.get.Booster.nrounds.Rd new file mode 100644 index 000000000000..e76cfd417b9b --- /dev/null +++ b/R-package/man/xgb.get.Booster.nrounds.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{xgb.get.Booster.nrounds} +\alias{xgb.get.Booster.nrounds} +\title{Get number of boosting in a fitted booster} +\usage{ +xgb.get.Booster.nrounds(model) +} +\arguments{ +\item{model}{A fitted \code{xgb.Booster} model.} +} +\value{ +The number of rounds saved in the model, as an integer. +} +\description{ +Get number of boosting in a fitted booster +} +\details{ +Note that setting the booster parameters will reset the +number of rounds to zero. +} From 88dd9470169a445f64fb66c6b7cc528ac5f346f9 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 13:36:42 +0100 Subject: [PATCH 03/47] linter --- R-package/R/callbacks.R | 4 +++- R-package/tests/testthat/test_basic.R | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 4839f71fa799..673ef97ff047 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -441,7 +441,9 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") { if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) || (save_period == 0 && env$iteration == env$end_iteration)) { - suppressWarnings({save_name <- sprintf(save_name, env$iteration)}) + suppressWarnings({ + save_name <- sprintf(save_name, env$iteration) + }) xgb.save(env$bst, save_name) } } diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 5cae4aba9321..5fe45e6a7b8a 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -330,7 +330,7 @@ test_that("training continuation works", { } expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2)) file.remove("xgboost.json") - + set.seed(11) bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0) # continue for two more, this time in-place: From 4a3b5e2d8a60449695bce52efcbbc6e67f935acc Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 13:43:06 +0100 Subject: [PATCH 04/47] fix incorrect qualifiers --- R-package/src/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/src/init.c b/R-package/src/init.c index 81cf73f6dac7..a6982f55a709 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -16,9 +16,9 @@ Check these declarations against the C/Fortran source code. /* .Call calls */ extern void XGBInitializeAltrepClass(DllInfo *info); -static SEXP XGDuplicate_R(SEXP); -static SEXP XGDuplicateAttrib(SEXP, SEXP); -static SEXP XGPointerEqComparison(SEXP, SEXP); +extern SEXP XGDuplicate_R(SEXP); +extern SEXP XGDuplicateAttrib(SEXP, SEXP); +extern SEXP XGPointerEqComparison(SEXP, SEXP); extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterCreate_R(SEXP); extern SEXP XGBoosterCopyInfoFromDMatrix(SEXP, SEXP); From 147e1cd31f35dff03ee15116bd7c104fb5ad6f67 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:12:38 +0100 Subject: [PATCH 05/47] remove all references to caret package --- R-package/demo/00Index | 1 - R-package/demo/README.md | 1 - R-package/demo/caret_wrapper.R | 44 ------------------- R-package/demo/runall.R | 1 - R-package/tests/helper_scripts/install_deps.R | 1 - R-package/vignettes/xgboostPresentation.Rmd | 6 +-- 6 files changed, 3 insertions(+), 51 deletions(-) delete mode 100644 R-package/demo/caret_wrapper.R diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 13ffdc6b64fa..fa09fa900486 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -1,5 +1,4 @@ basic_walkthrough Basic feature walkthrough -caret_wrapper Use xgboost to train in caret library custom_objective Customize loss function, and evaluation metric boost_from_prediction Boosting from existing prediction predict_first_ntree Predicting using first n trees diff --git a/R-package/demo/README.md b/R-package/demo/README.md index 0a07a7426741..99a492230d45 100644 --- a/R-package/demo/README.md +++ b/R-package/demo/README.md @@ -1,7 +1,6 @@ XGBoost R Feature Walkthrough ==== * [Basic walkthrough of wrappers](basic_walkthrough.R) -* [Train a xgboost model from caret library](caret_wrapper.R) * [Customize loss function, and evaluation metric](custom_objective.R) * [Boosting from existing prediction](boost_from_prediction.R) * [Predicting using first n trees](predict_first_ntree.R) diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R deleted file mode 100644 index 0e63f27ce188..000000000000 --- a/R-package/demo/caret_wrapper.R +++ /dev/null @@ -1,44 +0,0 @@ -# install development version of caret library that contains xgboost models -require(caret) -require(xgboost) -require(data.table) -require(vcd) -require(e1071) - -# Load Arthritis dataset in memory. -data(Arthritis) -# Create a copy of the dataset with data.table package -# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent -# and its performance are really good). -df <- data.table(Arthritis, keep.rownames = FALSE) - -# Let's add some new categorical features to see if it helps. -# Of course these feature are highly correlated to the Age feature. -# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, -# even in case of highly correlated features. -# For the first feature we create groups of age by rounding the real age. -# Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. -df[, AgeDiscret := as.factor(round(Age / 10, 0))] - -# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. -# I choose this value based on nothing. -# We will see later if simplifying the information based on arbitrary values is a good strategy -# (I am sure you already have an idea of how well it will work!). -df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] - -# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). -df[, ID := NULL] - -#-------------Basic Training using XGBoost in caret Library----------------- -# Set up control parameters for caret::train -# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters. -fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random") -# train a xgbTree model using caret::train -model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl) - -# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model -# using xgbLinear -# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl) - -# See model results -print(model) diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 7a35e247b204..ab1822a5b8ad 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -9,6 +9,5 @@ demo(create_sparse_matrix, package = 'xgboost') demo(predict_leaf_indices, package = 'xgboost') demo(early_stopping, package = 'xgboost') demo(poisson_regression, package = 'xgboost') -demo(caret_wrapper, package = 'xgboost') demo(tweedie_regression, package = 'xgboost') #demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R index cf9ab00348ea..3ae44f6b13f4 100644 --- a/R-package/tests/helper_scripts/install_deps.R +++ b/R-package/tests/helper_scripts/install_deps.R @@ -3,7 +3,6 @@ ## inconsistent is found. pkgs <- c( ## CI - "caret", "pkgbuild", "roxygen2", "XML", diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 1b015fab8e67..dfbe602a39c3 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -107,7 +107,7 @@ train <- agaricus.train test <- agaricus.test ``` -> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/data-splitting.html). +> In the real world, it would be up to you to make this division between `train` and `test` data. Each variable is a `list` containing two things, `label` and `data`: @@ -349,7 +349,7 @@ bst <- xgb.train( Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. -If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/data-splitting.html). +If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics. @@ -490,7 +490,7 @@ file.remove("./xgboost.model") > result is `0`? We are good! -In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it. +In some very specific cases, you will want to save the model as a *R* binary vector. See below how to do it. ```{r saveLoadRBinVectorModel, message=F, warning=F} # save model to R's raw vector From e012cce381caee83b19c8a292f4102a412de47c3 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:33:12 +0100 Subject: [PATCH 06/47] fix example --- R-package/R/xgb.model.dt.tree.R | 2 -- R-package/man/xgb.model.dt.tree.Rd | 2 -- 2 files changed, 4 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index c42d609b8f9b..e395d1756c2e 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -56,8 +56,6 @@ #' objective = "binary:logistic" #' ) #' -#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) -#' #' # This bst model already has feature_names stored with it, so those would be used when #' # feature_names is not set: #' (dt <- xgb.model.dt.tree(model = bst)) diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 4d4ae7772791..f68d8d39e75b 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -79,8 +79,6 @@ bst <- xgboost( objective = "binary:logistic" ) -(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) - # This bst model already has feature_names stored with it, so those would be used when # feature_names is not set: (dt <- xgb.model.dt.tree(model = bst)) From f4448129b1515bd0ca9910cca03998305d9fd64e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:35:39 +0100 Subject: [PATCH 07/47] misc fixes --- R-package/R/xgb.Booster.R | 4 ++-- R-package/R/xgb.load.raw.R | 2 +- R-package/R/xgb.plot.shap.R | 2 +- R-package/R/xgb.train.R | 4 ++-- R-package/man/coef.xgb.Booster.Rd | 2 +- R-package/man/xgb.attr.Rd | 2 +- R-package/man/xgb.load.raw.Rd | 2 +- R-package/man/xgb.train.Rd | 4 ++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 0cbad09372eb..58a7e4d3efff 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -442,7 +442,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA #' Important: since this modifies the booster's C object, semantics for assignment here #' will differ from R's, as any object reference to the same booster will be modified #' too, while assignment of R attributes through `attributes(model)$ <- ` -#' will follow the usual copy-on-write R semantics (see \link{xgb.xopy.Booster} for an +#' will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an #' example of these behaviors). #' #' @return @@ -757,7 +757,7 @@ xgb.best_iteration <- function(bst) { #' as first column. #' \item If there's more than one coefficient per column in the data (e.g. when using #' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal -#' to [num_features, num_cols], with the intercepts as first row. Note that the column +#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column #' (classes in multi-class classification) dimension will not be named. #' } #' diff --git a/R-package/R/xgb.load.raw.R b/R-package/R/xgb.load.raw.R index a19febae55d2..73ac50dc6662 100644 --- a/R-package/R/xgb.load.raw.R +++ b/R-package/R/xgb.load.raw.R @@ -4,7 +4,7 @@ #' #' @param buffer the buffer returned by xgb.save.raw #' @export -xgb.load.raw <- function(buffer, as_booster = FALSE) { +xgb.load.raw <- function(buffer) { cachelist <- list() bst <- .Call(XGBoosterCreate_R, cachelist) .Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer) diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 8e26d7feab4c..c19e5e63e383 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -307,7 +307,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, if (is.null(features) && !is.null(model)) { model_feature_names <- xgb.feature_names(model) } - if (is.null(model_feature_names) && .Call(model, XGBoosterGetNumFeature_R) != ncol(data)) + if (is.null(model_feature_names) && .Call(XGBoosterGetNumFeature_R, model) != ncol(data)) stop("if model has no feature_names, columns in `data` must match features in model") if (!is.null(subsample)) { diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index df3c79c025df..aeb9f3e0bd4b 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -236,10 +236,10 @@ #' not anyhow used by functions like \link{predict.xgb.Booster}. #' #' If passing `keep_extra_attributes=TRUE`, note that the parameters passed here will be kept -#' in the R-specific attributes, but since functions like \link{xgb.parameters} allow changing +#' in the R-specific attributes, but since functions like \link{xgb.parameters<-} allow changing #' parameters in the C-level object after it has been fitted, be aware that there's no guarantee #' that these R parameters would be synchronized with the internal booster parameters as -#' return by \link{xgb.parameters} or \link{xgb.config}. +#' return by \link{xgb.parameters<-} or \link{xgb.config}. #' #' @seealso #' \code{\link{callbacks}}, diff --git a/R-package/man/coef.xgb.Booster.Rd b/R-package/man/coef.xgb.Booster.Rd index 3924987f0142..7318077bbb0f 100644 --- a/R-package/man/coef.xgb.Booster.Rd +++ b/R-package/man/coef.xgb.Booster.Rd @@ -18,7 +18,7 @@ vector, potentially containing the feature names if available, with the intercep as first column. \item If there's more than one coefficient per column in the data (e.g. when using \code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal -to \link{num_features, num_cols}, with the intercepts as first row. Note that the column +to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column (classes in multi-class classification) dimension will not be named. } diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index e84dc749d4bc..6bd69a680b0b 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -57,7 +57,7 @@ but it doesn't delete the other existing attributes. Important: since this modifies the booster's C object, semantics for assignment here will differ from R's, as any object reference to the same booster will be modified too, while assignment of R attributes through \verb{attributes(model)$ <- } -will follow the usual copy-on-write R semantics (see \link{xgb.xopy.Booster} for an +will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an example of these behaviors). } \examples{ diff --git a/R-package/man/xgb.load.raw.Rd b/R-package/man/xgb.load.raw.Rd index d1da607375bd..f0248cd9e002 100644 --- a/R-package/man/xgb.load.raw.Rd +++ b/R-package/man/xgb.load.raw.Rd @@ -4,7 +4,7 @@ \alias{xgb.load.raw} \title{Load serialised xgboost model from R's raw vector} \usage{ -xgb.load.raw(buffer, as_booster = FALSE) +xgb.load.raw(buffer) } \arguments{ \item{buffer}{the buffer returned by xgb.save.raw} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index f768ad5f0261..4ebc27627997 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -305,10 +305,10 @@ only used in the R interface, only kept when using R's serializers like \link{sa not anyhow used by functions like \link{predict.xgb.Booster}. If passing \code{keep_extra_attributes=TRUE}, note that the parameters passed here will be kept -in the R-specific attributes, but since functions like \link{xgb.parameters} allow changing +in the R-specific attributes, but since functions like \link{xgb.parameters<-} allow changing parameters in the C-level object after it has been fitted, be aware that there's no guarantee that these R parameters would be synchronized with the internal booster parameters as -return by \link{xgb.parameters} or \link{xgb.config}. +return by \link{xgb.parameters<-} or \link{xgb.config}. } \examples{ data(agaricus.train, package='xgboost') From 2f30031ebfe09cf39ff43b677fd9cd2ba0fa3f48 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:39:54 +0100 Subject: [PATCH 08/47] allow unsetting booster info --- R-package/R/xgb.Booster.R | 4 +--- R-package/tests/testthat/test_helpers.R | 7 +------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 58a7e4d3efff..a87229f4161e 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -669,9 +669,7 @@ setinfo.xgb.Booster <- function(object, name, info, ...) { if (!(name %in% allowed_fields)) { stop("setinfo: unknown info name ", name) } - if (!is.character(info)) { - stop("'info' must be a character vector.") - } + info <- as.character(info) handle <- xgb.get.handle(object) .Call( XGBoosterSetStrFeatureInfo_R, diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 4019c88fdcce..ed43e0fe2d55 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -51,12 +51,7 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, # without feature names bst.Tree.unnamed <- xgb.copy.Booster(bst.Tree) -.Call( - XGBoosterSetStrFeatureInfo_R, - xgb.get.handle(bst.Tree.unnamed), - "feature_name", - NULL -) +setinfo(bst.Tree.unnamed, "feature_name", NULL) test_that("xgb.dump works", { .skip_if_vcd_not_available() From 6d4ad8b23808ed4bfbc59c5c4c243c536946b43e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:49:41 +0100 Subject: [PATCH 09/47] remove unused argument --- R-package/tests/testthat/test_io.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R index 8cf5a9ae97c2..e89cab657c04 100644 --- a/R-package/tests/testthat/test_io.R +++ b/R-package/tests/testthat/test_io.R @@ -17,8 +17,8 @@ test_that("load/save raw works", { ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj") old_bytes <- xgb.save.raw(booster, raw_format = "deprecated") - from_json <- xgb.load.raw(json_bytes, as_booster = TRUE) - from_ubj <- xgb.load.raw(ubj_bytes, as_booster = TRUE) + from_json <- xgb.load.raw(json_bytes) + from_ubj <- xgb.load.raw(ubj_bytes) json2old <- xgb.save.raw(from_json, raw_format = "deprecated") ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated") From b0054be9994280fb64833ef3302285117b7a8f3f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 14:56:42 +0100 Subject: [PATCH 10/47] more fixes --- R-package/R/callbacks.R | 2 +- R-package/R/xgb.plot.shap.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 673ef97ff047..16e781124f1f 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -732,7 +732,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) { if (!is_cv) { num_class <- xgb.num_class(model) - num_feat <- .Call(XGBoosterGetNumFeature_R, xgb.get.handle(model)) + num_feat <- xgb.num_feature(model) } else { # in case of CV, the object is expected to have this info if (model$params$booster != "gblinear") diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index c19e5e63e383..788a095399ed 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -307,7 +307,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, if (is.null(features) && !is.null(model)) { model_feature_names <- xgb.feature_names(model) } - if (is.null(model_feature_names) && .Call(XGBoosterGetNumFeature_R, model) != ncol(data)) + if (is.null(model_feature_names) && xgb.num_feature(model) != ncol(data)) stop("if model has no feature_names, columns in `data` must match features in model") if (!is.null(subsample)) { From 4050b6fb0a95fdb8fa19aa3c3e4ad9f93a328b01 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 26 Dec 2023 15:50:17 +0100 Subject: [PATCH 11/47] missing import --- R-package/NAMESPACE | 1 + R-package/R/xgboost.R | 1 + 2 files changed, 2 insertions(+) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a5411f3f30d8..4af9325e1ff6 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -87,6 +87,7 @@ importFrom(graphics,points) importFrom(graphics,title) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) +importFrom(stats,coef) importFrom(stats,median) importFrom(stats,predict) importFrom(utils,head) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index f61c535e228f..4b4d603da070 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -99,6 +99,7 @@ NULL #' @importFrom jsonlite fromJSON #' @importFrom jsonlite toJSON #' @importFrom utils object.size str tail +#' @importFrom stats coef #' @importFrom stats predict #' @importFrom stats median #' @importFrom utils head From 2e16f73f7999e89b3c55f38d74c0eaa0f6e7d9de Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 27 Dec 2023 19:12:54 +0100 Subject: [PATCH 12/47] swap 'static' with 'namespace' --- R-package/src/xgboost_R.cc | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 98e958bd3c57..71fe32d9524f 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -189,13 +189,15 @@ XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) { return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr); } -XGB_DLL void _DMatrixFinalizer(SEXP ext) { +namespace { +void _DMatrixFinalizer(SEXP ext) { R_API_BEGIN(); if (R_ExternalPtrAddr(ext) == NULL) return; CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext))); R_ClearExternalPtr(ext); R_API_END(); } +} /* namespace */ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str) { R_API_BEGIN(); @@ -560,7 +562,8 @@ XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2) { } // functions related to booster -static void _BoosterFinalizer(SEXP R_ptr) { +namespace { +void _BoosterFinalizer(SEXP R_ptr) { if (R_ExternalPtrAddr(R_ptr) == NULL) return; CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(R_ptr))); R_ClearExternalPtr(R_ptr); @@ -569,17 +572,17 @@ static void _BoosterFinalizer(SEXP R_ptr) { /* Booster is represented as an altrep list with one element which corresponds to an 'externalptr' holding the C object, forbidding modification by not implementing setters, and adding custom serialization. */ -static R_altrep_class_t XGBAltrepPointerClass; +R_altrep_class_t XGBAltrepPointerClass; -static R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) { +R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) { return 1; } -static SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) { +SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) { return R_altrep_data1(R_altrepped_obj); } -static SEXP XGBMakeEmptyAltrep() { +SEXP XGBMakeEmptyAltrep() { SEXP class_name = Rf_protect(Rf_mkString("xgb.Booster")); SEXP elt_names = Rf_protect(Rf_mkString("ptr")); SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); @@ -592,15 +595,15 @@ static SEXP XGBMakeEmptyAltrep() { /* Note: the idea for separating this function from the one above is to be able to trigger all R allocations first before doing non-R allocations. */ -static void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) { +void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) { SEXP R_ptr = R_altrep_data1(R_altrepped_obj); R_SetExternalPtrAddr(R_ptr, handle); R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE); } -const char *ubj_json_format_str = "{\"format\": \"ubj\"}"; +char *ubj_json_format_str = "{\"format\": \"ubj\"}"; -static SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { +SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { R_API_BEGIN(); BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)); char const *serialized_bytes; @@ -617,7 +620,7 @@ static SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { return R_NilValue; /* <- should not be reached */ } -static SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) { +SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) { SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep()); R_API_BEGIN(); BoosterHandle handle = nullptr; @@ -636,7 +639,7 @@ static SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) { } // https://purrple.cat/blog/2018/10/14/altrep-and-cpp/ -static Rboolean XGBAltrepInspector_R( +Rboolean XGBAltrepInspector_R( SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int)) { Rprintf("Altrepped external pointer [address:%p]\n", @@ -644,7 +647,7 @@ static Rboolean XGBAltrepInspector_R( return TRUE; } -static SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { +SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { R_API_BEGIN(); if (!deep) { SEXP out = Rf_protect(XGBMakeEmptyAltrep()); @@ -675,6 +678,8 @@ static SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { return R_NilValue; /* <- should not be reached */ } +} /* namespace */ + XGB_DLL void XGBInitializeAltrepClass(DllInfo *dll) { XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll); R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R); From 70affd580a555856188fe296ae307329810f433f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 27 Dec 2023 19:19:05 +0100 Subject: [PATCH 13/47] improve wording on compatibility note --- R-package/R/utils.R | 8 ++++---- R-package/man/a-compatibility-note-for-saveRDS-save.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index e0683b84555a..75cc3a75e2b5 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -360,10 +360,10 @@ NULL #' accessible in later releases of XGBoost. To ensure that your model can be accessed in future #' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. #' -#' Currently, it is not possible to use R serializers like `readRDS` to load an XGBoost. model -#' saved with an XGBoost. version lower than 2.1.0, and it's not possible to load an XGBoost. model -#' saved with R serializers like `readRDS` under XGBoost. version 2.1.0 when using an older version -#' of XGBoost. +#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models +#' before version `2.1.0`; have a very different R object structure and are incompatible with +#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before +#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. #' #' Furthermore, note that using the package `qs` for serialization will require version 0.26 or #' higher of said package, and will have the same compatibility restrictions as R serializers. diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 9b1bddeba4e3..705f504af57a 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -14,10 +14,10 @@ accessible in later releases of XGBoost. To ensure that your model can be access releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. } \details{ -Currently, it is not possible to use R serializers like \code{readRDS} to load an XGBoost. model -saved with an XGBoost. version lower than 2.1.0, and it's not possible to load an XGBoost. model -saved with R serializers like \code{readRDS} under XGBoost. version 2.1.0 when using an older version -of XGBoost. +Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models +before version \verb{2.1.0}; have a very different R object structure and are incompatible with +each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before +version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or higher of said package, and will have the same compatibility restrictions as R serializers. From af6cdbff8126ccc15c4127c2850bff99deab15f1 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 28 Dec 2023 22:53:55 +0100 Subject: [PATCH 14/47] fix non-executed tests and potentially incorrect 'niter_init' --- R-package/R/xgb.Booster.R | 19 +++++++++----- R-package/R/xgb.cv.R | 1 + R-package/R/xgb.load.R | 1 + R-package/R/xgb.train.R | 17 ++----------- R-package/tests/testthat/test_ranking.R | 8 +++--- R-package/tests/testthat/test_update.R | 33 +++++++++++++++++++------ 6 files changed, 47 insertions(+), 32 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index a87229f4161e..c215d5df7630 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -1,5 +1,8 @@ -# Construct an internal xgboost Booster. +# Construct an internal xgboost Booster and get its current number of rounds. # internal utility function +# Note: the number of rounds in the C booster gets reset to zero when updating +# the parameters through 'xgb.parameters', hence the need to return it from +# this function when using it for training continuation xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { if (typeof(cachelist) != "list" || !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { @@ -12,24 +15,28 @@ xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { bst <- .Call(XGBoosterCreate_R, cachelist) modelfile <- path.expand(modelfile) .Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1])) + niter <- xgb.nrounds(bst) if (length(params) > 0) { xgb.parameters(bst) <- params } - return(bst) + return(list(bst = bst, niter = niter)) } else if (is.raw(modelfile)) { ## A memory buffer bst <- xgb.unserialize(modelfile) + niter <- xgb.nrounds(bst) xgb.parameters(bst) <- params - return(bst) + return(list(bst = bst, niter = niter)) } else if (inherits(modelfile, "xgb.Booster")) { ## A booster object if (training_continuation == "copy") { bst <- .Call(XGDuplicate_R, modelfile) + niter <- xgb.nrounds(bst) xgb.parameters(bst) <- params - return(bst) + return(list(bst = bst, niter = niter)) } else { + niter <- xgb.nrounds(modelfile) xgb.parameters(modelfile) <- params - return(modelfile) + return(list(bst = modelfile, niter = niter)) } } else { stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") @@ -40,7 +47,7 @@ xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { if (length(params) > 0) { xgb.parameters(bst) <- params } - return(bst) + return(list(bst = bst, niter = 0L)) } # Check whether xgb.Booster handle is null diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index f1edf184d1d9..1b13aff26595 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -210,6 +210,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing modelfile = NULL, training_continuation = "copy" # <- not used ) + bst <- bst$bst list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) }) rm(dall) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index ba446e50a483..b1f5e8c48260 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -50,6 +50,7 @@ xgb.load <- function(modelfile) { modelfile = modelfile, training_continuation = "copy" ) + bst <- bst$bst # re-use modelfile if it is raw so we do not need to serialize if (typeof(modelfile) == "raw") { warning( diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index aeb9f3e0bd4b..7d91d00d377b 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -404,13 +404,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), past_evaluation_log <- attributes(xgb_model)$evaluation_log } - niter_init <- 0 - if (inherits(xgb_model, "xgb.Booster")) { - # Note: when assigning 'xgb.params', the number of rounds in the object - # gets reset to zero, hence this piece of code. - niter_init <- xgb.nrounds(xgb_model) - } - # Construct a booster (either a new one or load from xgb_model) bst <- xgb.Booster( params = params, @@ -418,6 +411,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), modelfile = xgb_model, training_continuation = training_continuation ) + niter_init <- bst$niter + bst <- bst$bst # extract parameters that can affect the relationship b/w #trees and #iterations # Note: it might look like these aren't used, but they need to be defined in this @@ -425,14 +420,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) - # When the 'xgb_model' was set, find out how many boosting iterations it has - # TODO: improve this kind of logic by leveraging C-level attributes - if (!is.null(xgb_model) && !inherits(xgb_model, "xgb.Booster")) { - niter_init <- xgb.nrounds(bst) - if (length(niter_init) == 0) { - niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class) - } - } if (is_update && nrounds > niter_init) stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)") diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R index d4102dfce343..277c8f288e34 100644 --- a/R-package/tests/testthat/test_ranking.R +++ b/R-package/tests/testthat/test_ranking.R @@ -17,8 +17,8 @@ test_that('Test ranking with unweighted data', { eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads) bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain)) # Check if the metric is monotone increasing - expect_true(all(diff(bst$evaluation_log$train_auc) >= 0)) - expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0)) + expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0)) + expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0)) }) test_that('Test ranking with weighted data', { @@ -41,8 +41,8 @@ test_that('Test ranking with weighted data', { ) bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain)) # Check if the metric is monotone increasing - expect_true(all(diff(bst$evaluation_log$train_auc) >= 0)) - expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0)) + expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0)) + expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0)) for (i in 1:10) { pred <- predict(bst, newdata = dtrain, ntreelimit = i) # is_sorted[i]: is i-th group correctly sorted by the ranking predictor? diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index 5daed61d8e35..3d58232c9b31 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -40,7 +40,12 @@ test_that("updating the model works", { bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0) tr1r <- xgb.model.dt.tree(model = bst1r) # all should be the same when no subsampling - expect_equal(bst1$evaluation_log, bst1r$evaluation_log) + expect_equal(attributes(bst1)$evaluation_log, attributes(bst1r)$evaluation_log) + expect_equal( + jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))), + jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1r, raw_format = "json"))), + tolerance = 1e-6 + ) if (!win32_flag) { expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE) } @@ -51,7 +56,7 @@ test_that("updating the model works", { bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0) tr2r <- xgb.model.dt.tree(model = bst2r) # should be the same evaluation but different gains and larger cover - expect_equal(bst2$evaluation_log, bst2r$evaluation_log) + expect_equal(attributes(bst2)$evaluation_log, attributes(bst2r)$evaluation_log) if (!win32_flag) { expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality) } @@ -59,11 +64,25 @@ test_that("updating the model works", { expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5) # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data: + set.seed(123) p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE)) bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) tr1u <- xgb.model.dt.tree(model = bst1u) # all should be the same when no subsampling - expect_equal(bst1$evaluation_log, bst1u$evaluation_log) + expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log) + expect_equal( + jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))), + jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1u, raw_format = "json"))), + tolerance = 1e-6 + ) + expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE) + + # same thing but with a serialized model + set.seed(123) + bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1)) + tr1u <- xgb.model.dt.tree(model = bst1u) + # all should be the same when no subsampling + expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log) expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE) # process type 'update' for model with subsampling, refreshing only the tree stats from training data: @@ -71,12 +90,12 @@ test_that("updating the model works", { bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2) tr2u <- xgb.model.dt.tree(model = bst2u) # should be the same evaluation but different gains and larger cover - expect_equal(bst2$evaluation_log, bst2u$evaluation_log) + expect_equal(attributes(bst2)$evaluation_log, attributes(bst2u)$evaluation_log) expect_equal(tr2[Feature == 'Leaf']$Quality, tr2u[Feature == 'Leaf']$Quality) expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2u[Feature != 'Leaf']$Quality)), 100) expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5) # the results should be the same as for the model with an extra 'refresh' updater - expect_equal(bst2r$evaluation_log, bst2u$evaluation_log) + expect_equal(attributes(bst2r)$evaluation_log, attributes(bst2u)$evaluation_log) if (!win32_flag) { expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE) } @@ -86,7 +105,7 @@ test_that("updating the model works", { bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) tr1ut <- xgb.model.dt.tree(model = bst1ut) # should be the same evaluations but different gains and smaller cover (test data is smaller) - expect_equal(bst1$evaluation_log, bst1ut$evaluation_log) + expect_equal(attributes(bst1)$evaluation_log, attributes(bst1ut)$evaluation_log) expect_equal(tr1[Feature == 'Leaf']$Quality, tr1ut[Feature == 'Leaf']$Quality) expect_gt(sum(abs(tr1[Feature != 'Leaf']$Quality - tr1ut[Feature != 'Leaf']$Quality)), 100) expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5) @@ -110,7 +129,7 @@ test_that("updating works for multiclass & multitree", { tr0u <- xgb.model.dt.tree(model = bst0u) # should be the same evaluation but different gains and larger cover - expect_equal(bst0$evaluation_log, bst0u$evaluation_log) + expect_equal(attributes(bst0)$evaluation_log, attributes(bst0u)$evaluation_log) expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality) expect_gt(sum(abs(tr0[Feature != 'Leaf']$Quality - tr0u[Feature != 'Leaf']$Quality)), 100) expect_gt(sum(tr0u$Cover) / sum(tr0$Cover), 1.5) From 22d4dd73fc5fff59654cdd740edf8d605c4cf511 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 29 Dec 2023 09:01:51 +0100 Subject: [PATCH 15/47] linter --- R-package/R/xgb.train.R | 4 ++-- R-package/src/xgboost_R.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 7d91d00d377b..9ea4145efe27 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -417,8 +417,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), # extract parameters that can affect the relationship b/w #trees and #iterations # Note: it might look like these aren't used, but they need to be defined in this # environment for the callbacks for work correctly. - num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) - num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) + num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint + num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint if (is_update && nrounds > niter_init) stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)") diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 71fe32d9524f..f45dd463bc1a 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -601,7 +601,7 @@ void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) { R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE); } -char *ubj_json_format_str = "{\"format\": \"ubj\"}"; +const char *ubj_json_format_str = "{\"format\": \"ubj\"}"; SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { R_API_BEGIN(); From 74d5d55ae5476769ad421b72983679cdd5764422 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sat, 30 Dec 2023 09:42:18 +0100 Subject: [PATCH 16/47] more doc specificity about nrounds reset --- R-package/R/xgb.Booster.R | 7 ++++--- R-package/man/xgb.get.Booster.nrounds.Rd | 3 ++- R-package/man/xgb.parameters.Rd | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index c215d5df7630..b05c2706b2ae 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -603,8 +603,8 @@ xgb.config <- function(object) { #' #' See \link{xgb.copy.Booster} for an example of this behavior. #' -#' Be aware that setting the parameters of a fitted booster will reset its number of rounds -#' indicator to zero. +#' Be aware that setting parameters of a fitted booster related to training continuation / updates +#' will reset its number of rounds indicator to zero. #' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}. #' @param value A list (or an object coercible to a list) with the names of parameters to set #' and the elements corresponding to parameter values. @@ -690,7 +690,8 @@ setinfo.xgb.Booster <- function(object, name, info, ...) { #' @title Get number of boosting in a fitted booster #' @param model A fitted `xgb.Booster` model. #' @return The number of rounds saved in the model, as an integer. -#' @details Note that setting the booster parameters will reset the +#' @details Note that setting booster parameters related to training +#' continuation / updates through \link{xgb.params<-} will reset the #' number of rounds to zero. #' @export xgb.get.Booster.nrounds <- function(model) { diff --git a/R-package/man/xgb.get.Booster.nrounds.Rd b/R-package/man/xgb.get.Booster.nrounds.Rd index e76cfd417b9b..11563d784396 100644 --- a/R-package/man/xgb.get.Booster.nrounds.Rd +++ b/R-package/man/xgb.get.Booster.nrounds.Rd @@ -16,6 +16,7 @@ The number of rounds saved in the model, as an integer. Get number of boosting in a fitted booster } \details{ -Note that setting the booster parameters will reset the +Note that setting booster parameters related to training +continuation / updates through \link{xgb.params<-} will reset the number of rounds to zero. } diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd index 682e23ef5be8..8d5044cab5cc 100644 --- a/R-package/man/xgb.parameters.Rd +++ b/R-package/man/xgb.parameters.Rd @@ -26,8 +26,8 @@ attributes which follow copy-on-write semantics. See \link{xgb.copy.Booster} for an example of this behavior. -Be aware that setting the parameters of a fitted booster will reset its number of rounds -indicator to zero. +Be aware that setting parameters of a fitted booster related to training continuation / updates +will reset its number of rounds indicator to zero. } \examples{ data(agaricus.train, package = "xgboost") From 1bb74d8f3adcd6dd3e1f50ea867619c8c5c6a17a Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sat, 30 Dec 2023 10:01:59 +0100 Subject: [PATCH 17/47] correct function name --- R-package/R/xgb.Booster.R | 2 +- R-package/man/xgb.get.Booster.nrounds.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index b05c2706b2ae..a5c7ed6eb28a 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -691,7 +691,7 @@ setinfo.xgb.Booster <- function(object, name, info, ...) { #' @param model A fitted `xgb.Booster` model. #' @return The number of rounds saved in the model, as an integer. #' @details Note that setting booster parameters related to training -#' continuation / updates through \link{xgb.params<-} will reset the +#' continuation / updates through \link{xgb.parameters<-} will reset the #' number of rounds to zero. #' @export xgb.get.Booster.nrounds <- function(model) { diff --git a/R-package/man/xgb.get.Booster.nrounds.Rd b/R-package/man/xgb.get.Booster.nrounds.Rd index 11563d784396..c39b5e6d6d29 100644 --- a/R-package/man/xgb.get.Booster.nrounds.Rd +++ b/R-package/man/xgb.get.Booster.nrounds.Rd @@ -17,6 +17,6 @@ Get number of boosting in a fitted booster } \details{ Note that setting booster parameters related to training -continuation / updates through \link{xgb.params<-} will reset the +continuation / updates through \link{xgb.parameters<-} will reset the number of rounds to zero. } From b5ec14e089b7ba82af49123076d56cab727d4102 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 31 Dec 2023 14:46:03 +0100 Subject: [PATCH 18/47] corrections after merge conflicts --- R-package/R/xgb.Booster.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index a5c7ed6eb28a..efc9368ff98f 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -650,7 +650,7 @@ xgb.config <- function(object) { #' @rdname getinfo #' @export -getinfo.xgb.Booster <- function(object, name, ...) { +getinfo.xgb.Booster <- function(object, name) { name <- as.character(head(name, 1L)) allowed_fields <- c("feature_name", "feature_type") if (!(name %in% allowed_fields)) { @@ -670,7 +670,7 @@ getinfo.xgb.Booster <- function(object, name, ...) { #' @rdname getinfo #' @export -setinfo.xgb.Booster <- function(object, name, info, ...) { +setinfo.xgb.Booster <- function(object, name, info) { name <- as.character(head(name, 1L)) allowed_fields <- c("feature_name", "feature_type") if (!(name %in% allowed_fields)) { From ae0de6d99a2951cf1e867b6af8d80a8c11ae85de Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 31 Dec 2023 16:36:19 +0100 Subject: [PATCH 19/47] more corrections after merge conflict --- R-package/R/xgb.DMatrix.R | 1 - R-package/man/getinfo.Rd | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index f15eba4493ac..25170a3ba6f9 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -447,7 +447,6 @@ getinfo.xgb.DMatrix <- function(object, name) { #' setinfo(dtrain, 'label', 1-labels) #' labels2 <- getinfo(dtrain, 'label') #' stopifnot(all.equal(labels2, 1-labels)) -#' @rdname setinfo #' @export setinfo <- function(object, name, info) UseMethod("setinfo") diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index bc7b03c233ea..5ada11f2db42 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -9,9 +9,9 @@ \alias{setinfo.xgb.DMatrix} \title{Get or set information of xgb.DMatrix and xgb.Booster objects} \usage{ -\method{getinfo}{xgb.Booster}(object, name, ...) +\method{getinfo}{xgb.Booster}(object, name) -\method{setinfo}{xgb.Booster}(object, name, info, ...) +\method{setinfo}{xgb.Booster}(object, name, info) getinfo(object, name) From 041dd2f3f36604c5ba0e79b164083d4b7a1b1527 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 18:06:51 +0100 Subject: [PATCH 20/47] updates for new default serialization format --- R-package/R/callbacks.R | 12 ++++++---- R-package/R/xgb.load.R | 6 ++--- R-package/R/xgb.save.R | 28 ++++++++++++++++------- R-package/R/xgb.train.R | 11 ++++----- R-package/man/cb.save.model.Rd | 17 ++++++++++---- R-package/man/xgb.load.Rd | 6 ++--- R-package/man/xgb.save.Rd | 28 ++++++++++++++++------- R-package/tests/testthat/test_callbacks.R | 18 ++------------- 8 files changed, 73 insertions(+), 53 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 33df896cf423..2cd5934e6950 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, #' @param save_period save the model to disk after every #' \code{save_period} iterations; 0 means save the model at the end. #' @param save_name the name or path for the saved model file. +#' +#' Note that the format of the model being saved is determined by the file +#' extension specified here (see \link{xgb.save} for details about how it works). +#' #' It can contain a \code{\link[base]{sprintf}} formatting specifier #' to include the integer iteration number in the file name. -#' E.g., with \code{save_name} = 'xgboost_%04d.model', -#' the file saved at iteration 50 would be named "xgboost_0050.model". -#' +#' E.g., with \code{save_name} = 'xgboost_%04d.ubj', +#' the file saved at iteration 50 would be named "xgboost_0050.ubj". +#' @seealso \link{xgb.save} #' @details #' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end. #' @@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, #' \code{\link{callbacks}} #' #' @export -cb.save.model <- function(save_period = 0, save_name = "xgboost.model") { +cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") { if (save_period < 0) stop("'save_period' cannot be negative") diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index bfd0299cc33f..47b39e78801d 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -38,9 +38,9 @@ #' objective = "binary:logistic" #' ) #' -#' xgb.save(bst, 'xgb.model') -#' bst <- xgb.load('xgb.model') -#' if (file.exists('xgb.model')) file.remove('xgb.model') +#' xgb.save(bst, 'xgb.ubj') +#' bst <- xgb.load('xgb.ubj') +#' if (file.exists('xgb.ubj')) file.remove('xgb.ubj') #' @export xgb.load <- function(modelfile) { if (is.null(modelfile)) diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 4394bfca32ea..46e8229dced2 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -1,12 +1,24 @@ #' Save xgboost model to binary file #' -#' Save xgboost model to a file in binary format. +#' Save xgboost model to a file in binary or JSON format. #' -#' @param model model object of \code{xgb.Booster} class. -#' @param fname name of the file to write. +#' @param model Model object of \code{xgb.Booster} class. +#' @param fname Name of the file to write. +#' +#' Note that the extension of this file name determined the serialization format to use:\itemize{ +#' \item Extension ".ubj" will use the universal binary JSON format (recommended). +#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss +#' of precision when converting to a human-readable JSON text or similar. +#' \item Extension ".json" will use plain JSON, which is a human-readable format. +#' \item Extension ".model" will use a \bold{deprecated} binary format. This format will +#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration" +#' attribute that boosters might keep, nor feature names or user-specifiec attributes. +#' \item If the format is not specified by passing one of the file extensions above, will +#' default to UBJ. +#' } #' #' @details -#' This methods allows to save a model in an xgboost-internal binary format which is universal +#' This methods allows to save a model in an xgboost-internal binary or text format which is universal #' among the various xgboost interfaces. In R, the saved model file could be read-in later #' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter #' of \code{\link{xgb.train}}. @@ -14,7 +26,7 @@ #' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} #' or \code{\link[base]{save}}). However, it would then only be compatible with R, and #' corresponding R-methods would need to be used to load it. Moreover, persisting the model with -#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in +#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in #' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn #' how to persist models in a future-proof way, i.e. to make the model accessible in future #' releases of XGBoost. @@ -40,9 +52,9 @@ #' nrounds = 2, #' objective = "binary:logistic" #' ) -#' xgb.save(bst, 'xgb.model') -#' bst <- xgb.load('xgb.model') -#' if (file.exists('xgb.model')) file.remove('xgb.model') +#' xgb.save(bst, 'xgb.ubj') +#' bst <- xgb.load('xgb.ubj') +#' if (file.exists('xgb.ubj')) file.remove('xgb.ubj') #' @export xgb.save <- function(model, fname) { if (typeof(fname) != "character") diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index e938ab1ac5cf..9b5e1a200413 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -413,6 +413,11 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), ) niter_init <- bst$niter bst <- bst$bst + .Call( + XGBoosterCopyInfoFromDMatrix, + xgb.get.handle(bst), + dtrain + ) # extract parameters that can affect the relationship b/w #trees and #iterations # Note: it might look like these aren't used, but they need to be defined in this @@ -468,12 +473,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), } } - .Call( - XGBoosterCopyInfoFromDMatrix, - xgb.get.handle(bst), - dtrain - ) - if (keep_extra_attributes) { extra_attrs <- list( call = match.call(), diff --git a/R-package/man/cb.save.model.Rd b/R-package/man/cb.save.model.Rd index 584fd69b7360..7701ad9900e5 100644 --- a/R-package/man/cb.save.model.Rd +++ b/R-package/man/cb.save.model.Rd @@ -4,17 +4,22 @@ \alias{cb.save.model} \title{Callback closure for saving a model file.} \usage{ -cb.save.model(save_period = 0, save_name = "xgboost.model") +cb.save.model(save_period = 0, save_name = "xgboost.ubj") } \arguments{ \item{save_period}{save the model to disk after every \code{save_period} iterations; 0 means save the model at the end.} \item{save_name}{the name or path for the saved model file. -It can contain a \code{\link[base]{sprintf}} formatting specifier -to include the integer iteration number in the file name. -E.g., with \code{save_name} = 'xgboost_\%04d.model', -the file saved at iteration 50 would be named "xgboost_0050.model".} + +\if{html}{\out{
}}\preformatted{ Note that the format of the model being saved is determined by the file + extension specified here (see \link{xgb.save} for details about how it works). + + It can contain a \code{\link[base]{sprintf}} formatting specifier + to include the integer iteration number in the file name. + E.g., with \code{save_name} = 'xgboost_\%04d.ubj', + the file saved at iteration 50 would be named "xgboost_0050.ubj". +}\if{html}{\out{
}}} } \description{ Callback closure for saving a model file. @@ -29,5 +34,7 @@ Callback function expects the following values to be set in its calling frame: \code{end_iteration}. } \seealso{ +\link{xgb.save} + \code{\link{callbacks}} } diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 21b4660c7285..1d2e6b17b9bc 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -43,9 +43,9 @@ bst <- xgb.train( objective = "binary:logistic" ) -xgb.save(bst, 'xgb.model') -bst <- xgb.load('xgb.model') -if (file.exists('xgb.model')) file.remove('xgb.model') +xgb.save(bst, 'xgb.ubj') +bst <- xgb.load('xgb.ubj') +if (file.exists('xgb.ubj')) file.remove('xgb.ubj') } \seealso{ \code{\link{xgb.save}} diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index 767c50e873bd..5c5663dccd64 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -7,15 +7,27 @@ xgb.save(model, fname) } \arguments{ -\item{model}{model object of \code{xgb.Booster} class.} +\item{model}{Model object of \code{xgb.Booster} class.} -\item{fname}{name of the file to write.} +\item{fname}{Name of the file to write. + +Note that the extension of this file name determined the serialization format to use:\itemize{ +\item Extension ".ubj" will use the universal binary JSON format (recommended). +This format uses binary types for e.g. floating point numbers, thereby preventing any loss +of precision when converting to a human-readable JSON text or similar. +\item Extension ".json" will use plain JSON, which is a human-readable format. +\item Extension ".model" will use a \bold{deprecated} binary format. This format will +not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration" +attribute that boosters might keep, nor feature names or user-specifiec attributes. +\item If the format is not specified by passing one of the file extensions above, will +default to UBJ. +}} } \description{ -Save xgboost model to a file in binary format. +Save xgboost model to a file in binary or JSON format. } \details{ -This methods allows to save a model in an xgboost-internal binary format which is universal +This methods allows to save a model in an xgboost-internal binary or text format which is universal among the various xgboost interfaces. In R, the saved model file could be read-in later using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter of \code{\link{xgb.train}}. @@ -23,7 +35,7 @@ of \code{\link{xgb.train}}. Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} or \code{\link[base]{save}}). However, it would then only be compatible with R, and corresponding R-methods would need to be used to load it. Moreover, persisting the model with -\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in +\code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn how to persist models in a future-proof way, i.e. to make the model accessible in future releases of XGBoost. @@ -46,9 +58,9 @@ bst <- xgb.train( nrounds = 2, objective = "binary:logistic" ) -xgb.save(bst, 'xgb.model') -bst <- xgb.load('xgb.model') -if (file.exists('xgb.model')) file.remove('xgb.model') +xgb.save(bst, 'xgb.ubj') +bst <- xgb.load('xgb.ubj') +if (file.exists('xgb.ubj')) file.remove('xgb.ubj') } \seealso{ \code{\link{xgb.load}} diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index adaa144d4c11..9ebfd3edab4c 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -189,14 +189,7 @@ test_that("cb.save.model works as expected", { xgb.config(b2) <- xgb.config(bst) expect_equal(xgb.config(bst), xgb.config(b2)) - # TODO: remove this workaround once serialization to disk is able to take more attributes - # expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) - # workaround below: - s1 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(bst, raw_format = "json"))) - s2 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(b2, raw_format = "json"))) - s1$learner$feature_names <- NULL - s2$learner$feature_names <- NULL - expect_equal(s1, s2) + expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) # save_period = 0 saves the last iteration's model bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, @@ -204,14 +197,7 @@ test_that("cb.save.model works as expected", { expect_true(file.exists('xgboost.json')) b2 <- xgb.load('xgboost.json') xgb.config(b2) <- xgb.config(bst) - # TODO: remove this workaround once serialization to disk is able to take more attributes - # expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) - # workaround below: - s1 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(bst, raw_format = "json"))) - s2 <- jsonlite::fromJSON(rawToChar(xgb.save.raw(b2, raw_format = "json"))) - s1$learner$feature_names <- NULL - s2$learner$feature_names <- NULL - expect_equal(s1, s2) + expect_equal(xgb.save.raw(bst), xgb.save.raw(b2)) for (f in files) if (file.exists(f)) file.remove(f) }) From 7f39bb0a56a545f54cdc6697abf62642999730e0 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:02:26 +0100 Subject: [PATCH 21/47] update name for nrounds getter --- R-package/NAMESPACE | 2 +- R-package/R/xgb.Booster.R | 2 +- R-package/R/xgb.DMatrix.R | 5 ----- R-package/demo/predict_leaf_indices.R | 2 +- R-package/man/getinfo.Rd | 5 ----- ...get.Booster.nrounds.Rd => xgb.get.num.boosted.rounds.Rd} | 6 +++--- 6 files changed, 6 insertions(+), 16 deletions(-) rename R-package/man/{xgb.get.Booster.nrounds.Rd => xgb.get.num.boosted.rounds.Rd} (83%) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f488ea3ac52e..937edbc2dfbd 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -39,11 +39,11 @@ export(xgb.create.features) export(xgb.cv) export(xgb.dump) export(xgb.gblinear.history) -export(xgb.get.Booster.nrounds) export(xgb.get.DMatrix.data) export(xgb.get.DMatrix.num.non.missing) export(xgb.get.DMatrix.qcut) export(xgb.get.config) +export(xgb.get.num.boosted.rounds) export(xgb.ggplot.deepness) export(xgb.ggplot.importance) export(xgb.ggplot.shap.summary) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index f822657aade3..a507693d267e 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -691,7 +691,7 @@ setinfo.xgb.Booster <- function(object, name, info) { #' continuation / updates through \link{xgb.parameters<-} will reset the #' number of rounds to zero. #' @export -xgb.get.Booster.nrounds <- function(model) { +xgb.get.num.boosted.rounds <- function(model) { return(xgb.nrounds(model)) } diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index a781e5ec8a0e..57c57c365684 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -367,11 +367,6 @@ dimnames.xgb.DMatrix <- function(x) { #' #' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See #' \link{xgb.copy.Booster} for an idea of this in-place assignment works. -#' -#' Be aware that, when a booster object is serialized to disk through functions like -#' \link{xgb.save}, information that was set on it will be lost, but such information -#' would be kept as part of the serialization when using to-bytes serializers like -#' \link{xgb.save.raw}, and when using R serializers like \link{saveRDS}. #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 3706eb476fa3..21b6fa71d0b7 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -27,7 +27,7 @@ head(pred_with_leaf) create.new.tree.features <- function(model, original.features) { pred_with_leaf <- predict(model, original.features, predleaf = TRUE) cols <- list() - for (i in 1:xgb.get.Booster.nrounds(model)) { + for (i in 1:xgb.get.num.boosted.rounds(model)) { # max is not the real max but it s not important for the purpose of adding features leaf.id <- sort(unique(pred_with_leaf[, i])) cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id) diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 5ada11f2db42..7cc0d6ecb406 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -63,11 +63,6 @@ for a DMatrix that had 'qid' assigned. \bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See \link{xgb.copy.Booster} for an idea of this in-place assignment works. -Be aware that, when a booster object is serialized to disk through functions like -\link{xgb.save}, information that was set on it will be lost, but such information -would be kept as part of the serialization when using to-bytes serializers like -\link{xgb.save.raw}, and when using R serializers like \link{saveRDS}. - See the documentation for \link{xgb.DMatrix} for possible fields that can be set (which correspond to arguments in that function). diff --git a/R-package/man/xgb.get.Booster.nrounds.Rd b/R-package/man/xgb.get.num.boosted.rounds.Rd similarity index 83% rename from R-package/man/xgb.get.Booster.nrounds.Rd rename to R-package/man/xgb.get.num.boosted.rounds.Rd index c39b5e6d6d29..74c94d95b94d 100644 --- a/R-package/man/xgb.get.Booster.nrounds.Rd +++ b/R-package/man/xgb.get.num.boosted.rounds.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.Booster.R -\name{xgb.get.Booster.nrounds} -\alias{xgb.get.Booster.nrounds} +\name{xgb.get.num.boosted.rounds} +\alias{xgb.get.num.boosted.rounds} \title{Get number of boosting in a fitted booster} \usage{ -xgb.get.Booster.nrounds(model) +xgb.get.num.boosted.rounds(model) } \arguments{ \item{model}{A fitted \code{xgb.Booster} model.} From 02c312c0af18491dff4be1fbefe9445859043825 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:08:56 +0100 Subject: [PATCH 22/47] remove in-place training continuation --- R-package/R/xgb.Booster.R | 16 ++++------- R-package/R/xgb.cv.R | 3 +- R-package/R/xgb.load.R | 3 +- R-package/R/xgb.train.R | 40 ++++----------------------- R-package/man/xgb.train.Rd | 17 +----------- R-package/src/init.c | 2 -- R-package/src/xgboost_R.cc | 5 ---- R-package/src/xgboost_R.h | 7 ----- R-package/tests/testthat/test_basic.R | 9 ------ 9 files changed, 13 insertions(+), 89 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index a507693d267e..ec2cd9cadcd4 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -3,7 +3,7 @@ # Note: the number of rounds in the C booster gets reset to zero when updating # the parameters through 'xgb.parameters', hence the need to return it from # this function when using it for training continuation -xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { +xgb.Booster <- function(params, cachelist, modelfile) { if (typeof(cachelist) != "list" || !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { stop("cachelist must be a list of xgb.DMatrix objects") @@ -28,16 +28,10 @@ xgb.Booster <- function(params, cachelist, modelfile, training_continuation) { return(list(bst = bst, niter = niter)) } else if (inherits(modelfile, "xgb.Booster")) { ## A booster object - if (training_continuation == "copy") { - bst <- .Call(XGDuplicate_R, modelfile) - niter <- xgb.nrounds(bst) - xgb.parameters(bst) <- params - return(list(bst = bst, niter = niter)) - } else { - niter <- xgb.nrounds(modelfile) - xgb.parameters(modelfile) <- params - return(list(bst = modelfile, niter = niter)) - } + bst <- .Call(XGDuplicate_R, modelfile) + niter <- xgb.nrounds(bst) + xgb.parameters(bst) <- params + return(list(bst = bst, niter = niter)) } else { stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 17fe98efa739..a960957ca313 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -207,8 +207,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing bst <- xgb.Booster( params = params, cachelist = list(dtrain, dtest), - modelfile = NULL, - training_continuation = "copy" # <- not used + modelfile = NULL ) bst <- bst$bst list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 47b39e78801d..4a77ab441d0a 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -49,8 +49,7 @@ xgb.load <- function(modelfile) { bst <- xgb.Booster( params = list(), cachelist = list(), - modelfile = modelfile, - training_continuation = "copy" + modelfile = modelfile ) bst <- bst$bst # re-use modelfile if it is raw so we do not need to serialize diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 9b5e1a200413..0facbe8ce332 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -148,16 +148,6 @@ #' @param xgb_model a previously built model to continue the training from. #' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a #' file with a previously saved model. -#' @param training_continuation when passing `xgb_model`, whether to update the previous model by -#' creating a copy of it which will contain the new boosted rounds (meaning: the original object -#' is kept as it was before the call to `xgb.train`, and there will be two booster objects), or -#' by updating the previous object in-place (meaning: the object passed under `xgb_model` will -#' be updated, and nothing will be returned from this function). -#' -#' Note that, if passing "update" here, the object in `xgb_model` -#' \bold{will get updated regardless of whether this function succeeds or not} -#' (for example, the parameters will be set on the existing `xgb_model`, overwriting previous ones, -#' even if the training fails because of some error). #' @param callbacks a list of callback functions to perform various task during boosting. #' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the #' parameters' values. User can provide either existing or their own callback methods in order @@ -182,9 +172,7 @@ #' evaluation logs from callbacks, among others. #' #' @return -#' An object of class \code{xgb.Booster}, unless passing a previous `xgb_model` and passing -#' `training_continuation="update"`, in which case will update that object and return NULL -#' (invisibly) from this function. +#' An object of class \code{xgb.Booster}. #' #' @details #' These are the training functions for \code{xgboost}. @@ -326,8 +314,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, save_name = "xgboost.model", - xgb_model = NULL, training_continuation = c("copy", "update"), - callbacks = list(), keep_extra_attributes = TRUE, ...) { + xgb_model = NULL, callbacks = list(), + keep_extra_attributes = TRUE, ...) { check.deprecation(...) @@ -336,16 +324,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), check.custom.obj() check.custom.eval() - if (is.null(xgb_model)) { - training_continuation <- "copy" - } else { - training_continuation <- head(training_continuation, 1L) - training_continuation <- as.character(training_continuation) - if (!(training_continuation %in% c("copy", "update"))) { - stop("'training_continuation' must be one of 'copy' or 'update'.") - } - } - # data & watchlist checks dtrain <- data if (!inherits(dtrain, "xgb.DMatrix")) @@ -408,8 +386,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), bst <- xgb.Booster( params = params, cachelist = append(watchlist, dtrain), - modelfile = xgb_model, - training_continuation = training_continuation + modelfile = xgb_model ) niter_init <- bst$niter bst <- bst$bst @@ -484,14 +461,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), } curr_attrs <- attributes(bst) attributes(bst) <- c(curr_attrs, extra_attrs) - if (training_continuation == "update") { - .Call(XGDuplicateAttrib, bst, xgb_model) - } } - if (training_continuation == "update") { - return(invisible(NULL)) - } else { - return(bst) - } + return(bst) } diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 457194c9ebc8..30cbe0da68b0 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -19,7 +19,6 @@ xgb.train( save_period = NULL, save_name = "xgboost.model", xgb_model = NULL, - training_continuation = c("copy", "update"), callbacks = list(), keep_extra_attributes = TRUE, ... @@ -204,18 +203,6 @@ This parameter is passed to the \code{\link{cb.early.stop}} callback.} Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a file with a previously saved model.} -\item{training_continuation}{when passing \code{xgb_model}, whether to update the previous model by -creating a copy of it which will contain the new boosted rounds (meaning: the original object -is kept as it was before the call to \code{xgb.train}, and there will be two booster objects), or -by updating the previous object in-place (meaning: the object passed under \code{xgb_model} will -be updated, and nothing will be returned from this function). - -\if{html}{\out{
}}\preformatted{ Note that, if passing "update" here, the object in `xgb_model` - \\bold\{will get updated regardless of whether this function succeeds or not\} - (for example, the parameters will be set on the existing `xgb_model`, overwriting previous ones, - even if the training fails because of some error). -}\if{html}{\out{
}}} - \item{callbacks}{a list of callback functions to perform various task during boosting. See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the parameters' values. User can provide either existing or their own callback methods in order @@ -247,9 +234,7 @@ This parameter is only used when input is a dense matrix.} \item{weight}{a vector indicating the weight for each row of the input.} } \value{ -An object of class \code{xgb.Booster}, unless passing a previous \code{xgb_model} and passing -\code{training_continuation="update"}, in which case will update that object and return NULL -(invisibly) from this function. +An object of class \code{xgb.Booster}. } \description{ \code{xgb.train} is an advanced interface for training an xgboost model. diff --git a/R-package/src/init.c b/R-package/src/init.c index e825e54190c2..db6f058bc342 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -17,7 +17,6 @@ Check these declarations against the C/Fortran source code. /* .Call calls */ extern void XGBInitializeAltrepClass(DllInfo *info); extern SEXP XGDuplicate_R(SEXP); -extern SEXP XGDuplicateAttrib(SEXP, SEXP); extern SEXP XGPointerEqComparison(SEXP, SEXP); extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterCreate_R(SEXP); @@ -66,7 +65,6 @@ extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1}, - {"XGDuplicateAttrib", (DL_FUNC) &XGDuplicateAttrib, 2}, {"XGPointerEqComparison", (DL_FUNC) &XGPointerEqComparison, 2}, {"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5}, {"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index d7d395668ff9..243e5ac820e1 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -609,11 +609,6 @@ XGB_DLL SEXP XGDuplicate_R(SEXP obj) { return Rf_duplicate(obj); } -XGB_DLL SEXP XGDuplicateAttrib(SEXP src, SEXP dest) { - DUPLICATE_ATTRIB(dest, src); - return R_NilValue; -} - XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2) { return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2)); } diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 7c9c381321ee..956b18ffbf99 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -151,13 +151,6 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle); */ XGB_DLL SEXP XGDuplicate_R(SEXP obj); -/*! - * \brief Call R C-level function 'DUPLICATE_ATTRIB' - * \param src Object from which to take the attributes - * \param dest Object for which to set the attributes - */ -XGB_DLL SEXP XGDuplicateAttrib(SEXP src, SEXP dest); - /*! * \brief Equality comparison for two pointers * \param obj1 R 'externalptr' diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 3799f099dbee..d8ef8d9f4107 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -336,15 +336,6 @@ test_that("training continuation works", { } expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2)) file.remove("xgboost.json") - - set.seed(11) - bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0) - # continue for two more, this time in-place: - xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, - xgb_model = bst1, training_continuation = "update") - if (!windows_flag && !solaris_flag) { - expect_equal(xgb.save.raw(bst), xgb.save.raw(bst1)) - } }) test_that("model serialization works", { From b4d59f7fe6daf911cc2b02ad24d42eb235555b94 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:12:05 +0100 Subject: [PATCH 23/47] change unserialize -> load.raw --- R-package/R/xgb.Booster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index ec2cd9cadcd4..23afb34bbbcd 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -22,7 +22,7 @@ xgb.Booster <- function(params, cachelist, modelfile) { return(list(bst = bst, niter = niter)) } else if (is.raw(modelfile)) { ## A memory buffer - bst <- xgb.unserialize(modelfile) + bst <- xgb.load.raw(modelfile) niter <- xgb.nrounds(bst) xgb.parameters(bst) <- params return(list(bst = bst, niter = niter)) From 24e256ae111019c30f9adcd0a5522ca8ce4341a1 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:18:45 +0100 Subject: [PATCH 24/47] use R lists instead of JSON text for xgb.config --- R-package/R/xgb.Booster.R | 21 +++++++++++-------- R-package/man/xgb.config.Rd | 5 ++--- .../tests/testthat/test_model_compatibility.R | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 23afb34bbbcd..14f1f8e4406c 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -547,9 +547,8 @@ xgb.attributes <- function(object) { #' #' See \link{xgb.copy.Booster} for an example of this behavior. #' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it. -#' @param value A JSON string. -#' @return `xgb.config` will return the parameters as JSON text, which can be parsed through -#' e.g. `jsonlite::fromJSON`. +#' @param value An R list. +#' @return `xgb.config` will return the parameters as an R list. #' @examples #' data(agaricus.train, package = "xgboost") #' @@ -574,14 +573,18 @@ xgb.attributes <- function(object) { #' @export xgb.config <- function(object) { handle <- xgb.get.handle(object) - .Call(XGBoosterSaveJsonConfig_R, handle) + return(jsonlite::fromJSON(.Call(XGBoosterSaveJsonConfig_R, handle))) } #' @rdname xgb.config #' @export `xgb.config<-` <- function(object, value) { handle <- xgb.get.handle(object) - .Call(XGBoosterLoadJsonConfig_R, handle, value) + .Call( + XGBoosterLoadJsonConfig_R, + handle, + jsonlite::toJSON(value, auto_unbox = TRUE, null = "null") + ) return(object) } @@ -696,25 +699,25 @@ xgb.nrounds <- function(bst) { } xgb.ntree <- function(bst) { - config <- jsonlite::fromJSON(xgb.config(bst)) + config <- xgb.config(bst) out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees) return(out) } xgb.nthread <- function(bst) { - config <- jsonlite::fromJSON(xgb.config(bst)) + config <- xgb.config(bst) out <- strtoi(config$learner$generic_param$nthread) return(out) } xgb.booster_type <- function(bst) { - config <- jsonlite::fromJSON(xgb.config(bst)) + config <- xgb.config(bst) out <- config$learner$learner_train_param$booster return(out) } xgb.num_class <- function(bst) { - config <- jsonlite::fromJSON(xgb.config(bst)) + config <- xgb.config(bst) out <- strtoi(config$learner$learner_model_param$num_class) return(out) } diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd index 4657df3185a5..1ab810644db9 100644 --- a/R-package/man/xgb.config.Rd +++ b/R-package/man/xgb.config.Rd @@ -12,11 +12,10 @@ xgb.config(object) <- value \arguments{ \item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.} -\item{value}{A JSON string.} +\item{value}{An R list.} } \value{ -\code{xgb.config} will return the parameters as JSON text, which can be parsed through -e.g. \code{jsonlite::fromJSON}. +\code{xgb.config} will return the parameters as an R list. } \description{ Accessors for model parameters as JSON string diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R index 40481a3f00e2..613ba066f459 100644 --- a/R-package/tests/testthat/test_model_compatibility.R +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -23,7 +23,7 @@ get_num_tree <- function(booster) { } run_booster_check <- function(booster, name) { - config <- jsonlite::fromJSON(xgb.config(booster)) + config <- xgb.config(booster) run_model_param_check(config) if (name == 'cls') { testthat::expect_equal(get_num_tree(booster), From c97dc1abadb1f1fc81b232684988cf7bd57542af Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:21:18 +0100 Subject: [PATCH 25/47] remove internal function for nrounds getter --- R-package/R/xgb.Booster.R | 16 +++++----------- R-package/tests/testthat/test_basic.R | 10 +++++----- R-package/tests/testthat/test_callbacks.R | 4 ++-- R-package/tests/testthat/test_update.R | 3 ++- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 14f1f8e4406c..b5f06f694fdb 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -15,7 +15,7 @@ xgb.Booster <- function(params, cachelist, modelfile) { bst <- .Call(XGBoosterCreate_R, cachelist) modelfile <- path.expand(modelfile) .Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1])) - niter <- xgb.nrounds(bst) + niter <- xgb.get.num.boosted.rounds(bst) if (length(params) > 0) { xgb.parameters(bst) <- params } @@ -23,13 +23,13 @@ xgb.Booster <- function(params, cachelist, modelfile) { } else if (is.raw(modelfile)) { ## A memory buffer bst <- xgb.load.raw(modelfile) - niter <- xgb.nrounds(bst) + niter <- xgb.get.num.boosted.rounds(bst) xgb.parameters(bst) <- params return(list(bst = bst, niter = niter)) } else if (inherits(modelfile, "xgb.Booster")) { ## A booster object bst <- .Call(XGDuplicate_R, modelfile) - niter <- xgb.nrounds(bst) + niter <- xgb.get.num.boosted.rounds(bst) xgb.parameters(bst) <- params return(list(bst = bst, niter = niter)) } else { @@ -689,13 +689,7 @@ setinfo.xgb.Booster <- function(object, name, info) { #' number of rounds to zero. #' @export xgb.get.num.boosted.rounds <- function(model) { - return(xgb.nrounds(model)) -} - -# Extract the number of trees in a model. -# internal utility function -xgb.nrounds <- function(bst) { - return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(bst))) + return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model))) } xgb.ntree <- function(bst) { @@ -956,7 +950,7 @@ print.xgb.Booster <- function(x, ...) { } cat('# of features:', xgb.num_feature(x), '\n') - cat('# of rounds: ', xgb.nrounds(x), '\n') + cat('# of rounds: ', xgb.get.num.boosted.rounds(x), '\n') attr_names <- .Call(XGBoosterGetAttrNames_R, handle) if (NROW(attr_names)) { diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index d8ef8d9f4107..4cc4bdc76b14 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -25,7 +25,7 @@ test_that("train and predict binary classification", { "train-error" ) expect_equal(class(bst), "xgb.Booster") - expect_equal(xgb.nrounds(bst), nrounds) + expect_equal(xgb.get.num.boosted.rounds(bst), nrounds) expect_false(is.null(attributes(bst)$evaluation_log)) expect_equal(nrow(attributes(bst)$evaluation_log), nrounds) expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.03) @@ -161,7 +161,7 @@ test_that("train and predict softprob", { ) expect_false(is.null(attributes(bst)$evaluation_log)) expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025) - expect_equal(xgb.nrounds(bst) * 3, xgb.ntree(bst)) + expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst)) pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris) * 3) # row sums add up to total probability of 1: @@ -212,7 +212,7 @@ test_that("train and predict softmax", { ) expect_false(is.null(attributes(bst)$evaluation_log)) expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025) - expect_equal(xgb.nrounds(bst) * 3, xgb.ntree(bst)) + expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst)) pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris)) @@ -231,7 +231,7 @@ test_that("train and predict RF", { num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1, watchlist = list(train = xgb.DMatrix(train$data, label = lb)) ) - expect_equal(xgb.nrounds(bst), 1) + expect_equal(xgb.get.num.boosted.rounds(bst), 1) expect_equal(xgb.ntree(bst), 20) pred <- predict(bst, train$data) @@ -259,7 +259,7 @@ test_that("train and predict RF with softprob", { num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5, watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb)) ) - expect_equal(xgb.nrounds(bst), 15) + expect_equal(xgb.get.num.boosted.rounds(bst), 15) expect_equal(xgb.ntree(bst), 15 * 3 * 4) # predict for all iterations: pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index 9ebfd3edab4c..5c0243f46493 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -182,10 +182,10 @@ test_that("cb.save.model works as expected", { expect_true(file.exists('xgboost_02.json')) b1 <- xgb.load('xgboost_01.json') xgb.parameters(b1) <- list(nthread = 2) - expect_equal(xgb.nrounds(b1), 1) + expect_equal(xgb.get.num.boosted.rounds(b1), 1) b2 <- xgb.load('xgboost_02.json') xgb.parameters(b2) <- list(nthread = 2) - expect_equal(xgb.nrounds(b2), 2) + expect_equal(xgb.get.num.boosted.rounds(b2), 2) xgb.config(b2) <- xgb.config(bst) expect_equal(xgb.config(bst), xgb.config(b2)) diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index f3b1f1512af4..3c88178e08d3 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -125,7 +125,8 @@ test_that("updating works for multiclass & multitree", { # run update process for an original model with subsampling p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) - bst0u <- xgb.train(p0u, dtr, nrounds = xgb.nrounds(bst0), watchlist, xgb_model = bst0, verbose = 0) + bst0u <- xgb.train(p0u, dtr, nrounds = xgb.get.num.boosted.rounds(bst0), + watchlist, xgb_model = bst0, verbose = 0) tr0u <- xgb.model.dt.tree(model = bst0u) # should be the same evaluation but different gains and larger cover From 5f8dea578850aac747b73a7e944ecbed522cc663 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:28:12 +0100 Subject: [PATCH 26/47] use _R suffix for all C functions specific to R --- R-package/R/xgb.Booster.R | 2 +- R-package/R/xgb.train.R | 2 +- R-package/src/init.c | 12 ++++++------ R-package/src/xgboost_R.cc | 6 +++--- R-package/src/xgboost_R.h | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index b5f06f694fdb..b505d49b912e 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -907,7 +907,7 @@ xgb.is.same.Booster <- function(obj1, obj2) { } return( .Call( - XGPointerEqComparison, + XGPointerEqComparison_R, xgb.get.handle(obj1), xgb.get.handle(obj2) ) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 0facbe8ce332..11244d0c1ae6 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -391,7 +391,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), niter_init <- bst$niter bst <- bst$bst .Call( - XGBoosterCopyInfoFromDMatrix, + XGBoosterCopyInfoFromDMatrix_R, xgb.get.handle(bst), dtrain ) diff --git a/R-package/src/init.c b/R-package/src/init.c index db6f058bc342..81c28c401c44 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -15,12 +15,12 @@ Check these declarations against the C/Fortran source code. */ /* .Call calls */ -extern void XGBInitializeAltrepClass(DllInfo *info); +extern void XGBInitializeAltrepClass_R(DllInfo *info); extern SEXP XGDuplicate_R(SEXP); -extern SEXP XGPointerEqComparison(SEXP, SEXP); +extern SEXP XGPointerEqComparison_R(SEXP, SEXP); extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterCreate_R(SEXP); -extern SEXP XGBoosterCopyInfoFromDMatrix(SEXP, SEXP); +extern SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP, SEXP); extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP); extern SEXP XGBoosterBoostedRounds_R(SEXP); @@ -65,10 +65,10 @@ extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1}, - {"XGPointerEqComparison", (DL_FUNC) &XGPointerEqComparison, 2}, + {"XGPointerEqComparison_R", (DL_FUNC) &XGPointerEqComparison_R, 2}, {"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5}, {"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1}, - {"XGBoosterCopyInfoFromDMatrix",(DL_FUNC) &XGBoosterCopyInfoFromDMatrix,2}, // NOLINT + {"XGBoosterCopyInfoFromDMatrix_R", (DL_FUNC) &XGBoosterCopyInfoFromDMatrix_R, 2}, {"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3}, // NOLINT {"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2}, // NOLINT {"XGBoosterBoostedRounds_R", (DL_FUNC) &XGBoosterBoostedRounds_R, 1}, @@ -119,5 +119,5 @@ __declspec(dllexport) void attribute_visible R_init_xgboost(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); - XGBInitializeAltrepClass(dll); + XGBInitializeAltrepClass_R(dll); } diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 243e5ac820e1..89b366feded4 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -609,7 +609,7 @@ XGB_DLL SEXP XGDuplicate_R(SEXP obj) { return Rf_duplicate(obj); } -XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2) { +XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2) { return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2)); } @@ -799,7 +799,7 @@ SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { } /* namespace */ -XGB_DLL void XGBInitializeAltrepClass(DllInfo *dll) { +XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll) { XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll); R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R); R_set_altlist_Elt_method(XGBAltrepPointerClass, XGBAltrepPointerGetElt_R); @@ -830,7 +830,7 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) { return out; } -XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix(SEXP booster, SEXP dmat) { +XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat) { R_API_BEGIN(); char const **feature_names; bst_ulong len_feature_names = 0; diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 956b18ffbf99..79d441792323 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -156,13 +156,13 @@ XGB_DLL SEXP XGDuplicate_R(SEXP obj); * \param obj1 R 'externalptr' * \param obj2 R 'externalptr' */ -XGB_DLL SEXP XGPointerEqComparison(SEXP obj1, SEXP obj2); +XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2); /*! * \brief Register the Altrep class used for the booster * \param dll DLL info as provided by R_init */ -XGB_DLL void XGBInitializeAltrepClass(DllInfo *dll); +XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll); /*! * \brief return the quantile cuts used for the histogram method @@ -200,7 +200,7 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats); * \param booster R 'externalptr' pointing to a booster object * \param dmat R 'externalptr' pointing to a DMatrix object */ -XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix(SEXP booster, SEXP dmat); +XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat); /*! * \brief handle R 'externalptr' holding the booster object From 8e29769a2e6084015ffa9e88160341ce41262b4f Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:33:26 +0100 Subject: [PATCH 27/47] add test for C and R attributes with saveRDS --- R-package/tests/testthat/test_io.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R index d27ad8856364..9c05bdd1015d 100644 --- a/R-package/tests/testthat/test_io.R +++ b/R-package/tests/testthat/test_io.R @@ -26,3 +26,24 @@ test_that("load/save raw works", { expect_equal(json2old, ubj2old) expect_equal(json2old, old_bytes) }) + +test_that("saveRDS preserves C and R attributes", { + data(mtcars) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + dm <- xgb.DMatrix(x, label = y, nthread = 1) + model <- xgb.train( + data = dm, + params = list(nthread = 1, max_depth = 2), + nrounds = 5 + ) + attributes(model)$my_attr <- "qwerty" + xgb.attr(model, "c_attr") <- "asdf" + + fname <- file.path(tempdir(), "xgb_model.Rds") + saveRDS(model, fname) + model_new <- readRDS(fname) + + expect_equal(attributes(model_new)$my_attr, attributes(model)$my_attr) + expect_equal(xgb.attr(model, "c_attr"), xgb.attr(model_new, "c_attr")) +}) From 9f81e2033513ec713f06121868617255e7bc606c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:45:42 +0100 Subject: [PATCH 28/47] add variable.names method for booster --- R-package/NAMESPACE | 2 ++ R-package/R/xgb.Booster.R | 15 ++++++++++++++ R-package/R/xgboost.R | 1 + R-package/man/variable.names.xgb.Booster.Rd | 22 +++++++++++++++++++++ R-package/tests/testthat/test_basic.R | 5 +++++ 5 files changed, 45 insertions(+) create mode 100644 R-package/man/variable.names.xgb.Booster.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 937edbc2dfbd..21a4174d74de 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -14,6 +14,7 @@ S3method(print,xgb.cv.synchronous) S3method(setinfo,xgb.Booster) S3method(setinfo,xgb.DMatrix) S3method(slice,xgb.DMatrix) +S3method(variable.names,xgb.Booster) export("xgb.attr<-") export("xgb.attributes<-") export("xgb.config<-") @@ -95,6 +96,7 @@ importFrom(methods,new) importFrom(stats,coef) importFrom(stats,median) importFrom(stats,predict) +importFrom(stats,variable.names) importFrom(utils,head) importFrom(utils,object.size) importFrom(utils,str) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index b505d49b912e..b6073505f4c2 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -692,6 +692,21 @@ xgb.get.num.boosted.rounds <- function(model) { return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model))) } +#' @title Get Features Names from Booster +#' @description Returns the feature / variable / column names from a fitted +#' booster object, which are set automatically during the call to \link{xgb.train} +#' from the DMatrix names, or which can be set manually through \link{setinfo}. +#' +#' If the object doesn't have feature names, will return `NULL`. +#' +#' It is equivalent to calling `getinfo(object, "feature_name")`. +#' @param object An `xgb.Booster` object. +#' @param ... Not used. +#' @export +variable.names.xgb.Booster <- function(object, ...) { + return(getinfo(object, "feature_name")) +} + xgb.ntree <- function(bst) { config <- xgb.config(bst) out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees) diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index b61b63bd0daa..170aa5ffd5be 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -103,6 +103,7 @@ NULL #' @importFrom stats coef #' @importFrom stats predict #' @importFrom stats median +#' @importFrom stats variable.names #' @importFrom utils head #' @importFrom graphics barplot #' @importFrom graphics lines diff --git a/R-package/man/variable.names.xgb.Booster.Rd b/R-package/man/variable.names.xgb.Booster.Rd new file mode 100644 index 000000000000..aec09751d8a0 --- /dev/null +++ b/R-package/man/variable.names.xgb.Booster.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{variable.names.xgb.Booster} +\alias{variable.names.xgb.Booster} +\title{Get Features Names from Booster} +\usage{ +\method{variable.names}{xgb.Booster}(object, ...) +} +\arguments{ +\item{object}{An \code{xgb.Booster} object.} + +\item{...}{Not used.} +} +\description{ +Returns the feature / variable / column names from a fitted +booster object, which are set automatically during the call to \link{xgb.train} +from the DMatrix names, or which can be set manually through \link{setinfo}. + +If the object doesn't have feature names, will return \code{NULL}. + +It is equivalent to calling \code{getinfo(object, "feature_name")}. +} diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 4cc4bdc76b14..31d827f62c3a 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -792,6 +792,11 @@ test_that("DMatrix field are set to booster when training", { expect_null(getinfo(model_feature_types, "feature_name")) expect_equal(getinfo(model_both, "feature_name"), c("a", "b", "c")) + expect_null(variable.names(model_unnamed)) + expect_equal(variable.names(model_feature_names), c("a", "b", "c")) + expect_null(variable.names(model_feature_types)) + expect_equal(variable.names(model_both), c("a", "b", "c")) + expect_null(getinfo(model_unnamed, "feature_type")) expect_null(getinfo(model_feature_names, "feature_type")) expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q")) From 65197e182310641a366c2679192065113866ea3a Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 20:48:57 +0100 Subject: [PATCH 29/47] add comment about supressed warning --- R-package/R/callbacks.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 2cd5934e6950..b3d6bdb1ae0a 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -445,6 +445,7 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") { if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) || (save_period == 0 && env$iteration == env$end_iteration)) { + # Note: this throws a warning if the name doesn't have anything to format through 'sprintf' suppressWarnings({ save_name <- sprintf(save_name, env$iteration) }) From 7dc9b96a3446793e579cd15f266d5f18be5101ea Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 21:14:24 +0100 Subject: [PATCH 30/47] update comment --- R-package/R/xgb.Booster.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 7bc170868dc2..cee7e9fc5887 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -1,8 +1,11 @@ # Construct an internal xgboost Booster and get its current number of rounds. # internal utility function -# Note: the number of rounds in the C booster gets reset to zero when updating -# the parameters through 'xgb.parameters', hence the need to return it from -# this function when using it for training continuation +# Note: the number of rounds in the C booster gets reset to zero when changing +# key booster parameters like 'process_type=update', but in some cases, when +# replacing previous iterations, it needs to make a check that the new number +# of iterations doesn't exceed the previous ones, hence it keeps track of the +# current number of iterations before resetting the parameters in order to +# perform the check later on. xgb.Booster <- function(params, cachelist, modelfile) { if (typeof(cachelist) != "list" || !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { From 11b213e9e6af220212b3f5c98a68ed6db840fe8b Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 21:24:06 +0100 Subject: [PATCH 31/47] update serializers in vignette --- R-package/vignettes/xgboostPresentation.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index e3987c7bd127..fefa0bba5250 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -496,13 +496,13 @@ In some very specific cases, you will want to save the model as a *R* binary vec ```{r saveLoadRBinVectorModel, message=F, warning=F} # save model to R's raw vector -rawVec <- xgb.serialize(bst) +rawVec <- xgb.save.raw(bst) # print class print(class(rawVec)) # load binary model to R -bst3 <- xgb.load(rawVec) +bst3 <- xgb.load.raw(rawVec) xgb.parameters(bst3) <- list(nthread = 2) pred3 <- predict(bst3, test$data) From c72e663654aacae765a420ea287068582b6f9f49 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Mon, 8 Jan 2024 21:35:55 +0100 Subject: [PATCH 32/47] update vignettes --- R-package/vignettes/xgboostPresentation.Rmd | 109 ++++++++++++-------- R-package/vignettes/xgboostfromJSON.Rmd | 3 +- 2 files changed, 65 insertions(+), 47 deletions(-) diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index fefa0bba5250..efafc624d40f 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -155,11 +155,13 @@ We will train decision tree model using the following parameters: bstSparse <- xgboost( data = train$data , label = train$label - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 - , objective = "binary:logistic" ) ``` @@ -175,11 +177,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** bstDense <- xgboost( data = as.matrix(train$data), label = train$label, - max_depth = 2, - eta = 1, - nthread = 2, - nrounds = 2, - objective = "binary:logistic" + params = list( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ), + nrounds = 2 ) ``` @@ -191,11 +195,13 @@ bstDense <- xgboost( dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2) bstDMatrix <- xgboost( data = dtrain, - max_depth = 2, - eta = 1, - nthread = 2, - nrounds = 2, - objective = "binary:logistic" + params = list( + max_depth = 2, + eta = 1, + nthread = 2, + objective = "binary:logistic" + ), + nrounds = 2 ) ``` @@ -209,11 +215,13 @@ One of the simplest way to see the training progress is to set the `verbose` opt # verbose = 0, no message bst <- xgboost( data = dtrain - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 - , objective = "binary:logistic" , verbose = 0 ) ``` @@ -222,11 +230,13 @@ bst <- xgboost( # verbose = 1, print evaluation metric bst <- xgboost( data = dtrain - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 - , objective = "binary:logistic" , verbose = 1 ) ``` @@ -235,11 +245,13 @@ bst <- xgboost( # verbose = 2, also print information about tree bst <- xgboost( data = dtrain - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 - , objective = "binary:logistic" , verbose = 2 ) ``` @@ -336,12 +348,14 @@ watchlist <- list(train = dtrain, test = dtest) bst <- xgb.train( data = dtrain - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 , watchlist = watchlist - , objective = "binary:logistic" ) ``` @@ -357,13 +371,15 @@ For a better understanding of the learning progression, you may want to have som bst <- xgb.train( data = dtrain , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + eta = 1 + , nthread = 2 + , objective = "binary:logistic" + , eval_metric = "error" + , eval_metric = "logloss" + ) , nrounds = 2 , watchlist = watchlist - , eval_metric = "error" - , eval_metric = "logloss" - , objective = "binary:logistic" ) ``` @@ -377,14 +393,15 @@ Until now, all the learnings we have performed were based on boosting trees. **X ```{r linearBoosting, message=F, warning=F} bst <- xgb.train( data = dtrain - , booster = "gblinear" - , max_depth = 2 - , nthread = 2 + , params = list( + booster = "gblinear" + , nthread = 2 + , objective = "binary:logistic" + , eval_metric = "error" + , eval_metric = "logloss" + ) , nrounds = 2 , watchlist = watchlist - , eval_metric = "error" - , eval_metric = "logloss" - , objective = "binary:logistic" ) ``` @@ -406,12 +423,14 @@ xgb.DMatrix.save(dtrain, fname) dtrain2 <- xgb.DMatrix(fname) bst <- xgb.train( data = dtrain2 - , max_depth = 2 - , eta = 1 - , nthread = 2 + , params = list( + max_depth = 2 + , eta = 1 + , nthread = 2 + , objective = "binary:logistic" + ) , nrounds = 2 , watchlist = watchlist - , objective = "binary:logistic" ) ``` diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd index f5bc3ad9b7f0..e5331b0ff38c 100644 --- a/R-package/vignettes/xgboostfromJSON.Rmd +++ b/R-package/vignettes/xgboostfromJSON.Rmd @@ -53,11 +53,10 @@ labels <- c(1, 1, 1, data <- data.frame(dates = dates, labels = labels) bst <- xgb.train( - data = xgb.DMatrix(as.matrix(data$dates), label = labels), + data = xgb.DMatrix(as.matrix(data$dates), label = labels, missing = NA), nthread = 2, nrounds = 1, objective = "binary:logistic", - missing = NA, max_depth = 1 ) ``` From df88ad9146fbf2b158828b9f6e3c700ee16fa3fe Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 9 Jan 2024 19:09:16 +0100 Subject: [PATCH 33/47] remove xgb.serialize and xgb.unserialize --- R-package/NAMESPACE | 2 -- R-package/R/utils.R | 4 --- R-package/R/xgb.load.R | 3 +- R-package/R/xgb.serialize.R | 23 -------------- R-package/R/xgb.unserialize.R | 31 ------------------- .../a-compatibility-note-for-saveRDS-save.Rd | 4 --- R-package/man/xgb.serialize.Rd | 27 ---------------- R-package/man/xgb.unserialize.Rd | 17 ---------- R-package/tests/testthat/test_basic.R | 16 ---------- 9 files changed, 1 insertion(+), 126 deletions(-) delete mode 100644 R-package/R/xgb.serialize.R delete mode 100644 R-package/R/xgb.unserialize.R delete mode 100644 R-package/man/xgb.serialize.Rd delete mode 100644 R-package/man/xgb.unserialize.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 21a4174d74de..a29c9b1e0104 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -61,10 +61,8 @@ export(xgb.plot.shap.summary) export(xgb.plot.tree) export(xgb.save) export(xgb.save.raw) -export(xgb.serialize) export(xgb.set.config) export(xgb.train) -export(xgb.unserialize) export(xgboost) import(methods) importClassesFrom(Matrix,dgCMatrix) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 8543a96b4e61..d288e0562d4b 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -379,10 +379,6 @@ NULL #' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model #' as part of another R object. #' -#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the -#' model but also internal configurations and parameters, and its format is not stable across -#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing. -#' #' For more details and explanation about model persistence and archival, consult the page #' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. #' diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 7150bc3e4faa..7d1eab7e9c34 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -57,8 +57,7 @@ xgb.load <- function(modelfile) { warning( paste( "The support for loading raw booster with `xgb.load` will be ", - "discontinued in upcoming release. Use `xgb.load.raw` or", - " `xgb.unserialize` instead. " + "discontinued in upcoming release. Use `xgb.load.raw` instead. " ) ) } diff --git a/R-package/R/xgb.serialize.R b/R-package/R/xgb.serialize.R deleted file mode 100644 index ce0de4f0e8ef..000000000000 --- a/R-package/R/xgb.serialize.R +++ /dev/null @@ -1,23 +0,0 @@ -#' Serialize the booster instance into R's raw vector. The serialization method differs -#' from \code{\link{xgb.save.raw}} as the latter one saves only the model but not -#' parameters. -#' -#' \bold{IMPORTANT:} this serialization format is not stable across different xgboost versions. -#' -#' @param booster A booster instance -#' -#' @examples -#' data(agaricus.train, package='xgboost') -#' data(agaricus.test, package='xgboost') -#' train <- agaricus.train -#' test <- agaricus.test -#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") -#' raw <- xgb.serialize(bst) -#' bst <- xgb.unserialize(raw) -#' -#' @export -xgb.serialize <- function(booster) { - handle <- xgb.get.handle(booster) - .Call(XGBoosterSerializeToBuffer_R, handle) -} diff --git a/R-package/R/xgb.unserialize.R b/R-package/R/xgb.unserialize.R deleted file mode 100644 index 28e3e9087148..000000000000 --- a/R-package/R/xgb.unserialize.R +++ /dev/null @@ -1,31 +0,0 @@ -#' Load the instance back from \code{\link{xgb.serialize}} -#' -#' @param buffer the buffer containing booster instance saved by \code{\link{xgb.serialize}} -#' @return An \code{xgb.Booster} object. -#' -#' @export -xgb.unserialize <- function(buffer) { - cachelist <- list() - bst <- .Call(XGBoosterCreate_R, cachelist) - tryCatch( - .Call(XGBoosterUnserializeFromBuffer_R, xgb.get.handle(bst), buffer), - error = function(e) { - error_msg <- conditionMessage(e) - m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)", - error_msg, perl = TRUE) - groups <- regmatches(error_msg, m)[[1]] - if (length(groups) == 3) { - warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ", - "loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ", - "function, to ensure that your model can be read in current and upcoming ", - "XGBoost releases. Please use xgb.save() instead to preserve models for the ", - "long term. For more details and explanation, see ", - "https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html", - sep = "")) - .Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer) - } else { - stop(e) - } - }) - return(bst) -} diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index f3dd75309dc0..a54f19be6c8c 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -32,10 +32,6 @@ re-construct the corresponding model. To read the model back, use \code{\link{xg The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model as part of another R object. -Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the -model but also internal configurations and parameters, and its format is not stable across -multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing. - For more details and explanation about model persistence and archival, consult the page \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. } diff --git a/R-package/man/xgb.serialize.Rd b/R-package/man/xgb.serialize.Rd deleted file mode 100644 index a4ca66bbe8e9..000000000000 --- a/R-package/man/xgb.serialize.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.serialize.R -\name{xgb.serialize} -\alias{xgb.serialize} -\title{Serialize the booster instance into R's raw vector. The serialization method differs -from \code{\link{xgb.save.raw}} as the latter one saves only the model but not -parameters.} -\usage{ -xgb.serialize(booster) -} -\arguments{ -\item{booster}{A booster instance} -} -\description{ -\bold{IMPORTANT:} this serialization format is not stable across different xgboost versions. -} -\examples{ -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -train <- agaricus.train -test <- agaricus.test -bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, - eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") -raw <- xgb.serialize(bst) -bst <- xgb.unserialize(raw) - -} diff --git a/R-package/man/xgb.unserialize.Rd b/R-package/man/xgb.unserialize.Rd deleted file mode 100644 index d07fa5d1f58b..000000000000 --- a/R-package/man/xgb.unserialize.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.unserialize.R -\name{xgb.unserialize} -\alias{xgb.unserialize} -\title{Load the instance back from \code{\link{xgb.serialize}}} -\usage{ -xgb.unserialize(buffer) -} -\arguments{ -\item{buffer}{the buffer containing booster instance saved by \code{\link{xgb.serialize}}} -} -\value{ -An \code{xgb.Booster} object. -} -\description{ -Load the instance back from \code{\link{xgb.serialize}} -} diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 0a3cc89fdde7..8dd934765004 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -339,22 +339,6 @@ test_that("training continuation works", { expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2)) }) -test_that("model serialization works", { - out_path <- file.path(tempdir(), "model_serialization") - dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads) - watchlist <- list(train = dtrain) - param <- list(objective = "binary:logistic", nthread = n_threads) - booster <- xgb.train(param, dtrain, nrounds = 4, watchlist) - raw <- xgb.serialize(booster) - saveRDS(raw, out_path) - raw <- readRDS(out_path) - - loaded <- xgb.unserialize(raw) - raw_from_loaded <- xgb.serialize(loaded) - expect_equal(raw, raw_from_loaded) - file.remove(out_path) -}) - test_that("xgb.cv works", { set.seed(11) expect_output( From 74f7f0cff94e2a4b7ed4cefefd0b2584294a76b4 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 9 Jan 2024 19:11:05 +0100 Subject: [PATCH 34/47] Update R-package/R/xgb.save.R Co-authored-by: Jiaming Yuan --- R-package/R/xgb.save.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index f0487c262b35..e1a61d1965b9 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -10,7 +10,7 @@ #' This format uses binary types for e.g. floating point numbers, thereby preventing any loss #' of precision when converting to a human-readable JSON text or similar. #' \item Extension ".json" will use plain JSON, which is a human-readable format. -#' \item Extension ".model" will use a \bold{deprecated} binary format. This format will +#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will #' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration" #' attribute that boosters might keep, nor feature names or user-specifiec attributes. #' \item If the format is not specified by passing one of the file extensions above, will From 1ede32e1286d0c8ce511e7f6e0087cc4912b10cb Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 9 Jan 2024 19:11:38 +0100 Subject: [PATCH 35/47] update docs --- R-package/man/xgb.save.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index 4beb23d866ea..0db80a120c84 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -16,7 +16,7 @@ Note that the extension of this file name determined the serialization format to This format uses binary types for e.g. floating point numbers, thereby preventing any loss of precision when converting to a human-readable JSON text or similar. \item Extension ".json" will use plain JSON, which is a human-readable format. -\item Extension ".model" will use a \bold{deprecated} binary format. This format will +\item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration" attribute that boosters might keep, nor feature names or user-specifiec attributes. \item If the format is not specified by passing one of the file extensions above, will From 37d6b1e2a3c05b3c31206b42c1c7bc0aba5837ae Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 9 Jan 2024 19:14:50 +0100 Subject: [PATCH 36/47] remove 'keep_extra_attributes' --- R-package/R/xgb.train.R | 38 +++++++++++--------------------------- R-package/man/xgb.train.Rd | 18 +----------------- 2 files changed, 12 insertions(+), 44 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 11244d0c1ae6..a68cb3ff6526 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -153,8 +153,7 @@ #' parameters' values. User can provide either existing or their own callback methods in order #' to customize the training process. #' -#' Note that some callbacks might try to set an evaluation log - in order to keep such logs, -#' it's necessary to pass `keep_extra_attributes = TRUE`. Be aware that these evaluation logs +#' Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs #' are kept as R attributes, and thus do not get saved when using non-R serializaters like #' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). #' @param ... other parameters to pass to \code{params}. @@ -164,12 +163,6 @@ #' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values. #' This parameter is only used when input is a dense matrix. #' @param weight a vector indicating the weight for each row of the input. -#' @param keep_extra_attributes Whether to keep extra R attributes in the booster object which -#' are specific to the R interface and which do not get saved along when calling functions -#' like `xgb.save`, but which get saved with R-specific serializers such as `saveRDS`. -#' -#' These attributes include, for example, the function call that was used to produce the model, -#' evaluation logs from callbacks, among others. #' #' @return #' An object of class \code{xgb.Booster}. @@ -223,12 +216,6 @@ #' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and #' not anyhow used by functions like \link{predict.xgb.Booster}. #' -#' If passing `keep_extra_attributes=TRUE`, note that the parameters passed here will be kept -#' in the R-specific attributes, but since functions like \link{xgb.parameters<-} allow changing -#' parameters in the C-level object after it has been fitted, be aware that there's no guarantee -#' that these R parameters would be synchronized with the internal booster parameters as -#' return by \link{xgb.parameters<-} or \link{xgb.config}. -#' #' @seealso #' \code{\link{callbacks}}, #' \code{\link{predict.xgb.Booster}}, @@ -314,8 +301,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, save_name = "xgboost.model", - xgb_model = NULL, callbacks = list(), - keep_extra_attributes = TRUE, ...) { + xgb_model = NULL, callbacks = list(), ...) { check.deprecation(...) @@ -450,18 +436,16 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), } } - if (keep_extra_attributes) { - extra_attrs <- list( - call = match.call(), - params = params, - callbacks = callbacks - ) - if (keep_evaluation_log) { - extra_attrs$evaluation_log <- evaluation_log - } - curr_attrs <- attributes(bst) - attributes(bst) <- c(curr_attrs, extra_attrs) + extra_attrs <- list( + call = match.call(), + params = params, + callbacks = callbacks + ) + if (keep_evaluation_log) { + extra_attrs$evaluation_log <- evaluation_log } + curr_attrs <- attributes(bst) + attributes(bst) <- c(curr_attrs, extra_attrs) return(bst) } diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 30cbe0da68b0..74e19b35cee1 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -20,7 +20,6 @@ xgb.train( save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), - keep_extra_attributes = TRUE, ... ) @@ -208,20 +207,11 @@ See \code{\link{callbacks}}. Some of the callbacks are automatically created dep parameters' values. User can provide either existing or their own callback methods in order to customize the training process. -\if{html}{\out{
}}\preformatted{ Note that some callbacks might try to set an evaluation log - in order to keep such logs, - it's necessary to pass `keep_extra_attributes = TRUE`. Be aware that these evaluation logs +\if{html}{\out{
}}\preformatted{ Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs are kept as R attributes, and thus do not get saved when using non-R serializaters like \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). }\if{html}{\out{
}}} -\item{keep_extra_attributes}{Whether to keep extra R attributes in the booster object which -are specific to the R interface and which do not get saved along when calling functions -like \code{xgb.save}, but which get saved with R-specific serializers such as \code{saveRDS}. - -\if{html}{\out{
}}\preformatted{ These attributes include, for example, the function call that was used to produce the model, - evaluation logs from callbacks, among others. -}\if{html}{\out{
}}} - \item{...}{other parameters to pass to \code{params}.} \item{label}{vector of response values. Should not be provided when data is @@ -288,12 +278,6 @@ and shared between interfaces through serialization functions like \link{xgb.sav R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and not anyhow used by functions like \link{predict.xgb.Booster}. - -If passing \code{keep_extra_attributes=TRUE}, note that the parameters passed here will be kept -in the R-specific attributes, but since functions like \link{xgb.parameters<-} allow changing -parameters in the C-level object after it has been fitted, be aware that there's no guarantee -that these R parameters would be synchronized with the internal booster parameters as -return by \link{xgb.parameters<-} or \link{xgb.config}. } \examples{ data(agaricus.train, package='xgboost') From 43d938b13184393d16ae7b75f9a8ce7fb2fa7100 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 9 Jan 2024 19:17:49 +0100 Subject: [PATCH 37/47] remove .Rnw file --- R-package/vignettes/xgboost.Rnw | 223 -------------------------------- 1 file changed, 223 deletions(-) delete mode 100644 R-package/vignettes/xgboost.Rnw diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw deleted file mode 100644 index 7edf4ace3d4f..000000000000 --- a/R-package/vignettes/xgboost.Rnw +++ /dev/null @@ -1,223 +0,0 @@ -\documentclass{article} -\RequirePackage{url} -\usepackage{hyperref} -\RequirePackage{amsmath} -\RequirePackage{natbib} -\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry} - -\makeatletter -% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting} -%\VignetteKeywords{xgboost, gbm, gradient boosting machines} -%\VignettePackage{xgboost} -% \VignetteEngine{knitr::knitr} -\makeatother - -\begin{document} -%\SweaveOpts{concordance=TRUE} - -<>= -if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE) -@ - -% -<>= -xgboost.version <- packageDescription("xgboost")$Version - -@ -% - - \begin{center} - \vspace*{6\baselineskip} - \rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt} - \rule{\textwidth}{0.4pt}\\[2\baselineskip] - {\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip] - \rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt} - \rule{\textwidth}{1.6pt}\\[2\baselineskip] - {\Large Tianqi Chen, Tong He}\\[\baselineskip] - {\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip] - {\large \today}\par - \vfill - \end{center} - -\thispagestyle{empty} - -\clearpage - -\setcounter{page}{1} - -\section{Introduction} - -This is an introductory document of using the \verb@xgboost@ package in R. - -\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient - and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}. -The package includes efficient linear model solver and tree learning algorithm. -It supports various objective functions, including regression, classification -and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features: -\begin{enumerate} - \item{Speed: }{\verb@xgboost@ can automatically do parallel computation on - Windows and Linux, with openmp. It is generally over 10 times faster than - \verb@gbm@.} - \item{Input Type: }{\verb@xgboost@ takes several types of input data:} - \begin{itemize} - \item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@} - \item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@} - \item{Data File: }{Local data files} - \item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.} - \end{itemize} - \item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster - and linear booster, and is optimized for sparse input.} - \item{Customization: }{\verb@xgboost@ supports customized objective function - and evaluation function} - \item{Performance: }{\verb@xgboost@ has better performance on several different - datasets.} -\end{enumerate} - - -\section{Example with Mushroom data} - -In this section, we will illustrate some common usage of \verb@xgboost@. The -Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013} - -<>= -library(xgboost) -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -train <- agaricus.train -test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, - nrounds = 2, objective = "binary:logistic", nthread = 2) -xgb.save(bst, 'model.save') -bst = xgb.load('model.save') -xgb.parameters(bst) <- list(nthread = 2) -pred <- predict(bst, test$data) -@ - -\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model. -\verb@predict@ does prediction on the model. - -Here we can save the model to a binary local file, and load it when needed. -We can't inspect the trees inside. However we have another function to save the -model in plain text. -<>= -xgb.dump(bst, 'model.dump') -@ - -The output looks like - -\begin{verbatim} -booster[0]: -0:[f28<1.00001] yes=1,no=2,missing=2 - 1:[f108<1.00001] yes=3,no=4,missing=4 - 3:leaf=1.85965 - 4:leaf=-1.94071 - 2:[f55<1.00001] yes=5,no=6,missing=6 - 5:leaf=-1.70044 - 6:leaf=1.71218 -booster[1]: -0:[f59<1.00001] yes=1,no=2,missing=2 - 1:leaf=-6.23624 - 2:[f28<1.00001] yes=3,no=4,missing=4 - 3:leaf=-0.96853 - 4:leaf=0.784718 -\end{verbatim} - -It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@. -It speeds up \verb@xgboost@, and is needed for advanced features such as -training from initial prediction value, weighted training instance. - -We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object: -<>= -dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2) -class(dtrain) -head(getinfo(dtrain,'label')) -@ - -We can also save the matrix to a binary file. Then load it simply with -\verb@xgb.DMatrix@ -<>= -xgb.DMatrix.save(dtrain, 'xgb.DMatrix') -dtrain = xgb.DMatrix('xgb.DMatrix') -@ - -\section{Advanced Examples} - -The function \verb@xgboost@ is a simple function with less parameter, in order -to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully. - -\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions. - -<>= -logregobj <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - preds <- 1/(1 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1 - preds) - return(list(grad = grad, hess = hess)) -} - -evalerror <- function(preds, dtrain) { - labels <- getinfo(dtrain, "label") - err <- sqrt(mean((preds-labels)^2)) - return(list(metric = "MSE", value = err)) -} - -dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2) -watchlist <- list(eval = dtest, train = dtrain) -param <- list(max_depth = 2, eta = 1, nthread = 2) - -bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE) -@ - -The gradient and second order gradient is required for the output of customized -objective function. - -We also have \verb@slice@ for row extraction. It is useful in -cross-validation. - -For a walkthrough demo, please see \verb@R-package/demo/@ for further -details. - -\section{The Higgs Boson competition} - -We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs -Boson Machine Learning Challenge}. - -Here are the instructions to make a submission -\begin{enumerate} - \item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets} - and extract them to \verb@data/@. - \item Run scripts under \verb@xgboost/demo/kaggle-higgs/@: - \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R} - and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}. - The computation will take less than a minute on Intel i7. - \item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page} - and submit your result. -\end{enumerate} - -We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script} -to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@. -The training set contains 350000 records and 30 features. - -\verb@xgboost@ can automatically do parallel computation. On a machine with Intel -i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster -than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was -still about two times faster than \verb@gbm@. - -Meanwhile, the result from \verb@xgboost@ reaches -\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a -single model. This results stands in the -\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the -competition. - -\bibliographystyle{jss} -\nocite{*} % list uncited references -\bibliography{xgboost} - -\end{document} - -<>= -file.remove("xgb.DMatrix") -file.remove("model.dump") -file.remove("model.save") -@ From 692e5a577974819fe1e94d178fcde2180a9e173c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 19:13:14 +0100 Subject: [PATCH 38/47] add note about booster's R parameters --- R-package/R/xgb.train.R | 7 +++++++ R-package/man/xgb.train.Rd | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index a68cb3ff6526..a313ed32f414 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -216,6 +216,13 @@ #' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and #' not anyhow used by functions like \link{predict.xgb.Booster}. #' +#' Be aware that one such R attribute that is automatically added is `params` - this attribute +#' is assigned from the `params` argument to this function, and is only meant to serve as a +#' reference for what went into the booster, but is not used in other methods that take a booster +#' object - so for example, changing the booster's configuration requires calling `xgb.config<-` +#' or 'xgb.parameters<-', while simply modifying `attributes(model)$params$<...>` will have no +#' effect elsewhere. +#' #' @seealso #' \code{\link{callbacks}}, #' \code{\link{predict.xgb.Booster}}, diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 74e19b35cee1..0421b9c4a38a 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -278,6 +278,13 @@ and shared between interfaces through serialization functions like \link{xgb.sav R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and not anyhow used by functions like \link{predict.xgb.Booster}. + +Be aware that one such R attribute that is automatically added is \code{params} - this attribute +is assigned from the \code{params} argument to this function, and is only meant to serve as a +reference for what went into the booster, but is not used in other methods that take a booster +object - so for example, changing the booster's configuration requires calling \verb{xgb.config<-} +or 'xgb.parameters<-', while simply modifying \verb{attributes(model)$params$<...>} will have no +effect elsewhere. } \examples{ data(agaricus.train, package='xgboost') From c161999169c977468fbb36cff852508cef85c3d6 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 19:28:24 +0100 Subject: [PATCH 39/47] user SerializeToBuffer for internal serialization --- R-package/src/xgboost_R.cc | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index b8ad4b229442..63f36ad6a0f2 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -737,15 +737,13 @@ void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) { R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE); } -const char *ubj_json_format_str = "{\"format\": \"ubj\"}"; - SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) { R_API_BEGIN(); BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)); char const *serialized_bytes; bst_ulong serialized_length; - CHECK_CALL(XGBoosterSaveModelToBuffer( - handle, ubj_json_format_str, &serialized_length, &serialized_bytes)); + CHECK_CALL(XGBoosterSerializeToBuffer( + handle, &serialized_length, &serialized_bytes)); SEXP R_state = Rf_protect(Rf_allocVector(RAWSXP, serialized_length)); if (serialized_length != 0) { std::memcpy(RAW(R_state), serialized_bytes, serialized_length); @@ -761,9 +759,9 @@ SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) { R_API_BEGIN(); BoosterHandle handle = nullptr; CHECK_CALL(XGBoosterCreate(nullptr, 0, &handle)); - int res_code = XGBoosterLoadModelFromBuffer(handle, - RAW(R_state), - Rf_xlength(R_state)); + int res_code = XGBoosterUnserializeFromBuffer(handle, + RAW(R_state), + Rf_xlength(R_state)); if (res_code != 0) { XGBoosterFree(handle); } @@ -794,14 +792,14 @@ SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) { SEXP out = Rf_protect(XGBMakeEmptyAltrep()); char const *serialized_bytes; bst_ulong serialized_length; - CHECK_CALL(XGBoosterSaveModelToBuffer( + CHECK_CALL(XGBoosterSerializeToBuffer( R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)), - ubj_json_format_str, &serialized_length, &serialized_bytes)); + &serialized_length, &serialized_bytes)); BoosterHandle new_handle = nullptr; CHECK_CALL(XGBoosterCreate(nullptr, 0, &new_handle)); - int res_code = XGBoosterLoadModelFromBuffer(new_handle, - serialized_bytes, - serialized_length); + int res_code = XGBoosterUnserializeFromBuffer(new_handle, + serialized_bytes, + serialized_length); if (res_code != 0) { XGBoosterFree(new_handle); } From feedce500e8376d374562cbef3caffc5edbcd379 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 19:33:48 +0100 Subject: [PATCH 40/47] add test for serialization of config --- R-package/tests/testthat/test_io.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R index 9c05bdd1015d..3d4bf31c2c23 100644 --- a/R-package/tests/testthat/test_io.R +++ b/R-package/tests/testthat/test_io.R @@ -47,3 +47,24 @@ test_that("saveRDS preserves C and R attributes", { expect_equal(attributes(model_new)$my_attr, attributes(model)$my_attr) expect_equal(xgb.attr(model, "c_attr"), xgb.attr(model_new, "c_attr")) }) + +test_that("R serializers keep C config", { + data(mtcars) + y <- mtcars$mpg + x <- as.matrix(mtcars[, -1]) + dm <- xgb.DMatrix(x, label = y, nthread = 1) + model <- xgb.train( + data = dm, + params = list( + tree_method = "approx", + nthread = 1, + max_depth = 2 + ), + nrounds = 3 + ) + model_new <- unserialize(serialize(model, NULL)) + expect_equal( + xgb.config(model)$learner$gradient_booster$gbtree_train_param$tree_method, + xgb.config(model_new)$learner$gradient_booster$gbtree_train_param$tree_method + ) +}) From a02abfc2270ee84296541b75747ce74c54761256 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 19:35:20 +0100 Subject: [PATCH 41/47] check more attributes --- R-package/tests/testthat/test_io.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R index 3d4bf31c2c23..36a6d7572f2f 100644 --- a/R-package/tests/testthat/test_io.R +++ b/R-package/tests/testthat/test_io.R @@ -67,4 +67,5 @@ test_that("R serializers keep C config", { xgb.config(model)$learner$gradient_booster$gbtree_train_param$tree_method, xgb.config(model_new)$learner$gradient_booster$gbtree_train_param$tree_method ) + expect_equal(variable.names(model), variable.names(model_new)) }) From 3285ed632a8798909326cc97e230f93747a45cc2 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:19:37 +0100 Subject: [PATCH 42/47] rewrite compatibility note for serialization --- R-package/R/utils.R | 45 +++++++++++++---- .../a-compatibility-note-for-saveRDS-save.Rd | 48 ++++++++++++++----- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index d288e0562d4b..7ed6195cded0 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -349,21 +349,41 @@ xgb.createFolds <- function(y, k) { #' @name xgboost-deprecated NULL -#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of -#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}. +#' @title Model Serialization and Compatibility +#' @description #' -#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or -#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist -#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if -#' the model is to be accessed in the future. If you train a model with the current version of -#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be -#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future -#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. +#' When it comes to serializing XGBoost models, it's possible to use R serializers such as +#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides +#' its own serializers with perhaps better compability guarantees and which allow loading +#' said models in other language bindings of XGBoost. +#' +#' Note however that an `xgb.Booster` object might also keep:\itemize{ +#' \item Additional model configuration attributes (accessible through \link{xgb.config}), +#' which might be used during model fitting but are not used in e.g. `predict`, feature importance, +#' or plotting methods. +#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs, +#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log` +#' if present. +#' } +#' +#' The first ones (configuration attributes) do not have the same compatibility guarantees as +#' attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes +#' might be lost after loading the booster in a different XGBoost version, regardless of the +#' serializer that was used. Note that these are saved when using \link{xgb.save}, but not when +#' using \link{xgb.save.raw}. +#' +#' The second ones (R attributes) are not part of standard XGBoost model structure, and thus are +#' not saved when using XGBoost's own serializers. These attributes are only used for informational +#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R +#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc. +#' These R attributes are only preserved when using R's own serializers. #' #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models #' before version `2.1.0`; have a very different R object structure and are incompatible with #' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before -#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. +#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that +#' the structure of R model objects could in theory again in the future, so XGBoost's serializers +#' should be preferred for very long-term storage. #' #' Furthermore, note that using the package `qs` for serialization will require version 0.26 or #' higher of said package, and will have the same compatibility restrictions as R serializers. @@ -379,6 +399,11 @@ NULL #' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model #' as part of another R object. #' +#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such +#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's +#' control as it relies on R's serialization format (see e.g. the details section in +#' \link{serialize} and \link{save} from base R). +#' #' For more details and explanation about model persistence and archival, consult the page #' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. #' diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index a54f19be6c8c..46caf4cf8a4d 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -2,26 +2,45 @@ % Please edit documentation in R/utils.R \name{a-compatibility-note-for-saveRDS-save} \alias{a-compatibility-note-for-saveRDS-save} -\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of -models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.} +\title{Model Serialization and Compatibility} \description{ -It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or -\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist -\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if -the model is to be accessed in the future. If you train a model with the current version of -XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be -accessible in later releases of XGBoost. To ensure that your model can be accessed in future -releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead. +When it comes to serializing XGBoost models, it's possible to use R serializers such as +\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides +its own serializers with perhaps better compability guarantees and which allow loading +said models in other language bindings of XGBoost. + +Note however that an \code{xgb.Booster} object might also keep:\itemize{ +\item Additional model configuration attributes (accessible through \link{xgb.config}), +which might be used during model fitting but are not used in e.g. \code{predict}, feature importance, +or plotting methods. +\item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs, +which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log} +if present. } -\details{ + +The first ones (configuration attributes) do not have the same compatibility guarantees as +attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes +might be lost after loading the booster in a different XGBoost version, regardless of the +serializer that was used. Note that these are saved when using \link{xgb.save}, but not when +using \link{xgb.save.raw}. + +The second ones (R attributes) are not part of standard XGBoost model structure, and thus are +not saved when using XGBoost's own serializers. These attributes are only used for informational +purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R +call that produced the model, but are otherwise not used for prediction / importance / plotting / etc. +These R attributes are only preserved when using R's own serializers. + Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models before version \verb{2.1.0}; have a very different R object structure and are incompatible with each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before -version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. +version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that +the structure of R model objects could in theory again in the future, so XGBoost's serializers +should be preferred for very long-term storage. Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or higher of said package, and will have the same compatibility restrictions as R serializers. - +} +\details{ Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into the JSON format by specifying the JSON extension. To read the model back, use \code{\link{xgb.load}}. @@ -32,6 +51,11 @@ re-construct the corresponding model. To read the model back, use \code{\link{xg The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model as part of another R object. +Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such +as evaluation logs, but note that future compatibility of such objects is outside XGBoost's +control as it relies on R's serialization format (see e.g. the details section in +\link{serialize} and \link{save} from base R). + For more details and explanation about model persistence and archival, consult the page \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. } From 808225611f05d243aa05f971c372ffbb0da41db8 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:23:01 +0100 Subject: [PATCH 43/47] improve wording --- R-package/R/utils.R | 2 +- R-package/man/a-compatibility-note-for-saveRDS-save.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 7ed6195cded0..e252afb03c45 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -357,7 +357,7 @@ NULL #' its own serializers with perhaps better compability guarantees and which allow loading #' said models in other language bindings of XGBoost. #' -#' Note however that an `xgb.Booster` object might also keep:\itemize{ +#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{ #' \item Additional model configuration attributes (accessible through \link{xgb.config}), #' which might be used during model fitting but are not used in e.g. `predict`, feature importance, #' or plotting methods. diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 46caf4cf8a4d..eccf05be4062 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -9,7 +9,7 @@ When it comes to serializing XGBoost models, it's possible to use R serializers its own serializers with perhaps better compability guarantees and which allow loading said models in other language bindings of XGBoost. -Note however that an \code{xgb.Booster} object might also keep:\itemize{ +Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{ \item Additional model configuration attributes (accessible through \link{xgb.config}), which might be used during model fitting but are not used in e.g. \code{predict}, feature importance, or plotting methods. From 6fa79378fe068cb4c5baef63bd8b486c79fa1a06 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:31:01 +0100 Subject: [PATCH 44/47] update note about attributes in xgb.save --- R-package/R/utils.R | 5 +++-- R-package/man/a-compatibility-note-for-saveRDS-save.Rd | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index e252afb03c45..9ce98f27511e 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -369,8 +369,9 @@ NULL #' The first ones (configuration attributes) do not have the same compatibility guarantees as #' attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes #' might be lost after loading the booster in a different XGBoost version, regardless of the -#' serializer that was used. Note that these are saved when using \link{xgb.save}, but not when -#' using \link{xgb.save.raw}. +#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded +#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's +#' serializers from its public interface. #' #' The second ones (R attributes) are not part of standard XGBoost model structure, and thus are #' not saved when using XGBoost's own serializers. These attributes are only used for informational diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index eccf05be4062..5caed5f4fb20 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -21,8 +21,9 @@ if present. The first ones (configuration attributes) do not have the same compatibility guarantees as attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes might be lost after loading the booster in a different XGBoost version, regardless of the -serializer that was used. Note that these are saved when using \link{xgb.save}, but not when -using \link{xgb.save.raw}. +serializer that was used. These are saved when using \link{saveRDS}, but will be discarded +if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's +serializers from its public interface. The second ones (R attributes) are not part of standard XGBoost model structure, and thus are not saved when using XGBoost's own serializers. These attributes are only used for informational From e02ed8fcaa5f71b0a5f9d9552233007887a5f0fa Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:44:49 +0100 Subject: [PATCH 45/47] Update R-package/R/utils.R Co-authored-by: Jiaming Yuan --- R-package/R/utils.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 9ce98f27511e..a137e5106d76 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -354,13 +354,13 @@ NULL #' #' When it comes to serializing XGBoost models, it's possible to use R serializers such as #' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides -#' its own serializers with perhaps better compability guarantees and which allow loading +#' its own serializers with better compatibility guarantees, which allow loading #' said models in other language bindings of XGBoost. #' #' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{ -#' \item Additional model configuration attributes (accessible through \link{xgb.config}), -#' which might be used during model fitting but are not used in e.g. `predict`, feature importance, -#' or plotting methods. +#' \item Additional model configuration (accessible through \link{xgb.config}), +#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`. +#' These are not necessarily useful for prediction/importance/plotting. #' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs, #' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log` #' if present. From ff70221eed78d30ad317a3436ee5dd661dd71d9c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:45:27 +0100 Subject: [PATCH 46/47] Update R-package/R/utils.R Co-authored-by: Jiaming Yuan --- R-package/R/utils.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index a137e5106d76..945d86132a08 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -366,25 +366,25 @@ NULL #' if present. #' } #' -#' The first ones (configuration attributes) do not have the same compatibility guarantees as -#' attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes +#' The first one (configurations) does not have the same compatibility guarantees as +#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration #' might be lost after loading the booster in a different XGBoost version, regardless of the #' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded #' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's -#' serializers from its public interface. +#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}. #' -#' The second ones (R attributes) are not part of standard XGBoost model structure, and thus are +#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are #' not saved when using XGBoost's own serializers. These attributes are only used for informational #' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R #' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc. -#' These R attributes are only preserved when using R's own serializers. +#' These R attributes are only preserved when using R's serializers. #' #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models #' before version `2.1.0`; have a very different R object structure and are incompatible with #' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before #' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that -#' the structure of R model objects could in theory again in the future, so XGBoost's serializers -#' should be preferred for very long-term storage. +#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers +#' should be preferred for long-term storage. #' #' Furthermore, note that using the package `qs` for serialization will require version 0.26 or #' higher of said package, and will have the same compatibility restrictions as R serializers. From d133258592e89314ef74a897b4636cf08d30e1e2 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 10 Jan 2024 20:46:13 +0100 Subject: [PATCH 47/47] rebuild docs --- .../a-compatibility-note-for-saveRDS-save.Rd | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 5caed5f4fb20..860f4f0c1580 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -6,37 +6,37 @@ \description{ When it comes to serializing XGBoost models, it's possible to use R serializers such as \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides -its own serializers with perhaps better compability guarantees and which allow loading +its own serializers with better compatibility guarantees, which allow loading said models in other language bindings of XGBoost. Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{ -\item Additional model configuration attributes (accessible through \link{xgb.config}), -which might be used during model fitting but are not used in e.g. \code{predict}, feature importance, -or plotting methods. +\item Additional model configuration (accessible through \link{xgb.config}), +which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}. +These are not necessarily useful for prediction/importance/plotting. \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs, which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log} if present. } -The first ones (configuration attributes) do not have the same compatibility guarantees as -attributes that are set and accessed through \link{xgb.attributes} - that is, such attributes +The first one (configurations) does not have the same compatibility guarantees as +the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration might be lost after loading the booster in a different XGBoost version, regardless of the serializer that was used. These are saved when using \link{saveRDS}, but will be discarded if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's -serializers from its public interface. +serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}. -The second ones (R attributes) are not part of standard XGBoost model structure, and thus are +The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are not saved when using XGBoost's own serializers. These attributes are only used for informational purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R call that produced the model, but are otherwise not used for prediction / importance / plotting / etc. -These R attributes are only preserved when using R's own serializers. +These R attributes are only preserved when using R's serializers. Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models before version \verb{2.1.0}; have a very different R object structure and are incompatible with each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that -the structure of R model objects could in theory again in the future, so XGBoost's serializers -should be preferred for very long-term storage. +the structure of R model objects could in theory change again in the future, so XGBoost's serializers +should be preferred for long-term storage. Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or higher of said package, and will have the same compatibility restrictions as R serializers.