dmlc · trivialfis · Jun 27, 2023 · Jun 22, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -85,9 +85,18 @@ test_that("dart prediction works", {
  rnorm(100)
 
  set.seed(1994)
- booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
- rate_drop = 0.5, one_drop = TRUE,
- eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
+ booster_by_xgboost <- xgboost(
+ data = d,
+ label = y,
+ max_depth = 2,
+ booster = "dart",
+ rate_drop = 0.5,
+ one_drop = TRUE,
+ eta = 1,
+ nthread = 2,
+ nrounds = nrounds,
+ objective = "reg:squarederror"
+ )
  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
  expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
@@ -97,19 +106,19 @@ test_that("dart prediction works", {
 
  set.seed(1994)
  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
- booster_by_train <- xgb.train(params = list(
-  booster = "dart",
-  max_depth = 2,
-  eta = 1,
-  rate_drop = 0.5,
-  one_drop = TRUE,
-  nthread = 1,
-  tree_method = "exact",
-  objective = "reg:squarederror"
-  ),
-  data = dtrain,
-  nrounds = nrounds
-  )
+ booster_by_train <- xgb.train(
+ params = list(
+ booster = "dart",
+ max_depth = 2,
+ eta = 1,
+ rate_drop = 0.5,
+ one_drop = TRUE,
+ nthread = 1,
+ objective = "reg:squarederror"
+ ),
+ data = dtrain,
+ nrounds = nrounds
+ )
  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
@@ -399,7 +408,7 @@ test_that("colsample_bytree works", {
  xgb.importance(model = bst)
  # If colsample_bytree works properly, a variety of features should be used
  # in the 100 trees
- expect_gte(nrow(xgb.importance(model = bst)), 30)
+ expect_gte(nrow(xgb.importance(model = bst)), 28)
 })
 
 test_that("Configuration works", {

diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
@@ -13,7 +13,10 @@ test_that("updating the model works", {
  watchlist <- list(train = dtrain, test = dtest)
 
  # no-subsampling
- p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
+ p1 <- list(
+ objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
+ updater = "grow_colmaker,prune"
+ )
  set.seed(11)
  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
  tr1 <- xgb.model.dt.tree(model = bst1)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -39,7 +39,6 @@ namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);
 
 void GBTree::Configure(Args const& cfg) {
- this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
  tree_param_.UpdateAllowUnknown(cfg);
@@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) {
 
  monitor_.Init("GBTree");
 
- specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
- [](std::pair<std::string, std::string> const& arg) {
- return arg.first == "updater";
- });
+ specified_updater_ = std::any_of(
+ cfg.cbegin(), cfg.cend(),
+ [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
 
  if (specified_updater_ && !showed_updater_warning_) {
  LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -93,111 +91,52 @@ void GBTree::Configure(Args const& cfg) {
  showed_updater_warning_ = true;
  }
 
+ if (model_.learner_model_param->IsVectorLeaf()) {
+ CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
+ << "Only the hist tree method is supported for building multi-target trees with vector "
+ "leaf.";
+ }
+ LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
  this->ConfigureUpdaters();
+
  if (updater_seq != tparam_.updater_seq) {
  updaters_.clear();
  this->InitUpdater(cfg);
  } else {
- for (auto &up : updaters_) {
+ for (auto& up : updaters_) {
  up->Configure(cfg);
  }
  }
 
  configured_ = true;
 }
 
-// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
-// whether external memory is used and how large is dataset. We can remove the dependency
-// on DMatrix once `hist` tree method can handle external memory so that we can make it
-// default.
-void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
- CHECK(this->configured_);
- std::string updater_seq = tparam_.updater_seq;
- CHECK(tparam_.GetInitialised());
-
- tparam_.UpdateAllowUnknown(cfg);
-
- this->PerformTreeMethodHeuristic(fmat);
- this->ConfigureUpdaters();
-
- // initialize the updaters only when needed.
- if (updater_seq != tparam_.updater_seq) {
- LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
- this->updaters_.clear();
- this->InitUpdater(cfg);
- }
-}
-
-void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
- if (specified_updater_) {
- // This method is disabled when `updater` parameter is explicitly
- // set, since only experts are expected to do so.
- return;
- }
- if (model_.learner_model_param->IsVectorLeaf()) {
- CHECK(tparam_.tree_method == TreeMethod::kHist)
- << "Only the hist tree method is supported for building multi-target trees with vector "
- "leaf.";
- }
-
- // tparam_ is set before calling this function.
- if (tparam_.tree_method != TreeMethod::kAuto) {
- return;
- }
-
- if (collective::IsDistributed()) {
- LOG(INFO) << "Tree method is automatically selected to be 'approx' "
- "for distributed training.";
- tparam_.tree_method = TreeMethod::kApprox;
- } else if (!fmat->SingleColBlock()) {
- LOG(INFO) << "Tree method is automatically set to 'approx' "
- "since external-memory data matrix is used.";
- tparam_.tree_method = TreeMethod::kApprox;
- } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
- /* Choose tree_method='approx' automatically for large data matrix */
- LOG(INFO) << "Tree method is automatically selected to be "
- "'approx' for faster speed. To use old behavior "
- "(exact greedy algorithm on single machine), "
- "set tree_method to 'exact'.";
- tparam_.tree_method = TreeMethod::kApprox;
- } else {
- tparam_.tree_method = TreeMethod::kExact;
- }
- LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
-}
-
 void GBTree::ConfigureUpdaters() {
  if (specified_updater_) {
  return;
  }
  // `updater` parameter was manually specified
  /* Choose updaters according to tree_method parameters */
  switch (tparam_.tree_method) {
- case TreeMethod::kAuto:
- // Use heuristic to choose between 'exact' and 'approx' This
- // choice is carried out in PerformTreeMethodHeuristic() before
- // calling this function.
+ case TreeMethod::kAuto: // Use hist as default in 2.0
+ case TreeMethod::kHist: {
+ tparam_.updater_seq = "grow_quantile_histmaker";
  break;
+ }
  case TreeMethod::kApprox:
  tparam_.updater_seq = "grow_histmaker";
  break;
  case TreeMethod::kExact:
  tparam_.updater_seq = "grow_colmaker,prune";
  break;
- case TreeMethod::kHist: {
- LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
- "grow_quantile_histmaker.";
- tparam_.updater_seq = "grow_quantile_histmaker";
- break;
- }
  case TreeMethod::kGPUHist: {
  common::AssertGPUSupport();
  tparam_.updater_seq = "grow_gpu_hist";
  break;
  }
  default:
- LOG(FATAL) << "Unknown tree_method ("
- << static_cast<int>(tparam_.tree_method) << ") detected";
+ LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
+ << ") detected";
  }
 }
 
@@ -253,7 +192,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
  PredictionCacheEntry* predt, ObjFunction const* obj) {
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
- ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");
 
  // Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let

diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
@@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
 DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
 
-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
  /*! \brief tree updater sequence */
@@ -192,12 +190,8 @@ class GBTree : public GradientBooster {
  : GradientBooster{ctx}, model_(booster_config, ctx_) {}
 
  void Configure(const Args& cfg) override;
- // Revise `tree_method` and `updater` parameters after seeing the training
- // data matrix, only useful when tree_method is auto.
- void PerformTreeMethodHeuristic(DMatrix* fmat);
  /*! \brief Map `tree_method` parameter to `updater` parameter */
  void ConfigureUpdaters();
- void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
 
  /**
  * \brief Optionally update the leaf value.
@@ -222,11 +216,7 @@ class GBTree : public GradientBooster {
  return tparam_;
  }
 
- void Load(dmlc::Stream* fi) override {
- model_.Load(fi);
- this->cfg_.clear();
- }
-
+ void Load(dmlc::Stream* fi) override { model_.Load(fi); }
  void Save(dmlc::Stream* fo) const override {
  model_.Save(fo);
  }
@@ -416,8 +406,6 @@ class GBTree : public GradientBooster {
  bool showed_updater_warning_ {false};
  bool specified_updater_ {false};
  bool configured_ {false};
- // configurations for tree
- Args cfg_;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters_;
  // Predictors
@@ -431,7 +419,6 @@ class GBTree : public GradientBooster {
  common::Monitor monitor_;
 };
 
-} // namespace gbm
-} // namespace xgboost
+} // namespace xgboost::gbm
 
 #endif // XGBOOST_GBM_GBTREE_H_
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
@@ -23,6 +23,7 @@ class LintersPaths:
  "tests/python/test_predict.py",
  "tests/python/test_quantile_dmatrix.py",
  "tests/python/test_tree_regularization.py",
+ "tests/python/test_shap.py",
  "tests/python-gpu/test_gpu_data_iterator.py",
  "tests/test_distributed/test_with_spark/",
  "tests/test_distributed/test_gpu_with_spark/",

diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
@@ -379,6 +379,8 @@ TEST(Learner, Seed) {
 TEST(Learner, ConstantSeed) {
  auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true);
  std::unique_ptr<Learner> learner{Learner::Create({m})};
+ // Use exact as it doesn't initialize column sampler at construction, which alters the rng.
+ learner->SetParam("tree_method", "exact");
  learner->Configure(); // seed the global random
 
  std::uniform_real_distribution<float> dist;

diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
@@ -18,9 +18,8 @@
 def test_basic_walkthrough():
  script = os.path.join(PYTHON_DEMO_DIR, 'basic_walkthrough.py')
  cmd = ['python', script]
- subprocess.check_call(cmd)
- os.remove('dump.nice.txt')
- os.remove('dump.raw.txt')
+ with tempfile.TemporaryDirectory() as tmpdir:
+ subprocess.check_call(cmd, cwd=tmpdir)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())