Skip to content

Commit

Permalink
Use hist as the default tree method. (#9320)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Jun 27, 2023
1 parent bc267dd commit f479871
Show file tree
Hide file tree
Showing 10 changed files with 139 additions and 179 deletions.
43 changes: 26 additions & 17 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,18 @@ test_that("dart prediction works", {
rnorm(100)

set.seed(1994)
booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
rate_drop = 0.5, one_drop = TRUE,
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
booster_by_xgboost <- xgboost(
data = d,
label = y,
max_depth = 2,
booster = "dart",
rate_drop = 0.5,
one_drop = TRUE,
eta = 1,
nthread = 2,
nrounds = nrounds,
objective = "reg:squarederror"
)
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
Expand All @@ -97,19 +106,19 @@ test_that("dart prediction works", {

set.seed(1994)
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
booster_by_train <- xgb.train(params = list(
booster = "dart",
max_depth = 2,
eta = 1,
rate_drop = 0.5,
one_drop = TRUE,
nthread = 1,
tree_method = "exact",
objective = "reg:squarederror"
),
data = dtrain,
nrounds = nrounds
)
booster_by_train <- xgb.train(
params = list(
booster = "dart",
max_depth = 2,
eta = 1,
rate_drop = 0.5,
one_drop = TRUE,
nthread = 1,
objective = "reg:squarederror"
),
data = dtrain,
nrounds = nrounds
)
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
Expand Down Expand Up @@ -399,7 +408,7 @@ test_that("colsample_bytree works", {
xgb.importance(model = bst)
# If colsample_bytree works properly, a variety of features should be used
# in the 100 trees
expect_gte(nrow(xgb.importance(model = bst)), 30)
expect_gte(nrow(xgb.importance(model = bst)), 28)
})

test_that("Configuration works", {
Expand Down
5 changes: 4 additions & 1 deletion R-package/tests/testthat/test_update.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ test_that("updating the model works", {
watchlist <- list(train = dtrain, test = dtest)

# no-subsampling
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
p1 <- list(
objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
updater = "grow_colmaker,prune"
)
set.seed(11)
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
tr1 <- xgb.model.dt.tree(model = bst1)
Expand Down
96 changes: 17 additions & 79 deletions src/gbm/gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gbtree);

void GBTree::Configure(Args const& cfg) {
this->cfg_ = cfg;
std::string updater_seq = tparam_.updater_seq;
tparam_.UpdateAllowUnknown(cfg);
tree_param_.UpdateAllowUnknown(cfg);
Expand Down Expand Up @@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) {

monitor_.Init("GBTree");

specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) {
return arg.first == "updater";
});
specified_updater_ = std::any_of(
cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });

if (specified_updater_ && !showed_updater_warning_) {
LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
Expand All @@ -93,111 +91,52 @@ void GBTree::Configure(Args const& cfg) {
showed_updater_warning_ = true;
}

if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
this->ConfigureUpdaters();

if (updater_seq != tparam_.updater_seq) {
updaters_.clear();
this->InitUpdater(cfg);
} else {
for (auto &up : updaters_) {
for (auto& up : updaters_) {
up->Configure(cfg);
}
}

configured_ = true;
}

// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
// whether external memory is used and how large is dataset. We can remove the dependency
// on DMatrix once `hist` tree method can handle external memory so that we can make it
// default.
void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
CHECK(this->configured_);
std::string updater_seq = tparam_.updater_seq;
CHECK(tparam_.GetInitialised());

tparam_.UpdateAllowUnknown(cfg);

this->PerformTreeMethodHeuristic(fmat);
this->ConfigureUpdaters();

// initialize the updaters only when needed.
if (updater_seq != tparam_.updater_seq) {
LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
this->updaters_.clear();
this->InitUpdater(cfg);
}
}

void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
if (specified_updater_) {
// This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}

// tparam_ is set before calling this function.
if (tparam_.tree_method != TreeMethod::kAuto) {
return;
}

if (collective::IsDistributed()) {
LOG(INFO) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (!fmat->SingleColBlock()) {
LOG(INFO) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(INFO) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
} else {
tparam_.tree_method = TreeMethod::kExact;
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
}

void GBTree::ConfigureUpdaters() {
if (specified_updater_) {
return;
}
// `updater` parameter was manually specified
/* Choose updaters according to tree_method parameters */
switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx' This
// choice is carried out in PerformTreeMethodHeuristic() before
// calling this function.
case TreeMethod::kAuto: // Use hist as default in 2.0
case TreeMethod::kHist: {
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kApprox:
tparam_.updater_seq = "grow_histmaker";
break;
case TreeMethod::kExact:
tparam_.updater_seq = "grow_colmaker,prune";
break;
case TreeMethod::kHist: {
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
"grow_quantile_histmaker.";
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
tparam_.updater_seq = "grow_gpu_hist";
break;
}
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
<< ") detected";
}
}

Expand Down Expand Up @@ -253,7 +192,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) {
TreesOneIter new_trees;
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
ConfigureWithKnownData(this->cfg_, p_fmat);
monitor_.Start("BoostNewTrees");

// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
Expand Down
19 changes: 3 additions & 16 deletions src/gbm/gbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);

namespace xgboost {
namespace gbm {

namespace xgboost::gbm {
/*! \brief training parameters */
struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
/*! \brief tree updater sequence */
Expand Down Expand Up @@ -192,12 +190,8 @@ class GBTree : public GradientBooster {
: GradientBooster{ctx}, model_(booster_config, ctx_) {}

void Configure(const Args& cfg) override;
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix, only useful when tree_method is auto.
void PerformTreeMethodHeuristic(DMatrix* fmat);
/*! \brief Map `tree_method` parameter to `updater` parameter */
void ConfigureUpdaters();
void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);

/**
* \brief Optionally update the leaf value.
Expand All @@ -222,11 +216,7 @@ class GBTree : public GradientBooster {
return tparam_;
}

void Load(dmlc::Stream* fi) override {
model_.Load(fi);
this->cfg_.clear();
}

void Load(dmlc::Stream* fi) override { model_.Load(fi); }
void Save(dmlc::Stream* fo) const override {
model_.Save(fo);
}
Expand Down Expand Up @@ -416,8 +406,6 @@ class GBTree : public GradientBooster {
bool showed_updater_warning_ {false};
bool specified_updater_ {false};
bool configured_ {false};
// configurations for tree
Args cfg_;
// the updaters that can be applied to each of tree
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
// Predictors
Expand All @@ -431,7 +419,6 @@ class GBTree : public GradientBooster {
common::Monitor monitor_;
};

} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

#endif // XGBOOST_GBM_GBTREE_H_
1 change: 1 addition & 0 deletions tests/ci_build/lint_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class LintersPaths:
"tests/python/test_predict.py",
"tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py",
"tests/python/test_shap.py",
"tests/python-gpu/test_gpu_data_iterator.py",
"tests/test_distributed/test_with_spark/",
"tests/test_distributed/test_gpu_with_spark/",
Expand Down
2 changes: 2 additions & 0 deletions tests/cpp/test_learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ TEST(Learner, Seed) {
TEST(Learner, ConstantSeed) {
auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true);
std::unique_ptr<Learner> learner{Learner::Create({m})};
// Use exact as it doesn't initialize column sampler at construction, which alters the rng.
learner->SetParam("tree_method", "exact");
learner->Configure(); // seed the global random

std::uniform_real_distribution<float> dist;
Expand Down
5 changes: 2 additions & 3 deletions tests/python/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
def test_basic_walkthrough():
script = os.path.join(PYTHON_DEMO_DIR, 'basic_walkthrough.py')
cmd = ['python', script]
subprocess.check_call(cmd)
os.remove('dump.nice.txt')
os.remove('dump.raw.txt')
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.check_call(cmd, cwd=tmpdir)


@pytest.mark.skipif(**tm.no_matplotlib())
Expand Down
Loading

0 comments on commit f479871

Please sign in to comment.