Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use hist as the default tree method. #9320

Merged
merged 10 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 26 additions & 17 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,18 @@ test_that("dart prediction works", {
rnorm(100)

set.seed(1994)
booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
rate_drop = 0.5, one_drop = TRUE,
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
booster_by_xgboost <- xgboost(
data = d,
label = y,
max_depth = 2,
booster = "dart",
rate_drop = 0.5,
one_drop = TRUE,
eta = 1,
nthread = 2,
nrounds = nrounds,
objective = "reg:squarederror"
)
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
Expand All @@ -97,19 +106,19 @@ test_that("dart prediction works", {

set.seed(1994)
dtrain <- xgb.DMatrix(data = d, info = list(label = y))
booster_by_train <- xgb.train(params = list(
booster = "dart",
max_depth = 2,
eta = 1,
rate_drop = 0.5,
one_drop = TRUE,
nthread = 1,
tree_method = "exact",
objective = "reg:squarederror"
),
data = dtrain,
nrounds = nrounds
)
booster_by_train <- xgb.train(
params = list(
booster = "dart",
max_depth = 2,
eta = 1,
rate_drop = 0.5,
one_drop = TRUE,
nthread = 1,
objective = "reg:squarederror"
),
data = dtrain,
nrounds = nrounds
)
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
Expand Down Expand Up @@ -399,7 +408,7 @@ test_that("colsample_bytree works", {
xgb.importance(model = bst)
# If colsample_bytree works properly, a variety of features should be used
# in the 100 trees
expect_gte(nrow(xgb.importance(model = bst)), 30)
expect_gte(nrow(xgb.importance(model = bst)), 28)
})

test_that("Configuration works", {
Expand Down
5 changes: 4 additions & 1 deletion R-package/tests/testthat/test_update.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ test_that("updating the model works", {
watchlist <- list(train = dtrain, test = dtest)

# no-subsampling
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
p1 <- list(
objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
updater = "grow_colmaker,prune"
)
set.seed(11)
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
tr1 <- xgb.model.dt.tree(model = bst1)
Expand Down
96 changes: 17 additions & 79 deletions src/gbm/gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ namespace xgboost::gbm {
DMLC_REGISTRY_FILE_TAG(gbtree);

void GBTree::Configure(Args const& cfg) {
this->cfg_ = cfg;
std::string updater_seq = tparam_.updater_seq;
tparam_.UpdateAllowUnknown(cfg);
tree_param_.UpdateAllowUnknown(cfg);
Expand Down Expand Up @@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) {

monitor_.Init("GBTree");

specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) {
return arg.first == "updater";
});
specified_updater_ = std::any_of(
cfg.cbegin(), cfg.cend(),
[](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });

if (specified_updater_ && !showed_updater_warning_) {
LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
Expand All @@ -93,111 +91,52 @@ void GBTree::Configure(Args const& cfg) {
showed_updater_warning_ = true;
}

if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
this->ConfigureUpdaters();

if (updater_seq != tparam_.updater_seq) {
updaters_.clear();
this->InitUpdater(cfg);
} else {
for (auto &up : updaters_) {
for (auto& up : updaters_) {
up->Configure(cfg);
}
}

configured_ = true;
}

// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on
// whether external memory is used and how large is dataset. We can remove the dependency
// on DMatrix once `hist` tree method can handle external memory so that we can make it
// default.
void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
CHECK(this->configured_);
std::string updater_seq = tparam_.updater_seq;
CHECK(tparam_.GetInitialised());

tparam_.UpdateAllowUnknown(cfg);

this->PerformTreeMethodHeuristic(fmat);
this->ConfigureUpdaters();

// initialize the updaters only when needed.
if (updater_seq != tparam_.updater_seq) {
LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
this->updaters_.clear();
this->InitUpdater(cfg);
}
}

void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
if (specified_updater_) {
// This method is disabled when `updater` parameter is explicitly
// set, since only experts are expected to do so.
return;
}
if (model_.learner_model_param->IsVectorLeaf()) {
CHECK(tparam_.tree_method == TreeMethod::kHist)
<< "Only the hist tree method is supported for building multi-target trees with vector "
"leaf.";
}

// tparam_ is set before calling this function.
if (tparam_.tree_method != TreeMethod::kAuto) {
return;
}

if (collective::IsDistributed()) {
LOG(INFO) << "Tree method is automatically selected to be 'approx' "
"for distributed training.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (!fmat->SingleColBlock()) {
LOG(INFO) << "Tree method is automatically set to 'approx' "
"since external-memory data matrix is used.";
tparam_.tree_method = TreeMethod::kApprox;
} else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
/* Choose tree_method='approx' automatically for large data matrix */
LOG(INFO) << "Tree method is automatically selected to be "
"'approx' for faster speed. To use old behavior "
"(exact greedy algorithm on single machine), "
"set tree_method to 'exact'.";
tparam_.tree_method = TreeMethod::kApprox;
} else {
tparam_.tree_method = TreeMethod::kExact;
}
LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
}

void GBTree::ConfigureUpdaters() {
if (specified_updater_) {
return;
}
// `updater` parameter was manually specified
/* Choose updaters according to tree_method parameters */
switch (tparam_.tree_method) {
case TreeMethod::kAuto:
// Use heuristic to choose between 'exact' and 'approx' This
// choice is carried out in PerformTreeMethodHeuristic() before
// calling this function.
case TreeMethod::kAuto: // Use hist as default in 2.0
case TreeMethod::kHist: {
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kApprox:
tparam_.updater_seq = "grow_histmaker";
break;
case TreeMethod::kExact:
tparam_.updater_seq = "grow_colmaker,prune";
break;
case TreeMethod::kHist: {
LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
"grow_quantile_histmaker.";
tparam_.updater_seq = "grow_quantile_histmaker";
break;
}
case TreeMethod::kGPUHist: {
common::AssertGPUSupport();
tparam_.updater_seq = "grow_gpu_hist";
break;
}
default:
LOG(FATAL) << "Unknown tree_method ("
<< static_cast<int>(tparam_.tree_method) << ") detected";
LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
<< ") detected";
}
}

Expand Down Expand Up @@ -253,7 +192,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) {
TreesOneIter new_trees;
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
ConfigureWithKnownData(this->cfg_, p_fmat);
monitor_.Start("BoostNewTrees");

// Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let
Expand Down
19 changes: 3 additions & 16 deletions src/gbm/gbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);

namespace xgboost {
namespace gbm {

namespace xgboost::gbm {
/*! \brief training parameters */
struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
/*! \brief tree updater sequence */
Expand Down Expand Up @@ -192,12 +190,8 @@ class GBTree : public GradientBooster {
: GradientBooster{ctx}, model_(booster_config, ctx_) {}

void Configure(const Args& cfg) override;
// Revise `tree_method` and `updater` parameters after seeing the training
// data matrix, only useful when tree_method is auto.
void PerformTreeMethodHeuristic(DMatrix* fmat);
/*! \brief Map `tree_method` parameter to `updater` parameter */
void ConfigureUpdaters();
void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);

/**
* \brief Optionally update the leaf value.
Expand All @@ -222,11 +216,7 @@ class GBTree : public GradientBooster {
return tparam_;
}

void Load(dmlc::Stream* fi) override {
model_.Load(fi);
this->cfg_.clear();
}

void Load(dmlc::Stream* fi) override { model_.Load(fi); }
void Save(dmlc::Stream* fo) const override {
model_.Save(fo);
}
Expand Down Expand Up @@ -416,8 +406,6 @@ class GBTree : public GradientBooster {
bool showed_updater_warning_ {false};
bool specified_updater_ {false};
bool configured_ {false};
// configurations for tree
Args cfg_;
// the updaters that can be applied to each of tree
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
// Predictors
Expand All @@ -431,7 +419,6 @@ class GBTree : public GradientBooster {
common::Monitor monitor_;
};

} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

#endif // XGBOOST_GBM_GBTREE_H_
1 change: 1 addition & 0 deletions tests/ci_build/lint_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class LintersPaths:
"tests/python/test_predict.py",
"tests/python/test_quantile_dmatrix.py",
"tests/python/test_tree_regularization.py",
"tests/python/test_shap.py",
"tests/python-gpu/test_gpu_data_iterator.py",
"tests/test_distributed/test_with_spark/",
"tests/test_distributed/test_gpu_with_spark/",
Expand Down
2 changes: 2 additions & 0 deletions tests/cpp/test_learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ TEST(Learner, Seed) {
TEST(Learner, ConstantSeed) {
auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true);
std::unique_ptr<Learner> learner{Learner::Create({m})};
// Use exact as it doesn't initialize column sampler at construction, which alters the rng.
learner->SetParam("tree_method", "exact");
learner->Configure(); // seed the global random

std::uniform_real_distribution<float> dist;
Expand Down
5 changes: 2 additions & 3 deletions tests/python/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
def test_basic_walkthrough():
script = os.path.join(PYTHON_DEMO_DIR, 'basic_walkthrough.py')
cmd = ['python', script]
subprocess.check_call(cmd)
os.remove('dump.nice.txt')
os.remove('dump.raw.txt')
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.check_call(cmd, cwd=tmpdir)


@pytest.mark.skipif(**tm.no_matplotlib())
Expand Down
Loading
Loading