From 4904d674ed4436ac2bdbf6dfb0752497fc94e251 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 22 Jun 2023 11:41:29 +0800 Subject: [PATCH 01/10] Use hist as the default tree method. --- src/gbm/gbtree.cc | 64 +++++++-------------------------------- src/gbm/gbtree.h | 18 +++-------- tests/cpp/test_learner.cc | 2 ++ 3 files changed, 17 insertions(+), 67 deletions(-) diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index f67c053448e3..ee752d5bd5bb 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -39,7 +39,6 @@ namespace xgboost::gbm { DMLC_REGISTRY_FILE_TAG(gbtree); void GBTree::Configure(Args const& cfg) { - this->cfg_ = cfg; std::string updater_seq = tparam_.updater_seq; tparam_.UpdateAllowUnknown(cfg); tree_param_.UpdateAllowUnknown(cfg); @@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) { monitor_.Init("GBTree"); - specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(), - [](std::pair const& arg) { - return arg.first == "updater"; - }); + specified_updater_ = std::any_of( + cfg.cbegin(), cfg.cend(), + [](std::pair const& arg) { return arg.first == "updater"; }); if (specified_updater_ && !showed_updater_warning_) { LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` " @@ -93,12 +91,14 @@ void GBTree::Configure(Args const& cfg) { showed_updater_warning_ = true; } + this->PerformTreeMethodHeuristic(); this->ConfigureUpdaters(); + if (updater_seq != tparam_.updater_seq) { updaters_.clear(); this->InitUpdater(cfg); } else { - for (auto &up : updaters_) { + for (auto& up : updaters_) { up->Configure(cfg); } } @@ -106,36 +106,14 @@ void GBTree::Configure(Args const& cfg) { configured_ = true; } -// FIXME(trivialfis): This handles updaters. Because the choice of updaters depends on -// whether external memory is used and how large is dataset. We can remove the dependency -// on DMatrix once `hist` tree method can handle external memory so that we can make it -// default. -void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) { - CHECK(this->configured_); - std::string updater_seq = tparam_.updater_seq; - CHECK(tparam_.GetInitialised()); - - tparam_.UpdateAllowUnknown(cfg); - - this->PerformTreeMethodHeuristic(fmat); - this->ConfigureUpdaters(); - - // initialize the updaters only when needed. - if (updater_seq != tparam_.updater_seq) { - LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq; - this->updaters_.clear(); - this->InitUpdater(cfg); - } -} - -void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) { +void GBTree::PerformTreeMethodHeuristic() { if (specified_updater_) { // This method is disabled when `updater` parameter is explicitly // set, since only experts are expected to do so. return; } if (model_.learner_model_param->IsVectorLeaf()) { - CHECK(tparam_.tree_method == TreeMethod::kHist) + CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto) << "Only the hist tree method is supported for building multi-target trees with vector " "leaf."; } @@ -145,24 +123,7 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) { return; } - if (collective::IsDistributed()) { - LOG(INFO) << "Tree method is automatically selected to be 'approx' " - "for distributed training."; - tparam_.tree_method = TreeMethod::kApprox; - } else if (!fmat->SingleColBlock()) { - LOG(INFO) << "Tree method is automatically set to 'approx' " - "since external-memory data matrix is used."; - tparam_.tree_method = TreeMethod::kApprox; - } else if (fmat->Info().num_row_ >= (4UL << 20UL)) { - /* Choose tree_method='approx' automatically for large data matrix */ - LOG(INFO) << "Tree method is automatically selected to be " - "'approx' for faster speed. To use old behavior " - "(exact greedy algorithm on single machine), " - "set tree_method to 'exact'."; - tparam_.tree_method = TreeMethod::kApprox; - } else { - tparam_.tree_method = TreeMethod::kExact; - } + tparam_.tree_method = TreeMethod::kHist; LOG(DEBUG) << "Using tree method: " << static_cast(tparam_.tree_method); } @@ -185,8 +146,6 @@ void GBTree::ConfigureUpdaters() { tparam_.updater_seq = "grow_colmaker,prune"; break; case TreeMethod::kHist: { - LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater " - "grow_quantile_histmaker."; tparam_.updater_seq = "grow_quantile_histmaker"; break; } @@ -196,8 +155,8 @@ void GBTree::ConfigureUpdaters() { break; } default: - LOG(FATAL) << "Unknown tree_method (" - << static_cast(tparam_.tree_method) << ") detected"; + LOG(FATAL) << "Unknown tree_method (" << static_cast(tparam_.tree_method) + << ") detected"; } } @@ -253,7 +212,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector* in_gpair, PredictionCacheEntry* predt, ObjFunction const* obj) { TreesOneIter new_trees; bst_target_t const n_groups = model_.learner_model_param->OutputLength(); - ConfigureWithKnownData(this->cfg_, p_fmat); monitor_.Start("BoostNewTrees"); // Weird case that tree method is cpu-based but gpu_id is set. Ideally we should let diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 6e7da77ac1d6..9bad1bd1be31 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod); DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType); DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType); -namespace xgboost { -namespace gbm { - +namespace xgboost::gbm { /*! \brief training parameters */ struct GBTreeTrainParam : public XGBoostParameter { /*! \brief tree updater sequence */ @@ -194,10 +192,9 @@ class GBTree : public GradientBooster { void Configure(const Args& cfg) override; // Revise `tree_method` and `updater` parameters after seeing the training // data matrix, only useful when tree_method is auto. - void PerformTreeMethodHeuristic(DMatrix* fmat); + void PerformTreeMethodHeuristic(); /*! \brief Map `tree_method` parameter to `updater` parameter */ void ConfigureUpdaters(); - void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat); /** * \brief Optionally update the leaf value. @@ -222,11 +219,7 @@ class GBTree : public GradientBooster { return tparam_; } - void Load(dmlc::Stream* fi) override { - model_.Load(fi); - this->cfg_.clear(); - } - + void Load(dmlc::Stream* fi) override { model_.Load(fi); } void Save(dmlc::Stream* fo) const override { model_.Save(fo); } @@ -416,8 +409,6 @@ class GBTree : public GradientBooster { bool showed_updater_warning_ {false}; bool specified_updater_ {false}; bool configured_ {false}; - // configurations for tree - Args cfg_; // the updaters that can be applied to each of tree std::vector> updaters_; // Predictors @@ -431,7 +422,6 @@ class GBTree : public GradientBooster { common::Monitor monitor_; }; -} // namespace gbm -} // namespace xgboost +} // namespace xgboost::gbm #endif // XGBOOST_GBM_GBTREE_H_ diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index a3bb30fcd871..5c561d2a4b08 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -379,6 +379,8 @@ TEST(Learner, Seed) { TEST(Learner, ConstantSeed) { auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true); std::unique_ptr learner{Learner::Create({m})}; + // Use exact as it doesn't initialize column sampler at construction, which alters the rng. + learner->SetParam("tree_method", "exact"); learner->Configure(); // seed the global random std::uniform_real_distribution dist; From 2cd8d15afd6b37a503f2a2db1c4d7f079509fe7f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 10:40:13 +0800 Subject: [PATCH 02/10] immutable. --- src/gbm/gbtree.cc | 40 ++++++++----------------------- src/gbm/gbtree.h | 3 --- tests/python/test_with_sklearn.py | 4 ++-- 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index ee752d5bd5bb..2d24184d617d 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -91,7 +91,12 @@ void GBTree::Configure(Args const& cfg) { showed_updater_warning_ = true; } - this->PerformTreeMethodHeuristic(); + if (model_.learner_model_param->IsVectorLeaf()) { + CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto) + << "Only the hist tree method is supported for building multi-target trees with vector " + "leaf."; + } + LOG(DEBUG) << "Using tree method: " << static_cast(tparam_.tree_method); this->ConfigureUpdaters(); if (updater_seq != tparam_.updater_seq) { @@ -106,27 +111,6 @@ void GBTree::Configure(Args const& cfg) { configured_ = true; } -void GBTree::PerformTreeMethodHeuristic() { - if (specified_updater_) { - // This method is disabled when `updater` parameter is explicitly - // set, since only experts are expected to do so. - return; - } - if (model_.learner_model_param->IsVectorLeaf()) { - CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto) - << "Only the hist tree method is supported for building multi-target trees with vector " - "leaf."; - } - - // tparam_ is set before calling this function. - if (tparam_.tree_method != TreeMethod::kAuto) { - return; - } - - tparam_.tree_method = TreeMethod::kHist; - LOG(DEBUG) << "Using tree method: " << static_cast(tparam_.tree_method); -} - void GBTree::ConfigureUpdaters() { if (specified_updater_) { return; @@ -134,21 +118,17 @@ void GBTree::ConfigureUpdaters() { // `updater` parameter was manually specified /* Choose updaters according to tree_method parameters */ switch (tparam_.tree_method) { - case TreeMethod::kAuto: - // Use heuristic to choose between 'exact' and 'approx' This - // choice is carried out in PerformTreeMethodHeuristic() before - // calling this function. + case TreeMethod::kAuto: // Use hist as default in 2.0 + case TreeMethod::kHist: { + tparam_.updater_seq = "grow_quantile_histmaker"; break; + } case TreeMethod::kApprox: tparam_.updater_seq = "grow_histmaker"; break; case TreeMethod::kExact: tparam_.updater_seq = "grow_colmaker,prune"; break; - case TreeMethod::kHist: { - tparam_.updater_seq = "grow_quantile_histmaker"; - break; - } case TreeMethod::kGPUHist: { common::AssertGPUSupport(); tparam_.updater_seq = "grow_gpu_hist"; diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 9bad1bd1be31..dc5fc975debe 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -190,9 +190,6 @@ class GBTree : public GradientBooster { : GradientBooster{ctx}, model_(booster_config, ctx_) {} void Configure(const Args& cfg) override; - // Revise `tree_method` and `updater` parameters after seeing the training - // data matrix, only useful when tree_method is auto. - void PerformTreeMethodHeuristic(); /*! \brief Map `tree_method` parameter to `updater` parameter */ void ConfigureUpdaters(); diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index d1915267b966..8e202a4d574c 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -750,7 +750,7 @@ def get_tm(clf: xgb.XGBClassifier) -> str: ]["tree_method"] return tm - assert get_tm(clf) == "exact" + assert get_tm(clf) == "auto" # Kept as auto, immutable since 2.0 clf = pickle.loads(pickle.dumps(clf)) @@ -758,7 +758,7 @@ def get_tm(clf: xgb.XGBClassifier) -> str: assert clf.n_estimators == 2 assert clf.get_params()["tree_method"] is None assert clf.get_params()["n_estimators"] == 2 - assert get_tm(clf) == "exact" # preserved for pickle + assert get_tm(clf) == "auto" # preserved for pickle clf = save_load(clf) From e6e961ab69bf3a23f00ca3f2f6c10339c13e0a02 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 18:40:45 +0800 Subject: [PATCH 03/10] cleanup the file. --- tests/python/test_demos.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index c54f35046f8a..58cfb0d4c058 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -18,9 +18,8 @@ def test_basic_walkthrough(): script = os.path.join(PYTHON_DEMO_DIR, 'basic_walkthrough.py') cmd = ['python', script] - subprocess.check_call(cmd) - os.remove('dump.nice.txt') - os.remove('dump.raw.txt') + with tempfile.TemporaryDirectory() as tmpdir: + subprocess.check_call(cmd, cwd=tmpdir) @pytest.mark.skipif(**tm.no_matplotlib()) From 627d491485c27dbd4acc71907a88c9f51d24acb0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 18:46:35 +0800 Subject: [PATCH 04/10] Fix test. --- tests/python/test_with_sklearn.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 8e202a4d574c..26d18493cc7c 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -475,18 +475,22 @@ def test_rf_regression(): run_housing_rf_regression("hist") -def test_parameter_tuning(): +@pytest.mark.parametrize("tree_method", ["exact", "hist", "approx"]) +def test_parameter_tuning(tree_method: str) -> None: from sklearn.datasets import fetch_california_housing from sklearn.model_selection import GridSearchCV X, y = fetch_california_housing(return_X_y=True) - xgb_model = xgb.XGBRegressor(learning_rate=0.1) - clf = GridSearchCV(xgb_model, {'max_depth': [2, 4], - 'n_estimators': [50, 200]}, - cv=2, verbose=1) - clf.fit(X, y) - assert clf.best_score_ < 0.7 - assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4} + reg = xgb.XGBRegressor(learning_rate=0.1, tree_method=tree_method) + grid_cv = GridSearchCV( + reg, {"max_depth": [2, 4], "n_estimators": [50, 200]}, cv=2, verbose=1 + ) + grid_cv.fit(X, y) + assert grid_cv.best_score_ < 0.7 + assert grid_cv.best_params_ == { + "n_estimators": 200, + "max_depth": 4 if tree_method == "exact" else 2, + } def test_regression_with_custom_objective(): From 12b15e7258cc0d16e6d3f9b2c93ebf3315101138 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 19:19:31 +0800 Subject: [PATCH 05/10] Remove tm. --- R-package/tests/testthat/test_basic.R | 41 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 94cd1ded31e8..5d86e24061d2 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -85,9 +85,18 @@ test_that("dart prediction works", { rnorm(100) set.seed(1994) - booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart", - rate_drop = 0.5, one_drop = TRUE, - eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror") + booster_by_xgboost <- xgboost( + data = d, + label = y, + max_depth = 2, + booster = "dart", + rate_drop = 0.5, + one_drop = TRUE, + eta = 1, + nthread = 2, + nrounds = nrounds, + objective = "reg:squarederror" + ) pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0) pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds) expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) @@ -97,19 +106,19 @@ test_that("dart prediction works", { set.seed(1994) dtrain <- xgb.DMatrix(data = d, info = list(label = y)) - booster_by_train <- xgb.train(params = list( - booster = "dart", - max_depth = 2, - eta = 1, - rate_drop = 0.5, - one_drop = TRUE, - nthread = 1, - tree_method = "exact", - objective = "reg:squarederror" - ), - data = dtrain, - nrounds = nrounds - ) + booster_by_train <- xgb.train( + params = list( + booster = "dart", + max_depth = 2, + eta = 1, + rate_drop = 0.5, + one_drop = TRUE, + nthread = 1, + objective = "reg:squarederror" + ), + data = dtrain, + nrounds = nrounds + ) pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0) pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds) pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE) From 700effeca507809f0e5d74d074658961efd59bc2 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 20:42:44 +0800 Subject: [PATCH 06/10] Specify updater. --- R-package/tests/testthat/test_update.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index 887ffeb06d84..c961bab1a930 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -13,7 +13,10 @@ test_that("updating the model works", { watchlist <- list(train = dtrain, test = dtest) # no-subsampling - p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2) + p1 <- list( + objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2, + updater = "grow_colmaker,prune" + ) set.seed(11) bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0) tr1 <- xgb.model.dt.tree(model = bst1) From 3b52c76d419b6a77fac8c3893f45d5caf859cc4d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 20:53:22 +0800 Subject: [PATCH 07/10] aft test. --- tests/python/test_survival.py | 50 ++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tests/python/test_survival.py b/tests/python/test_survival.py index 35de79ce6134..e5ca30fffd07 100644 --- a/tests/python/test_survival.py +++ b/tests/python/test_survival.py @@ -1,6 +1,6 @@ import json import os -from typing import Optional, Tuple +from typing import List, Optional, Tuple, cast import numpy as np import pytest @@ -62,8 +62,8 @@ def test_aft_survival_toy_data( X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1)) dmat, y_lower, y_upper = toy_data - # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes - # the corresponding predicted label (y_pred) + # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) + # includes the corresponding predicted label (y_pred) acc_rec = [] class Callback(xgb.callback.TrainingCallback): @@ -71,21 +71,33 @@ def __init__(self): super().__init__() def after_iteration( - self, model: xgb.Booster, + self, + model: xgb.Booster, epoch: int, - evals_log: xgb.callback.TrainingCallback.EvalsLog + evals_log: xgb.callback.TrainingCallback.EvalsLog, ): y_pred = model.predict(dmat) - acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X)) + acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X)) acc_rec.append(acc) return False - evals_result = {} - params = {'max_depth': 3, 'objective': 'survival:aft', 'min_child_weight': 0} - bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=evals_result, - callbacks=[Callback()]) - - nloglik_rec = evals_result['train']['aft-nloglik'] + evals_result: xgb.callback.TrainingCallback.EvalsLog = {} + params = { + "max_depth": 3, + "objective": "survival:aft", + "min_child_weight": 0, + "tree_method": "exact", + } + bst = xgb.train( + params, + dmat, + 15, + [(dmat, "train")], + evals_result=evals_result, + callbacks=[Callback()], + ) + + nloglik_rec = cast(List[float], evals_result["train"]["aft-nloglik"]) # AFT metric (negative log likelihood) improve monotonically assert all(p >= q for p, q in zip(nloglik_rec, nloglik_rec[:1])) # "Accuracy" improve monotonically. @@ -94,15 +106,17 @@ def after_iteration( assert acc_rec[-1] == 1.0 def gather_split_thresholds(tree): - if 'split_condition' in tree: - return (gather_split_thresholds(tree['children'][0]) - | gather_split_thresholds(tree['children'][1]) - | {tree['split_condition']}) + if "split_condition" in tree: + return ( + gather_split_thresholds(tree["children"][0]) + | gather_split_thresholds(tree["children"][1]) + | {tree["split_condition"]} + ) return set() # Only 2.5, 3.5, and 4.5 are used as split thresholds. - model_json = [json.loads(e) for e in bst.get_dump(dump_format='json')] - for tree in model_json: + model_json = [json.loads(e) for e in bst.get_dump(dump_format="json")] + for i, tree in enumerate(model_json): assert gather_split_thresholds(tree).issubset({2.5, 3.5, 4.5}) From 5e39bbe4d2446af2e8e6ddd6d68cc36a5767c395 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Jun 2023 21:03:12 +0800 Subject: [PATCH 08/10] Windows rng. --- R-package/tests/testthat/test_basic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 5d86e24061d2..a21b03d774d3 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -408,7 +408,7 @@ test_that("colsample_bytree works", { xgb.importance(model = bst) # If colsample_bytree works properly, a variety of features should be used # in the 100 trees - expect_gte(nrow(xgb.importance(model = bst)), 30) + expect_gte(nrow(xgb.importance(model = bst)), 28) }) test_that("Configuration works", { From cf8f38147129ea45e3220ac9e8e12a51ce0c0406 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 27 Jun 2023 17:05:44 +0800 Subject: [PATCH 09/10] Fix numeric error. --- tests/python/test_shap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index 2585da0880e7..490da5533f7f 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -121,7 +121,8 @@ def exp_value_rec(tree, z, x, i=0): else: ind = tree[i]["feature_index"] if z[ind] == 1: - if x[ind] < tree[i]["threshold"]: + # 1e-6 for numeric error from parsing text dump. + if x[ind] + 1e-6 < tree[i]["threshold"]: return exp_value_rec(tree, z, x, tree[i]["yes_ind"]) else: return exp_value_rec(tree, z, x, tree[i]["no_ind"]) From c7f4b1b3e1f256d2e93b13b81d158b6e124f1275 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 27 Jun 2023 17:14:16 +0800 Subject: [PATCH 10/10] Cleanup random seed, lint. --- tests/ci_build/lint_python.py | 1 + tests/python/test_shap.py | 72 ++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 35 deletions(-) diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index 85ece676e8c5..90a7781c2aa4 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -23,6 +23,7 @@ class LintersPaths: "tests/python/test_predict.py", "tests/python/test_quantile_dmatrix.py", "tests/python/test_tree_regularization.py", + "tests/python/test_shap.py", "tests/python-gpu/test_gpu_data_iterator.py", "tests/test_distributed/test_with_spark/", "tests/test_distributed/test_gpu_with_spark/", diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index 490da5533f7f..bbbdcedc0895 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -6,35 +6,34 @@ import scipy.special import xgboost as xgb - -dpath = 'demo/data/' -rng = np.random.RandomState(1994) +from xgboost import testing as tm class TestSHAP: - - def test_feature_importances(self): - data = np.random.randn(100, 5) + def test_feature_importances(self) -> None: + rng = np.random.RandomState(1994) + data = rng.randn(100, 5) target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] + features = ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"] - dm = xgb.DMatrix(data, label=target, - feature_names=features) - params = {'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} + dm = xgb.DMatrix(data, label=target, feature_names=features) + params = { + "objective": "multi:softprob", + "eval_metric": "mlogloss", + "eta": 0.3, + "num_class": 3, + } bst = xgb.train(params, dm, num_boost_round=10) # number of feature importances should == number of features scores1 = bst.get_score() - scores2 = bst.get_score(importance_type='weight') - scores3 = bst.get_score(importance_type='cover') - scores4 = bst.get_score(importance_type='gain') - scores5 = bst.get_score(importance_type='total_cover') - scores6 = bst.get_score(importance_type='total_gain') + scores2 = bst.get_score(importance_type="weight") + scores3 = bst.get_score(importance_type="cover") + scores4 = bst.get_score(importance_type="gain") + scores5 = bst.get_score(importance_type="total_cover") + scores6 = bst.get_score(importance_type="total_gain") assert len(scores1) == len(features) assert len(scores2) == len(features) assert len(scores3) == len(features) @@ -46,12 +45,11 @@ def test_feature_importances(self): fscores = bst.get_fscore() assert scores1 == fscores - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm') + dtrain, dtest = tm.load_agaricus(__file__) - def fn(max_depth, num_rounds): + def fn(max_depth: int, num_rounds: int) -> None: # train - params = {'max_depth': max_depth, 'eta': 1, 'verbosity': 0} + params = {"max_depth": max_depth, "eta": 1, "verbosity": 0} bst = xgb.train(params, dtrain, num_boost_round=num_rounds) # predict @@ -82,7 +80,7 @@ def fn(max_depth, num_rounds): assert out[0, 1] == 0.375 assert out[0, 2] == 0.25 - def parse_model(model): + def parse_model(model: xgb.Booster) -> list: trees = [] r_exp = r"([0-9]+):\[f([0-9]+)<([0-9\.e-]+)\] yes=([0-9]+),no=([0-9]+).*cover=([0-9e\.]+)" r_exp_leaf = r"([0-9]+):leaf=([0-9\.e-]+),cover=([0-9e\.]+)" @@ -93,7 +91,9 @@ def parse_model(model): match = re.search(r_exp, line) if match is not None: ind = int(match.group(1)) + assert trees[-1] is not None while ind >= len(trees[-1]): + assert isinstance(trees[-1], list) trees[-1].append(None) trees[-1][ind] = { "yes_ind": int(match.group(4)), @@ -101,17 +101,16 @@ def parse_model(model): "value": None, "threshold": float(match.group(3)), "feature_index": int(match.group(2)), - "cover": float(match.group(6)) + "cover": float(match.group(6)), } else: - match = re.search(r_exp_leaf, line) ind = int(match.group(1)) while ind >= len(trees[-1]): trees[-1].append(None) trees[-1][ind] = { "value": float(match.group(2)), - "cover": float(match.group(3)) + "cover": float(match.group(3)), } return trees @@ -122,7 +121,7 @@ def exp_value_rec(tree, z, x, i=0): ind = tree[i]["feature_index"] if z[ind] == 1: # 1e-6 for numeric error from parsing text dump. - if x[ind] + 1e-6 < tree[i]["threshold"]: + if x[ind] + 1e-6 <= tree[i]["threshold"]: return exp_value_rec(tree, z, x, tree[i]["yes_ind"]) else: return exp_value_rec(tree, z, x, tree[i]["no_ind"]) @@ -137,10 +136,13 @@ def exp_value_rec(tree, z, x, i=0): return val def exp_value(trees, z, x): + "E[f(z)|Z_s = X_s]" return np.sum([exp_value_rec(tree, z, x) for tree in trees]) def all_subsets(ss): - return itertools.chain(*map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1))) + return itertools.chain( + *map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1)) + ) def shap_value(trees, x, i, cond=None, cond_value=None): M = len(x) @@ -197,7 +199,9 @@ def interaction_value(trees, x, i, j): z[i] = 0 v01 = exp_value(trees, z, x) z[j] = 0 - total += (v11 - v01 - v10 + v00) / (scipy.special.binom(M - 2, len(subset)) * (M - 1)) + total += (v11 - v01 - v10 + v00) / ( + scipy.special.binom(M - 2, len(subset)) * (M - 1) + ) z[list(subset)] = 0 return total @@ -221,11 +225,10 @@ def interaction_value(trees, x, i, j): assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4 # test a random function - np.random.seed(0) M = 2 N = 4 - X = np.random.randn(N, M) - y = np.random.randn(N) + X = rng.randn(N, M) + y = rng.randn(N) param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) brute_force = shap_values(parse_model(bst), X[0, :]) @@ -237,11 +240,10 @@ def interaction_value(trees, x, i, j): assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4 # test another larger more complex random function - np.random.seed(0) M = 5 N = 100 - X = np.random.randn(N, M) - y = np.random.randn(N) + X = rng.randn(N, M) + y = rng.randn(N) base_score = 1.0 param = {"max_depth": 5, "base_score": base_score, "eta": 0.1, "gamma": 2.0} bst = xgb.train(param, xgb.DMatrix(X, label=y), 10)