Skip to content

Commit

Permalink
[python-package] Fix inconsistency in predict() output shape for 1-…
Browse files Browse the repository at this point in the history
…tree models (microsoft#6753)
  • Loading branch information
RektPunk authored Dec 22, 2024
1 parent 4ee0bc0 commit 60b0155
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 2 deletions.
2 changes: 1 addition & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,7 +1248,7 @@ def predict(
if pred_leaf:
preds = preds.astype(np.int32)
is_sparse = isinstance(preds, (list, scipy.sparse.spmatrix))
if not is_sparse and preds.size != nrow:
if not is_sparse and (preds.size != nrow or pred_leaf or pred_contrib):
if preds.size % nrow == 0:
preds = preds.reshape(nrow, -1)
else:
Expand Down
90 changes: 89 additions & 1 deletion tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import psutil
import pytest
from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr
from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
from sklearn.datasets import load_svmlight_file, make_blobs, make_classification, make_multilabel_classification
from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score
from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split

Expand Down Expand Up @@ -2314,6 +2314,33 @@ def test_refit():
assert err_pred > new_err_pred


def test_refit_with_one_tree_regression():
X, y = make_synthetic_regression(n_samples=1_000, n_features=2)
lgb_train = lgb.Dataset(X, label=y)
params = {"objective": "regression", "verbosity": -1}
model = lgb.train(params, lgb_train, num_boost_round=1)
model_refit = model.refit(X, y)
assert isinstance(model_refit, lgb.Booster)


def test_refit_with_one_tree_binary_classification():
X, y = load_breast_cancer(return_X_y=True)
lgb_train = lgb.Dataset(X, label=y)
params = {"objective": "binary", "verbosity": -1}
model = lgb.train(params, lgb_train, num_boost_round=1)
model_refit = model.refit(X, y)
assert isinstance(model_refit, lgb.Booster)


def test_refit_with_one_tree_multiclass_classification():
X, y = load_iris(return_X_y=True)
lgb_train = lgb.Dataset(X, y)
params = {"objective": "multiclass", "num_class": 3, "verbose": -1}
model = lgb.train(params, lgb_train, num_boost_round=1)
model_refit = model.refit(X, y)
assert isinstance(model_refit, lgb.Booster)


def test_refit_dataset_params(rng):
# check refit accepts dataset_params
X, y = load_breast_cancer(return_X_y=True)
Expand Down Expand Up @@ -3872,6 +3899,67 @@ def test_predict_stump(rng, use_init_score):
np.testing.assert_allclose(preds_all, np.full_like(preds_all, fill_value=y_avg))


def test_predict_regression_output_shape():
n_samples = 1_000
n_features = 4
X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
dtrain = lgb.Dataset(X, label=y)
params = {"objective": "regression", "verbosity": -1}

# 1-round model
bst = lgb.train(params, dtrain, num_boost_round=1)
assert bst.predict(X).shape == (n_samples,)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1)

# 2-round model
bst = lgb.train(params, dtrain, num_boost_round=2)
assert bst.predict(X).shape == (n_samples,)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2)


def test_predict_binary_classification_output_shape():
n_samples = 1_000
n_features = 4
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2)
dtrain = lgb.Dataset(X, label=y)
params = {"objective": "binary", "verbosity": -1}

# 1-round model
bst = lgb.train(params, dtrain, num_boost_round=1)
assert bst.predict(X).shape == (n_samples,)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
assert bst.predict(X, pred_leaf=True).shape == (n_samples, 1)

# 2-round model
bst = lgb.train(params, dtrain, num_boost_round=2)
assert bst.predict(X).shape == (n_samples,)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_features + 1)
assert bst.predict(X, pred_leaf=True).shape == (n_samples, 2)


def test_predict_multiclass_classification_output_shape():
n_samples = 1_000
n_features = 10
n_classes = 3
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=6)
dtrain = lgb.Dataset(X, label=y)
params = {"objective": "multiclass", "verbosity": -1, "num_class": n_classes}

# 1-round model
bst = lgb.train(params, dtrain, num_boost_round=1)
assert bst.predict(X).shape == (n_samples, n_classes)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1))
assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes)

# 2-round model
bst = lgb.train(params, dtrain, num_boost_round=2)
assert bst.predict(X).shape == (n_samples, n_classes)
assert bst.predict(X, pred_contrib=True).shape == (n_samples, n_classes * (n_features + 1))
assert bst.predict(X, pred_leaf=True).shape == (n_samples, n_classes * 2)


def test_average_precision_metric():
# test against sklearn average precision metric
X, y = load_breast_cancer(return_X_y=True)
Expand Down

0 comments on commit 60b0155

Please sign in to comment.