Skip to content

Commit

Permalink
DEV: Fixes issue CederGroupHub#96. TST: Added a test to guarantee tha…
Browse files Browse the repository at this point in the history
…t the one-std rule always yields larger parameters than the optimal CV rule.
  • Loading branch information
qchempku2017 committed Jul 23, 2023
1 parent f7bedb3 commit e2a21bc
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/sparselm/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,9 @@ def _select_best_index_onestd(refit, refit_metric, results):
params.append(p)
params_sum = np.sum(params, axis=0)
one_std_dists = np.abs(metrics - m + sig)
candidates = np.arange(len(metrics))[
one_std_dists < (np.min(one_std_dists) + 0.1 * sig)
]
best_index = candidates[np.argmax(params_sum[candidates])]
# Guarantees that one-std rule always select larger params than max score.
candidates = np.arange(len(metrics))[params_sum >= params_sum[opt_index]]
best_index = candidates[np.argmin(one_std_dists[candidates])]
return best_index

# Overwrite original fit method to allow multiple optimal methods.
Expand Down
52 changes: 52 additions & 0 deletions tests/test_model_selection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import cvxpy as cp
import numpy as np
import pytest
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold, train_test_split

from sparselm.model import L1L0, L2L0
from sparselm.model_selection import GridSearchCV, LineSearchCV
Expand Down Expand Up @@ -115,6 +118,55 @@ def test_grid_search(random_energy_model, grid_search):
assert -grid_search.best_score_ >= rmse


# Guarantees that one-std rule always select larger params than max score.
def test_onestd():
success = 0
for _ in range(10):
X, y, coef = make_regression(
n_samples=200,
n_features=100,
n_informative=10,
noise=40.0,
bias=-15.0,
coef=True,
random_state=0,
)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=0
)

# create estimators
lasso = Lasso(fit_intercept=True)

# create cv search objects for each estimator
cv5 = KFold(n_splits=5, shuffle=True, random_state=0)
params = {"alpha": np.logspace(-1, 1, 10)}

lasso_cv_std = GridSearchCV(
lasso, params, opt_selection_method="one_std_score", cv=cv5, n_jobs=-1
)
lasso_cv_opt = GridSearchCV(
lasso, params, opt_selection_method="max_score", cv=cv5, n_jobs=-1
)

# fit models on training data
lasso_cv_std.fit(X_train, y_train)
lasso_cv_opt.fit(X_train, y_train)

correct_params = (
lasso_cv_opt.best_params_["alpha"] <= lasso_cv_std.best_params_["alpha"]
)
sparsity_opt = np.sum(np.abs(lasso_cv_opt.best_estimator_.coef_) >= 1e-6)
sparsity_std = np.sum(np.abs(lasso_cv_std.best_estimator_.coef_) >= 1e-6)

if correct_params and sparsity_opt >= sparsity_std:
success += 1

# Allow some failure caused by randomness of CV splits.
assert success >= 8


def test_line_search(random_energy_model, line_search):
femat, energies, _ = random_energy_model
n_samples, n_features = femat.shape
Expand Down

0 comments on commit e2a21bc

Please sign in to comment.