DEV: Fixes issue CederGroupHub#96. TST: Added a test to guarantee tha…

…t the one-std rule always yields larger parameters than the optimal CV rule.
qchempku2017 · Jul 23, 2023 · e2a21bc · e2a21bc
1 parent f7bedb3
commit e2a21bc
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 4 deletions.
diff --git a/src/sparselm/model_selection.py b/src/sparselm/model_selection.py
@@ -215,10 +215,9 @@ def _select_best_index_onestd(refit, refit_metric, results):
  params.append(p)
  params_sum = np.sum(params, axis=0)
  one_std_dists = np.abs(metrics - m + sig)
- candidates = np.arange(len(metrics))[
- one_std_dists < (np.min(one_std_dists) + 0.1 * sig)
- ]
- best_index = candidates[np.argmax(params_sum[candidates])]
+ # Guarantees that one-std rule always select larger params than max score.
+ candidates = np.arange(len(metrics))[params_sum >= params_sum[opt_index]]
+ best_index = candidates[np.argmin(one_std_dists[candidates])]
  return best_index
 
  # Overwrite original fit method to allow multiple optimal methods.

diff --git a/tests/test_model_selection.py b/tests/test_model_selection.py
@@ -1,6 +1,9 @@
 import cvxpy as cp
 import numpy as np
 import pytest
+from sklearn.datasets import make_regression
+from sklearn.linear_model import Lasso
+from sklearn.model_selection import KFold, train_test_split
 
 from sparselm.model import L1L0, L2L0
 from sparselm.model_selection import GridSearchCV, LineSearchCV
@@ -115,6 +118,55 @@ def test_grid_search(random_energy_model, grid_search):
  assert -grid_search.best_score_ >= rmse
 
 
+# Guarantees that one-std rule always select larger params than max score.
+def test_onestd():
+ success = 0
+ for _ in range(10):
+ X, y, coef = make_regression(
+ n_samples=200,
+ n_features=100,
+ n_informative=10,
+ noise=40.0,
+ bias=-15.0,
+ coef=True,
+ random_state=0,
+ )
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ X, y, test_size=0.25, random_state=0
+ )
+
+ # create estimators
+ lasso = Lasso(fit_intercept=True)
+
+ # create cv search objects for each estimator
+ cv5 = KFold(n_splits=5, shuffle=True, random_state=0)
+ params = {"alpha": np.logspace(-1, 1, 10)}
+
+ lasso_cv_std = GridSearchCV(
+ lasso, params, opt_selection_method="one_std_score", cv=cv5, n_jobs=-1
+ )
+ lasso_cv_opt = GridSearchCV(
+ lasso, params, opt_selection_method="max_score", cv=cv5, n_jobs=-1
+ )
+
+ # fit models on training data
+ lasso_cv_std.fit(X_train, y_train)
+ lasso_cv_opt.fit(X_train, y_train)
+
+ correct_params = (
+ lasso_cv_opt.best_params_["alpha"] <= lasso_cv_std.best_params_["alpha"]
+ )
+ sparsity_opt = np.sum(np.abs(lasso_cv_opt.best_estimator_.coef_) >= 1e-6)
+ sparsity_std = np.sum(np.abs(lasso_cv_std.best_estimator_.coef_) >= 1e-6)
+
+ if correct_params and sparsity_opt >= sparsity_std:
+ success += 1
+
+ # Allow some failure caused by randomness of CV splits.
+ assert success >= 8
+
+
 def test_line_search(random_energy_model, line_search):
  femat, energies, _ = random_energy_model
  n_samples, n_features = femat.shape