diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ff4b100..70e0082 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: run: | FMT=xml pixi run test-coverage - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v5.0.0 + uses: codecov/codecov-action@v5.0.2 with: token: ${{ secrets.CODECOV_TOKEN }} - name: Build SDist diff --git a/doc/index.rst b/doc/index.rst index 29518b8..20db2ae 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,6 +18,7 @@ API Reference :toctree: generated/ FastCan + refine ssc ols diff --git a/fastcan/__init__.py b/fastcan/__init__.py index 7f5b080..8bcbd35 100644 --- a/fastcan/__init__.py +++ b/fastcan/__init__.py @@ -3,10 +3,12 @@ """ from ._fastcan import FastCan +from ._refine import refine from ._utils import ols, ssc __all__ = [ "FastCan", "ssc", "ols", + "refine", ] diff --git a/fastcan/_fastcan.py b/fastcan/_fastcan.py index 1f3d3ee..bad3db6 100644 --- a/fastcan/_fastcan.py +++ b/fastcan/_fastcan.py @@ -2,6 +2,7 @@ Feature selection """ +from copy import deepcopy from numbers import Integral, Real import numpy as np @@ -66,15 +67,15 @@ class FastCan(SelectorMixin, BaseEstimator): The h-correlation/eta-cosine of selected features. The order of the scores is corresponding to the feature selection process. - X_transformed_ : ndarray of shape (n_samples_, n_features), dtype=float, order='F' + X_transformed_ : ndarray of shape (`n_samples_`, n_features), dtype=float, order='F' Transformed feature matrix. - When h-correlation method is used, n_samples_ = n_samples. - When eta-cosine method is used, n_samples_ = n_features+n_outputs. + When h-correlation method is used, `n_samples_` = n_samples. + When eta-cosine method is used, `n_samples_` = n_features+n_outputs. - y_transformed_ : ndarray of shape (n_samples_, n_outputs), dtype=float, order='F' + y_transformed_ : ndarray of shape (`n_samples_`, n_outputs), dtype=float, order='F' Transformed target matrix. - When h-correlation method is used, n_samples_ = n_samples. - When eta-cosine method is used, n_samples_ = n_features+n_outputs. + When h-correlation method is used, `n_samples_` = n_samples. + When eta-cosine method is used, `n_samples_` = n_features+n_outputs. References ---------- @@ -181,8 +182,26 @@ def fit(self, X, y): raise ValueError( "`eta` cannot be True, when n_samples < n_features+n_outputs." ) - indices_include = self._check_indices_params(self.indices_include, n_features) - indices_exclude = self._check_indices_params(self.indices_exclude, n_features) + self.indices_include_ = self._check_indices_params( + self.indices_include, n_features + ) + self.indices_exclude_ = self._check_indices_params( + self.indices_exclude, n_features + ) + if np.intersect1d(self.indices_include_, self.indices_exclude_).size != 0: + raise ValueError( + "`indices_include` and `indices_exclude` should not have intersection." + ) + + n_candidates = ( + n_features - self.indices_exclude_.size - self.n_features_to_select + ) + if n_candidates < 0: + raise ValueError( + "n_features - n_features_to_select - n_exclusions should >= 0." + ) + if self.n_features_to_select - self.indices_include_.size < 0: + raise ValueError("n_features_to_select - n_inclusions should >= 0.") if self.eta: xy_hstack = np.hstack((X, y)) @@ -198,16 +217,16 @@ def fit(self, X, y): self.X_transformed_ = X - X.mean(0) self.y_transformed_ = orth(y - y.mean(0)) - # initiated with -1 - indices = np.full(self.n_features_to_select, -1, dtype=np.intc, order="F") - indices[: indices_include.size] = indices_include - scores = np.zeros(self.n_features_to_select, dtype=float, order="F") - mask = np.zeros(n_features, dtype=np.ubyte, order="F") - mask[indices_exclude] = True + indices, scores, mask = _prepare_search( + n_features, + self.n_features_to_select, + self.indices_include_, + self.indices_exclude_, + ) n_threads = _openmp_effective_n_threads() _forward_search( - X=self.X_transformed_, + X=deepcopy(self.X_transformed_), V=self.y_transformed_, t=self.n_features_to_select, tol=self.tol, @@ -259,3 +278,15 @@ def _check_indices_params(self, indices_params, n_features): def _get_support_mask(self): check_is_fitted(self) return self.support_ + + +def _prepare_search(n_features, n_features_to_select, indices_include, indices_exclude): + """ """ + # initiated with -1 + indices = np.full(n_features_to_select, -1, dtype=np.intc, order="F") + indices[: indices_include.size] = indices_include + scores = np.zeros(n_features_to_select, dtype=float, order="F") + mask = np.zeros(n_features, dtype=np.ubyte, order="F") + mask[indices_exclude] = True + + return indices, scores, mask diff --git a/fastcan/_refine.py b/fastcan/_refine.py new file mode 100644 index 0000000..bbab1c3 --- /dev/null +++ b/fastcan/_refine.py @@ -0,0 +1,168 @@ +""" +Refine fastcan selection results +""" + +from copy import deepcopy +from numbers import Integral + +import numpy as np +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, StrOptions, validate_params +from sklearn.utils.validation import check_is_fitted + +from ._cancorr_fast import _forward_search # type: ignore +from ._fastcan import FastCan, _prepare_search + + +@validate_params( + { + "selector": [FastCan], + "drop": [ + Interval(Integral, 1, None, closed="left"), + StrOptions({"all"}), + "array-like", + ], + "max_iter": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "verbose": ["verbose"], + }, + prefer_skip_nested_validation=True, +) +def refine(selector, drop=1, max_iter=None, verbose=1): + """Two-Stage Refining. + + In the refining process, the selected features will be dropped, and + the vacancy positions will be refilled from the candidate features. + + The processing of a vacany position is refilled after searching all + candidate features is called an `iteration`. + + The processing of a vacany position is refilled by a different features + from the dropped one, which increase the SSC of the selected features + is called a `valid iteration`. + + Parameters + ---------- + selector : FastCan + FastCan selector. + + drop : int or array-like of shape (n_drops,) or "all", default=1 + The number of the selected features dropped for the consequencing + reselection. + + max_iter : int, default=None + The maximum number of valid iterations in the refining process. + + verbose : int, default=1 + The verbosity level. + + Returns + ------- + indices : ndarray of shape (n_features_to_select,), dtype=int + The indices of the selected features. + + scores : ndarray of shape (n_features_to_select,), dtype=float + The h-correlation/eta-cosine of selected features. + + References + ---------- + * Zhang L., Li K., Bai E. W. and Irwin G. W. (2015). + Two-stage orthogonal least squares methods for neural network construction. + IEEE Transactions on Neural Networks and Learning Systems, 26(8), 1608-1621. + + Examples + -------- + >>> from fastcan import FastCan, refine + >>> X = [[1, 1, 0], [0.01, 0, 0], [-1, 0, 1], [0, 0, 0]] + >>> y = [1, 0, -1, 0] + >>> selector = FastCan(2, verbose=0).fit(X, y) + >>> print(f"Indices: {selector.indices_}", f", SSC: {selector.scores_.sum():.5f}") + Indices: [0 1] , SSC: 0.99998 + >>> indices, scores = refine(selector, drop=1, verbose=0) + >>> print(f"Indices: {indices}", f", SSC: {scores.sum():.5f}") + Indices: [1 2] , SSC: 1.00000 + """ + check_is_fitted(selector) + X_transformed_ = deepcopy(selector.X_transformed_) + n_features = selector.n_features_in_ + n_features_to_select = selector.n_features_to_select + indices_include = selector.indices_include_ + indices_exclude = selector.indices_exclude_ + + n_inclusions = indices_include.size + n_selections = n_features_to_select - n_inclusions + + if drop == "all": + drop = np.arange(1, n_selections) + else: + drop = np.atleast_1d(drop).astype(int) + + if (drop.max() >= n_selections) or (drop.min() < 1): + raise ValueError( + "`drop` should be between `1<=drop best_ssc) and (set(indices) != set(best_indices)): + i = 0 + n_valid_iters += 1 + best_scores = scores + best_indices = indices + best_ssc = scores.sum() + else: + i += 1 + + indices_temp = indices + n_iters += 1 + if verbose == 1: + print( + f"No. of iterations: {n_iters}, " + f"No. of valid iterations {n_valid_iters}, " + f"SSC: {best_scores.sum():.5f}", + end="\r", + ) + + if n_iters >= max_iter: + if verbose == 1: + print() + return best_indices, best_scores + + if verbose == 1: + print() + return best_indices, best_scores diff --git a/tests/test_fastcan.py b/tests/test_fastcan.py index 610fb73..a8c4475 100644 --- a/tests/test_fastcan.py +++ b/tests/test_fastcan.py @@ -199,6 +199,20 @@ def test_raise_errors(): indices_include=[[0]] ) + selector_include_exclude_intersect = FastCan( + n_features_to_select=n_features, + indices_include=[0, 1], + indices_exclude=[1, 2], + ) + selector_n_candidates = FastCan( + n_features_to_select=n_features, + indices_exclude=[1, 2], + ) + selector_too_many_inclusions = FastCan( + n_features_to_select=2, + indices_include=[1, 2, 3], + ) + with pytest.raises(ValueError, match=r"n_features_to_select .*"): selector_n_select.fit(X, y) @@ -214,6 +228,15 @@ def test_raise_errors(): with pytest.raises(ValueError, match=r"`eta` cannot be True, .*"): selector_eta_for_small_size_samples.fit(X, y) + with pytest.raises(ValueError, match=r"`indices_include` and `indices_exclu.*"): + selector_include_exclude_intersect.fit(X, y) + + with pytest.raises(ValueError, match=r"n_features - n_features_to_select - n_e.*"): + selector_n_candidates.fit(X, y) + + with pytest.raises(ValueError, match=r"n_features_to_select - n_inclusions sho.*"): + selector_too_many_inclusions.fit(X, y) + @pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning") def test_cython_errors(): diff --git a/tests/test_refine.py b/tests/test_refine.py new file mode 100644 index 0000000..a7f059b --- /dev/null +++ b/tests/test_refine.py @@ -0,0 +1,70 @@ +"""Test refine""" +import pytest +from sklearn.datasets import make_classification + +from fastcan import FastCan, refine + + +def test_select_refine_random_cls(): + # Test whether refine work correctly with random samples. + n_samples = 200 + n_features = 20 + n_informative = 10 + n_classes = 8 + n_repeated = 5 + n_to_select = 10 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_repeated=n_repeated, + n_classes=n_classes, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + selector = FastCan(n_to_select).fit(X, y) + _, scores_1 = refine(selector, drop=1) + _, scores_23 = refine(selector, drop=[2, 3], verbose=0) + _, scores_all = refine(selector, drop="all", max_iter=20, verbose=1) + + selector = FastCan(n_to_select, indices_include=[1, 5]).fit(X, y) + indices_inc, _ = refine(selector, drop=1) + + assert selector.scores_.sum() <= scores_1.sum() + assert selector.scores_.sum() <= scores_23.sum() + assert selector.scores_.sum() <= scores_all.sum() + assert (indices_inc[0]==1) and (indices_inc[1]==5) + + +def test_refine_error(): + # Test refine raise error. + n_samples = 200 + n_features = 20 + n_informative = 10 + n_classes = 8 + n_repeated = 5 + n_to_select = 10 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_repeated=n_repeated, + n_classes=n_classes, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + selector = FastCan(n_to_select, indices_include=[0]) + selector.fit(X, y) + + with pytest.raises(ValueError, match=r"`drop` should be between .*"): + refine(selector, drop=n_to_select, verbose=0)