DOC add redundancy

scikit-learn-contrib · Sep 24, 2024 · 11c7c79 · 11c7c79
1 parent bdfdab7
commit 11c7c79
Show file tree

Hide file tree

Showing 5 changed files with 250 additions and 79 deletions.
diff --git a/doc/multioutput.rst b/doc/multioutput.rst
@@ -6,4 +6,4 @@
 Multioutput feature selection
 ==============================
 
-We can use :class:`FastCan` for multioutput feature selection.
+We can use :class:`FastCan` to handle multioutput feature selection.
diff --git a/doc/redundancy.rst b/doc/redundancy.rst
@@ -6,4 +6,30 @@
 Feature redundancy
 ==================
 
-:class:`FastCan` can effectively skip the redundant features.
+:class:`FastCan` can effectively skip the linearly redundant features.
+Here a feature :math:`x_r\in \mathbb{R}^{N\times 1}` is linearly
+redundant to a set of features :math:`X\in \mathbb{R}^{N\times n}` means that
+:math:`x_r` can be obtained from an affine transformation of :math:`X`, given by
+
+.. math::
+    x_r = Xa + b
+
+where :math:`a\in \mathbb{R}^{n\times 1}` and :math:`b\in \mathbb{R}^{N\times 1}`.
+In other words, the feature can be acquired by a linear transformation of :math:`X`,
+i.e. :math:`Xa`, and a translation, i.e. :math:`+b`.
+
+This capability of :class:`FastCan` is benefited from the
+`Modified Gram-Schmidt <https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process>`_,
+which gives large rounding-errors when linearly redundant features appears.
+
+.. rubric:: References
+
+* `"Canonical-correlation-based fast feature selection for structural
+  health monitoring" <https://doi.org/10.1016/j.ymssp.2024.111895>`_
+  Zhang, S., Wang, T., Worden, K., Sun L., & Cross, E. J.
+  Mechanical Systems and Signal Processing, 223:111895 (2025).
+
+.. rubric:: Examples
+
+* See :ref:`sphx_glr_auto_examples_plot_redundancy.py` for an example of
+  feature selection on datasets with redundant features.
diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst
@@ -10,72 +10,29 @@ We can use :class:`FastCan` to do unsupervised feature selection.
 The unsupervised application of :class:`FastCan` tries to select features, which
 maximize the sum of the squared canonical correlation (SSC) with the principal
 components (PCs) acquired from PCA (principal component analysis) of the feature
-matrix :math:`X`.
+matrix :math:`X`. See the example below.
 
     >>> from sklearn.decomposition import PCA
     >>> from sklearn import datasets
     >>> from fastcan import FastCan
     >>> iris = datasets.load_iris()
     >>> X = iris["data"]
-    >>> y = iris["target"]
-    >>> f_names = iris["feature_names"]
-    >>> t_names = iris["target_names"]
     >>> pca = PCA(n_components=2)
     >>> X_pcs = pca.fit_transform(X)
-    >>> selector = FastCan(n_features_to_select=2, verbose=0)
-    >>> selector.fit(X, X_pcs[:, :2])
+    >>> selector = FastCan(n_features_to_select=2, verbose=0).fit(X, X_pcs[:, :2])
     >>> selector.indices_
     array([2, 1], dtype=int32)
 
 .. note::
     There is no guarantee that this unsupervised :class:`FastCan` will select
     the optimal subset of the features, which has the highest SSC with PCs.
     Because :class:`FastCan` selects features in a greedy manner, which may lead to
-    suboptimal results. See the following plots.
+    suboptimal results.
 
-.. plot::
-    :context: close-figs
-    :align: center
-
-    from itertools import combinations
-    import matplotlib.pyplot as plt
-    from sklearn.cross_decomposition import CCA
-
-    def ssc(X, y):
-        """Sum of the squared canonical correlation coefficients.
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Feature matrix.
-
-        y : array-like of shape (n_samples, n_outputs)
-            Target matrix.
-
-        Returns
-        -------
-        ssc : float
-            Sum of the squared canonical correlation coefficients.
-        """
-        n_components = min(X.shape[1], y.shape[1])
-        cca = CCA(n_components=n_components)
-        X_c, y_c = cca.fit_transform(X, y)
-        corrcoef = np.diagonal(
-            np.corrcoef(X_c, y_c, rowvar=False),
-            offset=n_components
-        )
-        return sum(corrcoef**2)
-
-    comb = list(combinations([0, 1, 2, 3], 2))
-    fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(8, 6), layout="constrained")
-    for i in range(2):
-        for j in range(3):
-            f1_idx = comb[i*3+j][0]
-            f2_idx = comb[i*3+j][1]
-            score = ssc(X[:, [f1_idx, f2_idx]], X_pcs)
-            scatter = axs[i, j].scatter(X[:, f1_idx], X[:, f2_idx], c=y)
-            axs[i, j].set(xlabel=f_names[f1_idx], ylabel=f_names[f2_idx])
-            axs[i, j].set_title(f"SSC: {score:.3f}")
-    for spine in axs[1, 0].spines.values():
-            spine.set_edgecolor('red')
-    _ = axs[1, 2].legend(scatter.legend_elements()[0], t_names, loc="lower right")
+.. rubric:: References
 
+* `"Automatic Selection of Optimal Structures for Population-Based
+  Structural Health Monitoring" <https://doi.org/10.1007/978-3-031-34946-1_10>`_
+  Wang, T., Worden, K., Wagg, D.J., Cross, E.J., Maguire, A.E., Lin, W.
+  In: Conference Proceedings of the Society for Experimental Mechanics Series.
+  Springer, Cham. (2023).
diff --git a/examples/plot_redundancy.py b/examples/plot_redundancy.py
@@ -0,0 +1,211 @@
+"""
+===================================================
+Feature selection performance on redundant features
+===================================================
+
+In this examples, we will compare the performance of feature selectors on the
+datasets, which contain redundant features.
+Here four types of features should be distinguished:
+
+* Unuseful features: the features do not contribute to the target
+* Dependent informative features: the features contribute to the target and form
+  the redundant features
+* Redundant features: the features are constructed by linear transformation of
+  dependent informative features
+* Independent informative features: the features contribute to the target but
+  does not contribute to the redundant features.
+
+.. note::
+    If we do not distinguish dependent and independent informative features and use
+    informative features to form both the target and the redundant features. The task
+    will be much easier.
+"""
+
+# Authors: Sikai Zhang
+# SPDX-License-Identifier: MIT
+
+# %%
+# Define dataset generator
+# ------------------------
+
+import numpy as np
+
+
+def make_redundant(
+    n_samples,
+    n_features,
+    dep_info_ids,
+    indep_info_ids,
+    redundant_ids,
+    random_seed,
+):
+    """Make a dataset with linearly redundant features.
+
+    Parameters
+    ----------
+    n_samples : int
+        The number of samples.
+
+    n_features : int
+        The number of features.
+
+    dep_info_ids : list[int]
+        The indices of dependent informative features.
+
+    indep_info_ids : list[int]
+        The indices of independent informative features.
+
+    redundant_ids : list[int]
+        The indices of redundant features.
+
+    random_seed : int
+        Random seed.
+
+    Returns
+    -------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix.
+
+    y : array-like of shape (n_samples,)
+        Target vector.
+    """
+    rng = np.random.default_rng(random_seed)
+    info_ids = dep_info_ids + indep_info_ids
+    n_dep_info = len(dep_info_ids)
+    n_info = len(info_ids)
+    n_redundant = len(redundant_ids)
+    informative_coef = rng.random(n_info)
+    redundant_coef = rng.random((n_dep_info, n_redundant))
+
+    X = rng.random((n_samples, n_features))
+    y = np.dot(X[:, info_ids], informative_coef)
+
+    X[:, redundant_ids] = X[:, dep_info_ids]@redundant_coef
+    return X, y
+
+# %%
+# Define score function
+# ---------------------
+# This function is used to compute the number of correct features missed by selectors.
+#
+# * For independent informative features, selectors should select all of them
+# * For dependent informative features, selectors only need to select any
+#   ``n_dep_info``-combination of the set ``dep_info_ids`` + ``redundant_ids``. That
+#   means if the indices of dependent informative features are :math:`[0, 1]` and the
+#   indices of the redundant features are :math:`[5]`, then the correctly selected
+#   indices can be any of :math:`[0, 1]`, :math:`[0, 5]`, and :math:`[1, 5]`.
+
+def get_n_missed(
+    dep_info_ids,
+    indep_info_ids,
+    redundant_ids,
+    selected_ids
+):
+    """Get the number of features miss selected."""
+    n_redundant = len(redundant_ids)
+    n_missed_indep = len(np.setdiff1d(indep_info_ids, selected_ids))
+    n_missed_dep = len(
+        np.setdiff1d(dep_info_ids+redundant_ids, selected_ids)
+    )-n_redundant
+    if n_missed_dep < 0:
+        n_missed_dep = 0
+    return n_missed_indep+n_missed_dep
+
+# %%
+# Prepare selectors
+# -----------------
+# We compare :class:`fastcan.FastCan` with eight selectors of :mod:`sklearn`, which
+# include the Select From a Model (SFM) algorithm, the Recursive Feature Elimination
+# (RFE) algorithm, the Sequential Feature Selection (SFS) algorithm, and Select K Best
+# (SKB) algorithm.
+# The list of the selectors are given below:
+#
+# * fastcan: :class:`fastcan.FastCan` selector
+# * skb_reg: is the SKB algorithm ranking features with ANOVA (analysis of variance)
+#   F-statistic and p-values
+# * skb_mir: is the SKB algorithm ranking features mutual information for regression
+# * sfm_lsvr: the SFM algorithm with a linear support vector regressor
+# * sfm_rfr: the SFM algorithm with a random forest regressor
+# * rfe_lsvr: is the RFE algorithm with a linear support vector regressor
+# * rfe_rfr: is the RFE algorithm with a random forest regressor
+# * sfs_lsvr: is the forward SFS algorithm with a linear support vector regressor
+# * sfs_rfr: is the forward SFS algorithm with a random forest regressor
+
+
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.feature_selection import (
+    RFE,
+    SelectFromModel,
+    SelectKBest,
+    SequentialFeatureSelector,
+    f_regression,
+    mutual_info_regression,
+)
+from sklearn.svm import LinearSVR
+
+from fastcan import FastCan
+
+lsvr = LinearSVR(C = 1, dual="auto", max_iter=100000, random_state=0)
+rfr = RandomForestRegressor(n_estimators = 10, random_state=0)
+
+
+N_SAMPLES = 1000
+N_FEATURES = 10
+DEP_INFO_IDS = [2, 4, 7, 9]
+INDEP_INFO_IDS = [0, 1, 6]
+REDUNDANT_IDS = [5, 8]
+N_SELECTED = len(DEP_INFO_IDS+INDEP_INFO_IDS)
+N_REPEATED = 10
+
+selector_dict = {
+    "fastcan": FastCan(N_SELECTED, verbose=0),
+    "skb_reg": SelectKBest(f_regression, k=N_SELECTED),
+    "skb_mir": SelectKBest(mutual_info_regression, k=N_SELECTED),
+    "sfm_lsvr": SelectFromModel(lsvr, max_features=N_SELECTED, threshold=-np.inf),
+    "sfm_rfr": SelectFromModel(rfr, max_features=N_SELECTED, threshold=-np.inf),
+    "rfe_lsvr": RFE(lsvr, n_features_to_select=N_SELECTED, step=1),
+    "rfe_rfr": RFE(rfr, n_features_to_select=N_SELECTED, step=1),
+    "sfs_lsvr": SequentialFeatureSelector(lsvr, n_features_to_select=N_SELECTED, cv=2),
+    "sfs_rfr": SequentialFeatureSelector(rfr, n_features_to_select=N_SELECTED, cv=2),
+}
+
+# %%
+# Run test
+# --------
+
+N_SELECTORS = len(selector_dict)
+n_missed = np.zeros((N_REPEATED, N_SELECTORS), dtype=int)
+
+for i in range(N_REPEATED):
+    X, y = make_redundant(
+        n_samples=N_SAMPLES,
+        n_features=N_FEATURES,
+        dep_info_ids=DEP_INFO_IDS,
+        indep_info_ids=INDEP_INFO_IDS,
+        redundant_ids=REDUNDANT_IDS,
+        random_seed=i,
+    )
+    for j, selector in enumerate(selector_dict.values()):
+        result_ids = selector.fit(X, y).get_support(indices=True)
+        n_missed[i, j] = get_n_missed(
+            dep_info_ids=DEP_INFO_IDS,
+            indep_info_ids=INDEP_INFO_IDS,
+            redundant_ids=REDUNDANT_IDS,
+            selected_ids=result_ids,
+        )
+
+# %%
+# Plot results
+# ------------
+# :class:`fastcan.FastCan` correctly selects all informative features with zero missed
+# features.
+
+import matplotlib.pyplot as plt
+
+fig, ax = plt.subplots(figsize = (8, 5))
+rects = ax.bar(selector_dict.keys(), n_missed.sum(0), width = 0.5)
+ax.bar_label(rects, n_missed.sum(0), padding=3)
+plt.xlabel("Selector")
+plt.ylabel("No. of missed features")
+plt.title("Performance of selectors on datasets with linear redundant features")
+plt.show()
diff --git a/examples/plot_speed.py b/examples/plot_speed.py
@@ -37,35 +37,12 @@
 # canonical correlation coefficients may be more than one, the feature ranking
 # criterion used here is the sum squared of all canonical correlation coefficients.
 
-from sklearn.cross_decomposition import CCA
-
-def ssc(X, y):
-    """Sum of the squared canonical correlation coefficients.
-    Parameters
-    ----------
-    X : array-like of shape (n_samples, n_features)
-        Feature matrix.
-
-    y : array-like of shape (n_samples, n_outputs)
-        Target matrix.
-
-    Returns
-    -------
-    ssc : float
-        Sum of the squared canonical correlation coefficients.
-    """
-    n_components = min(X.shape[1], y.shape[1])
-    cca = CCA(n_components=n_components)
-    X_c, y_c = cca.fit_transform(X, y)
-    corrcoef = np.diagonal(
-        np.corrcoef(X_c, y_c, rowvar=False),
-        offset=n_components
-    )
-    return sum(corrcoef**2)
+from fastcan import ssc
 
 
 def baseline(X, y, t):
     """Baseline method using CCA from sklearn.
+
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)