From 1318faa112bfe51dd290c71b49c175a3d9343213 Mon Sep 17 00:00:00 2001 From: SIKAI ZHANG <34108862+MatthewSZhang@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:12:25 +0800 Subject: [PATCH] DOC unsupervised init --- doc/conf.py | 1 + doc/unsupervised.rst | 74 +++++++++++++++++++++++++++++++++++++++++- examples/plot_speed.py | 34 ++++++++++++++----- 3 files changed, 99 insertions(+), 10 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 1205bd0..966449c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -42,6 +42,7 @@ "sphinx.ext.intersphinx", "sphinx_gallery.gen_gallery", "sphinx_design", + "matplotlib.sphinxext.plot_directive", ] # List of patterns, relative to source directory, that match files and diff --git a/doc/unsupervised.rst b/doc/unsupervised.rst index 0a37ac6..7b469ec 100644 --- a/doc/unsupervised.rst +++ b/doc/unsupervised.rst @@ -6,4 +6,76 @@ Unsupervised feature selection ============================== -We can use :class:`FastCan` to do unsupervised feature selection. \ No newline at end of file +We can use :class:`FastCan` to do unsupervised feature selection. +The unsupervised application of :class:`FastCan` tries to select features, which +maximize the sum of the squared canonical correlation (SSC) with the principal +components (PCs) acquired from PCA (principal component analysis) of the feature +matrix :math:`X`. + + >>> from sklearn.decomposition import PCA + >>> from sklearn import datasets + >>> from fastcan import FastCan + >>> iris = datasets.load_iris() + >>> X = iris["data"] + >>> y = iris["target"] + >>> f_names = iris["feature_names"] + >>> t_names = iris["target_names"] + >>> pca = PCA(n_components=2) + >>> X_pcs = pca.fit_transform(X) + >>> selector = FastCan(n_features_to_select=2, verbose=0) + >>> selector.fit(X, X_pcs[:, :2]) + >>> selector.indices_ + array([2, 1], dtype=int32) + +.. note:: + There is no guarantee that this unsupervised :class:`FastCan` will select + the optimal subset of the features, which has the highest SSC with PCs. + Because :class:`FastCan` selects features in a greedy manner, which may lead to + suboptimal results. See the following plots. + +.. plot:: + :context: close-figs + :align: center + + from itertools import combinations + import matplotlib.pyplot as plt + from sklearn.cross_decomposition import CCA + + def ssc(X, y): + """Sum of the squared canonical correlation coefficients. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix. + + y : array-like of shape (n_samples, n_outputs) + Target matrix. + + Returns + ------- + ssc : float + Sum of the squared canonical correlation coefficients. + """ + n_components = min(X.shape[1], y.shape[1]) + cca = CCA(n_components=n_components) + X_c, y_c = cca.fit_transform(X, y) + corrcoef = np.diagonal( + np.corrcoef(X_c, y_c, rowvar=False), + offset=n_components + ) + return sum(corrcoef**2) + + comb = list(combinations([0, 1, 2, 3], 2)) + fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(8, 6), layout="constrained") + for i in range(2): + for j in range(3): + f1_idx = comb[i*3+j][0] + f2_idx = comb[i*3+j][1] + score = ssc(X[:, [f1_idx, f2_idx]], X_pcs) + scatter = axs[i, j].scatter(X[:, f1_idx], X[:, f2_idx], c=y) + axs[i, j].set(xlabel=f_names[f1_idx], ylabel=f_names[f2_idx]) + axs[i, j].set_title(f"SSC: {score:.3f}") + for spine in axs[1, 0].spines.values(): + spine.set_edgecolor('red') + _ = axs[1, 2].legend(scatter.legend_elements()[0], t_names, loc="lower right") + diff --git a/examples/plot_speed.py b/examples/plot_speed.py index 191e7da..e168c1c 100644 --- a/examples/plot_speed.py +++ b/examples/plot_speed.py @@ -39,6 +39,30 @@ from sklearn.cross_decomposition import CCA +def ssc(X, y): + """Sum of the squared canonical correlation coefficients. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix. + + y : array-like of shape (n_samples, n_outputs) + Target matrix. + + Returns + ------- + ssc : float + Sum of the squared canonical correlation coefficients. + """ + n_components = min(X.shape[1], y.shape[1]) + cca = CCA(n_components=n_components) + X_c, y_c = cca.fit_transform(X, y) + corrcoef = np.diagonal( + np.corrcoef(X_c, y_c, rowvar=False), + offset=n_components + ) + return sum(corrcoef**2) + def baseline(X, y, t): """Baseline method using CCA from sklearn. @@ -64,24 +88,16 @@ def baseline(X, y, t): the scores is corresponding to the feature selection process. """ n_samples, n_features = X.shape - n_targets = y.shape[1] mask = np.zeros(n_features, dtype=bool) r2 = np.zeros(n_features, dtype=float) indices = np.zeros(t, dtype=int) scores = np.zeros(t, dtype=float) X_selected = np.zeros((n_samples, 0), dtype=float) for i in range(t): - n_components = min(i+1, n_targets) - cca = CCA(n_components=n_components) for j in range(n_features): if not mask[j]: X_candidate = np.column_stack((X_selected, X[:, j])) - X_c, y_c = cca.fit_transform(X_candidate, y) - corrcoef = np.diagonal( - np.corrcoef(X_c, y_c, rowvar=False), - offset=n_components - ) - r2[j] = sum(corrcoef**2) + r2[j] = ssc(X_candidate, y) d = np.argmax(r2) indices[i] = d scores[i] = r2[d]