Skip to content

Commit

Permalink
DOC unsupervised init
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewSZhang committed Sep 24, 2024
1 parent e0961f8 commit 1318faa
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
"sphinx.ext.intersphinx",
"sphinx_gallery.gen_gallery",
"sphinx_design",
"matplotlib.sphinxext.plot_directive",
]

# List of patterns, relative to source directory, that match files and
Expand Down
74 changes: 73 additions & 1 deletion doc/unsupervised.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,76 @@
Unsupervised feature selection
==============================

We can use :class:`FastCan` to do unsupervised feature selection.
We can use :class:`FastCan` to do unsupervised feature selection.
The unsupervised application of :class:`FastCan` tries to select features, which
maximize the sum of the squared canonical correlation (SSC) with the principal
components (PCs) acquired from PCA (principal component analysis) of the feature
matrix :math:`X`.

>>> from sklearn.decomposition import PCA
>>> from sklearn import datasets
>>> from fastcan import FastCan
>>> iris = datasets.load_iris()
>>> X = iris["data"]
>>> y = iris["target"]
>>> f_names = iris["feature_names"]
>>> t_names = iris["target_names"]
>>> pca = PCA(n_components=2)
>>> X_pcs = pca.fit_transform(X)
>>> selector = FastCan(n_features_to_select=2, verbose=0)
>>> selector.fit(X, X_pcs[:, :2])
>>> selector.indices_
array([2, 1], dtype=int32)

.. note::
There is no guarantee that this unsupervised :class:`FastCan` will select
the optimal subset of the features, which has the highest SSC with PCs.
Because :class:`FastCan` selects features in a greedy manner, which may lead to
suboptimal results. See the following plots.

.. plot::
:context: close-figs
:align: center

from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import CCA

def ssc(X, y):
"""Sum of the squared canonical correlation coefficients.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.

y : array-like of shape (n_samples, n_outputs)
Target matrix.

Returns
-------
ssc : float
Sum of the squared canonical correlation coefficients.
"""
n_components = min(X.shape[1], y.shape[1])
cca = CCA(n_components=n_components)
X_c, y_c = cca.fit_transform(X, y)
corrcoef = np.diagonal(
np.corrcoef(X_c, y_c, rowvar=False),
offset=n_components
)
return sum(corrcoef**2)

comb = list(combinations([0, 1, 2, 3], 2))
fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(8, 6), layout="constrained")
for i in range(2):
for j in range(3):
f1_idx = comb[i*3+j][0]
f2_idx = comb[i*3+j][1]
score = ssc(X[:, [f1_idx, f2_idx]], X_pcs)
scatter = axs[i, j].scatter(X[:, f1_idx], X[:, f2_idx], c=y)
axs[i, j].set(xlabel=f_names[f1_idx], ylabel=f_names[f2_idx])
axs[i, j].set_title(f"SSC: {score:.3f}")
for spine in axs[1, 0].spines.values():
spine.set_edgecolor('red')
_ = axs[1, 2].legend(scatter.legend_elements()[0], t_names, loc="lower right")

34 changes: 25 additions & 9 deletions examples/plot_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,30 @@

from sklearn.cross_decomposition import CCA

def ssc(X, y):
"""Sum of the squared canonical correlation coefficients.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
y : array-like of shape (n_samples, n_outputs)
Target matrix.
Returns
-------
ssc : float
Sum of the squared canonical correlation coefficients.
"""
n_components = min(X.shape[1], y.shape[1])
cca = CCA(n_components=n_components)
X_c, y_c = cca.fit_transform(X, y)
corrcoef = np.diagonal(
np.corrcoef(X_c, y_c, rowvar=False),
offset=n_components
)
return sum(corrcoef**2)


def baseline(X, y, t):
"""Baseline method using CCA from sklearn.
Expand All @@ -64,24 +88,16 @@ def baseline(X, y, t):
the scores is corresponding to the feature selection process.
"""
n_samples, n_features = X.shape
n_targets = y.shape[1]
mask = np.zeros(n_features, dtype=bool)
r2 = np.zeros(n_features, dtype=float)
indices = np.zeros(t, dtype=int)
scores = np.zeros(t, dtype=float)
X_selected = np.zeros((n_samples, 0), dtype=float)
for i in range(t):
n_components = min(i+1, n_targets)
cca = CCA(n_components=n_components)
for j in range(n_features):
if not mask[j]:
X_candidate = np.column_stack((X_selected, X[:, j]))
X_c, y_c = cca.fit_transform(X_candidate, y)
corrcoef = np.diagonal(
np.corrcoef(X_c, y_c, rowvar=False),
offset=n_components
)
r2[j] = sum(corrcoef**2)
r2[j] = ssc(X_candidate, y)
d = np.argmax(r2)
indices[i] = d
scores[i] = r2[d]
Expand Down

0 comments on commit 1318faa

Please sign in to comment.