-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bdfdab7
commit 11c7c79
Showing
5 changed files
with
250 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
""" | ||
=================================================== | ||
Feature selection performance on redundant features | ||
=================================================== | ||
In this examples, we will compare the performance of feature selectors on the | ||
datasets, which contain redundant features. | ||
Here four types of features should be distinguished: | ||
* Unuseful features: the features do not contribute to the target | ||
* Dependent informative features: the features contribute to the target and form | ||
the redundant features | ||
* Redundant features: the features are constructed by linear transformation of | ||
dependent informative features | ||
* Independent informative features: the features contribute to the target but | ||
does not contribute to the redundant features. | ||
.. note:: | ||
If we do not distinguish dependent and independent informative features and use | ||
informative features to form both the target and the redundant features. The task | ||
will be much easier. | ||
""" | ||
|
||
# Authors: Sikai Zhang | ||
# SPDX-License-Identifier: MIT | ||
|
||
# %% | ||
# Define dataset generator | ||
# ------------------------ | ||
|
||
import numpy as np | ||
|
||
|
||
def make_redundant( | ||
n_samples, | ||
n_features, | ||
dep_info_ids, | ||
indep_info_ids, | ||
redundant_ids, | ||
random_seed, | ||
): | ||
"""Make a dataset with linearly redundant features. | ||
Parameters | ||
---------- | ||
n_samples : int | ||
The number of samples. | ||
n_features : int | ||
The number of features. | ||
dep_info_ids : list[int] | ||
The indices of dependent informative features. | ||
indep_info_ids : list[int] | ||
The indices of independent informative features. | ||
redundant_ids : list[int] | ||
The indices of redundant features. | ||
random_seed : int | ||
Random seed. | ||
Returns | ||
------- | ||
X : array-like of shape (n_samples, n_features) | ||
Feature matrix. | ||
y : array-like of shape (n_samples,) | ||
Target vector. | ||
""" | ||
rng = np.random.default_rng(random_seed) | ||
info_ids = dep_info_ids + indep_info_ids | ||
n_dep_info = len(dep_info_ids) | ||
n_info = len(info_ids) | ||
n_redundant = len(redundant_ids) | ||
informative_coef = rng.random(n_info) | ||
redundant_coef = rng.random((n_dep_info, n_redundant)) | ||
|
||
X = rng.random((n_samples, n_features)) | ||
y = np.dot(X[:, info_ids], informative_coef) | ||
|
||
X[:, redundant_ids] = X[:, dep_info_ids]@redundant_coef | ||
return X, y | ||
|
||
# %% | ||
# Define score function | ||
# --------------------- | ||
# This function is used to compute the number of correct features missed by selectors. | ||
# | ||
# * For independent informative features, selectors should select all of them | ||
# * For dependent informative features, selectors only need to select any | ||
# ``n_dep_info``-combination of the set ``dep_info_ids`` + ``redundant_ids``. That | ||
# means if the indices of dependent informative features are :math:`[0, 1]` and the | ||
# indices of the redundant features are :math:`[5]`, then the correctly selected | ||
# indices can be any of :math:`[0, 1]`, :math:`[0, 5]`, and :math:`[1, 5]`. | ||
|
||
def get_n_missed( | ||
dep_info_ids, | ||
indep_info_ids, | ||
redundant_ids, | ||
selected_ids | ||
): | ||
"""Get the number of features miss selected.""" | ||
n_redundant = len(redundant_ids) | ||
n_missed_indep = len(np.setdiff1d(indep_info_ids, selected_ids)) | ||
n_missed_dep = len( | ||
np.setdiff1d(dep_info_ids+redundant_ids, selected_ids) | ||
)-n_redundant | ||
if n_missed_dep < 0: | ||
n_missed_dep = 0 | ||
return n_missed_indep+n_missed_dep | ||
|
||
# %% | ||
# Prepare selectors | ||
# ----------------- | ||
# We compare :class:`fastcan.FastCan` with eight selectors of :mod:`sklearn`, which | ||
# include the Select From a Model (SFM) algorithm, the Recursive Feature Elimination | ||
# (RFE) algorithm, the Sequential Feature Selection (SFS) algorithm, and Select K Best | ||
# (SKB) algorithm. | ||
# The list of the selectors are given below: | ||
# | ||
# * fastcan: :class:`fastcan.FastCan` selector | ||
# * skb_reg: is the SKB algorithm ranking features with ANOVA (analysis of variance) | ||
# F-statistic and p-values | ||
# * skb_mir: is the SKB algorithm ranking features mutual information for regression | ||
# * sfm_lsvr: the SFM algorithm with a linear support vector regressor | ||
# * sfm_rfr: the SFM algorithm with a random forest regressor | ||
# * rfe_lsvr: is the RFE algorithm with a linear support vector regressor | ||
# * rfe_rfr: is the RFE algorithm with a random forest regressor | ||
# * sfs_lsvr: is the forward SFS algorithm with a linear support vector regressor | ||
# * sfs_rfr: is the forward SFS algorithm with a random forest regressor | ||
|
||
|
||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.feature_selection import ( | ||
RFE, | ||
SelectFromModel, | ||
SelectKBest, | ||
SequentialFeatureSelector, | ||
f_regression, | ||
mutual_info_regression, | ||
) | ||
from sklearn.svm import LinearSVR | ||
|
||
from fastcan import FastCan | ||
|
||
lsvr = LinearSVR(C = 1, dual="auto", max_iter=100000, random_state=0) | ||
rfr = RandomForestRegressor(n_estimators = 10, random_state=0) | ||
|
||
|
||
N_SAMPLES = 1000 | ||
N_FEATURES = 10 | ||
DEP_INFO_IDS = [2, 4, 7, 9] | ||
INDEP_INFO_IDS = [0, 1, 6] | ||
REDUNDANT_IDS = [5, 8] | ||
N_SELECTED = len(DEP_INFO_IDS+INDEP_INFO_IDS) | ||
N_REPEATED = 10 | ||
|
||
selector_dict = { | ||
"fastcan": FastCan(N_SELECTED, verbose=0), | ||
"skb_reg": SelectKBest(f_regression, k=N_SELECTED), | ||
"skb_mir": SelectKBest(mutual_info_regression, k=N_SELECTED), | ||
"sfm_lsvr": SelectFromModel(lsvr, max_features=N_SELECTED, threshold=-np.inf), | ||
"sfm_rfr": SelectFromModel(rfr, max_features=N_SELECTED, threshold=-np.inf), | ||
"rfe_lsvr": RFE(lsvr, n_features_to_select=N_SELECTED, step=1), | ||
"rfe_rfr": RFE(rfr, n_features_to_select=N_SELECTED, step=1), | ||
"sfs_lsvr": SequentialFeatureSelector(lsvr, n_features_to_select=N_SELECTED, cv=2), | ||
"sfs_rfr": SequentialFeatureSelector(rfr, n_features_to_select=N_SELECTED, cv=2), | ||
} | ||
|
||
# %% | ||
# Run test | ||
# -------- | ||
|
||
N_SELECTORS = len(selector_dict) | ||
n_missed = np.zeros((N_REPEATED, N_SELECTORS), dtype=int) | ||
|
||
for i in range(N_REPEATED): | ||
X, y = make_redundant( | ||
n_samples=N_SAMPLES, | ||
n_features=N_FEATURES, | ||
dep_info_ids=DEP_INFO_IDS, | ||
indep_info_ids=INDEP_INFO_IDS, | ||
redundant_ids=REDUNDANT_IDS, | ||
random_seed=i, | ||
) | ||
for j, selector in enumerate(selector_dict.values()): | ||
result_ids = selector.fit(X, y).get_support(indices=True) | ||
n_missed[i, j] = get_n_missed( | ||
dep_info_ids=DEP_INFO_IDS, | ||
indep_info_ids=INDEP_INFO_IDS, | ||
redundant_ids=REDUNDANT_IDS, | ||
selected_ids=result_ids, | ||
) | ||
|
||
# %% | ||
# Plot results | ||
# ------------ | ||
# :class:`fastcan.FastCan` correctly selects all informative features with zero missed | ||
# features. | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
fig, ax = plt.subplots(figsize = (8, 5)) | ||
rects = ax.bar(selector_dict.keys(), n_missed.sum(0), width = 0.5) | ||
ax.bar_label(rects, n_missed.sum(0), padding=3) | ||
plt.xlabel("Selector") | ||
plt.ylabel("No. of missed features") | ||
plt.title("Performance of selectors on datasets with linear redundant features") | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters