-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
84 lines (65 loc) · 3.15 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import random
import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from data_preparation import scale_standardize_data
reject_thresholds = [0.001, 0.005, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 1, 2, 3, 4, 5, 7, 10, 100, 1000]
knn_parameters = {'n_neighbors': [3, 5, 7, 10, 15]}
random_forest_parameters = {
'n_estimators': [20, 50, 100, 200],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [4,5,6,7,8],
'criterion' :['gini', 'entropy'],
'random_state': [444]
}
n_folds = 5
non_zero_threshold = 1e-5
non_zero_threshold_sparsity = 1e-5
def load_data(data_desc, data_folder="data/", scaling=True):
if data_desc == "breastcancer":
X, y = load_breast_cancer(return_X_y=True)
if scaling is True:
X = scale_standardize_data(X)
return X, y
elif data_desc == "wine":
X, y = load_wine(return_X_y=True)
if scaling is True:
X = scale_standardize_data(X)
return X, y
elif data_desc == "flip":
flip_data = np.load(os.path.join(data_folder, f"flip.npz"))
if scaling is False:
flip_data = np.load(os.path.join(data_folder, f"flip_notscaled.npz"))
return flip_data['X'], flip_data['y']
elif data_desc == "t21":
t21_data = np.load(os.path.join(data_folder, f"t21.npz"))
if scaling is False:
t21_data = np.load(os.path.join(data_folder, f"t21_notscaled.npz"))
return t21_data['X'], t21_data['y']
else:
raise ValueError(f"Unkown data set {data_desc}")
def evaluate_sparsity(xcf, x_orig):
return evaluate_sparsity_ex(xcf - x_orig)
def evaluate_sparsity_ex(x):
return np.sum(np.abs(x[i]) > non_zero_threshold_sparsity for i in range(x.shape[0])) / len(x) # Count non-zero features (smaller values are better!)
def select_random_feature_subset(n_features, size=0.3):
n_subset_size = int(n_features * size)
return random.sample(range(n_features), n_subset_size)
def apply_perturbation(X, features_idx, noise_size=1.):
scale = noise_size # Scale/amount/variance of noise
X[:, features_idx] += np.random.normal(scale=scale, size=(X.shape[0], len(features_idx)))
return X
def evaluate_perturbed_features_recovery(xcf, x_orig, perturbed_features_idx):
return evaluate_perturbed_features_recovery_ex(np.abs(xcf - x_orig), perturbed_features_idx)
def evaluate_perturbed_features_recovery_ex(x, perturbed_features_idx):
indices = np.argwhere(x > non_zero_threshold)
e = 1e-7
# Compute confusion matrix
tp = np.sum([idx in perturbed_features_idx for idx in indices]) / len(indices)
fp = np.sum([idx not in perturbed_features_idx for idx in indices]) / len(indices)
tn = np.sum([idx not in perturbed_features_idx for idx in filter(lambda i: i not in indices, range(x.shape[0]))]) / (len(list(filter(lambda i: i not in indices, range(x.shape[0])))) + e)
fn = np.sum([idx in perturbed_features_idx for idx in filter(lambda i: i not in indices, range(x.shape[0]))]) / (len(list(filter(lambda i: i not in indices, range(x.shape[0])))) + e)
if len(indices) != 0:
return tp / (tp + fn + e) # Compute recall
else:
return 0