-
Notifications
You must be signed in to change notification settings - Fork 0
/
E3_clf_real_selection.py
93 lines (71 loc) · 2.84 KB
/
E3_clf_real_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
E3, E4 - select k-best + classification + f-test anova --- Real-world streams
"""
import numpy as np
from sklearn import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
np.random.seed(1233)
base_clfs = [
GaussianNB(),
KNeighborsClassifier(),
SVC(random_state=11313),
DecisionTreeClassifier(random_state=11313),
MLPClassifier(random_state=11313)
]
real_streams_full = [
'data/real_streams/covtypeNorm-1-2vsAll-pruned.arff',
'data/real_streams/electricity.npy',
'data/real_streams/poker-lsn-1-2vsAll-pruned.arff',
'data/real_streams/INSECTS-abrupt_imbalanced_norm.arff',
'data/real_streams/INSECTS-gradual_imbalanced_norm.arff',
'data/real_streams/INSECTS-incremental_imbalanced_norm.arff'
]
def sqspace(start, end, num):
space = (((np.power(np.linspace(0,1,num),2))*(end-start))+start).astype(int)[1:]
return space
n_features = sqspace(1,118,31)[1:]
n_splits=2
n_repeats=5
for f_id in range(len(real_streams_full)):
clf_res = np.zeros((len(n_features), n_splits*n_repeats, len(base_clfs)))
anova_res = np.zeros((max(n_features), 2))
pbar = tqdm(total=len(n_features)*n_splits*n_repeats*len(base_clfs))
res_temp = np.load('results/combined_real_%i.npy' % f_id)
print(res_temp.shape) # features, chunks
res_temp = res_temp.swapaxes(0,1)
#shuffle
p = np.random.permutation(res_temp.shape[0])
res_temp = res_temp[p]
# print(res_rep.shape) # chunks, measures + label
X = res_temp[:,:-1]
y = res_temp[:,-1]
X[np.isnan(X)]=1
X[np.isinf(X)]=1
# print(X.shape)
# print(y)
stat, val = f_classif(X,y)
anova_res[:,0] = stat
anova_res[:,1] = val
for n_id, n_f in enumerate(n_features):
#selekcja
skb = SelectKBest(k=n_f)
X_new = skb.fit_transform(X, y)
rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=3242)
for fold, (train, test) in enumerate(rskf.split(X_new, y)):
for base_id, base_c in enumerate(base_clfs):
clf = clone(base_c)
pred = clf.fit(X_new[train], y[train]).predict(X_new[test])
acc = balanced_accuracy_score(y[test], pred)
clf_res[n_id, fold, base_id] = acc
# print(acc)
pbar.update(1)
np.save('results/clf_sel_real_%i.npy' % f_id, clf_res)
np.save('results/anova_sel_real_%i.npy' % f_id, anova_res)