-
Notifications
You must be signed in to change notification settings - Fork 89
/
Copy pathfeature_imp.py
73 lines (65 loc) · 3.44 KB
/
feature_imp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from cv import PurgedKFold, cvScore
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
def featImpMDI(fit, featNames):
# feat importance based on IS mean impurity reduction
df0 = {i: tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
df0 = pd.DataFrame.from_dict(df0, orient='index')
df0.columns = featNames
df0 = df0.replace(0, np.nan) # because max_features = 1
imp = pd.concat({'mean': df0.mean(), 'std': df0.std() * df0.shape[0] ** -0.5}, axis=1)
imp /= imp['mean'].sum()
return imp
def featImpMDA(clf, X, y, cv, sample_weight, t1, pctEmbargo, scoring='neg_log_loss'):
# feat importance based on OOS score reduction
if scoring not in ['neg_log_loss', 'accuracy']:
raise ValueError('wrong scoring method')
from sklearn.metrics import log_loss, accuracy_score
cvGen = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)
scr0, scr1 = pd.Series(), pd.DataFrame(columns=X.columns)
for i, (train, test) in enumerate(cvGen.split(X=X)):
X0, y0, w0 = X.iloc[train, :], y.iloc[train], sample_weight.iloc[train]
X1, y1, w1 = X.iloc[test, :], y.iloc[test], sample_weight.iloc[test]
fit = clf.fit(X=X0, y=y0, sample_weight=w0.values)
if scoring == 'neg_log_loss':
prob = fit.predict_proba(X1)
scr0.loc[i] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_)
else:
pred = fit.predict(X1)
scr0.loc[i] = accuracy_score(y1, pred, sample_weight=w1.values)
for j in X.columns:
X1_ = X1.copy(deep=True)
np.random.shuffle(X1_[j].values) # permutation of a single column
if scoring == 'neg_log_loss':
prob = fit.predict_proba(X1_)
scr1.loc[i, j] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_)
else:
pred = fit.predict(X1_)
scr1.loc[i, j] = accuracy_score(y1, pred, sample_weight=w1.values)
imp = (-scr1).add(scr0, axis=0)
if scoring == 'neg_log_loss':
imp = imp / -scr1
else:
imp = imp / (1.0 - scr1)
imp = pd.concat({'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** -0.5}, axis=1)
return imp, scr0.mean()
def auxFeatImpSFI(featNames, clf, trnsX, cont, scoring, cvGen):
imp = pd.DataFrame(columns=['mean', 'std'])
for featName in featNames:
df0 = cvScore(clf, X=trnsX[[featName]], y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cvGen=cvGen)
imp.loc[featName, 'mean'] = df0.mean()
imp.loc[featName, 'std'] = df0.std() * df0.shape[0] ** -0.5
return imp
def getTestData(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
# generate a random dataset for a classification problem
trnsX, cont = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, random_state=0, shuffle=False)
df0 = pd.DatetimeIndex(periods=n_samples, freq=pd.tseries.offsets.Minute(), end=pd.datetime.today())
trnsX = pd.DataFrame(trnsX, index=df0)
cont = pd.Series(cont, index=df0).to_frame('bin')
df0 = ['I_%s' % i for i in range(n_informative)] + ['R_%s' % i for i in range(n_redundant)]
df0 += ['N_%s' % i for i in range(n_features - len(df0))]
trnsX.columns = df0
cont['w'] = 1.0 / cont.shape[0]
cont['t1'] = pd.Series(cont.index, index=cont.index)
return trnsX, cont