-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutilities.py
151 lines (129 loc) · 4.37 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from __future__ import print_function, division
from random import randint, uniform, choice, sample
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import math
import numpy as np
PRE, REC, SPEC, FPR, NPV, ACC, F1 = 7, 6, 5, 4, 3, 2, 1
def _randint(a=0,b=0):
if a < b:
return randint(a,b)
else:
return randint(b,a)
def _randchoice(a=[]):
return choice(a)
def _randuniform(a=0.0,b=0.0):
return uniform(a,b)
def _randsample(a=[],b=1):
return sample(a,b)
def unpack(l):
tmp=[]
for i in l:
if list!=type(i):
tmp.append(i)
else:
for x in i:
tmp.append(x)
return tmp
def get_performance(tn, fp, fn, tp):
pre = 1.0 * tp / (tp + fp) if (tp + fp) != 0 else 0
rec = 1.0 * tp / (tp + fn) if (tp + fn) != 0 else 0
spec = 1.0 * tn / (tn + fp) if (tn + fp) != 0 else 0
fpr = 1 - spec
npv = 1.0 * tn / (tn + fn) if (tn + fn) != 0 else 0
acc = 1.0 * (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0
f1 = 2.0 * tp / (2.0 * tp + fp + fn) if (2.0 * tp + fp + fn) != 0 else 0
return [round(x, 3) for x in [pre, rec, spec, fpr, npv, acc, f1]]
def get_score(criteria, prediction, test_labels,data):
tn, fp, fn, tp = confusion_matrix(test_labels,prediction, labels=[0,1]).ravel()
pre, rec, spec, fpr, npv, acc, f1 = get_performance(tn, fp, fn, tp)
all_metrics = [tp, fp, tn, fn, pre, rec, spec, fpr, npv, acc, f1]
if criteria == "Accuracy":
score = -all_metrics[-ACC]
elif criteria == "d2h":
score = all_metrics[-FPR] ** 2 + (1 - all_metrics[-REC]) ** 2
score = math.sqrt(score) / math.sqrt(2)
elif criteria=="Pf_Auc":
score=auc_measure(prediction,test_labels)
elif criteria=="popt":
score=get_auc(data)
elif criteria=="popt20":
score=get_popt20(data)
elif criteria == "Gini":
p1 = all_metrics[-PRE] # target == 1 for the positive split
p0 = 1 - all_metrics[-NPV] # target == 1 for the negative split
score = 1 - p0 ** 2 - p1 ** 2
else: # Information Gain
P, N = all_metrics[0] + all_metrics[3], all_metrics[1] + all_metrics[2]
p = 1.0 * P / (P + N) if P + N > 0 else 0 # before the split
p1 = all_metrics[-PRE] # the positive part of the split
p0 = 1 - all_metrics[-NPV] # the negative part of the split
I, I0, I1 = (-x * np.log2(x) if x != 0 else 0 for x in (p, p0, p1))
I01 = p * I1 + (1 - p) * I0
score = -(I - I01) # the smaller the better.
return round(score, 3)
def auc_measure(prediction, test_labels):
fpr, tpr, _ = roc_curve(test_labels, prediction, pos_label=1)
auc1 = auc(fpr, tpr)
return auc1
def subtotal(x):
xx = [0]
for i, t in enumerate(x):
xx += [xx[-1] + t]
return xx[1:]
def get_recall(true):
total_true = float(len([i for i in true if i == 1]))
hit = 0.0
recall = []
for i in range(len(true)):
if true[i] == 1:
hit += 1
recall += [hit / total_true if total_true else 0.0]
return recall
def get_auc(data):
"""The smaller the better"""
if len(data) == 1:
return 0
x_sum = float(sum(data['loc']))
x = data['loc'].apply(lambda t: t / x_sum)
xx = subtotal(x)
yy = get_recall(data['bug'].values)
try:
ret = round(auc(xx, yy), 3)
except:
#print"?"
ret = 0
return ret
def get_popt20(data):
data.sort_values(by=["bug", "loc"], ascending=[0, 1], inplace=True)
x_sum = float(sum(data['loc']))
x = data['loc'].apply(lambda t: t / x_sum)
xx = subtotal(x)
# get AUC_optimal
yy = get_recall(data['bug'].values)
xxx = [i for i in xx if i <= 0.2]
yyy = yy[:len(xxx)]
s_opt = round(auc(xxx, yyy), 3)
# get AUC_worst
xx = subtotal(x[::-1])
yy = get_recall(data['bug'][::-1].values)
xxx = [i for i in xx if i <= 0.2]
yyy = yy[:len(xxx)]
try:
s_wst = round(auc(xxx, yyy), 3)
except:
# print "s_wst forced = 0"
s_wst = 0
# get AUC_prediction
data.sort_values(by=["prediction", "loc"], ascending=[0, 1], inplace=True)
x = data['loc'].apply(lambda t: t / x_sum)
xx = subtotal(x)
yy = get_recall(data['bug'].values)
xxx = [k for k in xx if k <= 0.2]
yyy = yy[:len(xxx)]
try:
s_m = round(auc(xxx, yyy), 3)
except:
return 0
Popt = (s_m - s_wst) / (s_opt - s_wst)
return round(Popt,3)