-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathevaluate.py
87 lines (82 loc) · 4.74 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python2
# -*- coding:utf-8 -*-
'''Evaluates PIE sense disambiguation systems' output'''
from collections import Counter
def evaluate(data, use_test_data, top_n):
'''
Evaluates PSD predictions, outputs per-class accuracy,
P/R/F1 on idiomatic sense, macro-accuracy, and macro-F1.
Per-class scores for top_n most frequent types only.
'''
# Split data and filter out PIEs without binary labels
evaluation_data = [PIE for PIE in data if (use_test_data and PIE.split == 'test') or (not use_test_data and PIE.split == 'dev')]
evaluation_data = [PIE for PIE in evaluation_data if PIE.binary_label]
# Get types for per-type scoring
PIE_types = set([PIE.pie_type for PIE in evaluation_data])
PIE_type_counts = Counter(PIE.pie_type for PIE in evaluation_data)
# Assign true/false positives/negatives
for PIE in evaluation_data:
if PIE.binary_label == PIE.predicted_label:
if PIE.binary_label == 'i':
PIE.classification = 'tp'
if PIE.binary_label == 'l':
PIE.classification = 'tn'
else:
if PIE.binary_label == 'i':
PIE.classification = 'fn'
if PIE.binary_label == 'l':
PIE.classification = 'fp'
# Count true/false positives/negatives
total_true = float(sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data]))
total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data]))
total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data]))
total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data]))
# Get overall micro-accuracy and micro-F1
micro_accuracy = total_true/float(len(evaluation_data))*100.
micro_precision = total_tp/(total_tp+total_fp)*100.
micro_recall = total_tp/(total_tp+total_fn)*100.
micro_f1 = 2. * (micro_precision * micro_recall) / (micro_precision + micro_recall)
# Get per-type scores
# NB: define precision and recall as 100%, if denominator is 0
scores_per_type = {PIE_type: {'accuracy': 0., 'precision': 0., 'recall': 0., 'f1': 0.} for PIE_type in PIE_types}
for PIE_type in PIE_types:
total_true = float(sum([PIE.classification in ['tp', 'tn'] for PIE in evaluation_data if PIE.pie_type == PIE_type]))
scores_per_type[PIE_type]['accuracy'] = total_true/float(PIE_type_counts[PIE_type])*100.
total_tp = float(sum([PIE.classification == 'tp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
total_fp = float(sum([PIE.classification == 'fp' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
total_fn = float(sum([PIE.classification == 'fn' for PIE in evaluation_data if PIE.pie_type == PIE_type]))
try:
scores_per_type[PIE_type]['precision'] = total_tp/(total_tp+total_fp)*100.
except ZeroDivisionError:
scores_per_type[PIE_type]['precision'] = 100.
try:
scores_per_type[PIE_type]['recall'] = total_tp/(total_tp+total_fn)*100.
except ZeroDivisionError:
scores_per_type[PIE_type]['recall'] = 100.
try:
scores_per_type[PIE_type]['f1'] = 2. * (scores_per_type[PIE_type]['precision'] * scores_per_type[PIE_type]['recall']) / (scores_per_type[PIE_type]['precision'] + scores_per_type[PIE_type]['recall'])
except ZeroDivisionError:
scores_per_type[PIE_type]['f1'] = 0.
# Get macro-averages
# NB: macro-average F1 is not harmonic mean of macro-average P and macro-average R
macro_accuracy = sum(scores_per_type[PIE_type]['accuracy'] for PIE_type in scores_per_type)/len(PIE_types)
macro_precision = sum(scores_per_type[PIE_type]['precision'] for PIE_type in scores_per_type)/len(PIE_types)
macro_recall = sum(scores_per_type[PIE_type]['recall'] for PIE_type in scores_per_type)/len(PIE_types)
macro_f1 = sum(scores_per_type[PIE_type]['f1'] for PIE_type in scores_per_type)/len(PIE_types)
# Print output to screen
print '### EVALUATION SCORES ###'
print 'Macro-Accuracy: {0:.2f}\nMicro-Accuracy: {1:.2f}'.format(macro_accuracy, micro_accuracy)
print '-'*25
print 'Macro-Precision: {0:.2f}\nMacro-Recall: {1:.2f}\nMacro-F1: {2:.2f}'.format(macro_precision, macro_recall, macro_f1)
print 'Micro-Precision: {0:.2f}\nMicro-Recall: {1:.2f}\nMicro-F1: {2:.2f}'.format(micro_precision, micro_recall, micro_f1)
# Print for copying to results file
print '\n{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}'.format(macro_accuracy, micro_accuracy, macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)
print '\n### SCORES PER TYPE ###'
print '{0}\tFreq.\tPrec.\tRecall\tF1\tAccuracy'.format('PIE Type' + 15*' ')
print '-'*65
top_n_types = PIE_type_counts.most_common(top_n)
for top_type in top_n_types:
scores = scores_per_type[top_type[0]]
type_with_padding = top_type[0][:23] + (23-len(top_type[0][:23]))*' '
print '{0}\t{1:d}\t{2:.2f}\t{3:.2f}\t{4:.2f}\t{5:.2f}'.format(type_with_padding, top_type[1], scores['precision'], scores['recall'], scores['f1'], scores['accuracy'])
return data