-
Notifications
You must be signed in to change notification settings - Fork 61
/
evaluation.py
93 lines (73 loc) · 3.39 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import nltk
import re
import pandas as pd
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
def tokenizer(sentence):
"""Very basic tokenizer: split the sentence into a list of tokens."""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
return [w for w in words if w]
def getBLEUscore(true_headline, predicted_headline):
token_true_headline = []
for sentence in true_headline:
token_true_headline.append(tokenizer(sentence))
token_predicted_headline = []
for sentence in predicted_headline:
token_predicted_headline.append(tokenizer(sentence))
BLEUscore = []
weights = [1, 0, 0, 0] # param weights: weights for unigrams, bigrams, trigrams and so on
total_predicted_headline = len(predicted_headline)
count = 0
for id in range(total_predicted_headline):
BLEUscore.append(nltk.translate.bleu_score.sentence_bleu
([token_true_headline[id]], token_predicted_headline[id], weights=weights))
count += 1
if count % 100 == 0:
print ("Calculating BLEU for sentence %d" %count)
avgBLEUscore = sum(BLEUscore)/total_predicted_headline
return BLEUscore, avgBLEUscore
def main():
desired_width = 600
pd.set_option('display.width', desired_width)
# specify sentence/true headline/predicted headline path.
sentence_path = './dataset/test_enc.txt'
true_headline_path = "./dataset/test_dec.txt"
predicted_headline_path = "./output/predicted_test_headline.txt"
# specify number of lines to read.
number_of_lines_read = 400
with open(true_headline_path) as ft:
print("reading actual headlines...")
true_headline = [next(ft).strip() for line in range(number_of_lines_read)]
ft.close()
with open(predicted_headline_path) as fp:
print("reading predicted headlines...")
predicted_headline = []
for line in range(number_of_lines_read):
predicted_headline.append(next(fp).strip())
fp.close()
# for debugging to detect empty predicted headlines (empty predicted headline will cause error while calculating BLEU)
# print (predicted_headline[88380])
# print (true_headline[88380])
with open(sentence_path) as f:
print("reading sentences...")
sentence = [next(f).strip() for line in range(number_of_lines_read)]
ft.close()
# For testing purpose
# true_headline = ["F1's Schumacher Slams Into Wall"]
# predicted_headline = ["Schumacher Crashes in Practice"]
BLEUscore, avgBLEUscore = getBLEUscore(true_headline, predicted_headline)
print("average BLEU score: %f" % avgBLEUscore)
summary = list(zip(BLEUscore, predicted_headline, true_headline, sentence))
# pd.set_option("display.max_rows", 999)
# pd.set_option('max_colwidth', 80)
df = pd.DataFrame(data=summary, columns=['BLEU score', 'Predicted headline', 'True headline', 'article'])
df_sortBLEU = df.sort_values('BLEU score', ascending=False)
# print(df_sortBLEU)
# Store the top 100 predicted headline in terms of BLEU score
output_file = 'BLEU.txt'
df_sortBLEU.head(100).to_csv(output_file, sep='\n', index=False,
line_terminator='\n-------------------------------------------------\n')
print("Finished creating results summary in %s!" %output_file)
if __name__ == "__main__":
main()