-
Notifications
You must be signed in to change notification settings - Fork 0
/
spacy_rouge_en.py
88 lines (73 loc) · 2.95 KB
/
spacy_rouge_en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from collections import OrderedDict
import nlp
import numpy as np
import spacy
import networkx as nx
from evaluate_rouge import make_sentences_into_summary, total_rouge, make_random_comparison
import matplotlib.pyplot as plt
from spacy.lang.en.stop_words import STOP_WORDS
from datasets import load_dataset
import tqdm
def make_spacy_gen_indexes_into_list(spacy):
actual_list = []
for sent in spacy:
actual_list.append(sent)
return actual_list
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
articles = test_dataset["article"]
highlights = test_dataset["highlights"]
rouge = nlp.load_metric('rouge')
list_of_highlights = []
list_of_summaries = []
list_of_random_summaries = []
for iteration, t in tqdm.tqdm(enumerate(zip(articles, highlights))):
nlp_en = spacy.load('en_core_web_lg')
doc = nlp_en(t[0])
list_of_sentences = make_spacy_gen_indexes_into_list(doc.sents)
similarity_and_tuples = []
checked_similarity = []
largest_sim = 0
for i in range(len(list_of_sentences)):
for k in range(len(list_of_sentences)):
if k == i:
continue
if (i, k) in checked_similarity or (k, i) in checked_similarity:
continue
if list_of_sentences[i].similarity(list_of_sentences[k]) > largest_sim:
largest_sim = list_of_sentences[i].similarity(list_of_sentences[k])
checked_similarity.append((i, k))
similarity_and_tuples.append((i, k, list_of_sentences[i].similarity(list_of_sentences[k])))
G = nx.Graph()
# make tuples into nodes
# print(largest_sim)
threshold = 0.6
for tuple in similarity_and_tuples:
if threshold * largest_sim < tuple[2]:
src_node = tuple[0]
dest_node = tuple[1]
weight = tuple[2]
G.add_edge(src_node, dest_node, weight=weight)
pos = nx.spring_layout(G) # compute graph layout
nx.draw(G, pos, node_size=700) # draw nodes and edges
nx.draw_networkx_labels(G, pos) # draw node labels/names
# draw edge weights
labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
# show image
# plt.show()
pr = nx.pagerank(G, max_iter=1000, alpha=0.9)
max_key = max(pr, key=pr.get)
from operator import itemgetter
list_of_result_tuples = sorted(pr.items(), key=itemgetter(1), reverse=True)
list_of_order = sorted(pr, key=pr.get, reverse=True)
# put together sentences by order of importance
list_of_summaries.append(make_sentences_into_summary(list_of_result_tuples, list_of_sentences))
list_of_random_summaries.append(make_random_comparison(list_of_sentences))
list_of_highlights.append(t[1])
if iteration == 200:
break
print("actual rouge:")
total_rouge(list_of_summaries, list_of_highlights, rouge)
print("random rouge:")
rouge_rand = nlp.load_metric('rouge')
total_rouge(list_of_random_summaries, list_of_highlights, rouge_rand)