-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasttext_rouge_no.py
130 lines (105 loc) · 3.75 KB
/
fasttext_rouge_no.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import fasttext
import fasttext.util
import numpy as np
from collections import OrderedDict
import nlp
import os
import spacy
import networkx as nx
import tqdm
from evaluate_rouge import make_sentences_into_summary, total_rouge, make_random_comparison
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
fasttext.util.download_model('no', if_exists='ignore') # Norwegian
vector = fasttext.load_model("cc.no.100.bin")
nlp_nb = spacy.load('nb_core_news_lg')
rouge = nlp.load_metric('rouge')
path = "FILE_PATH"
text_dict = {}
predone_summary_dict = {}
list_of_highlights = []
list_of_summaries = []
list_of_random_summaries = []
def make_spacy_gen_indexes_into_list(spacy):
actual_list = []
for sent in spacy:
actual_list.append(sent)
return actual_list
def get_mean_sentence(vectors, text):
accum = []
for word in text.lower().split():
if word in vectors:
accum.append(vectors[word])
if len(accum) > 0:
r = np.array(accum)
return r.mean(axis=0)
else:
return "Accum"
for k, file in enumerate(os.listdir(path)):
filename = os.path.join(path, file)
with open(filename, "r") as myfile:
text = myfile.read()
new_text = text.split("===")
predone_summary_dict[k] = new_text[0]
text_dict[k] = new_text[1]
for key, v in tqdm.tqdm(text_dict.items()):
doc = nlp_nb(text_dict[key])
list_of_sents = []
for sent in tqdm.tqdm(doc.sents):
enc = get_mean_sentence(vector, sent.text)
if type(enc) == str:
continue
list_of_sents.append(enc)
enc_sents = np.vstack(list_of_sents)
dist = cosine_similarity(enc_sents)
np.fill_diagonal(dist, 0)
dist1 = dist.copy()
dist.argmax(axis=1)
similarity_and_tuples = []
checked_similarity = []
largest_sim = 0
for i in range(len(dist1)):
for k in range(len(dist1)):
if k == i:
continue
if (i, k) in checked_similarity or (k, i) in checked_similarity:
continue
if dist[i][k] > largest_sim:
largest_sim = dist[i][k]
# print(dist[i][k], i, k)
checked_similarity.append((i, k))
similarity_and_tuples.append((i, k, dist[i][k]))
G = nx.Graph()
# make tuples into nodes
# print(largest_sim)
threshold = 0.6
for tuple in similarity_and_tuples:
if threshold * largest_sim < tuple[2]:
src_node = tuple[0]
dest_node = tuple[1]
weight = tuple[2]
G.add_edge(src_node, dest_node, weight=weight)
pos = nx.spring_layout(G) # compute graph layout
nx.draw(G, pos, node_size=700) # draw nodes and edges
nx.draw_networkx_labels(G, pos) # draw node labels/names
# draw edge weights
labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
# show image
# plt.show()
pr = nx.pagerank(G, max_iter=1000, alpha=0.9)
max_key = max(pr, key=pr.get)
from operator import itemgetter
list_of_result_tuples = sorted(pr.items(), key=itemgetter(1), reverse=True)
list_of_order = sorted(pr, key=pr.get, reverse=True)
list_of_sentences = make_spacy_gen_indexes_into_list(doc.sents)
# put together sentences by order of importance
list_of_summaries.append(make_sentences_into_summary(list_of_result_tuples, list_of_sentences))
list_of_random_summaries.append(make_random_comparison(list_of_sentences))
list_of_highlights.append(predone_summary_dict[key])
print("actual rouge:")
total_rouge(list_of_summaries, list_of_highlights, rouge)
print("random rouge")
rouge_rand = nlp.load_metric('rouge')
total_rouge(list_of_random_summaries, list_of_highlights, rouge_rand)
print("DONE")