-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_validation.py
347 lines (246 loc) · 11.7 KB
/
model_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import numpy as np
import preprocess as pp
import os
from random import randint
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
def validate_model(embedding, emb_model_dir, emb_model_fn):
print("Start validation. Loading model. \n")
# load config
config = embedding.config
# load model
embedding.load_model(emb_model_dir, emb_model_fn)
# directories and filenames
val_dir = config.config['val_data_dir']
doesntfit_fn = config.config['doesntfit_file']
doesntfit_src = os.path.join(val_dir, doesntfit_fn)
synonyms_fn = config.config['synonyms_file']
syn_file_src = os.path.join(val_dir, synonyms_fn)
# test with doesn't fit questions
test_doesntfit(embedding, doesntfit_src)
# test with synonyms
# TODO get better syn file (slow, contains many non-significant instances)
# test_synonyms(embedding, syn_file_src)
# test with human similarity TODO remove hardcoding
human_sim_file_src = 'data/validation_data/human_similarity.csv'
test_human_similarity(embedding, human_sim_file_src)
#### Doesn't Fit Validation ####
def doesntfit(embedding, word_list):
"""
- compares each word-vector to mean of all word-vectors of word_list using the vector dot-product
- vector with lowest dot-produt to mean-vector is regarded as the one that dosen't fit
"""
used_words = [word for word in word_list if embedding.may_construct_word_vec(word)]
n_used_words = len(used_words)
n_words = len(word_list)
if n_used_words != n_words:
ignored_words = set(word_list) - set(used_words)
print("vectors for words %s are not present in the model, ignoring these words: ", ignored_words)
if not used_words:
print("cannot select a word from an empty list.")
vectors = np.vstack(embedding.word_vec(word) for word in used_words)
mean = np.mean(vectors, axis=0)
dists = np.dot(vectors, mean)
return sorted(zip(dists, used_words))[0][1]
def test_doesntfit(embedding, file_src):
"""
- tests all doesntfit-questions (lines) of file
- a doesnt-fit question is of the format "word_1 word_2 ... word_N word_NotFitting"
where word_1 to word_n are members of a category but word_NotFitting isn't
eg. "Auto Motorrad Fahrrad Ampel"
"""
# load config
config = embedding.config
print("Validating 'doesntfit' with file", file_src)
num_lines = sum(1 for line in open(file_src))
num_questions = 0
num_right = 0
tokenizer = pp.get_tokenizer(config)
# get questions
with open(file_src) as f:
questions = f.read().splitlines()
tk_questions = [tokenizer.tokenize(q) for q in questions]
# TODO: check if tokenizer has splitted one word to mulitple words and handle it.
# So far no word in the doesnt_fit testfile should be splitted
# vocab used to speed checking if word is in vocabulary
# (also checked by embedding.may_construct_word_vec(word))
vocab = embedding.get_vocab()
# test each question
for question in tk_questions:
# check if all words exist in vocabulary
if all(((word in vocab) or (embedding.may_construct_word_vec(word))) for word in question):
num_questions += 1
if doesntfit(embedding, question) == question[-1]:
num_right += 1
# calculate result
correct_matches = np.round(num_right/np.float(num_questions)*100, 1) if num_questions>0 else 0.0
coverage = np.round(num_questions/np.float(num_lines)*100, 1) if num_lines>0 else 0.0
# log result
print("\n*** Doesn't fit ***")
print('Doesn\'t fit correct: {0}% ({1}/{2})'.format(str(correct_matches), str(num_right), str(num_questions)))
print('Doesn\'t fit coverage: {0}% ({1}/{2}) \n'.format(str(coverage), str(num_questions), str(num_lines)))
#### Synonyms Validation ####
def test_synonyms(embedding, file_src):
"""
- tests all synonym-questions (lines) of file
- a synonym-question is of the format "word_1 word_2"
where word_1 and word_2 are synonyms
eg. "Blutgerinnsel Thrombus"
- for word_1 check if it appears in the n closest words of word_2 using "model.cosine(word, n)"
and vice-versa
- for each synonym-pair TWO CHECKS are made therefore (non-symmetric problem)
"""
print("Validating 'synonyms' with file", file_src)
config = embedding.config
num_lines = sum(1 for line in open(file_src))
num_questions = 0
cos_sim_sum_synonyms = 0
tokenizer = pp.get_tokenizer(config)
# get questions which are still of lenght 2 after tokenization
# TODO: improve for compound words (aaa-bbb) which are splitted by the tokenizer
tk_questions = []
with open(file_src, 'r') as f:
questions = f.read().splitlines()
for q in questions:
# synonyms = q.split(';')#tokenizer.tokenize(q)
# synonyms = [" ".join(tokenizer.tokenize(synonym)) for synonym in
# synonyms]
synonyms = tokenizer.tokenize(q)
if len(synonyms) == 2:
tk_questions.append(synonyms)
vocab = embedding.get_vocab()
# test each question
for tk_quest in tk_questions:
# check if all words exist in vocabulary
if all(((word in vocab) or embedding.may_construct_word_vec(word)) for word in tk_quest):
num_questions += 1
w1 = tk_quest[0]
w2 = tk_quest[1]
cos_sim_sum_synonyms += embedding.similarity(w1, w2)
# compute avg cosine similarity for random vectors to relate to avg_cosine_similarity of synonyms
vocab_size = len(vocab)
n_vals = 1000
similarity_sum_rand_vec = 0
vals1 = [randint(0, vocab_size -1) for i in range(n_vals)]
vals2 = [randint(0, vocab_size -1) for i in range(n_vals)]
for v1, v2 in zip(vals1, vals2):
similarity_sum_rand_vec += embedding.similarity(vocab[v1], vocab[v2])
avg_cosine_similarity_rand_vec = similarity_sum_rand_vec / np.float(n_vals)
# calculate result
avg_cosine_similarity_synonyms = (cos_sim_sum_synonyms / num_questions) if num_questions>0 else 0.0
coverage = np.round(num_questions/np.float(num_lines)*100, 1) if num_lines>0 else 0.0
# log result
print("\n*** Cosine-Similarity ***")
print("Synonyms avg-cos-similarity (SACS):", avg_cosine_similarity_synonyms, "\nRandom avg-cos-similarity (RACS):", avg_cosine_similarity_rand_vec,
"\nRatio SACS/RACS:", avg_cosine_similarity_synonyms/float(avg_cosine_similarity_rand_vec))
print("\n*** Word Coverage ***")
print("Synonyms: {0} pairs in input. {1} pairs after tokenization. {2} pairs could be constructed from model-vocabulary.".format(str(num_lines), str(len(tk_questions)), str(num_questions)))
print("Synonyms coverage: {0}% ({1}/{2})\n".format(str(coverage), str(2*num_questions), str(2*num_lines), ))
def get_human_rating_deviation(embedding, word1, word2, human_similarity):
# compute deviation of human similarity from cosine similarity
# cosine similarity
cosine_similarity = embedding.similarity(word1, word2)
return np.abs(cosine_similarity - human_similarity)
def test_human_similarity(embedding, file_src):
"""
Compare cosine similarity of 2 word-vectors against a similarity value
based on human ratings.
Each line in the file contains two words and the similarity value,
separated by ':'.
The datasets were obtained by asking human subjects to assign a similarity
or relatedness judgment to a number of German word pairs.
https://www.ukp.tu-darmstadt.de/data/semantic-relatedness/german-relatedness-datasets/
"""
config = embedding.config
tokenizer = pp.get_tokenizer(config)
vocab = embedding.get_vocab()
vocab_size = len(vocab)
# accumulate error and count test instances
summed_error = 0.0
n_test_instances = 0
n_skipped_instances = 0
summed_random_error = 0.0
# load file to lines
with open(file_src, 'r') as csvfile:
filereader = csv.reader(csvfile, delimiter=':',)
next(filereader)
# process line by line
for line in filereader:
n_test_instances += 1
# split lines to instances
word1 = tokenizer.tokenize(line[0])[0]
word2 = tokenizer.tokenize(line[1])[0]
human_similarity = np.float32(line[2])
# check if both words are in vocab
if (word1 in embedding.get_vocab()
and word2 in embedding.get_vocab()):
# add current deviation to error
deviation = get_human_rating_deviation(embedding, word1, word2,
human_similarity)
summed_error += deviation
# get a random error for comparison
rand_word1 = vocab[randint(0, vocab_size -1)]
rand_word2 = vocab[randint(0, vocab_size -1)]
random_dev = get_human_rating_deviation(embedding, rand_word1,
rand_word2,
human_similarity)
summed_random_error += random_dev
else:
n_skipped_instances += 1
# print results
print("\n*** Human-Similarity ***")
print("Number of instances: {0}, skipped: {1}"
.format(str(n_test_instances), str(n_skipped_instances)))
# check whether we found any valid test instance
n_processed_instances = n_test_instances - n_skipped_instances
if (n_processed_instances == 0):
print("Error: No instance could be computed with this model.")
else:
mean_error = summed_error / n_processed_instances
random_error = summed_random_error / n_processed_instances
print("random error: {0}, mean error: {1}"
.format(str(random_error), str(mean_error)))
#### Visualization ####
def visualize_words(embedding, word_list, n_nearest_neighbours):
# get indexes and words that you want to visualize
words_to_visualize = []
# word_indexes_to_visualize = []
# get all words and neighbors that you want to visualize
for word in word_list:
if not embedding.may_construct_word_vec(word):
continue
words_to_visualize.append(word)
# word_indexes_to_visualize.append(model.ix(word))
# get neighbours of word
neighbours = [n for (n, m) in embedding.most_similar_n(word, n_nearest_neighbours)]
words_to_visualize.extend(neighbours)
#word_indexes_to_visualize.extend(indexes)
# get vectors from indexes to visualize
if words_to_visualize == []:
print("No word found to show.")
return
emb_vectors = np.vstack([embedding.word_vec(word) for word in words_to_visualize])
# project down to 2D
pca = PCA(n_components=2)
emb_vec_2D = pca.fit_transform(emb_vectors)
n_inputs = len(word_list)
for i in range(n_inputs):
# group word and it's neighbours together (results in different color in plot)
lower = i*n_nearest_neighbours + i
upper = (i+1)*n_nearest_neighbours + (i+1)
# plot 2D
plt.scatter(emb_vec_2D[lower:upper, 0], emb_vec_2D[lower:upper, 1])
for label, x, y in zip(words_to_visualize, emb_vec_2D[:, 0], emb_vec_2D[:, 1]):
plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
# find nice axes for plot
lower_x = min(emb_vec_2D[:, 0])
upper_x = max(emb_vec_2D[:, 0])
lower_y = min(emb_vec_2D[:, 1])
upper_y = max(emb_vec_2D[:, 1])
# 10% of padding on all sides
pad_x = 0.1 * abs(upper_x - lower_x)
pad_y = 0.1 * abs(upper_y - lower_y)
plt.xlim([lower_x - pad_x, upper_x + pad_x])
plt.ylim([lower_y - pad_y, upper_y + pad_y])
plt.show()