-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_emb.py
72 lines (52 loc) · 2.23 KB
/
get_emb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import sys
def read_emb(emb_path):
file = open(emb_path, mode='r', encoding='utf-8', errors="replace")
dim = int(file.readline().split(' ')[1])
word_vec_dict = {}
line = str(file.readline()).strip()
while len(line) > 1:
word, vec = line.split(' ', 1)
if word.lower() not in word_vec_dict:
word_vec_dict[word.lower()] = np.fromstring(vec, sep=' ', dtype=float)
else:
None # print("already", word)
line = str(file.readline()).strip()
file.close()
return word_vec_dict, dim
def read_m_analogy_set(file_path):
with open(file_path, 'r') as file_in:
line = str(file_in.readline()).strip()
words = []
while len(line) > 1:
left, right = line.split()
words = words + [left.lower(), right.lower()]
line = str(file_in.readline()).strip()
return words
def filter_x_embs(X_word_vec_dict, Y_word_vec_dict, X_analogy_words, Y_analogy_words, dim):
assert len(X_analogy_words) == len(Y_analogy_words)
X_mat = np.empty((0, dim), dtype=float)
Y_mat = np.empty((0, dim), dtype=float)
for i in range(len(X_analogy_words)):
if (X_analogy_words[i] in X_word_vec_dict) and (Y_analogy_words[i] in Y_word_vec_dict):
X_mat = np.vstack([X_mat, X_word_vec_dict[X_analogy_words[i]]])
Y_mat = np.vstack([Y_mat, Y_word_vec_dict[Y_analogy_words[i]]])
return X_mat, Y_mat
def preprocess_emb(emb_mat):
mu = emb_mat.mean(0)
# mean centring
emb_mat0 = emb_mat - mu
# row-wise L2 norm
norm = np.sqrt((emb_mat0 ** 2.).sum()) / len(emb_mat)
# scale to equal (unit) norm
return emb_mat0 / norm
def main(input_list):
[A_emb_path, B_emb_path, A_analogy_path, B_analogy_path, A_mat_path, B_mat_path] = input_list
A_analogy_words = read_m_analogy_set(A_analogy_path)
B_analogy_words = read_m_analogy_set(B_analogy_path)
A_word_vec_dict, A_dim = read_emb(A_emb_path)
B_word_vec_dict, B_dim = read_emb(B_emb_path)
A_mat, B_mat = filter_x_embs(A_word_vec_dict, B_word_vec_dict, A_analogy_words, B_analogy_words, A_dim)
np.save(A_mat_path, preprocess_emb(A_mat))
np.save(B_mat_path, preprocess_emb(B_mat))
main(sys.argv[1:])