-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedding_word2vec.py
199 lines (147 loc) · 6.3 KB
/
embedding_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
embedding method: word2vec by Google
wrappper: https://github.com/danielfrg/word2vec (pip install wor2vec)
"""
import multiprocessing
import numpy as np
import os
import json
from subprocess import PIPE, Popen
import word2vec as w2v
from embedding_base import EmbeddingAbstractBase
class EmbeddingWord2vec(EmbeddingAbstractBase):
def __init__(self, config):
self._model = None
self.config = config
def word_vec(self, word):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
return np.asarray(self._model.get_vector(word), dtype=np.float64)
def get_vocab(self):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
return self._model.vocab.tolist()
def most_similar_n(self, word, topn=10):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
indexes, metrics = self._model.cosine(word, topn)
return self._model.generate_response(indexes, metrics).tolist()
def analogy(self, positives, negatives, topn):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
indexes, metrics = self._model.analogy(positives, negatives, topn)
return self._model.generate_response(indexes, metrics).tolist()
def similarity(self, word1, word2):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
v1 = self._model[word1] / np.linalg.norm(self._model[word1], 2)
v2 = self._model[word2] / np.linalg.norm(self._model[word2], 2)
return np.dot(v1, v2)
def vec_dim(self):
if not self._model:
print("Model not defined. Train or load a model.")
return ReferenceError
return self._model.vectors.shape[1]
def may_construct_word_vec(self, word):
return word in self.get_vocab()
def train_model(self, train_data_src, emb_model_dir, emb_model_fn):
"""
Train word2vec model with following parameters:
***********************************************
train <file_path>
Use text data from <file_path> to train the _model
output <file_path>
Use <file_path> to save the resulting word vectors / word clusters
size <int>
Set size of word vectors; default is 100
window <int>
Set max skip length between words; default is 5
sample <float>
Set threshold for occurrence of words. Those that appear with
higher frequency in the training data will be randomly
down-sampled; default is 0 (off), useful value is 1e-5
hs <int>
Use Hierarchical Softmax; default is 1 (0 = not used)
negative <int>
Number of negative examples; default is 0, common values are 5 - 10
(0 = not used)
threads <int>
Use <int> threads (default 1)
iter_ = number of iterations (epochs) over the corpus. Default is 5.
min_count <int>
This will discard words that appear less than <int> times; default
is 5
alpha <float>
Set the starting learning rate; default is 0.025
debug <int>
Set the debug mode (default = 2 = more info during training)
binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
cbow <int>
Use the continuous bag of words model; default is 1 (skip-gram
model)
save_vocab <file>
The vocabulary will be saved to <file>
read_vocab <file>
The vocabulary will be read from <file>, not constructed from the
training data
verbose
Print output from training
"""
algorithm = self.config.config['embedding_algorithm'] # skipgram or cbow
print("Embedding Method: word2vec, Algorithm:", algorithm)
### embedding parameters
# embedding train algorithm
if algorithm == "skipgram":
alg = 1
elif algorithm == "cbow":
alg = 0
else:
print("train algorithm must be 'skipgram' or 'cbow' ")
return AttributeError
# embedding vector dimension
emb_dim = self.config.config['embedding_vector_dim']
# minimum number a token has to appear to be included in _model
min_count = self.config.config['min_token_appearance']
# number of cores
n_cores = multiprocessing.cpu_count()
# embedding _model source
emb_model_src = os.path.join(emb_model_dir, emb_model_fn)
# downsampling high occurence
sample_freq = 1e-5
# TODO probabla add negative sampling
print("Start training the model.")
command = ["word2vec", "-train", train_data_src, "-output", emb_model_src,
"-binary", "1", "-cbow", str(alg), "-size", str(emb_dim), "-sample", str(sample_freq),
"-min-count", str(min_count), "-threads", str(n_cores)]
# Open pipe to subprocess
proc = Popen(command, stdout=PIPE, stderr=PIPE)
result_list = []
# get output from subprocess
while proc.poll() is None:
while proc.poll() is None:
i = proc.stdout.read(1).decode('ascii')
result_list.append(i)
if i == "\n" or i == "\r":
break
if result_list != []:
print("".join(result_list), end="")
result_list = []
# save config file
filename, ext = os.path.splitext(emb_model_fn)
config_fn = filename + '_configuration.json'
config_src = os.path.join(emb_model_dir, config_fn)
with open(config_src, 'w') as f:
json.dump(self.config.config, f, indent=4)
# load newly generated model
self.load_model(emb_model_dir, emb_model_fn)
print("Training finsihed. \nModel saved at:", emb_model_src)
def load_model(self, emb_model_dir, emb_model_fn):
# load _model
emb_model_src = os.path.join(emb_model_dir, emb_model_fn)
self._model = w2v.load(emb_model_src)