Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add 2D visualization using TSNE #61

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions code/models/LoadModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""
auther Mohamad M. Jafari

This file contains a CLASS for loading and useing embedding models!

Essential materials for dealing with [word2vec, gensim] models is implemented!

There are some extra facilities to perform bunch of other operation on models,
like reordering, changeing format and etc.

"""



import numpy as np
import pickle
import gensim
from numpy import dot
from numpy.linalg import norm
from random import choice
from string import ascii_lowercase

class W2V():
def __init__(self, vocabulary, vectors):
self.vocabulary = vocabulary # list of vocab
self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector
# (corresponding to vocab list)

# magic method for reach vector corresponding to word
def __getitem__(self, index):
return self.vectors[index,:]
# magic method of containing special word
def __contains__(self, word):
return word in self.vocabulary

# delition word
def __delitem__(self, word):
index = self.vocabulary.index(word)
del self.vocabulary[index]
self.vectors = np.delete(self.vectors, index, 0)

# length
def __len__(self):
return len(self.vocabulary)
# iterator
def __iter__(self):
for w in self.vocabulary:
yield w, self[vocabulary.index(w)]
# returns word's vector if it existed!
def get_vector(self, word):
try:
index = self.vocabulary.index(word)
return self.vectors[index]
except:
print("word not found!")
raise

@property
def words(self):
return self.vocabulary
@property
def shape(self):
return self.vectors.shape

def normalize_words(self, ord=2, inplace=False):
if ord == 2:
ord = None # numpy uses this flag to indicate l2.
vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
if inplace:
self.vectors = vectors.T
return self
return W2V(vectors=vectors.T, vocabulary=self.vocabulary)

def nearest_neighbors(self, word, k=1):
if isinstance(word, str):
assert word in self, "invalid word!"
v = self.vocabulary.index(word)
print(v)
else:
v = word
dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
vectors = self.vectors
distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
return(sorted(distances, reverse=True)[1:1+k])
@staticmethod
def from_text(fname, encoding=False):
words = []
vectors = []
if encoding:
with open(fname, 'r', encoding="utf-8") as fin:
for line in fin:
line = line.split(" ")
word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)
else:
with open(fname, 'r') as fin:
for line in fin:
line = line.split(" ")
word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)


@staticmethod
def from_bin(fname):
model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
return W2V(vocabulary=list(model.vocab), vectors=np.array(model.vocab))
@staticmethod
def to_word2vec(w, fname, binary=False):
with open(fname, 'wb') as fout:
header = "%s %s\n" % w.vectors.shape
fout.write(header.encode("utf-8"))
for word, vector in zip(w.vocabulary.words, w.vectors):
if binary:
line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
fout.write(line)
else:
line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
fout.write(line.encode("utf-8"))

@staticmethod
def from_W2V(fname):
with open(fname, 'rb') as fin:
wtov = pickle.load(fin)
vec, voc = wtov["vectors"], wtov["vocabulary"]
return W2V(vocabulary=voc, vectors=vec)

def save(self, fname):
vec = self.vectors
voc = self.vocabulary
model = {"vectors":vec, "vocabulary":voc}
with open(fname, 'wb') as fout:
pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)

glove50dt = "glove.6B.50d.txt"
wikifab = "wiki.fa.bin"
googb = "GoogleNews-vectors-negative300.bin"

# w2vt = W2V.from_text(glove50dt, encoding=True)
# w2vt.save("test")
# w2vt = W2V.from_W2V("test")

#w2vb = W2V.from_bin(googb)
# print(w2vt.vocabulary[0])
# print(w2vt.vectors[0])
if __name__ == "__main__":
vocab_size = 100
embedding_dim = 300
# create our simple test case!
vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
for j in range(0, vocab_size)]
vectors = np.random.random((vocab_size, embedding_dim))
# test our methods!
my_w2v = W2V(vocabulary, vectors)
my_w2v.save("model")
my_w2v = W2V.from_W2V("model")
# print(my_w2v[vocabulary.index(vocabulary[10])])
# print(my_w2v.shape)
# print(my_w2v.words)
del my_w2v[vocabulary[10]]
print(my_w2v.shape)
# print(len(my_w2v))
# for word, vector in my_w2v:
# print(word, vector)
tmp = my_w2v.vectors
my_w2v.normalize_words(ord=2, inplace=True)
# print(my_w2v.vectors==tmp)
print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))

207 changes: 207 additions & 0 deletions scripts/model/LoadModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
"""
auther Mohamad M. Jafari

This file contains a CLASS for loading and useing embedding models!

Essential materials for dealing with [word2vec, gensim] models is implemented!

There are some extra facilities to perform bunch of other operation on models,
like reordering, changeing format and etc.

"""



import numpy as np
import pickle
import gensim
from numpy import dot
from numpy.linalg import norm
from random import choice
from string import ascii_lowercase
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

class W2V():
def __init__(self, vocabulary, vectors):
self.vocabulary = vocabulary # list of vocab
self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector
# (corresponding to vocab list)

# magic method for reach vector corresponding to word
def __getitem__(self, index):
return self.vectors[index,:]
# magic method of containing special word
def __contains__(self, word):
return word in self.vocabulary

# delition word
def __delitem__(self, word):
index = self.vocabulary.index(word)
del self.vocabulary[index]
self.vectors = np.delete(self.vectors, index, 0)

# length
def __len__(self):
return len(self.vocabulary)
# iterator
def __iter__(self):
for w in self.vocabulary:
yield w, self[vocabulary.index(w)]
# returns word's vector if it existed!
def get_vector(self, word):
try:
index = self.vocabulary.index(word)
return self.vectors[index]
except:
print("word not found!")
raise

@property
def words(self):
return self.vocabulary
@property
def shape(self):
return self.vectors.shape

def normalize_words(self, ord=2, inplace=False):
if ord == 2:
ord = None # numpy uses this flag to indicate l2.
vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
if inplace:
self.vectors = vectors.T
return self
return W2V(vectors=vectors.T, vocabulary=self.vocabulary)

def nearest_neighbors(self, word, k=1):
if isinstance(word, str):
assert word in self, "invalid word!"
v = self.vocabulary.index(word)
print(v)
else:
v = word
dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
vectors = self.vectors
distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
return(sorted(distances, reverse=True)[1:1+k])
@staticmethod
def from_text(fname, encoding=False):
words = []
vectors = []
if encoding:
with open(fname, 'r', encoding="utf-8") as fin:
for line in fin:
line = line.split(" ")
word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)
else:
with open(fname, 'r') as fin:
for line in fin:
line = line.split(" ")
word, vector = line[0], [float(x) for x in line[1:]]
words.append(word)
vectors.append(vector)
return W2V(vocabulary=words, vectors=vectors)


@staticmethod
def from_bin(fname):
model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
return W2V(vocabulary=list(model.vocab), vectors=np.array(model.vocab))

@staticmethod
def to_word2vec(w, fname, binary=False):
with open(fname, 'wb') as fout:
header = "%s %s\n" % w.vectors.shape
fout.write(header.encode("utf-8"))
for word, vector in zip(w.vocabulary.words, w.vectors):
if binary:
line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
fout.write(line)
else:
line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
fout.write(line.encode("utf-8"))

@staticmethod
def from_W2V(fname):
with open(fname, 'rb') as fin:
wtov = pickle.load(fin)
vec, voc = wtov["vectors"], wtov["vocabulary"]
return W2V(vocabulary=voc, vectors=vec)


# Creates and TSNE model and plots it
@staticmethod
def tsne_plot(words, vectors):


assert len(words)==len(vectors)
# for word in model.wv.vocab:
# tokens.append(model[word])
# labels.append(word)
#
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(vectors)

x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])

plt.figure(figsize=(16, 16))
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(words[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()

def save(self, fname):
vec = self.vectors
voc = self.vocabulary
model = {"vectors":vec, "vocabulary":voc}
with open(fname, 'wb') as fout:
pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)

glove50dt = "glove.6B.50d.txt"
wikifab = "wiki.fa.bin"
googb = "GoogleNews-vectors-negative300.bin"

# w2vt = W2V.from_text(glove50dt, encoding=True)
# w2vt.save("test")
# w2vt = W2V.from_W2V("test")

#w2vb = W2V.from_bin(googb)
# print(w2vt.vocabulary[0])
# print(w2vt.vectors[0])
if __name__ == "__main__":
vocab_size = 100
embedding_dim = 300
# create our simple test case!
vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
for j in range(0, vocab_size)]
vectors = np.random.random((vocab_size, embedding_dim))
# test our methods!
my_w2v = W2V(vocabulary, vectors)
my_w2v.save("model")
my_w2v = W2V.from_W2V("model")
# print(my_w2v[vocabulary.index(vocabulary[10])])
# print(my_w2v.shape)
# print(my_w2v.words)
del my_w2v[vocabulary[10]]
print(my_w2v.shape)
# print(len(my_w2v))
# for word, vector in my_w2v:
# print(word, vector)
tmp = my_w2v.vectors
my_w2v.normalize_words(ord=2, inplace=True)
# print(my_w2v.vectors==tmp)
print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))
W2V.tsne_plot(my_w2v.words, my_w2v.vectors)