sehsanm · mohamad-mehdi-jafari · Dec 18, 2018 · Dec 18, 2018 · Dec 27, 2018 · Dec 27, 2018
diff --git a/code/models/LoadModel.py b/code/models/LoadModel.py
@@ -0,0 +1,172 @@
+"""
+    auther Mohamad M. Jafari
+
+    This file contains a CLASS for loading and useing embedding models!
+
+    Essential materials for dealing with [word2vec, gensim] models is implemented!
+
+    There are some extra facilities to perform bunch of other operation on models,
+     like reordering, changeing format and etc.
+
+"""
+
+
+
+import numpy as np
+import pickle
+import gensim
+from numpy import dot
+from numpy.linalg import norm
+from random import choice
+from string import ascii_lowercase
+
+class W2V():
+    def __init__(self, vocabulary, vectors):
+        self.vocabulary = vocabulary # list of vocab 
+        self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector 
+                                           # (corresponding to vocab list)
+
+    #    magic method for reach vector corresponding to word
+    def __getitem__(self, index):
+        return self.vectors[index,:]
+    #   magic method of containing special word
+    def __contains__(self, word):
+        return word in self.vocabulary
+
+    #    delition word
+    def __delitem__(self, word):
+        index = self.vocabulary.index(word)
+        del self.vocabulary[index]
+        self.vectors = np.delete(self.vectors, index, 0)
+
+        #   length
+    def __len__(self):
+        return len(self.vocabulary)
+    #   iterator
+    def __iter__(self):
+        for w in self.vocabulary:
+            yield w, self[vocabulary.index(w)]
+    # returns word's vector if it existed!
+    def get_vector(self, word):
+        try:
+            index = self.vocabulary.index(word)
+            return self.vectors[index]
+        except:
+            print("word not found!")
+            raise
+
+    @property
+    def words(self):
+        return self.vocabulary   
+    @property
+    def shape(self):
+        return self.vectors.shape
+
+    def normalize_words(self, ord=2, inplace=False):
+        if ord == 2:
+            ord = None  # numpy uses this flag to indicate l2.
+        vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
+        if inplace:
+            self.vectors = vectors.T
+            return self
+        return W2V(vectors=vectors.T, vocabulary=self.vocabulary)
+
+    def nearest_neighbors(self, word, k=1):
+        if isinstance(word, str):
+            assert word in self, "invalid word!"
+            v = self.vocabulary.index(word)
+            print(v)
+        else:
+            v = word
+        dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
+        vectors = self.vectors
+        distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
+        return(sorted(distances, reverse=True)[1:1+k])
+    @staticmethod
+    def from_text(fname, encoding=False):
+        words = []
+        vectors = []
+        if encoding:
+            with open(fname, 'r', encoding="utf-8") as fin:
+                for line in fin:
+                    line = line.split(" ")
+                    word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+        else:    
+            with open(fname, 'r') as fin:
+                for line in fin:
+                    line = line.split(" ")
+                    word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+
+
+    @staticmethod
+    def from_bin(fname):
+    	model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
+        return W2V(vocabulary=list(model.vocab), vectors=np.array(model.vocab))
+    @staticmethod
+    def to_word2vec(w, fname, binary=False):
+        with open(fname, 'wb') as fout:
+            header = "%s %s\n" % w.vectors.shape
+            fout.write(header.encode("utf-8"))
+            for word, vector in zip(w.vocabulary.words, w.vectors):
+                if binary:
+                    line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
+                    fout.write(line)
+                else:
+                    line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
+                    fout.write(line.encode("utf-8"))
+
+    @staticmethod
+    def from_W2V(fname):
+        with open(fname, 'rb') as fin:
+            wtov = pickle.load(fin)
+        vec, voc = wtov["vectors"], wtov["vocabulary"]
+        return W2V(vocabulary=voc, vectors=vec)
+
+    def save(self, fname):
+        vec = self.vectors
+        voc = self.vocabulary
+        model = {"vectors":vec, "vocabulary":voc}
+        with open(fname, 'wb') as fout:
+            pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
+
+glove50dt = "glove.6B.50d.txt"
+wikifab = "wiki.fa.bin"
+googb = "GoogleNews-vectors-negative300.bin"
+
+# w2vt = W2V.from_text(glove50dt, encoding=True)
+# w2vt.save("test")
+# w2vt = W2V.from_W2V("test")
+
+#w2vb = W2V.from_bin(googb)
+# print(w2vt.vocabulary[0])
+# print(w2vt.vectors[0])
+if __name__ == "__main__":
+    vocab_size = 100
+    embedding_dim = 300
+    # create our simple test case!
+    vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
+                  for j in range(0, vocab_size)]
+    vectors = np.random.random((vocab_size, embedding_dim))
+    # test our methods!
+    my_w2v = W2V(vocabulary, vectors)
+    my_w2v.save("model")
+    my_w2v = W2V.from_W2V("model")
+#    print(my_w2v[vocabulary.index(vocabulary[10])])
+#    print(my_w2v.shape)
+#    print(my_w2v.words)
+    del my_w2v[vocabulary[10]]
+    print(my_w2v.shape)
+#    print(len(my_w2v))
+#    for word, vector in my_w2v:
+#        print(word, vector)
+    tmp = my_w2v.vectors
+    my_w2v.normalize_words(ord=2, inplace=True)
+#    print(my_w2v.vectors==tmp)
+    print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))
+
diff --git a/scripts/model/LoadModel.py b/scripts/model/LoadModel.py
@@ -0,0 +1,207 @@
+"""
+    auther Mohamad M. Jafari
+
+    This file contains a CLASS for loading and useing embedding models!
+
+    Essential materials for dealing with [word2vec, gensim] models is implemented!
+
+    There are some extra facilities to perform bunch of other operation on models,
+     like reordering, changeing format and etc.
+
+"""
+
+
+
+import numpy as np
+import pickle
+import gensim
+from numpy import dot
+from numpy.linalg import norm
+from random import choice
+from string import ascii_lowercase
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+
+class W2V():
+    def __init__(self, vocabulary, vectors):
+        self.vocabulary = vocabulary # list of vocab 
+        self.vectors = np.asarray(vectors) # numpy array, each row contains one word vector 
+                                           # (corresponding to vocab list)
+
+    #    magic method for reach vector corresponding to word
+    def __getitem__(self, index):
+        return self.vectors[index,:]
+    #   magic method of containing special word
+    def __contains__(self, word):
+        return word in self.vocabulary
+
+    #    delition word
+    def __delitem__(self, word):
+        index = self.vocabulary.index(word)
+        del self.vocabulary[index]
+        self.vectors = np.delete(self.vectors, index, 0)
+
+        #   length
+    def __len__(self):
+        return len(self.vocabulary)
+    #   iterator
+    def __iter__(self):
+        for w in self.vocabulary:
+            yield w, self[vocabulary.index(w)]
+    # returns word's vector if it existed!
+    def get_vector(self, word):
+        try:
+            index = self.vocabulary.index(word)
+            return self.vectors[index]
+        except:
+            print("word not found!")
+            raise
+
+    @property
+    def words(self):
+        return self.vocabulary   
+    @property
+    def shape(self):
+        return self.vectors.shape
+
+    def normalize_words(self, ord=2, inplace=False):
+        if ord == 2:
+            ord = None  # numpy uses this flag to indicate l2.
+        vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
+        if inplace:
+            self.vectors = vectors.T
+            return self
+        return W2V(vectors=vectors.T, vocabulary=self.vocabulary)
+
+    def nearest_neighbors(self, word, k=1):
+        if isinstance(word, str):
+            assert word in self, "invalid word!"
+            v = self.vocabulary.index(word)
+            print(v)
+        else:
+            v = word
+        dist = lambda v1, v2 : dot(v1, v2)/(norm(v1)*norm(v2))
+        vectors = self.vectors
+        distances = [dist(vectors[v,:], vectors[x,:]) for x in range(0, len(vectors))]
+        return(sorted(distances, reverse=True)[1:1+k])
+    @staticmethod
+    def from_text(fname, encoding=False):
+        words = []
+        vectors = []
+        if encoding:
+            with open(fname, 'r', encoding="utf-8") as fin:
+                for line in fin:
+                    line = line.split(" ")
+                    word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+        else:    
+            with open(fname, 'r') as fin:
+                for line in fin:
+                    line = line.split(" ")
+                    word, vector = line[0], [float(x) for x in line[1:]]
+                    words.append(word)
+                    vectors.append(vector)
+            return W2V(vocabulary=words, vectors=vectors)
+
+
+    @staticmethod
+    def from_bin(fname):
+        model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True)
+        return W2V(vocabulary=list(model.vocab), vectors=np.array(model.vocab))
+
+    @staticmethod
+    def to_word2vec(w, fname, binary=False):
+        with open(fname, 'wb') as fout:
+            header = "%s %s\n" % w.vectors.shape
+            fout.write(header.encode("utf-8"))
+            for word, vector in zip(w.vocabulary.words, w.vectors):
+                if binary:
+                    line = word.encode("utf-8") + b" " + vector.astype("float32").tostring()
+                    fout.write(line)
+                else:
+                    line = "%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))
+                    fout.write(line.encode("utf-8"))
+
+    @staticmethod
+    def from_W2V(fname):
+        with open(fname, 'rb') as fin:
+            wtov = pickle.load(fin)
+        vec, voc = wtov["vectors"], wtov["vocabulary"]
+        return W2V(vocabulary=voc, vectors=vec)
+
+
+    # Creates and TSNE model and plots it
+    @staticmethod
+    def tsne_plot(words, vectors):
+
+
+        assert len(words)==len(vectors)
+#    for word in model.wv.vocab:
+#        tokens.append(model[word])
+#        labels.append(word)
+#    
+        tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
+        new_values = tsne_model.fit_transform(vectors)
+
+        x = []
+        y = []
+        for value in new_values:
+            x.append(value[0])
+            y.append(value[1])
+
+            plt.figure(figsize=(16, 16)) 
+        for i in range(len(x)):
+            plt.scatter(x[i],y[i])
+            plt.annotate(words[i],
+                         xy=(x[i], y[i]),
+                         xytext=(5, 2),
+                         textcoords='offset points',
+                         ha='right',
+                         va='bottom')
+        plt.show()
+
+    def save(self, fname):
+        vec = self.vectors
+        voc = self.vocabulary
+        model = {"vectors":vec, "vocabulary":voc}
+        with open(fname, 'wb') as fout:
+            pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
+
+glove50dt = "glove.6B.50d.txt"
+wikifab = "wiki.fa.bin"
+googb = "GoogleNews-vectors-negative300.bin"
+
+# w2vt = W2V.from_text(glove50dt, encoding=True)
+# w2vt.save("test")
+# w2vt = W2V.from_W2V("test")
+
+#w2vb = W2V.from_bin(googb)
+# print(w2vt.vocabulary[0])
+# print(w2vt.vectors[0])
+if __name__ == "__main__":
+    vocab_size = 100
+    embedding_dim = 300
+    # create our simple test case!
+    vocabulary = [''.join(choice(ascii_lowercase) for i in range(10))\
+                  for j in range(0, vocab_size)]
+    vectors = np.random.random((vocab_size, embedding_dim))
+    # test our methods!
+    my_w2v = W2V(vocabulary, vectors)
+    my_w2v.save("model")
+    my_w2v = W2V.from_W2V("model")
+#    print(my_w2v[vocabulary.index(vocabulary[10])])
+#    print(my_w2v.shape)
+#    print(my_w2v.words)
+    del my_w2v[vocabulary[10]]
+    print(my_w2v.shape)
+#    print(len(my_w2v))
+#    for word, vector in my_w2v:
+#        print(word, vector)
+    tmp = my_w2v.vectors
+    my_w2v.normalize_words(ord=2, inplace=True)
+#    print(my_w2v.vectors==tmp)
+    print(len(my_w2v.nearest_neighbors(vocabulary[3], k=10)))
+    W2V.tsne_plot(my_w2v.words, my_w2v.vectors)
+