add generate embedding res func

haidfs · Mar 20, 2020 · f40fdcf · f40fdcf
1 parent f99dde8
commit f40fdcf
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 72 deletions.
diff --git a/TestDatasetTF.py b/TestDatasetTF.py
@@ -1,11 +1,12 @@
+# -*- coding: UTF-8 -*-
 import os
 import pandas as pd
 
 
 class KnowledgeGraph:
     def __init__(self, data_dir):
-        # 考虑到tf的各项api使用，Python不能将Tensor类型直接转换成字符串类型，但是可以将TF类型转换成numpy类型
-        # 所以这里的训练三元组，测试三元组等等，都是id三元组，而不是字符串三元组
+        # 考虑到tf的各项api使用，Python不能将Tensor类型直接转换成字符串类型，但是可以将TF类型转换成numpy类型
+        # 所以这里的训练三元组，测试三元组等等，都是id三元组，而不是字符串三元组
         self.data_dir = data_dir
         self.entity_dict = {}
         self.entities = []

diff --git a/TestMainTF.py b/TestMainTF.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import logging
 
 import tensorflow as tf

diff --git a/TestModelTF.py b/TestModelTF.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import timeit
 import numpy as np
 import tensorflow as tf
@@ -45,9 +46,9 @@ def evaluate(self, eval_triple):
             relation = tf.nn.embedding_lookup(
                 self.relation_embedding, eval_triple[2])
         with tf.name_scope('link'):
-            # 并不太明确这里的用途，h,r,t应该都是[1,dim]维度的向量， self.entity_embedding应该是[n,dim]维度的向量，做加减法得到的是什么类型？
-            # 如果是list类型，对于不同维度是不能直接加减的。但是对于np.array或者tf的embedding，是可以直接相减的，等同于 self.entity_embedding
-            # 的每一行都在和h,r,t做运算
+            # ����̫��ȷ�������;��h,r,tӦ�ö���[1,dim]ά�ȵ������� self.entity_embeddingӦ����[n,dim]ά�ȵ����������Ӽ����õ�����ʲô���ͣ�
+            # �����list���ͣ����ڲ�ͬά���ǲ���ֱ�ӼӼ��ġ����Ƕ���np.array����tf��embedding���ǿ���ֱ������ģ���ͬ�� self.entity_embedding
+            # ��ÿһ�ж��ں�h,r,t������
             distance_head_prediction = self.entity_embedding + relation - tail
             distance_tail_prediction = head + relation - self.entity_embedding
         with tf.name_scope('rank'):

diff --git a/TestTransEMpQueue.py b/TestTransEMpQueue.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 from numpy import *
 import operator
 import logging
@@ -10,17 +11,17 @@
 
 
 class Test:
-    '''基本的评价过程
-假设整个知识库中一共有n个实体，那么评价过程如下：
-对于每一个测试的三元组a中的头实体或者尾实体，依次替换为整个知识库中的所有其它实体，也就是会产生n个三元组。
-分别对上述n个三元组计算其能量值(dist值)，在transE中，就是计算h+r-t的值。这样可以得到n个能量值，分别对应上述n个三元组。
-对上述n个能量值进行升序排序。
-记录原本的三元组a的能量值排序后的序号。
-对所有处在测试集中的测试三元组重复上述过程。
-每个正确三元组的能量值排序后的序号求平均，得到的值我们称为Mean Rank。
-计算正确三元组的能量排序后的序号小于10的比例，得到的值我们称为Hits@10。
-上述就是评价的过程，共有两个指标：Mean Rank和Hits@10。其中Mean Rank越小越好，Hits@10越大越好。该代码未计算Hits@10，且Python对于这种大量计算速度很慢。
-建议读者后续使用清华大学库的Fast_TransX代码，使用C++编写，性能高，能够快速得出训练和测试结果。
+    '''���������۹���
+��������֪ʶ����һ����n��ʵ�壬��ô���۹������£�
+����ÿһ�����Ե���Ԫ��a�е�ͷʵ�����βʵ�壬�����滻Ϊ����֪ʶ���е���������ʵ�壬Ҳ���ǻ����n����Ԫ�顣
+�ֱ������n����Ԫ�����������ֵ(distֵ)����transE�У����Ǽ���h+r-t��ֵ���������Եõ�n������ֵ���ֱ��Ӧ����n����Ԫ�顣
+������n������ֵ������������
+��¼ԭ������Ԫ��a������ֵ��������š�
+�����д��ڲ��Լ��еĲ�����Ԫ���ظ��������̡�
+ÿ����ȷ��Ԫ�������ֵ�����������ƽ�����õ���ֵ���ǳ�ΪMean Rank��
+������ȷ��Ԫ����������������С��10�ı������õ���ֵ���ǳ�ΪHits@10��
+�����������۵Ĺ��̣���������ָ�꣺Mean Rank��Hits@10������Mean RankԽСԽ�ã�Hits@10Խ��Խ�á��ô���δ����Hits@10����Python�������ִ��������ٶȺ�����
+������ߺ���ʹ���廪��ѧ���Fast_TransX���룬ʹ��C++��д�����ܸߣ��ܹ����ٵó�ѵ���Ͳ��Խ����
 '''
 
     def __init__(self, entity_dyct, relation_dyct, train_triple_list,
@@ -57,15 +58,15 @@ def get_rank_part(self, triplet):
                     continue
                 rank_dyct[ent] = distance(self.entity_dyct[ent], self.entity_dyct[triplet[1]],
                                           self.relation_dyct[triplet[2]])
-            else:  # 根据标签替换头实体或者替换尾实体计算距离
+            else:  # ���ݱ�ǩ�滻ͷʵ������滻βʵ��������
                 corrupted_triplet = (triplet[0], ent, triplet[2])
                 if self.is_fit and (
                         corrupted_triplet in self.train_triple_list):
                     continue
                 rank_dyct[ent] = distance(self.entity_dyct[triplet[0]], self.entity_dyct[ent],
                                           self.relation_dyct[triplet[2]])
         sorted_rank = sorted(rank_dyct.items(),
-                             key=operator.itemgetter(1))  # 按照元素的第一个域进行升序排序
+                             key=operator.itemgetter(1))  # ����Ԫ�صĵ�һ���������������
         if self.label == 'head':
             num_tri = 0
         else:
@@ -167,7 +168,7 @@ def get_dict_from_vector_file(file_path):
     dyct = {}
     for line in file.readlines():
         name_vector = line.strip().split("\t")
-        # 这里的vector使用[1:-1]是因为vector是'[0.11,0.22,..]'这样的str类型，[1:-1]是为了去掉列表的中括号
+        # �����vectorʹ��[1:-1]����Ϊvector��'[0.11,0.22,..]'������str���ͣ�[1:-1]��Ϊ��ȥ���б��������
         vector = [float(s) for s in name_vector[1][1:-1].split(", ")]
         name = name_vector[0]
         dyct[name] = vector

diff --git a/TrainMain.py b/TrainMain.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import timeit
 from TrainTransESimple import prepare_fb15k_train_data
 from TrainTransESimple import TransE
@@ -72,9 +73,9 @@ def main():
             normal_form=args.normal_form)
         logging.info("TransE is initializing...")
         start = timeit.default_timer()
-        for i in range(args.max_epoch):  # epoch的次数
+        for i in range(args.max_epoch):  # epoch�Ĵ���
             lock = Lock()
-            proces = [Process(target=func1, args=(transE, lock)) for j in range(10)]  # 10个多进程，谨慎运行，电脑会很卡
+            proces = [Process(target=func1, args=(transE, lock)) for j in range(10)]  # 10������̣��������У����Ի�ܿ�
             for p in proces:
                 p.start()
             for p in proces:
@@ -86,7 +87,7 @@ def main():
             start = end
             transE.clear_loss()
     logging.info("********** End TransE training ***********\n")
-    # 训练的批次并不一定是100的整数倍，将最后更新的向量写到文件
+    # ѵ�������β���һ����100�����������������µ�����д���ļ�
     transE.write_vector("data/entityVector.txt", "entity")
     transE.write_vector("data/relationVector.txt", "relationship")
 

diff --git a/TrainTransEMpManager.py b/TrainTransEMpManager.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 from multiprocessing import Process, Lock
 from multiprocessing.managers import BaseManager
 import logging
@@ -13,22 +14,24 @@
 class TransE(TransESimple):
 
     def get_loss(self):
-        # 参考清华的Fast-TransX的C++代码，确实速度很快，Python接近10个小时的训练C++大概在十几分钟即可完成。粗略的看了一下代码，
-        # 它对原本的论文中的Sbatch做了修改，直接进行了（总数量为训练三元组数，一个epoch分为多个batch完成，每个batch的每一个三元组都随机采样），随机梯度下降。多线程并发，n个batch对应n个线程
-        # Python由于历史遗留问题，使用了GIL，全局解释锁，使得Python的多线程近似鸡肋，无法跑满多核cpu，所以考虑使用多进程优化
-        # 为了使用多进程，使用了manager将transE封装为Proxy对象。由于Proxy对象无法获取封装的TransE类的属性，所以需要写get函数将loss传出。
-        # 另外值得注意的是，Python的多进程性能不一定优于for循环。基本开销就包括了进程的创建和销毁、上下文切换（进程间需要RPC远程通信以做到类变量共享）。
-        # 至少在trainTransE和trainTransE_MultiProcess对比来看，trainTransE的for循环一批10个耗时在8s-9s，trainTransE_MultiProcess的一个epoch即一批，耗时在12-13s。
-        # 进一步优化方法：进程池，实现进程复用？框架：tf？？
+        # 参考清华的Fast-TransX的C++代码，确实速度很快，Python接近10个小时的训练C++大概在十几分钟即可完成。粗略的看了一下代码，
+        # 它对原本的论文中的Sbatch做了修改，直接进行了（总数量为训练三元组数，一个epoch分为多个batch完成，每个batch的每一个三元组都随机采样），随机梯度下降。多线程并发，n个batch对应n个线程
+        # Python由于历史遗留问题，使用了GIL，全局解释锁，使得Python的多线程近似鸡肋，无法跑满多核cpu，所以考虑使用多进程优化
+        # 为了使用多进程，使用了manager将transE封装为Proxy对象。由于Proxy对象无法获取封装的TransE类的属性，所以需要写get函数将loss传出。
+        # 另外值得注意的是，Python的多进程性能不一定优于for循环。基本开销就包括了进程的创建和销毁、上下文切换（进程间需要RPC远程通信以做到类变量共享）。
+        # 至少在trainTransE和trainTransE_MultiProcess对比来看，trainTransE的for循环一批10个耗时在8s-9s，trainTransE_MultiProcess的一个epoch即一批，耗时在12-13s。
+        # 进一步优化方法：进程池，实现进程复用？框架：tf？？
         return self.loss
 
     def clear_loss(self):
-        # 该函数也是为了Proxy对象外部将损失置0
+        # 该函数也是为了Proxy对象外部将损失置0
         self.loss = 0
 
     def transE(self):
+        # 这个地方和父类的transE的区别在于，这里由于是多进程之间直接共享class TransE的实例，所以现在并不知道对应的
+        # 训练epoch，于是这个地方删掉了原本的写文件函数
         Sbatch = self.sample(self.batch_size // 10)
-        Tbatch = []  # 元组对（原三元组，打碎的三元组）的列表 ：{((h,r,t),(h',r,t'))}
+        Tbatch = []  # 元组对（原三元组，打碎的三元组）的列表 ：{((h,r,t),(h',r,t'))}
         for sbatch in Sbatch:
             pos_neg_triplets = (sbatch, self.get_corrupted_triplets(sbatch))
             if pos_neg_triplets not in Tbatch:
@@ -66,10 +69,10 @@ def main():
         margin=1,
         dim=50)
     logging.info("TransE is initializing...")
-    for i in range(20000):  # epoch的次数
+    for i in range(2000):  # epoch的次数
         lock = Lock()
         proces = [Process(target=func1, args=(transE, lock))
-                  for j in range(10)]  # 10个多进程，谨慎运行，电脑会很卡
+                  for j in range(10)]  # 10个多进程，谨慎运行，电脑会很卡
         for p in proces:
             p.start()
         for p in proces:
@@ -78,6 +81,9 @@ def main():
             logging.info(
                 "After %d training epoch(s), loss on batch data is %g" %
                 (i * 10, transE.get_loss()))
+        if i % 100 == 0:
+            transE.write_vector("data/entityVectorMpManager.txt", "entity")
+            transE.write_vector("data/relationVectorMpManager.txt", "rels")
         transE.clear_loss()
     # transE.transE(100000)
     logging.info("********** End TransE training ***********\n")

diff --git a/TrainTransEMpQueue.py b/TrainTransEMpQueue.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import numpy as np
 from multiprocessing import Process, Queue
 import logging
@@ -72,24 +73,24 @@ def update_part(self, pos_triplet, neg_triplet):
         entity_vector_copy = self.entity_vector_dict
         rels_vector_copy = self.rels_vector_dict
 
-        # 这里的h,t,r代表头实体向量、尾实体向量、关系向量，h2和t2代表论文中的h'和t'，即负例三元组中的头尾实体向量
-        # Tbatch是元组对（原三元组，打碎的三元组）的列表
-        # ：[((h,r,t),(h',r,t'))...]，这里由于data文件的原因是(h,t,r)
+        # 这里的h,t,r代表头实体向量、尾实体向量、关系向量，h2和t2代表论文中的h'和t'，即负例三元组中的头尾实体向量
+        # Tbatch是元组对（原三元组，打碎的三元组）的列表
+        # ：[((h,r,t),(h',r,t'))...]，这里由于data文件的原因是(h,t,r)
         h = entity_vector_copy[pos_triplet[0]]
         t = entity_vector_copy[pos_triplet[1]]
         r = rels_vector_copy[pos_triplet[2]]
-        # 损坏三元组中的头实体向量与尾实体向量
+        # 损坏三元组中的头实体向量与尾实体向量
         h2 = entity_vector_copy[neg_triplet[0]]
         t2 = entity_vector_copy[neg_triplet[1]]
-        # 在这里原本定义了beforebatch，但是个人认为没有必要，这里已经进入到batch里面了，走的就是单个处理
+        # 在这里原本定义了beforebatch，但是个人认为没有必要，这里已经进入到batch里面了，走的就是单个处理
         if self.normal_form == "L1":
             dist_triplets = dist_L1(h, t, r)
             dist_corrupted_triplets = dist_L1(h2, t2, r)
         else:
             dist_triplets = dist_L2(h, t, r)
             dist_corrupted_triplets = dist_L2(h2, t2, r)
         eg = self.margin + dist_triplets - dist_corrupted_triplets
-        if eg > 0:  # 大于0取原值，小于0则置0.即合页损失函数margin-based ranking criterion
+        if eg > 0:  # 大于0取原值，小于0则置0.即合页损失函数margin-based ranking criterion
             self.loss += eg
             temp_positive = 2 * self.learning_rate * (t - h - r)
             temp_negative = 2 * self.learning_rate * (t2 - h2 - r)
@@ -99,14 +100,14 @@ def update_part(self, pos_triplet, neg_triplet):
                 temp_positive = np.array(temp_positive_L1) * self.learning_rate
                 temp_negative = np.array(temp_negative_L1) * self.learning_rate
 
-            # 对损失函数的5个参数进行梯度下降， 随机体现在sample函数上
+            # 对损失函数的5个参数进行梯度下降， 随机体现在sample函数上
             h += temp_positive
             t -= temp_positive
             r = r + temp_positive - temp_negative
             h2 -= temp_negative
             t2 += temp_negative
 
-            # 归一化刚才更新的向量，减少计算时间
+            # 归一化刚才更新的向量，减少计算时间
             entity_vector_copy[pos_triplet[0]] = norm(h)
             entity_vector_copy[pos_triplet[1]] = norm(t)
             rels_vector_copy[pos_triplet[2]] = norm(r)
@@ -131,6 +132,9 @@ def main():
     for epoch in range(2000):
         print("Mp Queue TransE, After %d training epoch(s):\n" % epoch)
         transE.launch_training()
+        if epoch % 100 == 0:
+            transE.write_vector("data/entityVectorMpQueue.txt", "entity")
+            transE.write_vector("data/relationVectorMpQueue.txt", "rels")
     logging.info("********** End TransE training ***********\n")