-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
1,204 additions
and
755 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,32 @@ | ||
该工程代码主要是实现自己阅读过的和知识图谱相关的经典算法的代码: | ||
1.TransE是知识图谱中知识表示的经典算法,工程实现了训练代码(多进程通信版)和测试代码 | ||
后续如继续进行论文阅读会补充相应的代码 | ||
2.TransE论文地址: https://www.utc.fr/~bordesan/dokuwiki/_media/en/transe_nips13.pdf | ||
3.该工程代码是基于wuxiyu的TransE代码进行注释和修改,感谢他的工作 https://github.com/wuxiyu/transE | ||
4.TransE SGD解释: https://blog.csdn.net/weixin_42348333/article/details/89598144 | ||
该工程代码主要是实现自己阅读过的和知识图谱相关的经典算法的代码: | ||
1.TransE是知识图谱中知识表示的经典算法,工程实现了训练代码(多进程通信版)和测试代码 | ||
后续如继续进行论文阅读会补充相应的代码 | ||
2.由于data文件过大,无法上传,请至https://github.com/thunlp/KB2E下载data.zip并解压至工程的data路径 | ||
3.TransE论文地址: https://www.utc.fr/~bordesan/dokuwiki/_media/en/transe_nips13.pdf | ||
###训练部分 | ||
####Simple版本 | ||
./train_fb15k.sh 0 | ||
仅仅使用Python完成对应的训练代码 | ||
####Manager版本 | ||
./train_fb15k.sh 1 | ||
将TransE类的实例在多进程之间传递 | ||
####Queue版本 | ||
./train_fb15k.sh 2 | ||
将TransE类的训练数据传入队列,减小进程开销,加快训练速度 | ||
|
||
当训练完成之后,再进行测试 | ||
###测试部分 | ||
####TestTransEMqQueue | ||
python TestTransEMpQueue.py | ||
多进程队列测试加速,效果不明显,单个测试例0.5s,测试结束需要近5h。 | ||
####TestMainTF | ||
python TestMainTF.py | ||
tf与多进程测试加速,效果显著,测试结束仅需要8min左右。 | ||
###最终测试结果 | ||
FB15k | ||
epochs:2000 MeanRank Hits@10 | ||
raw filter raw filter | ||
head 320.743 192.152 29.7 41.2 | ||
tail 236.984 153.431 36.1 46.2 | ||
average 278.863 172.792 32.9 43.7 | ||
paper 243 125 34.9 47.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import os | ||
import pandas as pd | ||
|
||
|
||
class KnowledgeGraph: | ||
def __init__(self, data_dir): | ||
# 考虑到tf的各项api使用,Python不能将Tensor类型直接转换成字符串类型,但是可以将TF类型转换成numpy类型 | ||
# 所以这里的训练三元组,测试三元组等等,都是id三元组,而不是字符串三元组 | ||
self.data_dir = data_dir | ||
self.entity_dict = {} | ||
self.entities = [] | ||
self.relation_dict = {} | ||
self.n_entity = 0 | ||
self.n_relation = 0 | ||
self.training_triples = [] # list of triples in the form of (h, t, r) | ||
self.validation_triples = [] | ||
self.test_triples = [] | ||
self.n_training_triple = 0 | ||
self.n_validation_triple = 0 | ||
self.n_test_triple = 0 | ||
'''load dicts and triples''' | ||
self.load_dicts() | ||
self.load_triples() | ||
'''construct pools after loading''' | ||
self.training_triple_pool = set(self.training_triples) | ||
self.golden_triple_pool = set( | ||
self.training_triples) | set( | ||
self.validation_triples) | set( | ||
self.test_triples) | ||
|
||
def load_dicts(self): | ||
entity_dict_file = 'entity2id.txt' | ||
relation_dict_file = 'relation2id.txt' | ||
print('-----Loading entity dict-----') | ||
entity_df = pd.read_table( | ||
os.path.join( | ||
self.data_dir, | ||
entity_dict_file), | ||
header=None) | ||
self.entity_dict = dict(zip(entity_df[0], entity_df[1])) | ||
self.n_entity = len(self.entity_dict) | ||
self.entities = list(self.entity_dict.values()) | ||
print('#entity: {}'.format(self.n_entity)) | ||
print('-----Loading relation dict-----') | ||
relation_df = pd.read_table( | ||
os.path.join( | ||
self.data_dir, | ||
relation_dict_file), | ||
header=None) | ||
self.relation_dict = dict(zip(relation_df[0], relation_df[1])) | ||
self.n_relation = len(self.relation_dict) | ||
print('#relation: {}'.format(self.n_relation)) | ||
|
||
def load_triples(self): | ||
training_file = 'train.txt' | ||
validation_file = 'valid.txt' | ||
test_file = 'test.txt' | ||
print('-----Loading training triples-----') | ||
training_df = pd.read_table( | ||
os.path.join( | ||
self.data_dir, | ||
training_file), | ||
header=None) | ||
self.training_triples = list(zip([self.entity_dict[h] for h in training_df[0]], | ||
[self.entity_dict[t] for t in training_df[1]], | ||
[self.relation_dict[r] for r in training_df[2]])) | ||
self.n_training_triple = len(self.training_triples) | ||
print('#training triple: {}'.format(self.n_training_triple)) | ||
print('-----Loading validation triples-----') | ||
validation_df = pd.read_table( | ||
os.path.join( | ||
self.data_dir, | ||
validation_file), | ||
header=None) | ||
self.validation_triples = list(zip([self.entity_dict[h] for h in validation_df[0]], | ||
[self.entity_dict[t] for t in validation_df[1]], | ||
[self.relation_dict[r] for r in validation_df[2]])) | ||
self.n_validation_triple = len(self.validation_triples) | ||
print('#validation triple: {}'.format(self.n_validation_triple)) | ||
print('-----Loading test triples------') | ||
test_df = pd.read_table( | ||
os.path.join( | ||
self.data_dir, | ||
test_file), | ||
header=None) | ||
self.test_triples = list(zip([self.entity_dict[h] for h in test_df[0]], | ||
[self.entity_dict[t] for t in test_df[1]], | ||
[self.relation_dict[r] for r in test_df[2]])) | ||
self.n_test_triple = len(self.test_triples) | ||
print('#test triple: {}'.format(self.n_test_triple)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import logging | ||
|
||
import tensorflow as tf | ||
import argparse | ||
from TestDatasetTF import KnowledgeGraph | ||
from TestModelTF import TransE | ||
from TestTransEMpQueue import get_dict_from_vector_file | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='TransE') | ||
parser.add_argument('--data_dir', type=str, default=r'./data/FB15k/') | ||
parser.add_argument('--score_func', type=str, default='L1') | ||
parser.add_argument('--n_rank_calculator', type=int, default=24) | ||
args = parser.parse_args() | ||
print(args) | ||
kg = KnowledgeGraph(data_dir=args.data_dir) | ||
|
||
entity_vector_file = "data/entityVector.txt" | ||
entity_vector_dyct = get_dict_from_vector_file(entity_vector_file) | ||
relation_vector_file = "data/relationVector.txt" | ||
relation_vector_dyct = get_dict_from_vector_file(relation_vector_file) | ||
logging.info("********** Start Test **********") | ||
|
||
kge_model = TransE( | ||
kg=kg, | ||
score_func=args.score_func, | ||
n_rank_calculator=args.n_rank_calculator, | ||
entity_vector_dict=entity_vector_dyct, | ||
rels_vector_dict=relation_vector_dyct) | ||
|
||
gpu_config = tf.GPUOptions(allow_growth=True) | ||
sess_config = tf.ConfigProto(gpu_options=gpu_config) | ||
with tf.Session(config=sess_config) as sess: | ||
print('-----Initializing tf graph-----') | ||
tf.global_variables_initializer().run() | ||
print('-----Initialization accomplished-----') | ||
kge_model.check_norm() | ||
kge_model.launch_evaluation(session=sess) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
import timeit | ||
import numpy as np | ||
import tensorflow as tf | ||
import multiprocessing as mp | ||
from TestDatasetTF import KnowledgeGraph | ||
|
||
|
||
class TransE: | ||
def __init__(self, kg: KnowledgeGraph, | ||
score_func, | ||
n_rank_calculator, entity_vector_dict, rels_vector_dict): | ||
self.kg = kg | ||
self.score_func = score_func | ||
self.n_rank_calculator = n_rank_calculator | ||
|
||
self.entity_vector_dict = entity_vector_dict | ||
self.rels_vector_dict = rels_vector_dict | ||
self.entity_embedding = None | ||
self.relation_embedding = None | ||
|
||
'''ops for evaluation''' | ||
self.eval_triple = tf.placeholder(dtype=tf.int32, shape=[3]) | ||
self.idx_head_prediction = None | ||
self.idx_tail_prediction = None | ||
self.build_entity_embedding() | ||
self.build_eval_graph() | ||
|
||
def build_entity_embedding(self): | ||
self.entity_embedding = np.array( | ||
list(self.entity_vector_dict.values())) | ||
self.relation_embedding = np.array( | ||
list(self.rels_vector_dict.values())) | ||
|
||
def build_eval_graph(self): | ||
with tf.name_scope('evaluation'): | ||
self.idx_head_prediction, self.idx_tail_prediction = self.evaluate( | ||
self.eval_triple) | ||
|
||
def evaluate(self, eval_triple): | ||
with tf.name_scope('lookup'): | ||
head = tf.nn.embedding_lookup( | ||
self.entity_embedding, eval_triple[0]) | ||
tail = tf.nn.embedding_lookup( | ||
self.entity_embedding, eval_triple[1]) | ||
relation = tf.nn.embedding_lookup( | ||
self.relation_embedding, eval_triple[2]) | ||
with tf.name_scope('link'): | ||
# 并不太明确这里的用途,h,r,t应该都是[1,dim]维度的向量, self.entity_embedding应该是[n,dim]维度的向量,做加减法得到的是什么类型? | ||
# 如果是list类型,对于不同维度是不能直接加减的。但是对于np.array或者tf的embedding,是可以直接相减的,等同于 self.entity_embedding | ||
# 的每一行都在和h,r,t做运算 | ||
distance_head_prediction = self.entity_embedding + relation - tail | ||
distance_tail_prediction = head + relation - self.entity_embedding | ||
with tf.name_scope('rank'): | ||
if self.score_func == 'L1': # L1 score | ||
_, idx_head_prediction = tf.nn.top_k(tf.reduce_sum( | ||
tf.abs(distance_head_prediction), axis=1), k=self.kg.n_entity) | ||
_, idx_tail_prediction = tf.nn.top_k(tf.reduce_sum( | ||
tf.abs(distance_tail_prediction), axis=1), k=self.kg.n_entity) | ||
else: # L2 score | ||
_, idx_head_prediction = tf.nn.top_k(tf.reduce_sum( | ||
tf.square(distance_head_prediction), axis=1), k=self.kg.n_entity) | ||
_, idx_tail_prediction = tf.nn.top_k(tf.reduce_sum( | ||
tf.square(distance_tail_prediction), axis=1), k=self.kg.n_entity) | ||
return idx_head_prediction, idx_tail_prediction | ||
|
||
def launch_evaluation(self, session): | ||
eval_result_queue = mp.JoinableQueue() | ||
rank_result_queue = mp.Queue() | ||
print('-----Start evaluation-----') | ||
start = timeit.default_timer() | ||
for _ in range(self.n_rank_calculator): | ||
mp.Process( | ||
target=self.calculate_rank, | ||
kwargs={ | ||
'in_queue': eval_result_queue, | ||
'out_queue': rank_result_queue}).start() | ||
n_used_eval_triple = 0 | ||
for eval_triple in self.kg.test_triples: | ||
idx_head_prediction, idx_tail_prediction = session.run( | ||
fetches=[ | ||
self.idx_head_prediction, self.idx_tail_prediction], feed_dict={ | ||
self.eval_triple: eval_triple}) | ||
eval_result_queue.put((eval_triple, idx_head_prediction, idx_tail_prediction)) | ||
n_used_eval_triple += 1 | ||
print( | ||
'[{:.3f}s] #evaluation triple: {}/{}'.format( | ||
timeit.default_timer() - start, | ||
n_used_eval_triple, | ||
self.kg.n_test_triple), | ||
end='\r') | ||
print() | ||
for _ in range(self.n_rank_calculator): | ||
eval_result_queue.put(None) | ||
print('-----Joining all rank calculator-----') | ||
eval_result_queue.join() | ||
print('-----All rank calculation accomplished-----') | ||
print('-----Obtaining evaluation results-----') | ||
'''Raw''' | ||
head_meanrank_raw = 0 | ||
head_hits10_raw = 0 | ||
tail_meanrank_raw = 0 | ||
tail_hits10_raw = 0 | ||
'''Filter''' | ||
head_meanrank_filter = 0 | ||
head_hits10_filter = 0 | ||
tail_meanrank_filter = 0 | ||
tail_hits10_filter = 0 | ||
for _ in range(n_used_eval_triple): | ||
head_rank_raw, tail_rank_raw, head_rank_filter, tail_rank_filter = rank_result_queue.get() | ||
head_meanrank_raw += head_rank_raw | ||
if head_rank_raw < 10: | ||
head_hits10_raw += 1 | ||
tail_meanrank_raw += tail_rank_raw | ||
if tail_rank_raw < 10: | ||
tail_hits10_raw += 1 | ||
head_meanrank_filter += head_rank_filter | ||
if head_rank_filter < 10: | ||
head_hits10_filter += 1 | ||
tail_meanrank_filter += tail_rank_filter | ||
if tail_rank_filter < 10: | ||
tail_hits10_filter += 1 | ||
print('-----Raw-----') | ||
head_meanrank_raw /= n_used_eval_triple | ||
head_hits10_raw /= n_used_eval_triple | ||
tail_meanrank_raw /= n_used_eval_triple | ||
tail_hits10_raw /= n_used_eval_triple | ||
print('-----Head prediction-----') | ||
print( | ||
'MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
head_meanrank_raw, | ||
head_hits10_raw)) | ||
print('-----Tail prediction-----') | ||
print( | ||
'MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
tail_meanrank_raw, | ||
tail_hits10_raw)) | ||
print('------Average------') | ||
print( | ||
'MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
(head_meanrank_raw + tail_meanrank_raw) / 2, | ||
(head_hits10_raw + tail_hits10_raw) / 2)) | ||
print('-----Filter-----') | ||
head_meanrank_filter /= n_used_eval_triple | ||
head_hits10_filter /= n_used_eval_triple | ||
tail_meanrank_filter /= n_used_eval_triple | ||
tail_hits10_filter /= n_used_eval_triple | ||
print('-----Head prediction-----') | ||
print('MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
head_meanrank_filter, head_hits10_filter)) | ||
print('-----Tail prediction-----') | ||
print('MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
tail_meanrank_filter, tail_hits10_filter)) | ||
print('-----Average-----') | ||
print( | ||
'MeanRank: {:.3f}, Hits@10: {:.3f}'.format( | ||
(head_meanrank_filter + tail_meanrank_filter) / 2, | ||
(head_hits10_filter + tail_hits10_filter) / 2)) | ||
print('cost time: {:.3f}s'.format(timeit.default_timer() - start)) | ||
print('-----Finish evaluation-----') | ||
|
||
def calculate_rank(self, in_queue, out_queue): | ||
while True: | ||
idx_predictions = in_queue.get() | ||
if idx_predictions is None: | ||
in_queue.task_done() | ||
return | ||
else: | ||
eval_triple, idx_head_prediction, idx_tail_prediction = idx_predictions | ||
head, tail, relation = eval_triple | ||
head_rank_raw = 0 | ||
tail_rank_raw = 0 | ||
head_rank_filter = 0 | ||
tail_rank_filter = 0 | ||
for candidate in idx_head_prediction[::-1]: | ||
if candidate == head: | ||
break | ||
else: | ||
head_rank_raw += 1 | ||
if (candidate, tail, | ||
relation) in self.kg.golden_triple_pool: | ||
continue | ||
else: | ||
head_rank_filter += 1 | ||
for candidate in idx_tail_prediction[::-1]: | ||
if candidate == tail: | ||
break | ||
else: | ||
tail_rank_raw += 1 | ||
if (head, candidate, | ||
relation) in self.kg.golden_triple_pool: | ||
continue | ||
else: | ||
tail_rank_filter += 1 | ||
out_queue.put( | ||
(head_rank_raw, | ||
tail_rank_raw, | ||
head_rank_filter, | ||
tail_rank_filter)) | ||
in_queue.task_done() | ||
|
||
def check_norm(self): | ||
print('-----Check norm-----') | ||
entity_embedding = self.entity_embedding | ||
relation_embedding = self.relation_embedding | ||
entity_norm = np.linalg.norm(entity_embedding, ord=2, axis=1) | ||
relation_norm = np.linalg.norm(relation_embedding, ord=2, axis=1) | ||
# print('entity norm: {} relation norm: {}'.format(entity_norm, relation_norm)) |
Oops, something went wrong.