-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_process.py
325 lines (276 loc) · 13.3 KB
/
final_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import os
from helper import *
import json
from collections import defaultdict as ddict
import pdb
import re
import unicodedata
import nltk
from tqdm import tqdm
rel2id = json.loads(open("./data/rel2id.json",encoding='utf-8').read())
id2rel = dict([(v, k) for k, v in rel2id.items()])
data = {"train": [], "test": []}
def read_file(file_path):
temp = []
with open(file_path,encoding='utf-8') as f:
for k, line in enumerate(f):
bag = json.loads(line.strip())
pos1_list = []
pos2_list = []
head_pos_list = []
tail_pos_list = []
wrds_list = []
mask_list = []
# print('complete substitution')
for sent in bag["sentence"]:#分词
# sent["nlp"]=ddict({"tokens":list})
sent["nlp"] = {}
sent["nlp"]["sentences"] = []
tokenlist = []
tokens = sent["sent"].split()
for index, word in enumerate(tokens):
token = {}
token["index"] = index
token["originalText"] = word
token["characterOffsetBegin"] = len(" ".join(sent["sent"].split()[0:index])) + (
1 if index != 0 else 0)
token["characterOffsetEnd"] = len(" ".join(sent["sent"].split()[0:index])) + len(word) + (
1 if index != 0 else 0)
tokenlist.append(token)
sent["nlp"]["sentences"].append({"tokens": tokenlist})
# print('complete adding nlp')
#print(bag)
count=0
#debug
# if k>=90000:
# print(len(bag['sentence']))
# print(bag)
for sent in bag["sentence"]:
#debug
# if k>=90000:
# print(sent)
# 输出head词和tail词在句子中的索引位置到(list)head_start_off和(list)tail_start_off,先找一个词的索引位置,再找另一个词,且另一个词的索引位置必须在第一个词
# 的第一个字之前或最后一个字之后
# 实体词由一个字或两个字组成,先分为len(head)>len(tail)和len(head)<=len(tail)
if len(bag["head"]) > len(bag["tail"]): # head词比tail词长的时候
head_idx = [i for i, e in enumerate(sent["sent"].split()) if
e == bag["head"]] # 不计词间空格时的head词的词列表索引位置(考虑多个head词)
head_start_off = [len(" ".join(sent["sent"].split()[0:idx])) + (1 if idx != 0 else 0) for idx in
head_idx] # 计入词间空格时head词的句子列表索引位置(假设idx=0时,head_start_off=0;idx=1时,head_start_off=2)(对中文分词有利)
if head_start_off == []: # 如果head是两个字的词时,用下划线取代空格后,利用正则表达式匹配
head_start_off = [
m.start() for m in re.finditer(
bag["head"].replace("_", " "),
sent["sent"].replace("_", " ")
)
]
reserve_span = [(start_off, start_off + len(bag["head"]))
for start_off in head_start_off] # head词的span,(第一个字的索引位置,最后一个字的索引位置)
tail_idx = [i for i, e in enumerate(sent["sent"].split()) if e == bag["tail"]]
tail_start_off = [len(" ".join(sent["sent"].split()[0:idx])) + (1 if idx != 0 else 0) for idx in
tail_idx]
if tail_start_off == []:
tail_start_off = [
m.start() for m in re.finditer(
bag["tail"].replace("_", " "),
sent["sent"].replace("_", " ")
)
]
tail_start_off = [
off for off in tail_start_off if all([
off < span[0] or off > span[1]
for span in reserve_span
])
] # 筛选tail_start_off,tail词的句子列表索引位置,必须满足在head词的第一个字之前,或在最后一个字之后
else: # head词和tail词一样长,或head词短于tail词
tail_idx = [
i for i, e in enumerate(sent["sent"].split()) if e == bag["tail"]
]
tail_start_off = [
len(" ".join(sent["sent"].split()[0:idx])) + (1 if idx != 0 else 0) for idx in tail_idx
]
if tail_start_off == []: # 把句子中的空格替换成下划线后再查找实体位置,start()返回的是pattern开始的位置
tail_start_off = [
m.start() for m in re.finditer(
bag["tail"].replace("_", " "),
sent["sent"].replace("_", " ")
)
]
reserve_span = [(start_off, start_off + len(bag["tail"]))
for start_off in tail_start_off] # tail词的span
head_idx = [
i for i, e in enumerate(sent["sent"].split()) if e == bag["head"]
]
head_start_off = [
len(" ".join(sent["sent"].split()[0:idx])) + (1 if idx != 0 else 0) for idx in head_idx
]
if head_start_off == []:
head_start_off = [
m.start() for m in re.finditer(
bag["head"].replace("_", " "),
sent["sent"].replace("_", " ")
)
]
head_start_off = [
off for off in head_start_off if all([
off < span[0] or off > span[1]
for span in reserve_span
])
]
#'词span元组[(开始位置,结束位置,"词名"),...]')
head_off = [(head_off, head_off + len(bag["head"]), "head")
for head_off in head_start_off]
tail_off = [(tail_off, tail_off + len(bag["tail"]), "tail")
for tail_off in tail_start_off]
if head_off == [] or tail_off == []:
continue
spans = [head_off[0]] + [tail_off[0]]
off_begin, off_end, _ = zip(*spans)
tid_map, tid2wrd = ddict(dict), ddict(list)
tok_idx = 1
head_pos, tail_pos = None, None
for s_n, sentence in enumerate(sent["nlp"]["sentences"]):
i, tokens = 0, sentence["tokens"]
while i < len(tokens):
#print('sent order {}'.format(i))
if tokens[i]['characterOffsetBegin'] in off_begin:
_, end_offset, identity = spans[off_begin.index(tokens[i]['characterOffsetBegin'])]
if identity == 'head':
head_pos = tok_idx - 1 # Indexing starts from 0
tok_list = [tok['originalText'] for tok in tokens]
else:
tail_pos = tok_idx - 1
tok_list = [tok['originalText'] for tok in tokens]
while i < len(tokens) and tokens[i]['characterOffsetEnd'] <= end_offset:
tid_map[s_n][tokens[i]['index']] = tok_idx
tid2wrd[tok_idx].append(tokens[i]['originalText'])
i += 1
tok_idx += 1
else:
tid_map[s_n][tokens[i]['index']] = tok_idx
tid2wrd[tok_idx].append(tokens[i]['originalText'])
i += 1
tok_idx += 1
if head_pos == None or tail_pos == None:
print('Skipped entry!!')
print('{} | {} | {}'.format(bag['head'], bag['tail'], sent['sent']))
continue
wrds = ['_'.join(e).lower() for e in tid2wrd.values()]
pos1 = [i - head_pos for i in range(tok_idx - 1)] # tok_id = (number of tokens + 1)
pos2 = [i - tail_pos for i in range(tok_idx - 1)]
#计算mask list,用于piecewise pooling
pos_min=min(head_pos,tail_pos)
pos_max=max(head_pos,tail_pos)
mask=[]
for i in range(tok_idx-1):
if i <=pos_min:
mask.append(1)
elif i<=pos_max:
mask.append(2)
else:
mask.append(3)
wrds_list.append(wrds)
pos1_list.append(pos1)
pos2_list.append(pos2)
head_pos_list.append(head_pos)
tail_pos_list.append(tail_pos)
mask_list.append(mask)
count+=1
temp.append({
'head': bag['head'],
'tail': bag['tail'],
'rels': bag['relation'],
# 'phrase_list': phrase_list,
'head_pos_list': head_pos_list,
'tail_pos_list': tail_pos_list,
'wrds_list': wrds_list,
'pos1_list': pos1_list,
'pos2_list': pos2_list,
'mask_list': mask_list
# 'sub_type': ent2type[bag['sub_id']],
# 'obj_type': ent2type[bag['obj_id']],
# 'dep_links_list': dep_links_list,
})
if k%10000==0:print('Completed {}'.format(k))
# if not args.FULL and k > args.sample_size: break
return temp
data['train'] = read_file("./data/train_bags.json")
data['test'] = read_file("./data/test_bags.json")
print('Bags processed:Train:{},Test:{}'.format(len(data['train']),len(data['test'])))
"""*************************************删除离群数据****************************************"""
del_cnt = 0
MAX_WORDS = 100
for dtype in ['train', 'test']:
for i in range(len(data[dtype]) - 1, -1, -1):
bag = data[dtype][i]
for j in range(len(bag['wrds_list']) - 1, -1, -1):
data[dtype][i]['wrds_list'][j] = data[dtype][i]['wrds_list'][j][:MAX_WORDS]
data[dtype][i]['pos1_list'][j] = data[dtype][i]['pos1_list'][j][:MAX_WORDS]
data[dtype][i]['pos2_list'][j] = data[dtype][i]['pos2_list'][j][:MAX_WORDS]
data[dtype][i]['mask_list'][j] = data[dtype][i]['mask_list'][j][:MAX_WORDS]
if len(data[dtype][i]['wrds_list']) == 0:
del data[dtype][i]
del_cnt += 1
continue
print('Bags deleted {}'.format(del_cnt))
"""***********************************建立词库**********************************************"""
MAX_VOCAB=150000
#词频字典
voc_freq=ddict(int)
for bag in data['train']:
for sentence in bag['wrds_list']:
for wrd in sentence:
voc_freq[wrd]+=1
freq=list(voc_freq.items())
freq.sort(key=lambda x:x[1],reverse=True)
freq=freq[:MAX_VOCAB]
vocab,_=map(list,zip(*freq))
vocab.append('UNK')
"""*******************************建立word 和 id之间的映射表*********************************"""
#词到id的字典
def getIdMap(vals,begin_idx=0):
ele2id={}
for id,ele in enumerate(vals):
ele2id[ele]=id+begin_idx
return ele2id
voc2id=getIdMap(vocab,1)
id2voc=dict([(v,k) for k,v in voc2id.items()])
print('Chosen Vocabulary:\t{}'.format(len(vocab)))
"""******************************将数据转化为张量形式************************************"""
MAX_POS=60#并不是最终的max_pos,而是计算max_pos的margin
#词转id
def getId(wrd,wrd2id,def_val='NONE'):
if wrd in wrd2id:
return wrd2id[wrd]
else:
return wrd2id[def_val]
def posMap(pos):
if pos< -MAX_POS:
return 0
elif pos > MAX_POS:
return (MAX_POS+1)*2
else:
return pos+(MAX_POS+1)
def procData(data,split='train'):
result=[]
for bag in data:
res={}#k-hot label
res['X']=[[getId(wrd,voc2id,'UNK') for wrd in wrds] for wrds in bag['wrds_list']]
res['Pos1']=[[posMap(pos) for pos in pos1] for pos1 in bag['pos1_list']]
res['Pos2']=[[posMap(pos) for pos in pos2] for pos2 in bag['pos2_list']]
res['Mask']=bag['mask_list']
res['Y']=bag['rels']
res['HeadPos']=bag['head_pos_list']
res['TailPos']=bag['tail_pos_list']
result.append(res)
return result
final_data={
"train":procData(data['train'],'train'),
"test":procData(data['test'],'test'),
"voc2id":voc2id,
"id2voc":id2voc,
"max_pos":(MAX_POS+1)*2+1,
"rel2id":rel2id
}
print('writing final_data')
pickle.dump(final_data,open("{}_processed.pkl".format("riedel"),'wb'))