-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtriple_extractor.py
115 lines (95 loc) · 4.5 KB
/
triple_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import re
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)
def get_entity_pairs(text, coref=True):
# preprocess text
text = re.sub(r'\n+', '.', text) # replace multiple newlines with period
text = re.sub(r'\[\d+\]', ' ', text) # remove reference numbers
text = nlp(text)
if coref:
text = nlp(text._.coref_resolved) # resolve coreference clusters
def refine_ent(ent, sent):
unwanted_tokens = (
'PRON', # pronouns
'PART', # particle
'DET', # determiner
'SCONJ', # subordinating conjunction
'PUNCT', # punctuation
'SYM', # symbol
'X', # other
)
ent_type = ent.ent_type_ # get entity type
if ent_type == '':
ent_type = 'NOUN_CHUNK'
ent = ' '.join(str(t.text) for t in
nlp(str(ent)) if t.pos_
not in unwanted_tokens and t.is_stop == False)
elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
refined = ''
for i in range(len(sent) - ent.i):
if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
refined += ' ' + str(ent.nbor(i))
else:
ent = refined.strip()
break
return ent, ent_type
sentences = [sent.string.strip() for sent in text.sents] # split text into sentences
ent_pairs = []
for sent in sentences:
sent = nlp(sent)
spans = list(sent.ents) + list(sent.noun_chunks) # collect nodes
spans = spacy.util.filter_spans(spans)
with sent.retokenize() as retokenizer:
[retokenizer.merge(span, attrs={'tag': span.root.tag,
'dep': span.root.dep}) for span in spans]
deps = [token.dep_ for token in sent]
# limit our example to simple sentences with one subject and object
if (deps.count('obj') + deps.count('dobj')) != 1\
or (deps.count('subj') + deps.count('nsubj')) != 1:
continue
for token in sent:
if token.dep_ not in ('obj', 'dobj'): # identify object nodes
continue
subject = [w for w in token.head.lefts if w.dep_
in ('subj', 'nsubj')] # identify subject nodes
if subject:
subject = subject[0]
# identify relationship by root dependency
relation = [w for w in token.ancestors if w.dep_ == 'ROOT']
if relation:
relation = relation[0]
# add adposition or particle to relationship
if relation.nbor(1).pos_ in ('ADP', 'PART'):
relation = ' '.join((str(relation), str(relation.nbor(1))))
else:
relation = 'unknown'
subject, subject_type = refine_ent(subject, sent)
token, object_type = refine_ent(token, sent)
ent_pairs.append([str(subject), str(relation), str(token),
str(subject_type), str(object_type)])
ent_pairs = [sublist for sublist in ent_pairs
if not any(str(ent) == '' for ent in sublist)]
pairs = pd.DataFrame(ent_pairs, columns=['subject', 'relation', 'object',
'subject_type', 'object_type'])
print('Entity pairs extracted:', str(len(ent_pairs)))
return pairs
doc = "Angela usually visit silver spoon restaurant with Angela friends. " \
"silver spoon restaurant food quality is amazing and Angela like silver spoon restaurant chicken biryani a lot. " \
"silver spoon restaurant chicken biryani so delicious. " \
"silver spoon restaurant food as well as services are all good."
# pairs = get_entity_pairs(doc)
# print(pairs)
# triple_pairs = [['Angela', 'visit', 'silver spoon restaurant'], ['Angela', 'likes', 'silver spoon restaurant chicken biryani'],
# ['silver spoon restaurant food', 'good', 'services'], ['silver spoon restaurant food', 'are', 'good']]
triple_pairs = []
with open('./data/output/kg/input_data.txt-out.csv') as file:
lines = file.readlines()
for line in lines:
triple_pair = line.split(',')
triple_pairs.append(triple_pair)
pairs = pd.DataFrame(triple_pairs, columns=['subject', 'relation', 'object'])
from create_kg import draw_kg
draw_kg(pairs)