-
Notifications
You must be signed in to change notification settings - Fork 0
/
property_linking.py
240 lines (175 loc) · 9.36 KB
/
property_linking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import io, sys, json
from json.decoder import JSONDecodeError
from wtu.table import Table
total_amount_columns = 0
total_same_uri = 0
total_other_uri_NOT_IN_list = 0
total_other_uri_IN_list = 0
total_no_gold_uri = 0
total_no_LL_annos = 0
total_gold_uri_not_valid = 0
table_no = 0
def naiveMaximumCountMissingLLAnnos(column) -> (str, dict):
'''
:param column:
:return:
'''
# find all LiteralLinking-annotations in our column for each cell, also find if no LL exists
pl_annotations = []
for cell in column:
cellAnnotations = cell.find_annotations(anno_source='preprocessing', anno_task='LiteralLinking')
if cellAnnotations:
pl_annotations.extend(cellAnnotations)
else:
pl_annotations.append('noLLAnno') # no LL
# collect all property-uris from all the LiteralLinking-annotations
property_uris = []
for annotation in pl_annotations:
if(annotation == 'noLLAnno'):
property_uris.append('noLLAnno')
else:
uri = annotation['property_uri']
property_uris.append(uri)
# find out which uri does occur most in all the property_uris
counter = {}
for uri in property_uris:
if (uri in counter):
counter[uri] = (counter.get(uri)) + 1
else:
counter[uri] = 1
sorted_uris = (sorted(counter.items(), key=lambda x: x[1]))[::-1] # uri with highest value first
print('list of our uris: ' + str(sorted_uris))
most_freq_uri = max(counter, key=counter.get) # the uri with the highest value
print('most_freq_uri: ' + most_freq_uri)
return (most_freq_uri, counter)
# returns:
# str: the most frequent URI of all cells in the column (empty str if no URIs in any cells in the column),
# dict: key: all URIs we made in this column, value: amount of their occurrence
def naiveMaximum(column) -> (str, dict):
# find all LiteralLinking-annotations in our column for each cell
pl_annotations = []
for cell in column:
pl_annotations.extend(cell.find_annotations(anno_source='preprocessing', anno_task='LiteralLinking'))
# collect all property-uris from all the LiteralLinking-annotations
property_uris = []
for annotation in pl_annotations:
uri = annotation['property_uri']
property_uris.append(uri)
# find out which uri does occur most in all the property_uris
counter = {}
for uri in property_uris:
if (uri in counter):
counter[uri] = (counter.get(uri)) + 1
else:
counter[uri] = 1
sorted_uris = (sorted(counter.items(), key=lambda x: x[1]))[::-1] # uri with highest value first
print('list of our uris: ' + str(sorted_uris))
try:
most_freq_uri = max(counter, key=counter.get) # the uri with the highest value
print('most_freq_uri: ' + most_freq_uri)
except:
most_freq_uri = '' # it means: counter was empty <= property_uris was empty <= we did not have any LL-annos in any of the cells
print("column has gold-uri, but we don't have any LL-annotations for any cell in this column.")
return (most_freq_uri, counter)
# read from stdin, ignore encoding errors
with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin:
# read the URIs and save in list. only if the gold URI is a URI from the list, then compare to our found annotation/URI.
with open('properties_to_consider.txt') as f:
uris_from_file = f.readlines()
uris_from_file = [uri.strip('\n') for uri in uris_from_file]
# iterate over input. Each line represents one table
for json_line in stdin:
try:
# parse json
table_data = json.loads(json_line)
# create Table object to work with
table = Table(table_data)
table_amount_columns = 0
column_same_uri = 0
column_other_uri_NOT_IN_list = 0
column_other_uri_IN_list = 0
column_has_no_gold_uri = 0
column_has_no_LL_anno = 0
column_gold_uri_not_valid = 0
print('-------------------------------------------------------------------------\n')
print('TABLE BEGIN - Table '+ str(table_no) + ' total rows: '+str(table.num_rows)+ ' total cols: '+str(table.num_cols) +'\n')
for column in table.columns():
print('\n COLUMN BEGIN - Col '+ str(column.col_idx))
table_amount_columns += 1
total_amount_columns += 1
# get the gold-property-annotation of the current column
gold_uri=''
for annotation in column.annotations:
if (annotation['task']=='PropertyLinking' and annotation['source']=='gold-v2'):
gold_uri = annotation['property_uri']
print('gold_uri: ' + gold_uri)
# if no gold_uri was found, move to the next column
if(gold_uri == ''):
column_has_no_gold_uri += 1
total_no_gold_uri+=1
print('No gold-uri to compare with.')
continue
# if the gold_uri is not in the list of valid uris, move to the next column
if(gold_uri not in uris_from_file):
column_gold_uri_not_valid += 1
total_gold_uri_not_valid +=1
print('Gold-uri is not valid.')
continue
myTuple = naiveMaximumCountMissingLLAnnos(column) # or: naiveMaximum(column)
most_freq_uri = myTuple[0]
ourURIs = myTuple[1]
# for future: insert an annotation with that most frequent uri to the current column
# if(most_freq_uri != ''): column.annotations.append({'source': 'evaluation', 'task': 'PropertyLinking', 'property_uri': most_freq_uri})
# compare the gold-annotation-uri to our most_freq_uri
if(most_freq_uri == gold_uri):
total_same_uri+=1
column_same_uri+=1
elif(most_freq_uri == ''):
#column has gold-uri, but we don't have any LL-annotations for any cell in this column
total_no_LL_annos+=1
column_has_no_LL_anno+=1
elif(most_freq_uri == 'noLLAnno'):
# column has gold-uri, and our most frequent uri was that we don't have a uri
total_no_LL_annos += 1
column_has_no_LL_anno += 1
else:
if (gold_uri in ourURIs.keys()):
print('We also have annotated some of our cells in this column with that gold-uri. But its not the most frequent one.')
print('It occurred only '+ str(ourURIs[gold_uri]) + '/'+ str(sum(ourURIs.values())) + ' times in our annotations.' )
total_other_uri_IN_list+=1
column_other_uri_IN_list+=1
else:
print('The gold URI is not even within any of our cell-annotations in this column.')
total_other_uri_NOT_IN_list+=1
column_other_uri_NOT_IN_list+=1
print('\nTABLE END - Table '+ str(table_no) + '\n')
table_no+=1
# ignore json decoding errors
except JSONDecodeError: pass
print(
'-----',
'column_same_uri = ' + str(column_same_uri) + '/' + str(table_amount_columns),
'column_other_uri_NOT_IN_list = ' + str(column_other_uri_NOT_IN_list) + '/' + str(table_amount_columns),
'column_other_uri_IN_list = ' + str(column_other_uri_IN_list) + '/' + str(table_amount_columns),
'column_has_no_gold_uri = ' + str(column_has_no_gold_uri) + '/' + str(table_amount_columns),
'column_has_no_LL_anno = ' + str(column_has_no_LL_anno) + '/' + str(table_amount_columns),
'column_gold_uri_not_valid = ' + str(column_gold_uri_not_valid) + '/' + str(table_amount_columns),
'-----',
sep='\n'
)
print(
'',
'-------------------------------------------------------------------------',
'-------------------------------------------------------------------------',
'total_no_gold_uri = ' + str(total_no_gold_uri) + '/' + str(total_amount_columns),
'total_gold_uri_not_valid = ' + str(total_gold_uri_not_valid) + '/' + str(total_amount_columns),
'rest = '+ str( total_amount_columns - total_no_gold_uri - total_gold_uri_not_valid )+ '/' + str(total_amount_columns),
'-----------------------------------------------',
'total_same_uri = ' + str(total_same_uri) + '/' + str(total_amount_columns - total_no_gold_uri - total_gold_uri_not_valid),
'total_other_uri_NOT_IN_list = ' + str(total_other_uri_NOT_IN_list) + '/' + str(total_amount_columns - total_no_gold_uri - total_gold_uri_not_valid),
'total_other_uri_IN_list = ' + str(total_other_uri_IN_list) + '/' + str(total_amount_columns - total_no_gold_uri - total_gold_uri_not_valid),
'total_no_LL_annos = ' + str(total_no_LL_annos) + '/' + str(total_amount_columns - total_no_gold_uri - total_gold_uri_not_valid),
'-------------------------------------------------------------------------',
'-------------------------------------------------------------------------',
sep='\n'
)