-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_histogramms.py
284 lines (205 loc) · 9.22 KB
/
make_histogramms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import io, sys, json
from json.decoder import JSONDecodeError
from wtu.table import Table
# bokeh
from bokeh.core.properties import value
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, save
from bokeh.transform import dodge
import collections as cl
# Searches for literal links
def getLiteral(cell, col, i):
c = ""
task = 'LiteralLinking'
feat = 'property_uri'
new_set = 'preprocessing'
if col[i]:
# extract resource uri from annotation as literal
gold_uri = col[i][0][feat]
preprocessing = cell.find_annotations(anno_task=task)
if preprocessing:
# extract resource uris and frequencies
preprocessing_uris = {annotation[feat] for annotation in preprocessing}
# check if our annotations also include the gold annotation
if gold_uri in preprocessing_uris and gold_uri in props:
c = gold_uri
else:
c = "ng"
return c
# Searches for entity links
def getEntity(cell):
c = ""
task = 'EntityLinking'
feat = 'resource_uri'
gold_set = 'gold-v2'
new_set = 'preprocessing'
gold = cell.find_annotations(anno_source=gold_set,anno_task=task)
if gold:
# extract resource uri from annotation as entity
gold_uri = gold[0][feat]
preprocessing = cell.find_annotations(anno_source=new_set,anno_task=task)
# extract resource uris and frequencies
preprocessing_uris = {
annotation[feat]: annotation['frequency']
for annotation in preprocessing
}
# check if our annotations also include the gold annotation
if gold_uri in preprocessing_uris:
c = gold_uri
else:
c = "ng"
return c
# Creates the histogramm for after a new table was analysed
def makeHist(tab, ana, key_list):
for k in key_list.keys():
if ana[k] in tab[k] and ana[key_list[k]]:
tab[k][ana[k]] = tab[k][ana[k]] + 1
elif ana[key_list[k]]:
tab[k][ana[k]] = 1
return tab
# Looks if a label is already in the list of lables and adds it if not
def createLbl(lbl, ana, key_list):
for k in key_list.keys():
if ana[k] not in lbl:
lbl = lbl + [ana[k]]
return lbl
# Analyses a row/colum and return the results
def analyze(line):
res = {'m':0, 'm_el':0, 'm_ll':0, 'ng':False, 'ng_el':False, 'ng_ll':False}
for cell in line:
if cell[0] is '' and cell[1] is '' :
res['m'] += 1
if cell[0] is '':
res['m_el'] += 1
if cell[1] is '':
res['m_ll'] += 1
if cell[0] is not 'ng' and cell[1] is not 'ng':
res['ng'] = True
if cell[0] is not 'ng':
res['ng_el'] = True
if cell[1] is not 'ng':
res['ng_ll'] = True
return res
# Analyses the table and adds the result to the overall results
def analyzeTable(table):
global hist_row
global hist_col
global lbl_col
global lbl_row
global num_found
global found_tables
tabel_local_row = {'m':{0:0}, 'm_el':{0:0}, 'm_ll':{0:0}}
tabel_local_col = {'m':{0:0}, 'm_el':{0:0}, 'm_ll':{0:0}}
lbl_row_local = []
lbl_col_local = []
for i in range(0, len(table[0])):
ana = analyze([row[i] for row in table])
tabel_local_col = makeHist(tabel_local_col, ana, key_list)
hist_col = makeHist(hist_col, ana, key_list)
lbl_row = createLbl(lbl_row, ana, key_list)
lbl_row_local = createLbl(lbl_row_local, ana, key_list)
for row in table:
ana = analyze(row)
tabel_local_row = makeHist(tabel_local_row, ana, key_list)
hist_row = makeHist(hist_row, ana, key_list)
lbl_col = createLbl(lbl_col, ana, key_list)
lbl_col_local = createLbl(lbl_col_local, ana, key_list)
# Graph for every table
plt_bar(tabel_local_row, lbl_col_local, 'Overall number of rows with this number of empty cells', str('Number of empty cells in a row for tabel' + str(num_tabel)), str('hists/rows_table'+ str(num_tabel) + '.html'), 600, '#Missing Annotations in row', '#How often this amount is missing')
plt_bar(tabel_local_col, lbl_row_local, 'Overall number of col with this number of empty cells', str('Number of empty cells in a col for tabel' + str(num_tabel)), str('hists/col_table'+ str(num_tabel) + '.html'), 600, '#Missing Annotations in row', '#How often this amount is missing')
# Makes a single plot in d new file for a given data set
def plt_bar(hist_tabel, lbl_list, ylabel, xlabel, filename, wigth, xl, yl):
w = 0.2
output_file(filename)
palette = ["#c9d9d3", "#718dbf", "#e84d60"]
od = cl.OrderedDict(sorted(hist_tabel['m'].items()))
el = cl.OrderedDict(sorted(hist_tabel['m_el'].items()))
ll = cl.OrderedDict(sorted(hist_tabel['m_ll'].items()))
odl = [od[l] if l in od else 0 for l in sorted(lbl_list)]
ell = [el[l] if l in el else 0 for l in sorted(lbl_list)]
lll = [ll[l] if l in ll else 0 for l in sorted(lbl_list)]
num = [str(s) for s in sorted(lbl_list)]
d = {'num': num, 'Entity + Literal': odl, 'Entity': ell, 'Literal': lll}
l = max(odl + ell + lll)
p = figure(x_range=num, y_range=(0,l+(l/4)), plot_height=350, plot_width=wigth, toolbar_location=None, tools="", x_axis_label=xl, y_axis_label=yl)
p.vbar(x=dodge('num', -0.25, range=p.x_range), top='Entity + Literal', width=0.1, source=d,
color="#c9d9d3", legend=value("Entity + Literal"))
p.vbar(x=dodge('num', 0.0, range=p.x_range), top='Entity', width=0.1, source=d,
color="#718dbf", legend=value("Entity"))
p.vbar(x=dodge('num', 0.25, range=p.x_range), top='Literal', width=0.1, source=d,
color="#e84d60", legend=value("Literal"))
#p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.orientation = "horizontal"
p.legend.location = "top_left"
save(p)
# Makes a single plot in d new file for a given data set
def plt_bar2(hist_tabel, used, ylabel, xlabel, filename, wigth, xl, yl):
w = 0.2
output_file(filename)
gold = hist_tabel['gold']
pre = hist_tabel['pre']
up = [u.replace('http://dbpedia.org/ontology/', '') for u in used]
goldl = [gold['http://dbpedia.org/ontology/'+l] if 'http://dbpedia.org/ontology/'+l in gold.keys() else 0 for l in up]
prel = [pre['http://dbpedia.org/ontology/'+l] if 'http://dbpedia.org/ontology/'+l in pre.keys() else 0 for l in up]
data = {'up': up, 'Gold': goldl, 'Preprocessing': prel}
source = ColumnDataSource(data=data)
p = figure(x_range=up, plot_height=800, plot_width=wigth, toolbar_location=None, tools="",x_axis_label=xl, y_axis_label=yl)
p.vbar(x=dodge('up', -0.25, range=p.x_range), top='Gold', width=0.2, source=data, color="#c9d9d3", legend=value("Gold"))
p.vbar(x=dodge('up', 0.0, range=p.x_range), top='Preprocessing', width=0.2, source=data, color="#718dbf", legend=value("Preprocessing"))
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.legend.orientation = "horizontal"
p.legend.location = "top_left"
save(p)
props = [l.strip('\n') for l in io.open('properties_to_consider.txt', 'r', encoding='utf-8', errors='ignore')]
key_list = {'m':'ng', 'm_el':'ng_el', 'm_ll':'ng_ll'}
# row_missing, row_missing_el, row_missing_ll
hist_row = {'m':{0:0}, 'm_el':{0:0}, 'm_ll':{0:0}}
# col_missing, col_missing_el, col_missing_ll
hist_col = {'m':{0:0}, 'm_el':{0:0}, 'm_ll':{0:0}}
num_tabel = 0
num_found = 0
found_tables = []
# Lables for colums
lbl_col = []
# Lables for rows
lbl_row = []
def printTable(t):
print("Table Num:" , num_tabel)
for e in t:
print(e)
print('')
def main():
global num_tabel
with io.open(sys.stdin.fileno(), 'r', encoding='utf-8', errors='ignore') as stdin:
for json_line in stdin:
try:
table_data = json.loads(json_line)
table = Table(table_data)
t = []
col = [c.find_annotations(anno_task = 'PropertyLinking') for c in table.columns()]
for row in table.rows():
if row.find_annotations(anno_task='EntityLinking') is not []:
i = 0
r = []
for cell in row:
if col[i]:
r = r + [[getEntity(cell)] + [getLiteral(cell, col, i)]]
i += 1
if r:
t = t + [r]
if t:
analyzeTable(t)
num_tabel += 1
# ignore json decoding errors
except JSONDecodeError:
pass
# Create histogramm for rows
plt_bar(hist_row, lbl_col, 'Overall number of rows ', 'Number of empty cells in a row', 'hists/01_ov_rows.html', 4000, '#Missing Annotations in row', '#How often this amount is missing')
# Create histogramm columns
plt_bar(hist_col, lbl_row, 'Overall number of cols', 'Number of empty cells in a col', 'hists/01_ov_cols.html', 4000, '#Missing Annotations in column', '#How often this amount is missing')
if __name__ == "__main__":
main()