-
Notifications
You must be signed in to change notification settings - Fork 0
/
kg_adaptation.py
executable file
·431 lines (371 loc) · 19 KB
/
kg_adaptation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
""" Create and save in output folder weighted KG """
import os, argparse
import operator, math
import time
import pandas as pd
import owlready2.namespace
import joblib
from typing import List
import hate_datasets as dc
import functions.kg.utils as kg_utils
import functions.kg.indexing as kg_index
import functions.kg.weighting as kg_weight
from functions.helper import save_dict
# Global variables: cant use - or _ as separators
THRS = [0.5] # int (standard thresholding), str (different aggregation functions?)
MATCH_METHODS = kg_index.MATCH_METHODS
INFER_METHODS = kg_utils.INFER_METHODS
WEIGHT_BY_SCORE = kg_weight.WEIGHT_BY_SCORE
WEIGHT_BY_MODEL = kg_weight.WEIGHT_BY_MODEL
WEIGHT_FS = kg_weight.WEIGHT_FS
DNAMES = dc.DNAMES
PROJ_DIR = os.getcwd()
OUTPUT_FOLDER = f'{PROJ_DIR}/models/adaptation'
CHECKPOINTS_FOLDER = f'{PROJ_DIR}/models/adaptation/checkpoints'
DATA_FOLDER = f'{PROJ_DIR}/data'
def collect_owl_from_path(kg_path: str,
output_folder: str = OUTPUT_FOLDER) -> owlready2.namespace.Ontology:
""" Load saved version of ontology from local repository or import from owl path and save a copy """
fname = kg_path.split('/')[-1]
o_path = f'{output_folder}/{fname}'
if os.path.exists(o_path):
kg = kg_utils.load_owl(o_path)
print('Found OWL file in output folder. Importing.')
else:
kg = kg_utils.load_owl(kg_path)
if not os.path.exists(output_folder):
os.mkdir(output_folder)
kg.save(file=o_path)
print(f'OWL collected from {kg_path} and saved in output folder.')
return kg
def disproportionate_stratified_sampling(n: int, df: pd.DataFrame, col_to_sample: List) -> pd.DataFrame:
"""
Get sample of size n from df using stratified sample.
Returning balanced samples of populations indicated by binary columns in col_to_sample list
More info on disproportionate sampling: https://www.geeksforgeeks.org/stratified-sampling-in-pandas/
"""
# 1. Create sampling dict: needed for balanced sampling with no repetitions
# dict of index lists which are unique in each group in ascending order
# a. get group sample sizes in ascending order
sample_sizes = {gi: df.loc[df[gi] == 1].shape[0] for gi in col_to_sample}
sample_sizes = dict(sorted(sample_sizes.items(), key=operator.itemgetter(1)))
#print(sample_sizes)
# b. get sample id lists for each group that are unique according to sample size ascending order
sampling_dict, ids_inc = {}, []
for g in sample_sizes.keys():
sampling_dict[g] = df.loc[(df[g] == 1) & (~df.index.isin(ids_inc))].index.to_list()
ids_inc += sampling_dict[g]
# 2. Create balanced sampled df
# df subset of n size
# a. get subgroup sample sizes as per how many samples can be extracted from each subgroup given final sample size N
sampling_sizes = {}
n_remaining, g_remaining = n, list(sample_sizes.keys())
for g in sample_sizes.keys():
sample_size = len(sampling_dict[g])
subg_n = math.ceil(n_remaining/len(g_remaining))
sampling_sizes[g] = sample_size if subg_n > sample_size else subg_n
# update remaining sample sizes and groups to sample from
n_remaining = n_remaining - sampling_sizes[g]
g_remaining.remove(g)
#print(sampling_sizes)
# b. return the balanced sample of df by drawing randomly samples from each subgroup of its corresponding
# sample size
sampled_ids = []
for g in sample_sizes.keys():
# print(f'{g}: sampling_sizes: {sampling_sizes[g]} sample size: {sample_sizes[g]}')
g_sample = pd.Series(sampling_dict[g]).sample(n=sampling_sizes[g], random_state=1)
sampled_ids += g_sample.to_list()
sampled_df = df.loc[df.index.isin(sampled_ids)].copy()
return sampled_df
def adaptation_subset(d: pd.DataFrame,
g_labels: dict,
dname: str,
thr: int,
identities: List = None,
data_folder: str = DATA_FOLDER):
"""
Add 'identities' to df_train: binary column with sample of related and not related texts for KG adaptation.
Can be at group (e.g., gender) or subgroup (e.g., male) level.
Rule to determine n (max possible sample for Npos=Nneg)
"""
if identities is None:
identities = ['gender', 'sexual_orientation']
y_col = ''.join([i for identity in identities for i in identity.split('_')])
print(f"Sampling distribution with thr={thr} for {y_col}")
# Create binary columns with groups over thr (g_labels keys) and
# nones (i.e., texts with no identity group labels over thr ("None"))
if identities[0] in g_labels.keys():
# ... groups
g_labels = list(g_labels.keys())
else:
# ... subgroups
g_labels = [subg_i for subg_l in g_labels.values() for subg_i in subg_l]
g_labels_bin = []
for g in list(g_labels):
d.loc[:, f'{g}_{thr}'] = d[f'{g}'].apply(lambda perc: 1 if perc >= thr else 0)
g_labels_bin.append(f'{g}_{thr}')
d.loc[:, f'none_{thr}'] = d.apply(lambda row: 1 if sum(row[g_labels_bin]) == 0 else 0, axis=1)
g_labels_bin.append(f'none_{thr}')
print(f' {d.loc[d[f"none_{thr}"] == 1,].shape[0]}/{d.shape[0]} samples with no identity annotations under {thr}')
if len(identities) == 2:
id_1, id_2 = identities
# Take n (minimum samples of either positive group)
d_1, d_2 = d.loc[d[f'{id_1}_{thr}'] == 1], d.loc[d[f'{id_2}_{thr}'] == 1]
n_pos_min = min(d_1.shape[0], d_2.shape[0])
print(f' min {id_1} or {id_2} sample: {n_pos_min}')
# Get balanced positive sample: with all samples from min pos group and n sample from the other pos group
col_pos = [f'{g}_{thr}' for g in identities]
if (d_1.loc[~d_1.index.isin(d_2.index)].shape[0] >= n_pos_min) or (d_2.loc[~d_2.index.isin(d_1.index)].shape[0] >= n_pos_min):
# ... there are enough disjoint examples from the majority class to draw a balanced sample
pos_df = disproportionate_stratified_sampling(2*n_pos_min, d, col_pos)
else:
# ... need to take n sample from both and remove duplicates
pos_samples = [d_1.sample(n=n_pos_min, random_state=1), d_2.sample(n=n_pos_min, random_state=1)]
pos_df = pd.concat(pos_samples, join='inner')
pos_df = pos_df[~pos_df.duplicated()]
neg_df = d.loc[(d[f'{id_1}_{thr}'] == 0) & (d[f'{id_2}_{thr}'] == 0)].copy()
elif len(identities) == 1:
col_pos = [f'{identities[0]}_{thr}']
pos_df = d.loc[d[col_pos[0]] == 1].copy()
neg_df = d.loc[d[col_pos[0]] == 0].copy()
else:
raise Exception(f'Adaptation not supported for more than 2 identities, provided {identities}')
n_pos = pos_df.shape[0]
pos_df[y_col] = n_pos * [1]
print(f' {n_pos} unique positive samples ')
if len(identities) == 2:
print(f'2*n ({n_pos_min}) = {2 * n_pos_min} - {2 * n_pos_min - n_pos} duplicates')
for g in col_pos:
print(f' -- {g}: {pos_df.loc[pos_df[g] == 1,].shape[0]}')
# Get balanced negative sample of n_pos sample size: stratification with disproportionate sampling
col_to_stratify = [x for x in g_labels_bin if x not in col_pos]
neg_df = disproportionate_stratified_sampling(n_pos, neg_df, col_to_stratify)
neg_df[y_col] = n_pos * [0]
print(f' {neg_df.shape[0]} unique negative samples:')
for g in col_to_stratify:
print(f' -- {g}: {neg_df.loc[neg_df[g] == 1,].shape[0]}')
# Take df_eval as d.notin(df_train)
df_train = pd.concat([pos_df, neg_df])
# ... ensure that values in either pos_label are unique
df_train = df_train[~df_train.duplicated(subset=df_train.columns.to_list()[:-1])]
print(f' {df_train.shape[0]} unique train samples: '
f'2*n ({n_pos}) = {2 * n_pos} - {2 * n_pos - df_train.shape[0]} duplicates:')
print(f' -- {y_col}: \n{df_train[y_col].value_counts()}')
df_eval = d.loc[~d.index.isin(df_train.index.to_list())]
# Save pre-training corpus as CSV in data folder (if not None)
if data_folder:
export_name = '{}_{}_{}'.format(dname, thr, y_col)
o_path = f'{data_folder}/{export_name}.csv'
df_train.to_csv(o_path, index=False)
print(f' Pre-training corpus exported to {data_folder}: {export_name}')
return df_train, df_eval
def __get_inferred(ent_assert: list,
kg: owlready2.namespace.Ontology,
infer_method: str) -> List:
""" Return list of all entities inferred in the list of asserted entities """
# Use KG structure to infer new entity information
if infer_method == 'hierarchical':
# ... [c1.iri, c2.iri, c11.iri]
from functions.kg.utils import get_hierarchical_info
ent_infer = [c_infer for c_assert in ent_assert
for c_infer in get_hierarchical_info(c_assert, kg)]
elif infer_method == 'none':
ent_infer = []
else:
raise Exception(f'{infer_method} Invalid method for using KG structure to infer information about terminology.'
f'Method selected in the list: ' + ', '.join(INFER_METHODS))
return ent_infer
def __entity_matching(df: pd.DataFrame,
inv_index: kg_index.EntityMatching,
text_col: str,
id_col: str,
kg: owlready2.namespace.Ontology,
checkpoint_root: str,
match_method: str,
infer_method: str):
print(f'Identifying entities asserted and inferred in train subset: infer_method = {infer_method}')
matching_df = pd.DataFrame()
check_asserted = f'{checkpoint_root}.pkl'
if not os.path.exists(check_asserted):
print(' matching entities')
# Create KG dicts for entity matching ({entity: [label, synonym, etc]})
kg_dict = kg_utils.get_kg_dict(kg)
# Return list of entities asserted in the text
matching_df['ent_assert'] = kg_utils.get_entity_matches(df, inv_index, text_col, id_col, kg_dict, match_method)
matching_df['ent_assert'].to_pickle(check_asserted)
print(' checkpoint to: {}'.format(check_asserted))
else:
print(' found checkpoint of matched entities. Importing from: {}'.format(check_asserted))
matching_df['ent_assert'] = pd.read_pickle(check_asserted)
check_inferred = f'{checkpoint_root}-{infer_method}.pkl'
if not os.path.exists(check_inferred):
print(' inferring information from asserted entities')
matching_df['ent_infer'] = matching_df['ent_assert'].apply(
lambda ent_assert: __get_inferred(ent_assert, kg, infer_method)
)
matching_df['ent_infer'].to_pickle(check_inferred)
print(' checkpoint to: {}'.format(check_inferred))
else:
print(' found checkpoint of inferred entities. Importing from: {}'.format(check_inferred))
matching_df['ent_infer'] = pd.read_pickle(check_inferred)
# Return list of inferred entities
return matching_df.apply(lambda row: row['ent_assert'] + row['ent_infer'], axis=1)
def __compute_weights(d_train: pd.DataFrame,
X_col: str,
y_col: str,
weighting_f: str,
o_path: str):
print("Computing and saving entity weights to get weighted KG ({IRI: weight}): "
f"weighting f={weighting_f}")
if weighting_f in WEIGHT_BY_SCORE:
# Exporting weights from ratios in positive and negative class
ent_match_pos = d_train.loc[d_train[y_col] == 1, X_col]
ent_match_neg = d_train.loc[d_train[y_col] == 0, X_col]
# Computing the weights
if weighting_f == 'docf':
# Get document frequencies in positive and negative space (unique occurrences by number of docs)
freq_pos = kg_weight.get_DocF(ent_match_pos)
freq_neg = kg_weight.get_DocF(ent_match_neg)
# Compute weights as the avg of the difference in both classes
weights = kg_weight.get_ratio(freq_pos, freq_neg)
else:
raise Exception(f'{weighting_f} weight function not in list of scoring methods: {WEIGHT_BY_SCORE}')
# Saving results:
save_dict(weights, o_path)
elif weighting_f in WEIGHT_BY_MODEL:
# Exporting weights from the feature coefficients of LR model trained on entities
pipeline = kg_weight.get_ML_coefficients(d_train, X_col, y_col, weighting_f)
# Dist vect is the vectorizer and model. Save them with joblib.
joblib.dump(pipeline, f'{o_path}.joblib')
else:
raise Exception(f'{weighting_f} Invalid method for weighting entities based on their distribution.'
f'Method selected in the list: ' + ', '.join(WEIGHT_FS))
def kg_adaptation(dname: str,
kg_path: str,
identities: List[str],
**opt_config):
# Parse with default config parameters
config = {'thr': THRS[0],
'match_method': MATCH_METHODS[0],
'infer_method': INFER_METHODS,
'weight_f': WEIGHT_FS}
for k, v in opt_config.items():
if k in config.keys():
config[k] = v
# Import processed df from data folder
d, text_col, id_col, g_labels = dc.import_dataset(dname)
y_col = ''.join([i for identity in identities for i in identity.split('_')])
# Import and save kg to result folder
kg = collect_owl_from_path(kg_path)
kg_name = kg_path.rsplit('.', 1)[-2].split('/')[-1]
# Draw pre-training corpus fom identities
d_train, _ = adaptation_subset(d, g_labels, dname, config["thr"], identities)
# Entity matching: identify entities asserted and inferred in text using KG
check_root = f'{CHECKPOINTS_FOLDER}/{kg_name}_{dname}_{y_col}_{config["thr"]}-{config["match_method"]}'
if not os.path.exists(CHECKPOINTS_FOLDER):
os.mkdir(CHECKPOINTS_FOLDER)
# Method 3: how to create index to do the entity matching
t0 = time.time()
# ... create custom index col (not required for creating entity weights)
d_train[id_col] = range(0, d_train.shape[0])
d_train[id_col] = d_train[id_col].apply(lambda id: str(id))
# ... create inverted index
inv_index = kg_index.indexing_df(d_train, text_col, id_col, config["match_method"])
print("Executed in %s seconds." % str(time.time() - t0))
# Method 4: how to use KG structure to identify terminology (infer methods)
for infer_method in config["infer_method"]:
t0 = time.time()
# ... do entity matching and infer information from kg
d_train['entity_matches'] = __entity_matching(d_train,
inv_index,
text_col,
id_col,
kg,
check_root,
config["match_method"],
infer_method)
print("Executed in %s seconds." % str(time.time() - t0))
for weight_f in config["weight_f"]:
# Method 5: how to weight entities based on the training corpus context
# Create or expand weights of a KG contextual utterance
t0 = time.time()
# ... get the path to output weights
method_name = '-'.join([str(config["thr"]), config["match_method"], infer_method, weight_f])
o_path = f'{OUTPUT_FOLDER}/{kg_name}_{dname}_{y_col}_{method_name}'
__compute_weights(d_train,
'entity_matches',
y_col,
weight_f,
o_path)
print("Executed in %s seconds." % str(time.time() - t0))
# ...to explore: weights = pd.DataFrame.from_dict(weights, orient='index')
print(' Success exporting entity weights to: {}'.format(o_path))
return
def main():
desc = "Create and save weighted KG and its evaluation subset in result folder"
parser = argparse.ArgumentParser(description=desc)
# Required arguments
parser.add_argument("--d_name",
default=None,
type=str,
required=True,
help=f"Pre-training corpus for the KG adaptation: {DNAMES}",
)
parser.add_argument("--knowledge_graph_path",
default=None,
type=str,
required=True,
help="Path to knowledge graph OWL file.",
)
parser.add_argument("--identities_pretraining",
default=None,
type=str,
required=True,
help="Column names in pre-training corpus for the identities (groups or subgroups) "
"based on which to assign weights to the KG"
" (up to 2 identity groups or subgroups, separated by ,)",
)
# Optional arguments
parser.add_argument("--thr",
default=0.5,
type=str,
required=False,
help="Configuration argument: \n"
"-- threshold to binarize labels (default: 0.5 percentage of annotator agreement).",
)
parser.add_argument("--match_method",
default='stem',
type=str,
required=False,
help="Configuration argument: \n"
"-- method for matching algorithm to texts (default: stemming).",
)
parser.add_argument("--infer_method",
default='hierarchical',
type=str,
required=False,
help="Configuration argument: \n"
"-- method for inferring information of entity from KG (default: use hierarchy).",
)
parser.add_argument("--weight_f",
default='docf',
type=str,
required=False,
help="Configuration argument: \n"
"-- method for weighting entities based on distribution "
"(default: use document frequencies).",
)
args = parser.parse_args()
# parse kwargs from optional arguments (specified or default values)
other_args = {'thr': float(args.thr),
'match_method': args.match_method,
'infer_method': [args.infer_method],
'weight_f': [args.weight_f]}
print('Computing entity weights: {}\n Optional args: {}'.format(args, other_args))
kg_adaptation(args.d_name,
args.knowledge_graph_path,
args.identities_pretraining.split(','),
**other_args)
return
if __name__ == "__main__":
main()