Skip to content

Commit

Permalink
add missing preprocessing file
Browse files Browse the repository at this point in the history
  • Loading branch information
rgemulla committed Nov 17, 2020
1 parent c9efabc commit e7bd9b7
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions data/preprocess/preprocess_default.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python
"""Preprocess a KGE dataset into a the format expected by libkge.
Call as `preprocess_base.py --folder <name>`. The original dataset should be stored in
subfolder `name` and have files "train.txt", "valid.txt", and "test.txt". Each file
contains one field_map triple per line, separated by tabs.
During preprocessing, each distinct entity name and each distinct relation name
is assigned an index (dense). The index-to-object mapping is stored in files
"entity_ids.del" and "relation_ids.del", resp. The triples (as indexes) are stored in
files "train.del", "valid.del", and "test.del". Additionally, the splits
"train_sample.del" (a random subset of train) and "valid_without_unseen.del" and
"test_without_unseen.del" are stored. The "test/valid_without_unseen.del" files are
subsets of "valid.del" and "test.del" resp. where all triples containing entities
or relations not existing in "train.del" have been filtered out.
Metadata information is stored in a file "dataset.yaml".
"""

import util

if __name__ == "__main__":
args = util.default_parser().parse_args()
field_map = {
"S": args.subject_field,
"P": args.predicate_field,
"O": args.object_field,
}

print(f"Preprocessing {args.folder}...")

# register raw splits
train_raw = util.RawSplit(
file="train.txt",
field_map=field_map,
collect_entities=True,
collect_relations=True,
)
valid_raw = util.RawSplit(file="valid.txt", field_map=field_map,)
test_raw = util.RawSplit(file="test.txt", field_map=field_map,)

# create raw dataset with default splits
raw_dataset = util.create_raw_dataset(train_raw, valid_raw, test_raw, args)

# do the work
util.process_splits(raw_dataset)
util.update_string_files(raw_dataset, args)
util.write_dataset_yaml(raw_dataset.config, args.folder)

0 comments on commit e7bd9b7

Please sign in to comment.