From 8856cf74d8485758186635fa498fff1148bd34d1 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 24 Oct 2023 19:40:18 +0200 Subject: [PATCH 1/5] add tacred dataset builder --- dataset_builders/pie/tacred/tacred.py | 207 ++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 dataset_builders/pie/tacred/tacred.py diff --git a/dataset_builders/pie/tacred/tacred.py b/dataset_builders/pie/tacred/tacred.py new file mode 100644 index 00000000..5dcf6a83 --- /dev/null +++ b/dataset_builders/pie/tacred/tacred.py @@ -0,0 +1,207 @@ +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Tuple + +import datasets +import pytorch_ie.data.builder +from pytorch_ie import token_based_document_to_text_based +from pytorch_ie.annotations import BinaryRelation, LabeledSpan, _post_init_single_label +from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field +from pytorch_ie.documents import ( + TextDocumentWithLabeledSpansAndBinaryRelations, + TokenBasedDocument, +) + + +@dataclass(eq=True, frozen=True) +class TokenRelation(Annotation): + head_idx: int + tail_idx: int + label: str + score: float = 1.0 + + def __post_init__(self) -> None: + _post_init_single_label(self) + + +@dataclass(eq=True, frozen=True) +class TokenAttribute(Annotation): + idx: int + label: str + + +@dataclass +class TacredDocument(Document): + tokens: Tuple[str, ...] + id: Optional[str] = None + metadata: Dict[str, Any] = field(default_factory=dict) + stanford_ner: AnnotationList[TokenAttribute] = annotation_field(target="tokens") + stanford_pos: AnnotationList[TokenAttribute] = annotation_field(target="tokens") + entities: AnnotationList[LabeledSpan] = annotation_field(target="tokens") + relations: AnnotationList[BinaryRelation] = annotation_field(target="entities") + dependency_relations: AnnotationList[TokenRelation] = annotation_field(target="tokens") + + +@dataclass +class SimpleTacredDocument(TokenBasedDocument): + labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens") + binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans") + + +def example_to_document( + example: Dict[str, Any], + relation_int2str: Callable[[int], str], + ner_int2str: Callable[[int], str], +) -> TacredDocument: + document = TacredDocument( + tokens=tuple(example["token"]), id=example["id"], metadata=dict(doc_id=example["docid"]) + ) + + for idx, (ner, pos) in enumerate(zip(example["stanford_ner"], example["stanford_pos"])): + document.stanford_ner.append(TokenAttribute(idx=idx, label=ner)) + document.stanford_pos.append(TokenAttribute(idx=idx, label=pos)) + + for tail_idx, (deprel_label, head_idx) in enumerate( + zip(example["stanford_deprel"], example["stanford_head"]) + ): + if head_idx >= 0: + document.dependency_relations.append( + TokenRelation( + head_idx=head_idx, + tail_idx=tail_idx, + label=deprel_label, + ) + ) + + head = LabeledSpan( + start=example["subj_start"], + end=example["subj_end"], + label=ner_int2str(example["subj_type"]), + ) + tail = LabeledSpan( + start=example["obj_start"], + end=example["obj_end"], + label=ner_int2str(example["obj_type"]), + ) + document.entities.append(head) + document.entities.append(tail) + + relation_str = relation_int2str(example["relation"]) + relation = BinaryRelation(head=head, tail=tail, label=relation_str) + document.relations.append(relation) + + return document + + +def _entity_to_dict( + entity: LabeledSpan, key_prefix: str = "", label_mapping: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: + return { + f"{key_prefix}start": entity.start, + f"{key_prefix}end": entity.end, + f"{key_prefix}type": label_mapping[entity.label] + if label_mapping is not None + else entity.label, + } + + +def document_to_example( + document: TacredDocument, + ner_names: Optional[List[str]] = None, + relation_names: Optional[List[str]] = None, +) -> Dict[str, Any]: + ner2idx = {name: idx for idx, name in enumerate(ner_names)} if ner_names is not None else None + rel2idx = ( + {name: idx for idx, name in enumerate(relation_names)} + if relation_names is not None + else None + ) + + token = list(document.tokens) + stanford_ner_dict = {ner.idx: ner.label for ner in document.stanford_ner} + stanford_pos_dict = {pos.idx: pos.label for pos in document.stanford_pos} + stanford_ner = [stanford_ner_dict[idx] for idx in range(len(token))] + stanford_pos = [stanford_pos_dict[idx] for idx in range(len(token))] + + stanford_deprel = ["ROOT"] * len(document.tokens) + stanford_head = [-1] * len(document.tokens) + for dep_rel in document.dependency_relations: + stanford_deprel[dep_rel.tail_idx] = dep_rel.label + stanford_head[dep_rel.tail_idx] = dep_rel.head_idx + + rel = document.relations[0] + obj: LabeledSpan = rel.tail + subj: LabeledSpan = rel.head + return { + "id": document.id, + "docid": document.metadata["doc_id"], + "relation": rel.label if rel2idx is None else rel2idx[rel.label], + "token": token, + "stanford_ner": stanford_ner, + "stanford_pos": stanford_pos, + "stanford_deprel": stanford_deprel, + "stanford_head": stanford_head, + **_entity_to_dict(obj, key_prefix="obj_", label_mapping=ner2idx), + **_entity_to_dict(subj, key_prefix="subj_", label_mapping=ner2idx), + } + + +def convert_to_text_document_with_labeled_spans_and_binary_relations( + document: TacredDocument, +) -> TextDocumentWithLabeledSpansAndBinaryRelations: + doc_simplified = document.as_type( + SimpleTacredDocument, + field_mapping={"entities": "labeled_spans", "relations": "binary_relations"}, + ) + result = token_based_document_to_text_based( + doc_simplified, + result_document_type=TextDocumentWithLabeledSpansAndBinaryRelations, + join_tokens_with=" ", + ) + return result + + +class TacredConfig(datasets.BuilderConfig): + """BuilderConfig for Tacred.""" + + def __init__(self, **kwargs): + """BuilderConfig for Tacred. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super().__init__(**kwargs) + + +class Tacred(pytorch_ie.data.builder.GeneratorBasedBuilder): + DOCUMENT_TYPE = TacredDocument + + DOCUMENT_CONVERTERS = { + TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations, + } + + BASE_DATASET_PATH = "DFKI-SLT/tacred" + + BUILDER_CONFIGS = [ + TacredConfig( + name="original", version=datasets.Version("1.1.0"), description="The original TACRED." + ), + TacredConfig( + name="revisited", + version=datasets.Version("1.1.0"), + description="The revised TACRED (corrected labels in dev and test split).", + ), + TacredConfig( + name="re-tacred", + version=datasets.Version("1.1.0"), + description="Relabeled TACRED (corrected labels for all splits and pruned)", + ), + ] + + def _generate_document_kwargs(self, dataset): + return { + "ner_int2str": dataset.features["subj_type"].int2str, + "relation_int2str": dataset.features["relation"].int2str, + } + + def _generate_document(self, example, **kwargs): + return example_to_document(example, **kwargs) From be87ee4f6e925b1ea4bc17cb683346870d54e545 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 24 Oct 2023 19:40:32 +0200 Subject: [PATCH 2/5] add tests for tacred dataset builder --- tests/dataset_builders/common.py | 69 ++ tests/dataset_builders/pie/test_tacred.py | 180 +++++ .../tacred/original/test.ner_names.json | 26 + .../tacred/original/test.relation_names.json | 44 ++ .../tacred/original/test.samples-3.json | 685 ++++++++++++++++++ .../tacred/original/train.ner_names.json | 26 + .../tacred/original/train.relation_names.json | 44 ++ .../tacred/original/train.samples-3.json | 545 ++++++++++++++ .../tacred/original/validation.ner_names.json | 26 + .../original/validation.relation_names.json | 44 ++ .../tacred/original/validation.samples-3.json | 505 +++++++++++++ .../tacred/re-tacred/test.ner_names.json | 26 + .../tacred/re-tacred/test.relation_names.json | 42 ++ .../tacred/re-tacred/test.samples-3.json | 685 ++++++++++++++++++ .../tacred/re-tacred/train.ner_names.json | 26 + .../re-tacred/train.relation_names.json | 42 ++ .../tacred/re-tacred/train.samples-3.json | 545 ++++++++++++++ .../re-tacred/validation.ner_names.json | 26 + .../re-tacred/validation.relation_names.json | 42 ++ .../re-tacred/validation.samples-3.json | 505 +++++++++++++ .../tacred/revisited/test.ner_names.json | 26 + .../tacred/revisited/test.relation_names.json | 44 ++ .../tacred/revisited/test.samples-3.json | 685 ++++++++++++++++++ .../tacred/revisited/train.ner_names.json | 26 + .../revisited/train.relation_names.json | 44 ++ .../tacred/revisited/train.samples-3.json | 545 ++++++++++++++ .../revisited/validation.ner_names.json | 26 + .../revisited/validation.relation_names.json | 44 ++ .../revisited/validation.samples-3.json | 505 +++++++++++++ 29 files changed, 6078 insertions(+) create mode 100644 tests/dataset_builders/common.py create mode 100644 tests/dataset_builders/pie/test_tacred.py create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.samples-3.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.ner_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.relation_names.json create mode 100644 tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.samples-3.json diff --git a/tests/dataset_builders/common.py b/tests/dataset_builders/common.py new file mode 100644 index 00000000..57291fd2 --- /dev/null +++ b/tests/dataset_builders/common.py @@ -0,0 +1,69 @@ +import json +import logging +import os +import re +from pathlib import Path +from typing import List, Optional + +from tests import FIXTURES_ROOT + +DATASET_BUILDER_BASE_PATH = Path("dataset_builders") +HF_BASE_PATH = DATASET_BUILDER_BASE_PATH / "hf" +PIE_BASE_PATH = DATASET_BUILDER_BASE_PATH / "pie" +HF_DS_FIXTURE_DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "hf" + +logger = logging.getLogger(__name__) + + +def _deep_compare( + obj, + obj_expected, + path: Optional[str] = None, + excluded_paths: Optional[List[str]] = None, + enforce_equal_dict_keys: bool = True, +): + if path is not None and excluded_paths is not None: + for excluded_path in excluded_paths: + if re.match(excluded_path, path): + return + + if type(obj) != type(obj_expected): + raise AssertionError(f"{path}: {obj} != {obj_expected}") + if isinstance(obj, (list, tuple)): + if len(obj) != len(obj_expected): + raise AssertionError(f"{path}: {obj} != {obj_expected}") + for i in range(len(obj)): + _deep_compare( + obj[i], + obj_expected[i], + path=f"{path}.{i}" if path is not None else str(i), + excluded_paths=excluded_paths, + enforce_equal_dict_keys=enforce_equal_dict_keys, + ) + elif isinstance(obj, dict): + if enforce_equal_dict_keys and obj.keys() != obj_expected.keys(): + raise AssertionError(f"{path}: {obj} != {obj_expected}") + for k in set(obj) | set(obj_expected): + _deep_compare( + obj.get(k, None), + obj_expected.get(k, None), + path=f"{path}.{k}" if path is not None else str(k), + excluded_paths=excluded_paths, + enforce_equal_dict_keys=enforce_equal_dict_keys, + ) + else: + if obj != obj_expected: + raise AssertionError(f"{path}: {obj} != {obj_expected}") + + +def _dump_json(obj, fn): + logger.warning(f"dump fixture data: {fn}") + os.makedirs(os.path.dirname(fn), exist_ok=True) + with open(fn, "w") as f: + json.dump(obj, f, indent=2, sort_keys=True) + + +def _load_json(fn: str): + with open(fn) as f: + ex = json.load(f) + return ex diff --git a/tests/dataset_builders/pie/test_tacred.py b/tests/dataset_builders/pie/test_tacred.py new file mode 100644 index 00000000..a46184c8 --- /dev/null +++ b/tests/dataset_builders/pie/test_tacred.py @@ -0,0 +1,180 @@ +import logging +import os + +import pytest +from datasets import load_dataset +from pytorch_ie.core import Document +from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations + +from dataset_builders.pie.tacred.tacred import ( + Tacred, + convert_to_text_document_with_labeled_spans_and_binary_relations, + document_to_example, + example_to_document, +) +from tests import FIXTURES_ROOT +from tests.dataset_builders.common import ( + PIE_BASE_PATH, + _deep_compare, + _dump_json, + _load_json, +) + +logger = logging.getLogger(__name__) + +HF_DATASET_PATH = "DFKI-SLT/tacred" +PIE_DATASET_PATH = f"{PIE_BASE_PATH}/tacred" +SPLITS = ["train", "validation", "test"] +EXAMPLE_IDX = 0 +NUM_SAMPLES = 3 + +DUMP_FIXTURE_DATA = False + +TACRED_DATA_DIR = os.getenv("TACRED_DATA_DIR", "") or None # ~/datasets/tacred/data/json + + +@pytest.fixture(params=[config.name for config in Tacred.BUILDER_CONFIGS], scope="module") +def dataset_variant(request): + return request.param + + +@pytest.fixture(params=SPLITS, scope="module") +def split(request): + return request.param + + +@pytest.fixture(scope="module") +def hf_example_path(dataset_variant): + return f"{FIXTURES_ROOT}/dataset_builders/hf/{HF_DATASET_PATH}/{dataset_variant}" + + +@pytest.fixture(scope="module") +def hf_samples_fn(hf_example_path): + return f"{hf_example_path}/{{split}}.samples-{NUM_SAMPLES}.json" + + +@pytest.fixture(scope="module") +def hf_metadata_fn(hf_example_path): + return f"{hf_example_path}/{{split}}.{{idx_or_feature}}.json" + + +@pytest.fixture(scope="module") +def hf_dataset(dataset_variant): + if TACRED_DATA_DIR is None: + raise ValueError("TACRED_DATA_DIR is required to load the Huggingface TacRED dataset") + else: + return load_dataset(HF_DATASET_PATH, name=dataset_variant, data_dir=TACRED_DATA_DIR) + + +@pytest.fixture(scope="module") +def hf_dataset_samples(hf_samples_fn): + data_files = {split: hf_samples_fn.format(split=split) for split in SPLITS} + return load_dataset("json", data_files=data_files) + + +def test_hf_dataset_samples(hf_dataset_samples): + assert set(hf_dataset_samples) == {"train", "validation", "test"} + for ds in hf_dataset_samples.values(): + assert len(ds) == NUM_SAMPLES + + +@pytest.mark.skipif(condition=not DUMP_FIXTURE_DATA, reason="don't dump fixture data") +def test_dump_hf(hf_dataset, hf_samples_fn, hf_metadata_fn): + for split, ds in hf_dataset.items(): + # save the dataset split + samples = [ds[i] for i in range(NUM_SAMPLES)] + _dump_json(samples, hf_samples_fn.format(split=split)) + # save the metadata + _dump_json( + obj=ds.features["subj_type"].names, + fn=hf_metadata_fn.format(split=split, idx_or_feature="ner_names"), + ) + _dump_json( + obj=ds.features["relation"].names, + fn=hf_metadata_fn.format(split=split, idx_or_feature="relation_names"), + ) + + +@pytest.fixture(params=range(NUM_SAMPLES), scope="module") +def hf_example(hf_dataset_samples, split, request): + return hf_dataset_samples[split][request.param] + + +@pytest.fixture(scope="module") +def ner_names(hf_metadata_fn, split): + return _load_json(hf_metadata_fn.format(split=split, idx_or_feature="ner_names")) + + +@pytest.fixture(scope="module") +def relation_names(hf_metadata_fn, split): + return _load_json(hf_metadata_fn.format(split=split, idx_or_feature="relation_names")) + + +@pytest.fixture(scope="module") +def document(hf_example, ner_names, relation_names): + return example_to_document( + hf_example, + ner_int2str=lambda idx: ner_names[idx], + relation_int2str=lambda idx: relation_names[idx], + ) + + +def test_document(document): + assert document is not None + assert isinstance(document, Document) + + +def test_example_to_document_and_back(hf_example, ner_names, relation_names): + doc = example_to_document( + hf_example, + ner_int2str=lambda idx: ner_names[idx], + relation_int2str=lambda idx: relation_names[idx], + ) + example_back = document_to_example(doc, ner_names=ner_names, relation_names=relation_names) + + _deep_compare(obj=example_back, obj_expected=hf_example) + + +@pytest.mark.skipif( + condition=TACRED_DATA_DIR is None, + reason="environment variable TACRED_DATA_DIR is not set", +) +@pytest.mark.slow +def test_example_to_document_and_back_all(hf_dataset): + for hf_ds in hf_dataset.values(): + ner_names = hf_ds.features["subj_type"].names + relation_names = hf_ds.features["relation"].names + for hf_ex in hf_ds: + doc = example_to_document( + hf_ex, + ner_int2str=lambda idx: ner_names[idx], + relation_int2str=lambda idx: relation_names[idx], + ) + example_back = document_to_example( + doc, ner_names=ner_names, relation_names=relation_names + ) + + _deep_compare(obj=example_back, obj_expected=hf_ex) + + +@pytest.mark.skipif( + condition=TACRED_DATA_DIR is None, + reason="environment variable TACRED_DATA_DIR is not set", +) +@pytest.mark.slow +def test_pie_document_all(dataset_variant): + pie_dataset = load_dataset( + PIE_DATASET_PATH, + name=dataset_variant, + data_dir=TACRED_DATA_DIR, + ) + for split, ds in pie_dataset.items(): + for doc in ds: + assert doc is not None + assert isinstance(doc, Document) + + +def test_convert_to_text_document_with_labeled_spans_and_binary_relations(document): + converted_doc = convert_to_text_document_with_labeled_spans_and_binary_relations(document) + assert converted_doc is not None + assert isinstance(converted_doc, TextDocumentWithLabeledSpansAndBinaryRelations) diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.samples-3.json new file mode 100644 index 00000000..e713e08f --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/test.samples-3.json @@ -0,0 +1,685 @@ +[ + { + "docid": "eng-NG-31-101172-8859554", + "id": "098f665fb966708cfcd2", + "obj_end": 46, + "obj_start": 43, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "aux", + "ROOT", + "case", + "det", + "compound", + "nmod", + "case", + "det", + "amod", + "compound", + "compound", + "compound", + "nmod", + "punct", + "case", + "compound", + "compound", + "nmod", + "case", + "det", + "nummod", + "amod", + "nmod", + "case", + "compound", + "compound", + "nmod", + "punct", + "case", + "det", + "nmod", + "case", + "compound", + "nmod", + "case", + "nmod", + "cc", + "case", + "det", + "conj", + "case", + "det", + "amod", + "compound", + "nmod", + "case", + "det", + "compound", + "nmod", + "acl", + "dobj", + "case", + "nmod", + "dep", + "punct" + ], + "stanford_head": [ + 2, + 2, + -1, + 6, + 6, + 6, + 2, + 13, + 13, + 13, + 13, + 13, + 13, + 2, + 2, + 18, + 18, + 18, + 2, + 23, + 23, + 23, + 23, + 18, + 27, + 27, + 27, + 23, + 2, + 31, + 31, + 2, + 34, + 34, + 31, + 36, + 34, + 31, + 40, + 40, + 31, + 45, + 45, + 45, + 45, + 40, + 49, + 49, + 49, + 45, + 40, + 50, + 53, + 51, + 53, + 2 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "DATE", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "PRP", + "VBZ", + "VBN", + "IN", + "DT", + "NN", + "NN", + "TO", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "CD", + "JJ", + "NN", + "IN", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "DT", + "NN", + "IN", + "NNP", + "NNPS", + "IN", + "NNP", + "CC", + "IN", + "DT", + "NN", + "IN", + "DT", + "JJ", + "NN", + "NN", + "IN", + "DT", + "NNP", + "NNP", + "VB", + "NN", + "IN", + "DT", + "DT", + "." + ], + "subj_end": 37, + "subj_start": 33, + "subj_type": 1, + "token": [ + "He", + "has", + "served", + "as", + "a", + "policy", + "aide", + "to", + "the", + "late", + "U.S.", + "Senator", + "Alan", + "Cranston", + ",", + "as", + "National", + "Issues", + "Director", + "for", + "the", + "2004", + "presidential", + "campaign", + "of", + "Congressman", + "Dennis", + "Kucinich", + ",", + "as", + "a", + "co-founder", + "of", + "Progressive", + "Democrats", + "of", + "America", + "and", + "as", + "a", + "member", + "of", + "the", + "international", + "policy", + "department", + "at", + "the", + "RAND", + "Corporation", + "think", + "tank", + "before", + "all", + "that", + "." + ] + }, + { + "docid": "APW_ENG_20090616.0636", + "id": "098f665fb90bef0c4ca4", + "obj_end": 11, + "obj_start": 10, + "obj_type": 8, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubj", + "aux", + "aux", + "ROOT", + "mark", + "xcomp", + "nmod:poss", + "dobj", + "case", + "nmod", + "punct", + "det", + "compound", + "root", + "punct", + "cc", + "conj", + "det", + "amod", + "xcomp", + "case", + "det", + "compound", + "nmod", + "punct", + "xcomp", + "case", + "det", + "nmod", + "case", + "nmod", + "case", + "nmod", + "nummod", + "punct" + ], + "stanford_head": [ + 1, + 4, + 4, + 4, + -1, + 6, + 4, + 8, + 6, + 10, + 6, + 4, + 14, + 14, + 11, + 14, + 14, + 14, + 20, + 20, + 17, + 24, + 24, + 24, + 20, + 17, + 17, + 29, + 29, + 26, + 31, + 29, + 33, + 29, + 33, + 14 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "DATE", + "DATE", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBZ", + "VBN", + "VBG", + "TO", + "VB", + "PRP$", + "NN", + "IN", + "NNP", + ",", + "DT", + "NNP", + "NN", + ",", + "CC", + "VB", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + "NNP", + ",", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NNP", + "IN", + "NNP", + "CD", + "." + ], + "subj_end": 8, + "subj_start": 7, + "subj_type": 2, + "token": [ + "Messina", + "Denaro", + "has", + "been", + "trying", + "to", + "impose", + "his", + "power", + "in", + "Palermo", + ",", + "the", + "Sicilian", + "capital", + ",", + "and", + "become", + "the", + "new", + "head", + "of", + "the", + "Sicilian", + "Mafia", + ",", + "weakened", + "by", + "the", + "arrest", + "of", + "Provenzano", + "in", + "April", + "2006", + "." + ] + }, + { + "docid": "XIN_ENG_20100801.0069", + "id": "098f665fb9ef7dbc81e7", + "obj_end": 16, + "obj_start": 15, + "obj_type": 9, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubjpass", + "punct", + "det", + "amod", + "appos", + "case", + "det", + "nmod", + "punct", + "cc", + "compound", + "compound", + "conj", + "case", + "nmod", + "cc", + "compound", + "compound", + "conj", + "case", + "det", + "nmod", + "auxpass", + "ROOT", + "case", + "nmod:poss", + "compound", + "compound", + "nmod", + "advmod", + "punct" + ], + "stanford_head": [ + 1, + 24, + 1, + 5, + 5, + 1, + 8, + 8, + 5, + 1, + 1, + 13, + 13, + 1, + 15, + 13, + 13, + 19, + 19, + 13, + 22, + 22, + 19, + 24, + -1, + 29, + 29, + 29, + 29, + 24, + 29, + 24 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "MISC", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "PERSON", + "O", + "O", + "LOCATION", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + ",", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + ",", + "CC", + "NNS", + "NNP", + "NNP", + "IN", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "NNPS", + "VBD", + "VBN", + "IN", + "PRP$", + "NNP", + "NNP", + "NNS", + "RB", + "." + ], + "subj_end": 2, + "subj_start": 0, + "subj_type": 2, + "token": [ + "Eugenio", + "Vagni", + ",", + "the", + "Italian", + "worker", + "of", + "the", + "ICRC", + ",", + "and", + "colleagues", + "Andreas", + "Notter", + "of", + "Switzerland", + "and", + "Mary", + "Jean", + "Lacaba", + "of", + "the", + "Philippines", + "were", + "released", + "by", + "their", + "Abu", + "Sayyaf", + "captors", + "separately", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.samples-3.json new file mode 100644 index 00000000..64340a00 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/train.samples-3.json @@ -0,0 +1,545 @@ +[ + { + "docid": "AFP_ENG_20070218.0019.LDC2009T13", + "id": "61b3a5c8c9a882dcfcd2", + "obj_end": 2, + "obj_start": 0, + "obj_type": 2, + "relation": 6, + "stanford_deprel": [ + "compound", + "nsubj", + "ROOT", + "case", + "nmod", + "amod", + "nmod:tmod", + "mark", + "xcomp", + "det", + "compound", + "compound", + "dobj", + "punct", + "appos", + "punct", + "punct", + "xcomp", + "det", + "dobj", + "case", + "nummod", + "nmod", + "case", + "nmod", + "punct", + "xcomp", + "amod", + "compound", + "compound", + "compound", + "dobj", + "mark", + "xcomp", + "dobj", + "cc", + "conj", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 1, + 2, + -1, + 4, + 2, + 6, + 2, + 8, + 2, + 12, + 12, + 12, + 8, + 14, + 12, + 14, + 2, + 2, + 19, + 17, + 22, + 22, + 17, + 24, + 22, + 2, + 2, + 31, + 31, + 31, + 31, + 26, + 33, + 26, + 33, + 33, + 33, + 39, + 39, + 36, + 2 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "DATE", + "DATE", + "DATE", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBD", + "IN", + "NNP", + "JJ", + "NN", + "TO", + "VB", + "DT", + "DT", + "NNP", + "NNP", + "-LRB-", + "NNP", + "-RRB-", + ",", + "VBG", + "DT", + "NN", + "IN", + "CD", + "NNS", + "IN", + "NN", + ",", + "VBG", + "JJ", + "NN", + "NNP", + "NNP", + "NNP", + "TO", + "VB", + "NN", + "CC", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 13, + "subj_start": 10, + "subj_type": 1, + "token": [ + "Tom", + "Thabane", + "resigned", + "in", + "October", + "last", + "year", + "to", + "form", + "the", + "All", + "Basotho", + "Convention", + "(", + "ABC", + ")", + ",", + "crossing", + "the", + "floor", + "with", + "17", + "members", + "of", + "parliament", + ",", + "causing", + "constitutional", + "monarch", + "King", + "Letsie", + "III", + "to", + "dissolve", + "parliament", + "and", + "call", + "the", + "snap", + "election", + "." + ] + }, + { + "docid": "NYT_ENG_20071026.0056.LDC2009T13", + "id": "61b3a65fb9b7111c4ca4", + "obj_end": 21, + "obj_start": 19, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "case", + "nmod", + "punct", + "det", + "nmod:tmod", + "case", + "det", + "nmod", + "punct", + "nsubj", + "ROOT", + "det", + "amod", + "punct", + "compound", + "dobj", + "punct", + "case", + "det", + "compound", + "nmod", + "cc", + "compound", + "compound", + "compound", + "conj", + "punct" + ], + "stanford_head": [ + 1, + 10, + 10, + 4, + 10, + 7, + 7, + 4, + 10, + 10, + -1, + 15, + 15, + 15, + 15, + 10, + 15, + 20, + 20, + 20, + 15, + 20, + 25, + 25, + 25, + 20, + 10 + ], + "stanford_ner": [ + "O", + "DATE", + "O", + "DURATION", + "DURATION", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O" + ], + "stanford_pos": [ + "IN", + "CD", + ",", + "DT", + "NN", + "IN", + "DT", + "NN", + ",", + "NNP", + "VBD", + "DT", + "JJ", + "``", + "NN", + "NN", + "''", + "IN", + "DT", + "NNP", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "NNP", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "In", + "1983", + ",", + "a", + "year", + "after", + "the", + "rally", + ",", + "Forsberg", + "received", + "the", + "so-called", + "``", + "genius", + "award", + "''", + "from", + "the", + "John", + "D.", + "and", + "Catherine", + "T.", + "MacArthur", + "Foundation", + "." + ] + }, + { + "docid": "eng-NG-31-126955-9171242", + "id": "61b3a65fb9aeb61c81e7", + "obj_end": 9, + "obj_start": 7, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "cop", + "case", + "det", + "ROOT", + "case", + "compound", + "compound", + "nmod:poss", + "case", + "nmod", + "mark", + "nsubjpass", + "auxpass", + "dep", + "case", + "det", + "nmod", + "case", + "nmod", + "cc", + "conj", + "case", + "det", + "nmod", + "case", + "nmod", + "punct" + ], + "stanford_head": [ + 4, + 4, + 4, + 4, + -1, + 10, + 8, + 8, + 10, + 8, + 4, + 14, + 14, + 14, + 4, + 17, + 17, + 14, + 19, + 17, + 14, + 14, + 24, + 24, + 21, + 26, + 24, + 4 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "DT", + "VBD", + "IN", + "DT", + "NN", + "IN", + "NN", + "NNP", + "NNP", + "POS", + "NNS", + "IN", + "PRP", + "VBD", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NN", + "CC", + "VBG", + "IN", + "DT", + "NN", + "IN", + "NN", + "." + ], + "subj_end": 27, + "subj_start": 26, + "subj_type": 1, + "token": [ + "This", + "was", + "among", + "a", + "batch", + "of", + "paperback", + "Oxford", + "World", + "'s", + "Classics", + "that", + "I", + "was", + "given", + "as", + "a", + "reward", + "for", + "reading", + "and", + "commenting", + "on", + "a", + "manuscript", + "for", + "OUP", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.samples-3.json new file mode 100644 index 00000000..5c3f131c --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/original/validation.samples-3.json @@ -0,0 +1,505 @@ +[ + { + "docid": "APW_ENG_20101103.0539", + "id": "e7798fb926b9403cfcd2", + "obj_end": 13, + "obj_start": 12, + "obj_type": 17, + "relation": 41, + "stanford_deprel": [ + "case", + "det", + "amod", + "nmod", + "punct", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "aux", + "ROOT", + "xcomp", + "punct", + "xcomp", + "compound", + "dobj", + "nsubj", + "aux", + "acl:relcl", + "mark", + "xcomp", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 3, + 3, + 3, + 11, + 11, + 9, + 9, + 9, + 9, + 11, + 11, + -1, + 11, + 11, + 11, + 16, + 14, + 19, + 19, + 16, + 21, + 19, + 24, + 24, + 21, + 11 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "IN", + "DT", + "JJ", + "NN", + ",", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "MD", + "VB", + "NN", + ",", + "VBG", + "NNP", + "NNP", + "WP", + "VBZ", + "VBG", + "TO", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 8, + "subj_type": 2, + "token": [ + "At", + "the", + "same", + "time", + ",", + "Chief", + "Financial", + "Officer", + "Douglas", + "Flint", + "will", + "become", + "chairman", + ",", + "succeeding", + "Stephen", + "Green", + "who", + "is", + "leaving", + "to", + "take", + "a", + "government", + "job", + "." + ] + }, + { + "docid": "APW_ENG_20080229.1401.LDC2009T13", + "id": "e779865fb96bbbcc4ca4", + "obj_end": 6, + "obj_start": 4, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "compound", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "case", + "nmod", + "ROOT", + "det", + "dobj", + "case", + "nmod", + "mark", + "det", + "amod", + "compound", + "compound", + "nsubj", + "advcl", + "det", + "dobj", + "mark", + "advcl", + "amod", + "compound", + "dobj", + "acl", + "case", + "det", + "amod", + "amod", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 8, + 7, + 5, + -1, + 10, + 8, + 12, + 10, + 19, + 18, + 18, + 18, + 18, + 19, + 8, + 21, + 19, + 23, + 19, + 26, + 26, + 23, + 26, + 32, + 32, + 32, + 32, + 27, + 8 + ], + "stanford_ner": [ + "LOCATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "MISC", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "IN", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "NNP", + "IN", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "VBG", + "JJ", + "NN", + "NN", + "VBN", + "IN", + "DT", + "JJ", + "JJ", + "NN", + "." + ], + "subj_end": 19, + "subj_start": 17, + "subj_type": 2, + "token": [ + "U.S.", + "District", + "Court", + "Judge", + "Jeffrey", + "White", + "in", + "mid-February", + "issued", + "an", + "injunction", + "against", + "Wikileaks", + "after", + "the", + "Zurich-based", + "Bank", + "Julius", + "Baer", + "accused", + "the", + "site", + "of", + "posting", + "sensitive", + "account", + "information", + "stolen", + "by", + "a", + "disgruntled", + "former", + "employee", + "." + ] + }, + { + "docid": "APW_ENG_20090707.0488", + "id": "e7798ae9c0adbcdc81e7", + "obj_end": 1, + "obj_start": 0, + "obj_type": 8, + "relation": 24, + "stanford_deprel": [ + "compound", + "nummod", + "nummod", + "compound", + "compound", + "nsubj", + "advmod", + "ROOT", + "mark", + "nsubjpass", + "punct", + "acl", + "dobj", + "punct", + "auxpass", + "ccomp", + "xcomp", + "case", + "nmod:poss", + "nmod", + "case", + "det", + "nmod", + "case", + "nmod:poss", + "compound", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 7, + 7, + -1, + 15, + 15, + 9, + 9, + 11, + 9, + 15, + 7, + 15, + 19, + 19, + 16, + 22, + 22, + 15, + 26, + 26, + 26, + 22, + 7 + ], + "stanford_ner": [ + "LOCATION", + "TIME", + "TIME", + "TIME", + "MISC", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "CD", + "CD", + "NNP", + "NNP", + "NNS", + "RBR", + "VBD", + "IN", + "NNP", + ",", + "VBD", + "CD", + ",", + "VBD", + "VBN", + "JJ", + "IN", + "PRP$", + "NN", + "IN", + "DT", + "NN", + "IN", + "PRP$", + "NNP", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "PARIS", + "2009-07-07", + "11:07:32", + "UTC", + "French", + "media", + "earlier", + "reported", + "that", + "Montcourt", + ",", + "ranked", + "119", + ",", + "was", + "found", + "dead", + "by", + "his", + "girlfriend", + "in", + "the", + "stairwell", + "of", + "his", + "Paris", + "apartment", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.relation_names.json new file mode 100644 index 00000000..9767c388 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.relation_names.json @@ -0,0 +1,42 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_branch", + "org:country_of_branch", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_branch", + "org:top_members/employees", + "org:website", + "per:age", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:identity", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.samples-3.json new file mode 100644 index 00000000..e713e08f --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/test.samples-3.json @@ -0,0 +1,685 @@ +[ + { + "docid": "eng-NG-31-101172-8859554", + "id": "098f665fb966708cfcd2", + "obj_end": 46, + "obj_start": 43, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "aux", + "ROOT", + "case", + "det", + "compound", + "nmod", + "case", + "det", + "amod", + "compound", + "compound", + "compound", + "nmod", + "punct", + "case", + "compound", + "compound", + "nmod", + "case", + "det", + "nummod", + "amod", + "nmod", + "case", + "compound", + "compound", + "nmod", + "punct", + "case", + "det", + "nmod", + "case", + "compound", + "nmod", + "case", + "nmod", + "cc", + "case", + "det", + "conj", + "case", + "det", + "amod", + "compound", + "nmod", + "case", + "det", + "compound", + "nmod", + "acl", + "dobj", + "case", + "nmod", + "dep", + "punct" + ], + "stanford_head": [ + 2, + 2, + -1, + 6, + 6, + 6, + 2, + 13, + 13, + 13, + 13, + 13, + 13, + 2, + 2, + 18, + 18, + 18, + 2, + 23, + 23, + 23, + 23, + 18, + 27, + 27, + 27, + 23, + 2, + 31, + 31, + 2, + 34, + 34, + 31, + 36, + 34, + 31, + 40, + 40, + 31, + 45, + 45, + 45, + 45, + 40, + 49, + 49, + 49, + 45, + 40, + 50, + 53, + 51, + 53, + 2 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "DATE", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "PRP", + "VBZ", + "VBN", + "IN", + "DT", + "NN", + "NN", + "TO", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "CD", + "JJ", + "NN", + "IN", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "DT", + "NN", + "IN", + "NNP", + "NNPS", + "IN", + "NNP", + "CC", + "IN", + "DT", + "NN", + "IN", + "DT", + "JJ", + "NN", + "NN", + "IN", + "DT", + "NNP", + "NNP", + "VB", + "NN", + "IN", + "DT", + "DT", + "." + ], + "subj_end": 37, + "subj_start": 33, + "subj_type": 1, + "token": [ + "He", + "has", + "served", + "as", + "a", + "policy", + "aide", + "to", + "the", + "late", + "U.S.", + "Senator", + "Alan", + "Cranston", + ",", + "as", + "National", + "Issues", + "Director", + "for", + "the", + "2004", + "presidential", + "campaign", + "of", + "Congressman", + "Dennis", + "Kucinich", + ",", + "as", + "a", + "co-founder", + "of", + "Progressive", + "Democrats", + "of", + "America", + "and", + "as", + "a", + "member", + "of", + "the", + "international", + "policy", + "department", + "at", + "the", + "RAND", + "Corporation", + "think", + "tank", + "before", + "all", + "that", + "." + ] + }, + { + "docid": "APW_ENG_20090616.0636", + "id": "098f665fb90bef0c4ca4", + "obj_end": 11, + "obj_start": 10, + "obj_type": 8, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubj", + "aux", + "aux", + "ROOT", + "mark", + "xcomp", + "nmod:poss", + "dobj", + "case", + "nmod", + "punct", + "det", + "compound", + "root", + "punct", + "cc", + "conj", + "det", + "amod", + "xcomp", + "case", + "det", + "compound", + "nmod", + "punct", + "xcomp", + "case", + "det", + "nmod", + "case", + "nmod", + "case", + "nmod", + "nummod", + "punct" + ], + "stanford_head": [ + 1, + 4, + 4, + 4, + -1, + 6, + 4, + 8, + 6, + 10, + 6, + 4, + 14, + 14, + 11, + 14, + 14, + 14, + 20, + 20, + 17, + 24, + 24, + 24, + 20, + 17, + 17, + 29, + 29, + 26, + 31, + 29, + 33, + 29, + 33, + 14 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "DATE", + "DATE", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBZ", + "VBN", + "VBG", + "TO", + "VB", + "PRP$", + "NN", + "IN", + "NNP", + ",", + "DT", + "NNP", + "NN", + ",", + "CC", + "VB", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + "NNP", + ",", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NNP", + "IN", + "NNP", + "CD", + "." + ], + "subj_end": 8, + "subj_start": 7, + "subj_type": 2, + "token": [ + "Messina", + "Denaro", + "has", + "been", + "trying", + "to", + "impose", + "his", + "power", + "in", + "Palermo", + ",", + "the", + "Sicilian", + "capital", + ",", + "and", + "become", + "the", + "new", + "head", + "of", + "the", + "Sicilian", + "Mafia", + ",", + "weakened", + "by", + "the", + "arrest", + "of", + "Provenzano", + "in", + "April", + "2006", + "." + ] + }, + { + "docid": "XIN_ENG_20100801.0069", + "id": "098f665fb9ef7dbc81e7", + "obj_end": 16, + "obj_start": 15, + "obj_type": 9, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubjpass", + "punct", + "det", + "amod", + "appos", + "case", + "det", + "nmod", + "punct", + "cc", + "compound", + "compound", + "conj", + "case", + "nmod", + "cc", + "compound", + "compound", + "conj", + "case", + "det", + "nmod", + "auxpass", + "ROOT", + "case", + "nmod:poss", + "compound", + "compound", + "nmod", + "advmod", + "punct" + ], + "stanford_head": [ + 1, + 24, + 1, + 5, + 5, + 1, + 8, + 8, + 5, + 1, + 1, + 13, + 13, + 1, + 15, + 13, + 13, + 19, + 19, + 13, + 22, + 22, + 19, + 24, + -1, + 29, + 29, + 29, + 29, + 24, + 29, + 24 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "MISC", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "PERSON", + "O", + "O", + "LOCATION", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + ",", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + ",", + "CC", + "NNS", + "NNP", + "NNP", + "IN", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "NNPS", + "VBD", + "VBN", + "IN", + "PRP$", + "NNP", + "NNP", + "NNS", + "RB", + "." + ], + "subj_end": 2, + "subj_start": 0, + "subj_type": 2, + "token": [ + "Eugenio", + "Vagni", + ",", + "the", + "Italian", + "worker", + "of", + "the", + "ICRC", + ",", + "and", + "colleagues", + "Andreas", + "Notter", + "of", + "Switzerland", + "and", + "Mary", + "Jean", + "Lacaba", + "of", + "the", + "Philippines", + "were", + "released", + "by", + "their", + "Abu", + "Sayyaf", + "captors", + "separately", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.relation_names.json new file mode 100644 index 00000000..9767c388 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.relation_names.json @@ -0,0 +1,42 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_branch", + "org:country_of_branch", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_branch", + "org:top_members/employees", + "org:website", + "per:age", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:identity", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.samples-3.json new file mode 100644 index 00000000..64340a00 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/train.samples-3.json @@ -0,0 +1,545 @@ +[ + { + "docid": "AFP_ENG_20070218.0019.LDC2009T13", + "id": "61b3a5c8c9a882dcfcd2", + "obj_end": 2, + "obj_start": 0, + "obj_type": 2, + "relation": 6, + "stanford_deprel": [ + "compound", + "nsubj", + "ROOT", + "case", + "nmod", + "amod", + "nmod:tmod", + "mark", + "xcomp", + "det", + "compound", + "compound", + "dobj", + "punct", + "appos", + "punct", + "punct", + "xcomp", + "det", + "dobj", + "case", + "nummod", + "nmod", + "case", + "nmod", + "punct", + "xcomp", + "amod", + "compound", + "compound", + "compound", + "dobj", + "mark", + "xcomp", + "dobj", + "cc", + "conj", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 1, + 2, + -1, + 4, + 2, + 6, + 2, + 8, + 2, + 12, + 12, + 12, + 8, + 14, + 12, + 14, + 2, + 2, + 19, + 17, + 22, + 22, + 17, + 24, + 22, + 2, + 2, + 31, + 31, + 31, + 31, + 26, + 33, + 26, + 33, + 33, + 33, + 39, + 39, + 36, + 2 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "DATE", + "DATE", + "DATE", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBD", + "IN", + "NNP", + "JJ", + "NN", + "TO", + "VB", + "DT", + "DT", + "NNP", + "NNP", + "-LRB-", + "NNP", + "-RRB-", + ",", + "VBG", + "DT", + "NN", + "IN", + "CD", + "NNS", + "IN", + "NN", + ",", + "VBG", + "JJ", + "NN", + "NNP", + "NNP", + "NNP", + "TO", + "VB", + "NN", + "CC", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 13, + "subj_start": 10, + "subj_type": 1, + "token": [ + "Tom", + "Thabane", + "resigned", + "in", + "October", + "last", + "year", + "to", + "form", + "the", + "All", + "Basotho", + "Convention", + "(", + "ABC", + ")", + ",", + "crossing", + "the", + "floor", + "with", + "17", + "members", + "of", + "parliament", + ",", + "causing", + "constitutional", + "monarch", + "King", + "Letsie", + "III", + "to", + "dissolve", + "parliament", + "and", + "call", + "the", + "snap", + "election", + "." + ] + }, + { + "docid": "NYT_ENG_20071026.0056.LDC2009T13", + "id": "61b3a65fb9b7111c4ca4", + "obj_end": 21, + "obj_start": 19, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "case", + "nmod", + "punct", + "det", + "nmod:tmod", + "case", + "det", + "nmod", + "punct", + "nsubj", + "ROOT", + "det", + "amod", + "punct", + "compound", + "dobj", + "punct", + "case", + "det", + "compound", + "nmod", + "cc", + "compound", + "compound", + "compound", + "conj", + "punct" + ], + "stanford_head": [ + 1, + 10, + 10, + 4, + 10, + 7, + 7, + 4, + 10, + 10, + -1, + 15, + 15, + 15, + 15, + 10, + 15, + 20, + 20, + 20, + 15, + 20, + 25, + 25, + 25, + 20, + 10 + ], + "stanford_ner": [ + "O", + "DATE", + "O", + "DURATION", + "DURATION", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O" + ], + "stanford_pos": [ + "IN", + "CD", + ",", + "DT", + "NN", + "IN", + "DT", + "NN", + ",", + "NNP", + "VBD", + "DT", + "JJ", + "``", + "NN", + "NN", + "''", + "IN", + "DT", + "NNP", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "NNP", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "In", + "1983", + ",", + "a", + "year", + "after", + "the", + "rally", + ",", + "Forsberg", + "received", + "the", + "so-called", + "``", + "genius", + "award", + "''", + "from", + "the", + "John", + "D.", + "and", + "Catherine", + "T.", + "MacArthur", + "Foundation", + "." + ] + }, + { + "docid": "eng-NG-31-126955-9171242", + "id": "61b3a65fb9aeb61c81e7", + "obj_end": 9, + "obj_start": 7, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "cop", + "case", + "det", + "ROOT", + "case", + "compound", + "compound", + "nmod:poss", + "case", + "nmod", + "mark", + "nsubjpass", + "auxpass", + "dep", + "case", + "det", + "nmod", + "case", + "nmod", + "cc", + "conj", + "case", + "det", + "nmod", + "case", + "nmod", + "punct" + ], + "stanford_head": [ + 4, + 4, + 4, + 4, + -1, + 10, + 8, + 8, + 10, + 8, + 4, + 14, + 14, + 14, + 4, + 17, + 17, + 14, + 19, + 17, + 14, + 14, + 24, + 24, + 21, + 26, + 24, + 4 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "DT", + "VBD", + "IN", + "DT", + "NN", + "IN", + "NN", + "NNP", + "NNP", + "POS", + "NNS", + "IN", + "PRP", + "VBD", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NN", + "CC", + "VBG", + "IN", + "DT", + "NN", + "IN", + "NN", + "." + ], + "subj_end": 27, + "subj_start": 26, + "subj_type": 1, + "token": [ + "This", + "was", + "among", + "a", + "batch", + "of", + "paperback", + "Oxford", + "World", + "'s", + "Classics", + "that", + "I", + "was", + "given", + "as", + "a", + "reward", + "for", + "reading", + "and", + "commenting", + "on", + "a", + "manuscript", + "for", + "OUP", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.relation_names.json new file mode 100644 index 00000000..9767c388 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.relation_names.json @@ -0,0 +1,42 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_branch", + "org:country_of_branch", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_branch", + "org:top_members/employees", + "org:website", + "per:age", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:identity", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.samples-3.json new file mode 100644 index 00000000..3fd86eb3 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/re-tacred/validation.samples-3.json @@ -0,0 +1,505 @@ +[ + { + "docid": "APW_ENG_20101103.0539", + "id": "e7798fb926b9403cfcd2", + "obj_end": 13, + "obj_start": 12, + "obj_type": 17, + "relation": 39, + "stanford_deprel": [ + "case", + "det", + "amod", + "nmod", + "punct", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "aux", + "ROOT", + "xcomp", + "punct", + "xcomp", + "compound", + "dobj", + "nsubj", + "aux", + "acl:relcl", + "mark", + "xcomp", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 3, + 3, + 3, + 11, + 11, + 9, + 9, + 9, + 9, + 11, + 11, + -1, + 11, + 11, + 11, + 16, + 14, + 19, + 19, + 16, + 21, + 19, + 24, + 24, + 21, + 11 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "IN", + "DT", + "JJ", + "NN", + ",", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "MD", + "VB", + "NN", + ",", + "VBG", + "NNP", + "NNP", + "WP", + "VBZ", + "VBG", + "TO", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 8, + "subj_type": 2, + "token": [ + "At", + "the", + "same", + "time", + ",", + "Chief", + "Financial", + "Officer", + "Douglas", + "Flint", + "will", + "become", + "chairman", + ",", + "succeeding", + "Stephen", + "Green", + "who", + "is", + "leaving", + "to", + "take", + "a", + "government", + "job", + "." + ] + }, + { + "docid": "APW_ENG_20080229.1401.LDC2009T13", + "id": "e779865fb96bbbcc4ca4", + "obj_end": 6, + "obj_start": 4, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "compound", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "case", + "nmod", + "ROOT", + "det", + "dobj", + "case", + "nmod", + "mark", + "det", + "amod", + "compound", + "compound", + "nsubj", + "advcl", + "det", + "dobj", + "mark", + "advcl", + "amod", + "compound", + "dobj", + "acl", + "case", + "det", + "amod", + "amod", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 8, + 7, + 5, + -1, + 10, + 8, + 12, + 10, + 19, + 18, + 18, + 18, + 18, + 19, + 8, + 21, + 19, + 23, + 19, + 26, + 26, + 23, + 26, + 32, + 32, + 32, + 32, + 27, + 8 + ], + "stanford_ner": [ + "LOCATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "MISC", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "IN", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "NNP", + "IN", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "VBG", + "JJ", + "NN", + "NN", + "VBN", + "IN", + "DT", + "JJ", + "JJ", + "NN", + "." + ], + "subj_end": 19, + "subj_start": 17, + "subj_type": 2, + "token": [ + "U.S.", + "District", + "Court", + "Judge", + "Jeffrey", + "White", + "in", + "mid-February", + "issued", + "an", + "injunction", + "against", + "Wikileaks", + "after", + "the", + "Zurich-based", + "Bank", + "Julius", + "Baer", + "accused", + "the", + "site", + "of", + "posting", + "sensitive", + "account", + "information", + "stolen", + "by", + "a", + "disgruntled", + "former", + "employee", + "." + ] + }, + { + "docid": "APW_ENG_20090707.0488", + "id": "e7798ae9c0adbcdc81e7", + "obj_end": 1, + "obj_start": 0, + "obj_type": 8, + "relation": 21, + "stanford_deprel": [ + "compound", + "nummod", + "nummod", + "compound", + "compound", + "nsubj", + "advmod", + "ROOT", + "mark", + "nsubjpass", + "punct", + "acl", + "dobj", + "punct", + "auxpass", + "ccomp", + "xcomp", + "case", + "nmod:poss", + "nmod", + "case", + "det", + "nmod", + "case", + "nmod:poss", + "compound", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 7, + 7, + -1, + 15, + 15, + 9, + 9, + 11, + 9, + 15, + 7, + 15, + 19, + 19, + 16, + 22, + 22, + 15, + 26, + 26, + 26, + 22, + 7 + ], + "stanford_ner": [ + "LOCATION", + "TIME", + "TIME", + "TIME", + "MISC", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "CD", + "CD", + "NNP", + "NNP", + "NNS", + "RBR", + "VBD", + "IN", + "NNP", + ",", + "VBD", + "CD", + ",", + "VBD", + "VBN", + "JJ", + "IN", + "PRP$", + "NN", + "IN", + "DT", + "NN", + "IN", + "PRP$", + "NNP", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "PARIS", + "2009-07-07", + "11:07:32", + "UTC", + "French", + "media", + "earlier", + "reported", + "that", + "Montcourt", + ",", + "ranked", + "119", + ",", + "was", + "found", + "dead", + "by", + "his", + "girlfriend", + "in", + "the", + "stairwell", + "of", + "his", + "Paris", + "apartment", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.samples-3.json new file mode 100644 index 00000000..e713e08f --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/test.samples-3.json @@ -0,0 +1,685 @@ +[ + { + "docid": "eng-NG-31-101172-8859554", + "id": "098f665fb966708cfcd2", + "obj_end": 46, + "obj_start": 43, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "aux", + "ROOT", + "case", + "det", + "compound", + "nmod", + "case", + "det", + "amod", + "compound", + "compound", + "compound", + "nmod", + "punct", + "case", + "compound", + "compound", + "nmod", + "case", + "det", + "nummod", + "amod", + "nmod", + "case", + "compound", + "compound", + "nmod", + "punct", + "case", + "det", + "nmod", + "case", + "compound", + "nmod", + "case", + "nmod", + "cc", + "case", + "det", + "conj", + "case", + "det", + "amod", + "compound", + "nmod", + "case", + "det", + "compound", + "nmod", + "acl", + "dobj", + "case", + "nmod", + "dep", + "punct" + ], + "stanford_head": [ + 2, + 2, + -1, + 6, + 6, + 6, + 2, + 13, + 13, + 13, + 13, + 13, + 13, + 2, + 2, + 18, + 18, + 18, + 2, + 23, + 23, + 23, + 23, + 18, + 27, + 27, + 27, + 23, + 2, + 31, + 31, + 2, + 34, + 34, + 31, + 36, + 34, + 31, + 40, + 40, + 31, + 45, + 45, + 45, + 45, + 40, + 49, + 49, + 49, + 45, + 40, + 50, + 53, + 51, + 53, + 2 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "DATE", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "PRP", + "VBZ", + "VBN", + "IN", + "DT", + "NN", + "NN", + "TO", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "CD", + "JJ", + "NN", + "IN", + "NNP", + "NNP", + "NNP", + ",", + "IN", + "DT", + "NN", + "IN", + "NNP", + "NNPS", + "IN", + "NNP", + "CC", + "IN", + "DT", + "NN", + "IN", + "DT", + "JJ", + "NN", + "NN", + "IN", + "DT", + "NNP", + "NNP", + "VB", + "NN", + "IN", + "DT", + "DT", + "." + ], + "subj_end": 37, + "subj_start": 33, + "subj_type": 1, + "token": [ + "He", + "has", + "served", + "as", + "a", + "policy", + "aide", + "to", + "the", + "late", + "U.S.", + "Senator", + "Alan", + "Cranston", + ",", + "as", + "National", + "Issues", + "Director", + "for", + "the", + "2004", + "presidential", + "campaign", + "of", + "Congressman", + "Dennis", + "Kucinich", + ",", + "as", + "a", + "co-founder", + "of", + "Progressive", + "Democrats", + "of", + "America", + "and", + "as", + "a", + "member", + "of", + "the", + "international", + "policy", + "department", + "at", + "the", + "RAND", + "Corporation", + "think", + "tank", + "before", + "all", + "that", + "." + ] + }, + { + "docid": "APW_ENG_20090616.0636", + "id": "098f665fb90bef0c4ca4", + "obj_end": 11, + "obj_start": 10, + "obj_type": 8, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubj", + "aux", + "aux", + "ROOT", + "mark", + "xcomp", + "nmod:poss", + "dobj", + "case", + "nmod", + "punct", + "det", + "compound", + "root", + "punct", + "cc", + "conj", + "det", + "amod", + "xcomp", + "case", + "det", + "compound", + "nmod", + "punct", + "xcomp", + "case", + "det", + "nmod", + "case", + "nmod", + "case", + "nmod", + "nummod", + "punct" + ], + "stanford_head": [ + 1, + 4, + 4, + 4, + -1, + 6, + 4, + 8, + 6, + 10, + 6, + 4, + 14, + 14, + 11, + 14, + 14, + 14, + 20, + 20, + 17, + 24, + 24, + 24, + 20, + 17, + 17, + 29, + 29, + 26, + 31, + 29, + 33, + 29, + 33, + 14 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "DATE", + "DATE", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBZ", + "VBN", + "VBG", + "TO", + "VB", + "PRP$", + "NN", + "IN", + "NNP", + ",", + "DT", + "NNP", + "NN", + ",", + "CC", + "VB", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + "NNP", + ",", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NNP", + "IN", + "NNP", + "CD", + "." + ], + "subj_end": 8, + "subj_start": 7, + "subj_type": 2, + "token": [ + "Messina", + "Denaro", + "has", + "been", + "trying", + "to", + "impose", + "his", + "power", + "in", + "Palermo", + ",", + "the", + "Sicilian", + "capital", + ",", + "and", + "become", + "the", + "new", + "head", + "of", + "the", + "Sicilian", + "Mafia", + ",", + "weakened", + "by", + "the", + "arrest", + "of", + "Provenzano", + "in", + "April", + "2006", + "." + ] + }, + { + "docid": "XIN_ENG_20100801.0069", + "id": "098f665fb9ef7dbc81e7", + "obj_end": 16, + "obj_start": 15, + "obj_type": 9, + "relation": 0, + "stanford_deprel": [ + "compound", + "nsubjpass", + "punct", + "det", + "amod", + "appos", + "case", + "det", + "nmod", + "punct", + "cc", + "compound", + "compound", + "conj", + "case", + "nmod", + "cc", + "compound", + "compound", + "conj", + "case", + "det", + "nmod", + "auxpass", + "ROOT", + "case", + "nmod:poss", + "compound", + "compound", + "nmod", + "advmod", + "punct" + ], + "stanford_head": [ + 1, + 24, + 1, + 5, + 5, + 1, + 8, + 8, + 5, + 1, + 1, + 13, + 13, + 1, + 15, + 13, + 13, + 19, + 19, + 13, + 22, + 22, + 19, + 24, + -1, + 29, + 29, + 29, + 29, + 24, + 29, + 24 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "MISC", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "LOCATION", + "O", + "PERSON", + "PERSON", + "PERSON", + "O", + "O", + "LOCATION", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + ",", + "DT", + "JJ", + "NN", + "IN", + "DT", + "NNP", + ",", + "CC", + "NNS", + "NNP", + "NNP", + "IN", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "IN", + "DT", + "NNPS", + "VBD", + "VBN", + "IN", + "PRP$", + "NNP", + "NNP", + "NNS", + "RB", + "." + ], + "subj_end": 2, + "subj_start": 0, + "subj_type": 2, + "token": [ + "Eugenio", + "Vagni", + ",", + "the", + "Italian", + "worker", + "of", + "the", + "ICRC", + ",", + "and", + "colleagues", + "Andreas", + "Notter", + "of", + "Switzerland", + "and", + "Mary", + "Jean", + "Lacaba", + "of", + "the", + "Philippines", + "were", + "released", + "by", + "their", + "Abu", + "Sayyaf", + "captors", + "separately", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.samples-3.json new file mode 100644 index 00000000..64340a00 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/train.samples-3.json @@ -0,0 +1,545 @@ +[ + { + "docid": "AFP_ENG_20070218.0019.LDC2009T13", + "id": "61b3a5c8c9a882dcfcd2", + "obj_end": 2, + "obj_start": 0, + "obj_type": 2, + "relation": 6, + "stanford_deprel": [ + "compound", + "nsubj", + "ROOT", + "case", + "nmod", + "amod", + "nmod:tmod", + "mark", + "xcomp", + "det", + "compound", + "compound", + "dobj", + "punct", + "appos", + "punct", + "punct", + "xcomp", + "det", + "dobj", + "case", + "nummod", + "nmod", + "case", + "nmod", + "punct", + "xcomp", + "amod", + "compound", + "compound", + "compound", + "dobj", + "mark", + "xcomp", + "dobj", + "cc", + "conj", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 1, + 2, + -1, + 4, + 2, + 6, + 2, + 8, + 2, + 12, + 12, + 12, + 8, + 14, + 12, + 14, + 2, + 2, + 19, + 17, + 22, + 22, + 17, + 24, + 22, + 2, + 2, + 31, + 31, + 31, + 31, + 26, + 33, + 26, + 33, + 33, + 33, + 39, + 39, + 36, + 2 + ], + "stanford_ner": [ + "PERSON", + "PERSON", + "O", + "O", + "DATE", + "DATE", + "DATE", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "O", + "O", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "VBD", + "IN", + "NNP", + "JJ", + "NN", + "TO", + "VB", + "DT", + "DT", + "NNP", + "NNP", + "-LRB-", + "NNP", + "-RRB-", + ",", + "VBG", + "DT", + "NN", + "IN", + "CD", + "NNS", + "IN", + "NN", + ",", + "VBG", + "JJ", + "NN", + "NNP", + "NNP", + "NNP", + "TO", + "VB", + "NN", + "CC", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 13, + "subj_start": 10, + "subj_type": 1, + "token": [ + "Tom", + "Thabane", + "resigned", + "in", + "October", + "last", + "year", + "to", + "form", + "the", + "All", + "Basotho", + "Convention", + "(", + "ABC", + ")", + ",", + "crossing", + "the", + "floor", + "with", + "17", + "members", + "of", + "parliament", + ",", + "causing", + "constitutional", + "monarch", + "King", + "Letsie", + "III", + "to", + "dissolve", + "parliament", + "and", + "call", + "the", + "snap", + "election", + "." + ] + }, + { + "docid": "NYT_ENG_20071026.0056.LDC2009T13", + "id": "61b3a65fb9b7111c4ca4", + "obj_end": 21, + "obj_start": 19, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "case", + "nmod", + "punct", + "det", + "nmod:tmod", + "case", + "det", + "nmod", + "punct", + "nsubj", + "ROOT", + "det", + "amod", + "punct", + "compound", + "dobj", + "punct", + "case", + "det", + "compound", + "nmod", + "cc", + "compound", + "compound", + "compound", + "conj", + "punct" + ], + "stanford_head": [ + 1, + 10, + 10, + 4, + 10, + 7, + 7, + 4, + 10, + 10, + -1, + 15, + 15, + 15, + 15, + 10, + 15, + 20, + 20, + 20, + 15, + 20, + 25, + 25, + 25, + 20, + 10 + ], + "stanford_ner": [ + "O", + "DATE", + "O", + "DURATION", + "DURATION", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "ORGANIZATION", + "O" + ], + "stanford_pos": [ + "IN", + "CD", + ",", + "DT", + "NN", + "IN", + "DT", + "NN", + ",", + "NNP", + "VBD", + "DT", + "JJ", + "``", + "NN", + "NN", + "''", + "IN", + "DT", + "NNP", + "NNP", + "CC", + "NNP", + "NNP", + "NNP", + "NNP", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "In", + "1983", + ",", + "a", + "year", + "after", + "the", + "rally", + ",", + "Forsberg", + "received", + "the", + "so-called", + "``", + "genius", + "award", + "''", + "from", + "the", + "John", + "D.", + "and", + "Catherine", + "T.", + "MacArthur", + "Foundation", + "." + ] + }, + { + "docid": "eng-NG-31-126955-9171242", + "id": "61b3a65fb9aeb61c81e7", + "obj_end": 9, + "obj_start": 7, + "obj_type": 1, + "relation": 0, + "stanford_deprel": [ + "nsubj", + "cop", + "case", + "det", + "ROOT", + "case", + "compound", + "compound", + "nmod:poss", + "case", + "nmod", + "mark", + "nsubjpass", + "auxpass", + "dep", + "case", + "det", + "nmod", + "case", + "nmod", + "cc", + "conj", + "case", + "det", + "nmod", + "case", + "nmod", + "punct" + ], + "stanford_head": [ + 4, + 4, + 4, + 4, + -1, + 10, + 8, + 8, + 10, + 8, + 4, + 14, + 14, + 14, + 4, + 17, + 17, + 14, + 19, + 17, + 14, + 14, + 24, + 24, + 21, + 26, + 24, + 4 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "ORGANIZATION", + "O", + "MISC", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "DT", + "VBD", + "IN", + "DT", + "NN", + "IN", + "NN", + "NNP", + "NNP", + "POS", + "NNS", + "IN", + "PRP", + "VBD", + "VBN", + "IN", + "DT", + "NN", + "IN", + "NN", + "CC", + "VBG", + "IN", + "DT", + "NN", + "IN", + "NN", + "." + ], + "subj_end": 27, + "subj_start": 26, + "subj_type": 1, + "token": [ + "This", + "was", + "among", + "a", + "batch", + "of", + "paperback", + "Oxford", + "World", + "'s", + "Classics", + "that", + "I", + "was", + "given", + "as", + "a", + "reward", + "for", + "reading", + "and", + "commenting", + "on", + "a", + "manuscript", + "for", + "OUP", + "." + ] + } +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.ner_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.ner_names.json new file mode 100644 index 00000000..7ac54785 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.ner_names.json @@ -0,0 +1,26 @@ +[ + "LOCATION", + "ORGANIZATION", + "PERSON", + "DATE", + "MONEY", + "PERCENT", + "TIME", + "CAUSE_OF_DEATH", + "CITY", + "COUNTRY", + "CRIMINAL_CHARGE", + "EMAIL", + "HANDLE", + "IDEOLOGY", + "NATIONALITY", + "RELIGION", + "STATE_OR_PROVINCE", + "TITLE", + "URL", + "NUMBER", + "ORDINAL", + "MISC", + "DURATION", + "O" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.relation_names.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.relation_names.json new file mode 100644 index 00000000..a83c5a53 --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.relation_names.json @@ -0,0 +1,44 @@ +[ + "no_relation", + "org:alternate_names", + "org:city_of_headquarters", + "org:country_of_headquarters", + "org:dissolved", + "org:founded", + "org:founded_by", + "org:member_of", + "org:members", + "org:number_of_employees/members", + "org:parents", + "org:political/religious_affiliation", + "org:shareholders", + "org:stateorprovince_of_headquarters", + "org:subsidiaries", + "org:top_members/employees", + "org:website", + "per:age", + "per:alternate_names", + "per:cause_of_death", + "per:charges", + "per:children", + "per:cities_of_residence", + "per:city_of_birth", + "per:city_of_death", + "per:countries_of_residence", + "per:country_of_birth", + "per:country_of_death", + "per:date_of_birth", + "per:date_of_death", + "per:employee_of", + "per:origin", + "per:other_family", + "per:parents", + "per:religion", + "per:schools_attended", + "per:siblings", + "per:spouse", + "per:stateorprovince_of_birth", + "per:stateorprovince_of_death", + "per:stateorprovinces_of_residence", + "per:title" +] diff --git a/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.samples-3.json b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.samples-3.json new file mode 100644 index 00000000..5c3f131c --- /dev/null +++ b/tests/fixtures/dataset_builders/hf/DFKI-SLT/tacred/revisited/validation.samples-3.json @@ -0,0 +1,505 @@ +[ + { + "docid": "APW_ENG_20101103.0539", + "id": "e7798fb926b9403cfcd2", + "obj_end": 13, + "obj_start": 12, + "obj_type": 17, + "relation": 41, + "stanford_deprel": [ + "case", + "det", + "amod", + "nmod", + "punct", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "aux", + "ROOT", + "xcomp", + "punct", + "xcomp", + "compound", + "dobj", + "nsubj", + "aux", + "acl:relcl", + "mark", + "xcomp", + "det", + "compound", + "dobj", + "punct" + ], + "stanford_head": [ + 3, + 3, + 3, + 11, + 11, + 9, + 9, + 9, + 9, + 11, + 11, + -1, + 11, + 11, + 11, + 16, + 14, + 19, + 19, + 16, + 21, + 19, + 24, + 24, + 21, + 11 + ], + "stanford_ner": [ + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "IN", + "DT", + "JJ", + "NN", + ",", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "MD", + "VB", + "NN", + ",", + "VBG", + "NNP", + "NNP", + "WP", + "VBZ", + "VBG", + "TO", + "VB", + "DT", + "NN", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 8, + "subj_type": 2, + "token": [ + "At", + "the", + "same", + "time", + ",", + "Chief", + "Financial", + "Officer", + "Douglas", + "Flint", + "will", + "become", + "chairman", + ",", + "succeeding", + "Stephen", + "Green", + "who", + "is", + "leaving", + "to", + "take", + "a", + "government", + "job", + "." + ] + }, + { + "docid": "APW_ENG_20080229.1401.LDC2009T13", + "id": "e779865fb96bbbcc4ca4", + "obj_end": 6, + "obj_start": 4, + "obj_type": 2, + "relation": 0, + "stanford_deprel": [ + "compound", + "compound", + "compound", + "compound", + "compound", + "nsubj", + "case", + "nmod", + "ROOT", + "det", + "dobj", + "case", + "nmod", + "mark", + "det", + "amod", + "compound", + "compound", + "nsubj", + "advcl", + "det", + "dobj", + "mark", + "advcl", + "amod", + "compound", + "dobj", + "acl", + "case", + "det", + "amod", + "amod", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 8, + 7, + 5, + -1, + 10, + 8, + 12, + 10, + 19, + 18, + 18, + 18, + 18, + 19, + 8, + 21, + 19, + 23, + 19, + 26, + 26, + 23, + 26, + 32, + 32, + 32, + 32, + 27, + 8 + ], + "stanford_ner": [ + "LOCATION", + "O", + "O", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "ORGANIZATION", + "O", + "O", + "MISC", + "O", + "PERSON", + "PERSON", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "NNP", + "IN", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "NNP", + "IN", + "DT", + "JJ", + "NNP", + "NNP", + "NNP", + "VBD", + "DT", + "NN", + "IN", + "VBG", + "JJ", + "NN", + "NN", + "VBN", + "IN", + "DT", + "JJ", + "JJ", + "NN", + "." + ], + "subj_end": 19, + "subj_start": 17, + "subj_type": 2, + "token": [ + "U.S.", + "District", + "Court", + "Judge", + "Jeffrey", + "White", + "in", + "mid-February", + "issued", + "an", + "injunction", + "against", + "Wikileaks", + "after", + "the", + "Zurich-based", + "Bank", + "Julius", + "Baer", + "accused", + "the", + "site", + "of", + "posting", + "sensitive", + "account", + "information", + "stolen", + "by", + "a", + "disgruntled", + "former", + "employee", + "." + ] + }, + { + "docid": "APW_ENG_20090707.0488", + "id": "e7798ae9c0adbcdc81e7", + "obj_end": 1, + "obj_start": 0, + "obj_type": 8, + "relation": 24, + "stanford_deprel": [ + "compound", + "nummod", + "nummod", + "compound", + "compound", + "nsubj", + "advmod", + "ROOT", + "mark", + "nsubjpass", + "punct", + "acl", + "dobj", + "punct", + "auxpass", + "ccomp", + "xcomp", + "case", + "nmod:poss", + "nmod", + "case", + "det", + "nmod", + "case", + "nmod:poss", + "compound", + "nmod", + "punct" + ], + "stanford_head": [ + 5, + 5, + 5, + 5, + 5, + 7, + 7, + -1, + 15, + 15, + 9, + 9, + 11, + 9, + 15, + 7, + 15, + 19, + 19, + 16, + 22, + 22, + 15, + 26, + 26, + 26, + 22, + 7 + ], + "stanford_ner": [ + "LOCATION", + "TIME", + "TIME", + "TIME", + "MISC", + "O", + "O", + "O", + "O", + "PERSON", + "O", + "O", + "NUMBER", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "O", + "LOCATION", + "O", + "O" + ], + "stanford_pos": [ + "NNP", + "CD", + "CD", + "NNP", + "NNP", + "NNS", + "RBR", + "VBD", + "IN", + "NNP", + ",", + "VBD", + "CD", + ",", + "VBD", + "VBN", + "JJ", + "IN", + "PRP$", + "NN", + "IN", + "DT", + "NN", + "IN", + "PRP$", + "NNP", + "NN", + "." + ], + "subj_end": 10, + "subj_start": 9, + "subj_type": 2, + "token": [ + "PARIS", + "2009-07-07", + "11:07:32", + "UTC", + "French", + "media", + "earlier", + "reported", + "that", + "Montcourt", + ",", + "ranked", + "119", + ",", + "was", + "found", + "dead", + "by", + "his", + "girlfriend", + "in", + "the", + "stairwell", + "of", + "his", + "Paris", + "apartment", + "." + ] + } +] From 4261d42fb662ccd986daa54e32643da5727de805 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 24 Oct 2023 20:54:59 +0200 Subject: [PATCH 3/5] check the document type --- tests/dataset_builders/pie/test_tacred.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_tacred.py b/tests/dataset_builders/pie/test_tacred.py index a46184c8..4c0accb5 100644 --- a/tests/dataset_builders/pie/test_tacred.py +++ b/tests/dataset_builders/pie/test_tacred.py @@ -121,7 +121,7 @@ def document(hf_example, ner_names, relation_names): def test_document(document): assert document is not None - assert isinstance(document, Document) + assert isinstance(document, Tacred.DOCUMENT_TYPE) def test_example_to_document_and_back(hf_example, ner_names, relation_names): @@ -150,6 +150,7 @@ def test_example_to_document_and_back_all(hf_dataset): ner_int2str=lambda idx: ner_names[idx], relation_int2str=lambda idx: relation_names[idx], ) + assert isinstance(doc, Tacred.DOCUMENT_TYPE) example_back = document_to_example( doc, ner_names=ner_names, relation_names=relation_names ) @@ -171,6 +172,10 @@ def test_pie_document_all(dataset_variant): for split, ds in pie_dataset.items(): for doc in ds: assert doc is not None + # Note: we don't check the actual type of the document here, because the real type + # comes from the dataset builder script which Huggingface load_dataset() copies + # to a temporary directory and then imports. This means that the type of the document + # is not the same as the type of the document in the original dataset builder script. assert isinstance(doc, Document) From 6b59d5c4c5d353f491842fe6a74f8f32cd49a579 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 24 Oct 2023 20:55:09 +0200 Subject: [PATCH 4/5] use Tacred.BASE_DATASET_PATH --- tests/dataset_builders/pie/test_tacred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataset_builders/pie/test_tacred.py b/tests/dataset_builders/pie/test_tacred.py index 4c0accb5..8a9c7217 100644 --- a/tests/dataset_builders/pie/test_tacred.py +++ b/tests/dataset_builders/pie/test_tacred.py @@ -22,8 +22,8 @@ logger = logging.getLogger(__name__) -HF_DATASET_PATH = "DFKI-SLT/tacred" PIE_DATASET_PATH = f"{PIE_BASE_PATH}/tacred" +HF_DATASET_PATH = Tacred.BASE_DATASET_PATH SPLITS = ["train", "validation", "test"] EXAMPLE_IDX = 0 NUM_SAMPLES = 3 From 8986e3ba2743dc3f1ed17073d0727a5757e2e491 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Tue, 24 Oct 2023 20:56:51 +0200 Subject: [PATCH 5/5] improve splits constant / fixture --- tests/dataset_builders/pie/test_tacred.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/dataset_builders/pie/test_tacred.py b/tests/dataset_builders/pie/test_tacred.py index 8a9c7217..70463120 100644 --- a/tests/dataset_builders/pie/test_tacred.py +++ b/tests/dataset_builders/pie/test_tacred.py @@ -24,7 +24,7 @@ PIE_DATASET_PATH = f"{PIE_BASE_PATH}/tacred" HF_DATASET_PATH = Tacred.BASE_DATASET_PATH -SPLITS = ["train", "validation", "test"] +SPLIT_NAMES = {"train", "validation", "test"} EXAMPLE_IDX = 0 NUM_SAMPLES = 3 @@ -38,8 +38,8 @@ def dataset_variant(request): return request.param -@pytest.fixture(params=SPLITS, scope="module") -def split(request): +@pytest.fixture(params=SPLIT_NAMES, scope="module") +def split_name(request): return request.param @@ -68,12 +68,12 @@ def hf_dataset(dataset_variant): @pytest.fixture(scope="module") def hf_dataset_samples(hf_samples_fn): - data_files = {split: hf_samples_fn.format(split=split) for split in SPLITS} + data_files = {split: hf_samples_fn.format(split=split) for split in SPLIT_NAMES} return load_dataset("json", data_files=data_files) def test_hf_dataset_samples(hf_dataset_samples): - assert set(hf_dataset_samples) == {"train", "validation", "test"} + assert set(hf_dataset_samples) == SPLIT_NAMES for ds in hf_dataset_samples.values(): assert len(ds) == NUM_SAMPLES @@ -96,18 +96,18 @@ def test_dump_hf(hf_dataset, hf_samples_fn, hf_metadata_fn): @pytest.fixture(params=range(NUM_SAMPLES), scope="module") -def hf_example(hf_dataset_samples, split, request): - return hf_dataset_samples[split][request.param] +def hf_example(hf_dataset_samples, split_name, request): + return hf_dataset_samples[split_name][request.param] @pytest.fixture(scope="module") -def ner_names(hf_metadata_fn, split): - return _load_json(hf_metadata_fn.format(split=split, idx_or_feature="ner_names")) +def ner_names(hf_metadata_fn, split_name): + return _load_json(hf_metadata_fn.format(split=split_name, idx_or_feature="ner_names")) @pytest.fixture(scope="module") -def relation_names(hf_metadata_fn, split): - return _load_json(hf_metadata_fn.format(split=split, idx_or_feature="relation_names")) +def relation_names(hf_metadata_fn, split_name): + return _load_json(hf_metadata_fn.format(split=split_name, idx_or_feature="relation_names")) @pytest.fixture(scope="module")