generated from ArneBinder/pytorch-ie-hydra-template-1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconll2003.py
50 lines (36 loc) · 1.59 KB
/
conll2003.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from dataclasses import dataclass
import datasets
from pie_datasets import GeneratorBasedBuilder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument, TextDocumentWithLabeledSpans
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans
@dataclass
class CoNLL2003Document(TextBasedDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")
class Conll2003(GeneratorBasedBuilder):
DOCUMENT_TYPE = CoNLL2003Document
BASE_DATASET_PATH = "conll2003"
BASE_DATASET_REVISION = "01ad4ad271976c5258b9ed9b910469a806ff3288"
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="conll2003", version=datasets.Version("1.0.0"), description="CoNLL2003 dataset"
),
]
DOCUMENT_CONVERTERS = {
TextDocumentWithLabeledSpans: {
# just rename the layer
"entities": "labeled_spans",
}
}
def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}
def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)
document = CoNLL2003Document(text=text, id=doc_id)
for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)
return document