-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_wikiann_datasets.py
415 lines (397 loc) · 11.8 KB
/
load_wikiann_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The WikiANN dataset for multilingual named entity recognition"""
import os
import datasets
_CITATION = """@inproceedings{pan-etal-2017-cross,
title = "Cross-lingual Name Tagging and Linking for 282 Languages",
author = "Pan, Xiaoman and
Zhang, Boliang and
May, Jonathan and
Nothman, Joel and
Knight, Kevin and
Ji, Heng",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1178",
doi = "10.18653/v1/P17-1178",
pages = "1946--1958",
abstract = "The ambitious goal of this work is to develop a cross-lingual name tagging and linking framework for 282 languages that exist in Wikipedia. Given a document in any of these languages, our framework is able to identify name mentions, assign a coarse-grained or fine-grained type to each mention, and link it to an English Knowledge Base (KB) if it is linkable. We achieve this goal by performing a series of new KB mining methods: generating {``}silver-standard{''} annotations by transferring annotations from English to other languages through cross-lingual links and KB properties, refining annotations through self-training and topic selection, deriving language-specific morphology features from anchor links, and mining word translation pairs from cross-lingual links. Both name tagging and linking results for 282 languages are promising on Wikipedia data and on-Wikipedia data.",
}"""
_DESCRIPTION = """WikiANN (sometimes called PAN-X) is a multilingual named entity recognition dataset consisting of Wikipedia articles annotated with LOC (location), PER (person), and ORG (organisation) tags in the IOB2 format. This version corresponds to the balanced train, dev, and test splits of Rahimi et al. (2019), which supports 176 of the 282 languages from the original WikiANN corpus."""
_DATA_URL = (
"https://s3.amazonaws.com/datasets.huggingface.co/wikiann/1.1.0/panx_dataset.zip"
)
_HOMEPAGE = "https://github.com/afshinrahimi/mmner"
_VERSION = "1.1.0"
_LANGS = [
"ace",
"af",
"als",
"am",
"an",
"ang",
"ar",
"arc",
"arz",
"as",
"ast",
"ay",
"az",
"ba",
"bar",
"bat-smg",
"be",
"be-x-old",
"bg",
"bh",
"bn",
"bo",
"br",
"bs",
"ca",
"cbk-zam",
"cdo",
"ce",
"ceb",
"ckb",
"co",
"crh",
"cs",
"csb",
"cv",
"cy",
"da",
"de",
"diq",
"dv",
"el",
"eml",
"en",
"eo",
"es",
"et",
"eu",
"ext",
"fa",
"fi",
"fiu-vro",
"fo",
"fr",
"frr",
"fur",
"fy",
"ga",
"gan",
"gd",
"gl",
"gn",
"gu",
"hak",
"he",
"hi",
"hr",
"hsb",
"hu",
"hy",
"ia",
"id",
"ig",
"ilo",
"io",
"is",
"it",
"ja",
"jbo",
"jv",
"ka",
"kk",
"km",
"kn",
"ko",
"ksh",
"ku",
"ky",
"la",
"lb",
"li",
"lij",
"lmo",
"ln",
"lt",
"lv",
"map-bms",
"mg",
"mhr",
"mi",
"min",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"mwl",
"my",
"mzn",
"nap",
"nds",
"ne",
"nl",
"nn",
"no",
"nov",
"oc",
"or",
"os",
"pa",
"pdc",
"pl",
"pms",
"pnb",
"ps",
"pt",
"qu",
"rm",
"ro",
"ru",
"rw",
"sa",
"sah",
"scn",
"sco",
"sd",
"sh",
"si",
"simple",
"sk",
"sl",
"so",
"sq",
"sr",
"su",
"sv",
"sw",
"szl",
"ta",
"te",
"tg",
"th",
"tk",
"tl",
"tr",
"tt",
"ug",
"uk",
"ur",
"uz",
"vec",
"vep",
"vi",
"vls",
"vo",
"wa",
"war",
"wuu",
"xmf",
"yi",
"yo",
"zea",
"zh",
"zh-classical",
"zh-min-nan",
"zh-yue",
]
class WikiannConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super(WikiannConfig, self).__init__(
version=datasets.Version(_VERSION, ""), **kwargs
)
class Wikiann(datasets.GeneratorBasedBuilder):
"""WikiANN is a multilingual named entity recognition dataset consisting of Wikipedia articles annotated with LOC, PER, and ORG tags"""
VERSION = datasets.Version(_VERSION)
# use two-letter ISO 639-1 language codes as the name for each corpus
BUILDER_CONFIGS = [
WikiannConfig(name=lang, description=f"WikiANN NER examples in language {lang}")
for lang in _LANGS
]
def _tags_to_spans(self, tags):
"""Convert tags to spans."""
spans = set()
span_start = 0
span_end = 0
active_conll_tag = None
for index, string_tag in enumerate(tags):
# Actual BIO tag.
bio_tag = string_tag[0]
assert bio_tag in ["B", "I", "O"], "Invalid Tag"
conll_tag = string_tag[2:]
if bio_tag == "O":
# The span has ended.
if active_conll_tag:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = None
# We don't care about tags we are
# told to ignore, so we do nothing.
continue
elif bio_tag == "B":
# We are entering a new span; reset indices and active tag to new span.
if active_conll_tag:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = conll_tag
span_start = index
span_end = index
elif bio_tag == "I" and conll_tag == active_conll_tag:
# We're inside a span.
span_end += 1
else:
# This is the case the bio label is an "I", but either:
# 1) the span hasn't started - i.e. an ill formed span.
# 2) We have IOB1 tagging scheme.
# We'll process the previous span if it exists, but also include this
# span. This is important, because otherwise, a model may get a perfect
# F1 score whilst still including false positive ill-formed spans.
if active_conll_tag:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = conll_tag
span_start = index
span_end = index
# Last token might have been a part of a valid span.
if active_conll_tag:
spans.add((active_conll_tag, (span_start, span_end)))
# Return sorted list of spans
return sorted(list(spans), key=lambda x: x[1][0])
def _get_spans(self, tokens, tags):
"""Convert tags to textspans."""
spans = self._tags_to_spans(tags)
text_spans = [
x[0] + ": " + " ".join([tokens[i] for i in range(x[1][0], x[1][1] + 1)])
for x in spans
]
if not text_spans:
text_spans = ["None"]
return text_spans
def _info(self):
features = datasets.Features(
{
"tokens": datasets.Sequence(datasets.Value("string")),
"ner_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
]
)
),
"langs": datasets.Sequence(datasets.Value("string")),
"spans": datasets.Sequence(datasets.Value("string")),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
wikiann_dl_dir = dl_manager.download_and_extract(_DATA_URL)
lang = self.config.name
lang_archive = os.path.join(wikiann_dl_dir, lang + ".tar.gz")
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": "dev",
"files": dl_manager.iter_archive(lang_archive),
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": "test",
"files": dl_manager.iter_archive(lang_archive),
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": "train",
"files": dl_manager.iter_archive(lang_archive),
},
),
]
def _generate_examples(self, filepath, files):
"""Reads line by line format of the NER dataset and generates examples.
Input Format:
en:rick B-PER
en:and O
en:morty B-PER
en:are O
en:cool O
en:. O
Output Format:
{
'tokens': ["rick", "and", "morty", "are", "cool", "."],
'ner_tags': ["B-PER", "O" , "B-PER", "O", "O", "O"],
'langs': ["en", "en", "en", "en", "en", "en"]
'spans': ["PER: rick", "PER: morty"]
}
Args:
filepath: Path to file with line by line NER format.
Returns:
Examples with the format listed above.
"""
guid_index = 1
for path, f in files:
if path == filepath:
tokens = []
ner_tags = []
langs = []
for line in f:
line = line.decode("utf-8")
if line == "" or line == "\n":
if tokens:
spans = self._get_spans(tokens, ner_tags)
yield guid_index, {
"tokens": tokens,
"ner_tags": ner_tags,
"langs": langs,
"spans": spans,
}
guid_index += 1
tokens = []
ner_tags = []
langs = []
else:
# wikiann data is tab separated
splits = line.split("\t")
# strip out en: prefix
langs.append(splits[0].split(":")[0])
tokens.append(":".join(splits[0].split(":")[1:]))
if len(splits) > 1:
ner_tags.append(splits[-1].replace("\n", ""))
else:
# examples have no label in test set
ner_tags.append("O")
break