Skip to content

Commit

Permalink
Merge pull request #868 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] extract more linkage data, hyphenation section, italic tag in gloss list
  • Loading branch information
xxyzz authored Oct 14, 2024
2 parents 07d18d5 + 3056257 commit 6b5ad0b
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 8 deletions.
18 changes: 15 additions & 3 deletions src/wiktextract/extractor/nl/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def extract_linkage_section(
sense_index = 0
elif node.template_name.startswith("nld-"):
extract_nld_template(wxr, word_entry, node, linkage_type)
elif node.template_name == "expr":
elif node.template_name in ["expr", "fras"]:
extract_expr_template(wxr, word_entry, node, linkage_type)
elif isinstance(node, WikiNode):
if node.kind == NodeKind.LINK:
Expand Down Expand Up @@ -78,6 +78,11 @@ def extract_linkage_list_item(
m = re.search(r"\[(\d+)\]", node)
if m is not None:
sense_index = int(m.group(1))
elif node.strip().startswith("="):
sense = node.strip().removeprefix("=").strip()
linkage_list = getattr(word_entry, linkage_type)
if len(linkage_list) > 0:
linkage_list[-1].sense = sense
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
Expand Down Expand Up @@ -124,12 +129,19 @@ def extract_expr_template(
linkage_type: str,
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:expr
# https://nl.wiktionary.org/wiki/Sjabloon:fras
sense_index_str = t_node.template_parameters.get("n", "")
sense_index = 0
if re.fullmatch(r"\d+", sense_index_str) is not None:
sense_index = int(sense_index_str)
sense = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
word = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
sense_arg = 2 if t_node.template_name == "expr" else 3
word_arg = 1 if t_node.template_name == "expr" else 2
sense = clean_node(wxr, None, t_node.template_parameters.get(sense_arg, ""))
word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
m = re.match(r"\[?(\d+)\]?", word)
if m is not None: # should use "n" arg
sense_index = int(m.group(1))
word = word[m.end() :].strip()
if word != "":
getattr(word_entry, linkage_type).append(
Linkage(word=word, sense=sense, sense_index=sense_index)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/nl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,4 @@ class WordEntry(DutchBaseModel):
rhymes: list[Linkage] = []
synonyms: list[Linkage] = []
translations: list[Translation] = []
hyphenation: str = ""
6 changes: 5 additions & 1 deletion src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import extract_sound_section
from .sound import extract_hyphenation_section, extract_sound_section
from .translation import extract_translation_section


Expand Down Expand Up @@ -52,6 +52,10 @@ def parse_section(
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Woordafbreking":
extract_hyphenation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
26 changes: 22 additions & 4 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from .models import AltForm, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags

FORM_OF_TEMPLATES = frozenset(["noun-pl", "noun-form"])

Expand Down Expand Up @@ -57,28 +58,45 @@ def extract_pos_section(
extract_form_of_template(wxr, page_data[-1], node)


# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"])


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
) -> None:
sense = Sense()
gloss_nodes = []
for child in list_item.children:
if isinstance(child, TemplateNode):
expanded_text = clean_node(wxr, sense, child)
if expanded_text.startswith("(") and expanded_text.endswith(")"):
sense.raw_tags.append(expanded_text.strip("() "))
if child.template_name in GLOSS_TAG_TEMPLATES:
sense.raw_tags.append(clean_node(wxr, sense, child))
else:
gloss_nodes.append(expanded_text)
expanded_text = clean_node(wxr, sense, child)
if expanded_text.startswith("(") and expanded_text.endswith(
")"
):
sense.raw_tags.append(expanded_text.strip("() "))
else:
gloss_nodes.append(expanded_text)
elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
if child.sarg.endswith("*"):
for next_list_item in child.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, next_list_item)
elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
italic_text = clean_node(wxr, sense, child)
if italic_text.startswith("(") and italic_text.endswith(")"):
sense.raw_tags.append(italic_text.strip("() "))
else:
gloss_nodes.append(italic_text)
else:
gloss_nodes.append(child)

gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)


Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/nl/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,11 @@ def extract_pron_reg_template(
)
for link_node in expanded_node.find_child_recursively(NodeKind.LINK):
sound.raw_tags.append(clean_node(wxr, None, link_node))


def extract_hyphenation_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
word_entry.hyphenation = clean_node(wxr, None, list_item.children)
break
19 changes: 19 additions & 0 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from .models import WordEntry

VERB_TAGS = {
"ergatief": "ergative", # Sjabloon:erg
"inergatief": "unergative", # Sjabloon:inerg
"hulpwerkwoord": "auxiliary", # Sjabloon:auxl
}

TAGS = {**VERB_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS:
data.tags.append(TAGS[raw_tag])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
23 changes: 23 additions & 0 deletions tests/test_nl_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,26 @@ def test_noun_form(self):
"tags": ["form-of", "plural"],
},
)

def test_italic_tag(self):
self.wxr.wtp.add_page(
"Sjabloon:erga",
10,
"""<span>[[WikiWoordenboek:Werkwoord#Ergativiteit|ergatief]]</span>[[Categorie:Ergatief werkwoord in het Nederlands]] """,
)
data = parse_page(
self.wxr,
"lopen",
"""==Nederlands==
====Werkwoord====
#''(Noord-Nederlands)'' {{erga|nld}} stappen, gaan, wandelen""",
)
self.assertEqual(
data[0]["senses"][0],
{
"categories": ["Ergatief werkwoord in het Nederlands"],
"tags": ["ergative"],
"raw_tags": ["Noord-Nederlands"],
"glosses": ["stappen, gaan, wandelen"],
},
)
35 changes: 35 additions & 0 deletions tests/test_nl_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,38 @@ def test_expr_template(self):
}
],
)

data = parse_page(
self.wxr,
"lopen",
"""==Nederlands==
====Werkwoord====
# stappen
=====Uitdrukkingen en gezegden=====
{{expr|[1] Tegen de lamp '''lopen'''|betrapt/gesnapt worden}}""",
)
self.assertEqual(
data[0]["proverbs"],
[
{
"sense": "betrapt/gesnapt worden",
"sense_index": 1,
"word": "Tegen de lamp lopen",
}
],
)

def test_sense_text_after_link(self):
data = parse_page(
self.wxr,
"lopen",
"""==Nederlands==
====Werkwoord====
# stappen
=====Verwante begrippen=====
*[[benen]] = met grote passen lopen""",
)
self.assertEqual(
data[0]["related"],
[{"sense": "met grote passen lopen", "word": "benen"}],
)
12 changes: 12 additions & 0 deletions tests/test_nl_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,15 @@ def test_common_lists(self):
data[0]["sounds"][2],
{"ipa": "/ˈɦɔnt/", "raw_tags": ["Vlaanderen", "Brabant"]},
)

def test_hyphenation(self):
data = parse_page(
self.wxr,
"lopen",
"""==Nederlands==
=====Woordafbreking=====
*lo·pen
====Werkwoord====
# stappen""",
)
self.assertEqual(data[0]["hyphenation"], "lo·pen")

0 comments on commit 6b5ad0b

Please sign in to comment.