Skip to content

Commit

Permalink
Merge pull request #882 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] extract "-nlnoun-" inflection table template
  • Loading branch information
xxyzz authored Oct 22, 2024
2 parents 25865e5 + ec4b233 commit f8acb8a
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 4 deletions.
48 changes: 48 additions & 0 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from wikitextprocessor import NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


def extract_inflection_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
if t_node.template_name == "-nlnoun-":
extract_nlnoun_template(wxr, word_entry, t_node)


def extract_nlnoun_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun-
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
column_headers = []
for table_node in expanded_node.find_child(NodeKind.TABLE):
for row_node in table_node.find_child(NodeKind.TABLE_ROW):
for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
header_text = clean_node(wxr, None, header_node)
if header_text != "":
column_headers.append(header_text)
row_header = ""
for col_index, data_node in enumerate(
row_node.find_child(NodeKind.TABLE_CELL)
):
if col_index == 0:
row_header = clean_node(wxr, None, data_node)
else:
form_str = clean_node(wxr, None, data_node)
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if row_header not in ["", "naamwoord"]:
form.raw_tags.append(row_header)
if col_index - 1 < len(column_headers):
form.raw_tags.append(column_headers[col_index - 1])
translate_raw_tags(form)
word_entry.forms.append(form)

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, word_entry, link_node)
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/nl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class Etymology(DutchBaseModel):
class Form(DutchBaseModel):
form: str = ""
note: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class Descendant(DutchBaseModel):
Expand Down
15 changes: 12 additions & 3 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ...wxr_context import WiktextractContext
from .descendant import extract_descendant_section
from .etymology import extract_etymology_section
from .inflection import extract_inflection_template
from .linkage import extract_fixed_preposition_section, extract_linkage_section
from .models import Etymology, Sense, WordEntry
from .pos import extract_pos_section
Expand All @@ -32,6 +33,7 @@ def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
forms_data: WordEntry,
level_node: WikiNode,
) -> list[Etymology]:
# title templates
Expand All @@ -40,7 +42,9 @@ def parse_section(
wxr.wtp.start_subsection(title_text)
etymology_data = []
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
extract_pos_section(
wxr, page_data, base_data, forms_data, level_node, title_text
)
elif title_text == "Uitspraak":
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
Expand Down Expand Up @@ -93,10 +97,12 @@ def parse_section(
wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60")

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
parse_section(wxr, page_data, base_data, forms_data, next_level)
extract_section_categories(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
extract_inflection_template(wxr, forms_data, t_node)
return etymology_data


Expand All @@ -107,6 +113,8 @@ def parse_page(
# https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien
# language templates
# https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen
if page_title.endswith("/vervoeging"):
return [] # skip conjugation pages
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
Expand All @@ -127,11 +135,12 @@ def parse_page(
lang=lang_name,
pos="unknown",
)
forms_data = base_data.model_copy(deep=True)
extract_section_categories(wxr, base_data, level2_node)
etymology_data = []
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
new_e_data = parse_section(
wxr, page_data, base_data, next_level_node
wxr, page_data, base_data, forms_data, next_level_node
)
if len(new_e_data) > 0:
etymology_data = new_e_data
Expand Down
9 changes: 9 additions & 0 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
forms_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
Expand All @@ -26,6 +27,14 @@ def extract_pos_section(
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
if forms_data.pos == "unknown":
forms_data.pos = page_data[-1].pos
if forms_data.pos == page_data[-1].pos:
page_data[-1].forms.extend(forms_data.forms)
page_data[-1].categories.extend(forms_data.categories)
else:
forms_data.forms.clear()
forms_data.categories.clear()
extract_pos_section_nodes(wxr, page_data, level_node)


Expand Down
9 changes: 8 additions & 1 deletion src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@
"verouderd": "obsolete", # Sjabloon:verouderd
}

TABLE_TAGS = {
# Sjabloon:-nlnoun-
"enkelvoud": "singular",
"meervoud": "plural",
"verkleinwoord": "diminutive",
}


TAGS = {**VERB_TAGS, **GLOSS_TAGS}
TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS}

# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
TOPICS = {"anatomie": "anatomy"}
Expand Down
110 changes: 110 additions & 0 deletions tests/test_nl_inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.nl.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestNlInflection(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="nl"),
WiktionaryConfig(
dump_file_lang_code="nl",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_nlnoun_different_pos(self):
self.wxr.wtp.add_page(
"Sjabloon:-nlnoun-",
10,
"""{| class="infobox"
|-
!
! [[enkelvoud]]
! [[meervoud]]
|-
| class="infoboxrijhoofding" | [[zelfstandig naamwoord|naamwoord]]
| loop
| [[lopen]]
|-
| class="infoboxrijhoofding" | [[verkleinwoord]]
| [[loopje]]
| [[loopjes]]
|}[[Categorie:Zelfstandig naamwoord in het Nederlands]][[Categorie:Telbaar]]""",
)
data = parse_page(
self.wxr,
"loop",
"""==Nederlands==
=====Woordafbreking=====
*loop
{{-nlnoun-|{{pn}}|[[lopen]]|[[{{pn}}je]]|[[{{pn}}jes]]}}
====Zelfstandig naamwoord====
{{-l-|m}}
#voorste deel van een [[wapen]]
====Werkwoord====
{{1ps|lopen}}""",
)
self.assertEqual(len(data), 2)
self.assertEqual(
data[0]["categories"],
["Zelfstandig naamwoord in het Nederlands", "Telbaar"],
)
self.assertEqual(
data[0]["forms"],
[
{"form": "lopen", "tags": ["plural"]},
{"form": "loopje", "tags": ["diminutive", "singular"]},
{"form": "loopjes", "tags": ["diminutive", "plural"]},
],
)
self.assertTrue("categories" not in data[1])
self.assertTrue("forms" not in data[1])

def test_nlnoun_same_pos(self):
self.wxr.wtp.add_page(
"Sjabloon:-nlnoun-",
10,
"""{| class="infobox"
|-
!
! [[enkelvoud]]
! [[meervoud]]
|-
| class="infoboxrijhoofding" | [[zelfstandig naamwoord|naamwoord]]
| hond
| [[honden]]
|-
| class="infoboxrijhoofding" | [[verkleinwoord]]
| [[hondje]]
| [[hondjes]]
|}[[Categorie:Zelfstandig naamwoord in het Nederlands]][[Categorie:Telbaar]]
""",
)
data = parse_page(
self.wxr,
"hond",
"""==Nederlands==
=====Woordherkomst en -opbouw=====
*[A] uiteindelijk
{{-nlnoun-|hond|[[honden]]|[[hondje]]|[[hondjes]]}}
====Zelfstandig naamwoord====
[A] {{-l-|m}}
# zoogdier
====Zelfstandig naamwoord====
[B] {{-l-|o}}
# landmaat""",
)
self.assertEqual(len(data), 2)
self.assertEqual(data[0]["categories"], data[1]["categories"])
self.assertEqual(len(data[0]["forms"]), 3)
self.assertEqual(data[0]["forms"], data[1]["forms"])

0 comments on commit f8acb8a

Please sign in to comment.