From a638542cbc3d6c44a78c3f3418219bc16e7aed8c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 22 Oct 2024 17:30:12 +0800 Subject: [PATCH 1/2] [nl] extract "-nlnoun-" inflection table template --- src/wiktextract/extractor/nl/inflection.py | 48 +++++++++ src/wiktextract/extractor/nl/models.py | 2 + src/wiktextract/extractor/nl/page.py | 13 ++- src/wiktextract/extractor/nl/pos.py | 9 ++ src/wiktextract/extractor/nl/tags.py | 9 +- tests/test_nl_inflection.py | 110 +++++++++++++++++++++ 6 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 src/wiktextract/extractor/nl/inflection.py create mode 100644 tests/test_nl_inflection.py diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py new file mode 100644 index 00000000..61a8bc3a --- /dev/null +++ b/src/wiktextract/extractor/nl/inflection.py @@ -0,0 +1,48 @@ +from wikitextprocessor import NodeKind, TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Form, WordEntry +from .tags import translate_raw_tags + + +def extract_inflection_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + if t_node.template_name == "-nlnoun-": + extract_nlnoun_template(wxr, word_entry, t_node) + + +def extract_nlnoun_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://nl.wiktionary.org/wiki/Sjabloon:-nlnoun- + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + column_headers = [] + for table_node in expanded_node.find_child(NodeKind.TABLE): + for row_node in table_node.find_child(NodeKind.TABLE_ROW): + for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): + header_text = clean_node(wxr, None, header_node) + if header_text != "": + column_headers.append(header_text) + row_header = "" + for col_index, data_node in enumerate( + row_node.find_child(NodeKind.TABLE_CELL) + ): + if col_index == 0: + row_header = clean_node(wxr, None, data_node) + else: + form_str = clean_node(wxr, None, data_node) + if form_str not in ["", wxr.wtp.title]: + form = Form(form=form_str) + if row_header not in ["", "naamwoord"]: + form.raw_tags.append(row_header) + if col_index - 1 < len(column_headers): + form.raw_tags.append(column_headers[col_index - 1]) + translate_raw_tags(form) + word_entry.forms.append(form) + + for link_node in expanded_node.find_child(NodeKind.LINK): + clean_node(wxr, word_entry, link_node) diff --git a/src/wiktextract/extractor/nl/models.py b/src/wiktextract/extractor/nl/models.py index 12e09253..d33b460a 100644 --- a/src/wiktextract/extractor/nl/models.py +++ b/src/wiktextract/extractor/nl/models.py @@ -79,6 +79,8 @@ class Etymology(DutchBaseModel): class Form(DutchBaseModel): form: str = "" note: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] class Descendant(DutchBaseModel): diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index 2cce0335..1b34bc19 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -12,6 +12,7 @@ from ...wxr_context import WiktextractContext from .descendant import extract_descendant_section from .etymology import extract_etymology_section +from .inflection import extract_inflection_template from .linkage import extract_fixed_preposition_section, extract_linkage_section from .models import Etymology, Sense, WordEntry from .pos import extract_pos_section @@ -32,6 +33,7 @@ def parse_section( wxr: WiktextractContext, page_data: list[WordEntry], base_data: WordEntry, + forms_data: WordEntry, level_node: WikiNode, ) -> list[Etymology]: # title templates @@ -40,7 +42,9 @@ def parse_section( wxr.wtp.start_subsection(title_text) etymology_data = [] if title_text in POS_DATA: - extract_pos_section(wxr, page_data, base_data, level_node, title_text) + extract_pos_section( + wxr, page_data, base_data, forms_data, level_node, title_text + ) elif title_text == "Uitspraak": extract_sound_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node @@ -93,10 +97,12 @@ def parse_section( wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60") for next_level in level_node.find_child(LEVEL_KIND_FLAGS): - parse_section(wxr, page_data, base_data, next_level) + parse_section(wxr, page_data, base_data, forms_data, next_level) extract_section_categories( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) + for t_node in level_node.find_child(NodeKind.TEMPLATE): + extract_inflection_template(wxr, forms_data, t_node) return etymology_data @@ -127,11 +133,12 @@ def parse_page( lang=lang_name, pos="unknown", ) + forms_data = base_data.model_copy(deep=True) extract_section_categories(wxr, base_data, level2_node) etymology_data = [] for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): new_e_data = parse_section( - wxr, page_data, base_data, next_level_node + wxr, page_data, base_data, forms_data, next_level_node ) if len(new_e_data) > 0: etymology_data = new_e_data diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index a576aabd..1c0f1a7d 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -18,6 +18,7 @@ def extract_pos_section( wxr: WiktextractContext, page_data: list[WordEntry], base_data: WordEntry, + forms_data: WordEntry, level_node: LevelNode, pos_title: str, ) -> None: @@ -26,6 +27,14 @@ def extract_pos_section( pos_data = POS_DATA[pos_title] page_data[-1].pos = pos_data["pos"] page_data[-1].tags.extend(pos_data.get("tags", [])) + if forms_data.pos == "unknown": + forms_data.pos = page_data[-1].pos + if forms_data.pos == page_data[-1].pos: + page_data[-1].forms.extend(forms_data.forms) + page_data[-1].categories.extend(forms_data.categories) + else: + forms_data.forms.clear() + forms_data.categories.clear() extract_pos_section_nodes(wxr, page_data, level_node) diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index 05eba862..48afbbec 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -13,8 +13,15 @@ "verouderd": "obsolete", # Sjabloon:verouderd } +TABLE_TAGS = { + # Sjabloon:-nlnoun- + "enkelvoud": "singular", + "meervoud": "plural", + "verkleinwoord": "diminutive", +} + -TAGS = {**VERB_TAGS, **GLOSS_TAGS} +TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS} # https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels TOPICS = {"anatomie": "anatomy"} diff --git a/tests/test_nl_inflection.py b/tests/test_nl_inflection.py new file mode 100644 index 00000000..d1f46c23 --- /dev/null +++ b/tests/test_nl_inflection.py @@ -0,0 +1,110 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.nl.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestNlInflection(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="nl"), + WiktionaryConfig( + dump_file_lang_code="nl", + capture_language_codes=None, + ), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def test_nlnoun_different_pos(self): + self.wxr.wtp.add_page( + "Sjabloon:-nlnoun-", + 10, + """{| class="infobox" +|- +! +! [[enkelvoud]] +! [[meervoud]] +|- +| class="infoboxrijhoofding" | [[zelfstandig naamwoord|naamwoord]] +| loop +| [[lopen]] +|- +| class="infoboxrijhoofding" | [[verkleinwoord]] +| [[loopje]] +| [[loopjes]] +|}[[Categorie:Zelfstandig naamwoord in het Nederlands]][[Categorie:Telbaar]]""", + ) + data = parse_page( + self.wxr, + "loop", + """==Nederlands== +=====Woordafbreking===== +*loop +{{-nlnoun-|{{pn}}|[[lopen]]|[[{{pn}}je]]|[[{{pn}}jes]]}} +====Zelfstandig naamwoord==== +{{-l-|m}} +#voorste deel van een [[wapen]] +====Werkwoord==== +{{1ps|lopen}}""", + ) + self.assertEqual(len(data), 2) + self.assertEqual( + data[0]["categories"], + ["Zelfstandig naamwoord in het Nederlands", "Telbaar"], + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "lopen", "tags": ["plural"]}, + {"form": "loopje", "tags": ["diminutive", "singular"]}, + {"form": "loopjes", "tags": ["diminutive", "plural"]}, + ], + ) + self.assertTrue("categories" not in data[1]) + self.assertTrue("forms" not in data[1]) + + def test_nlnoun_same_pos(self): + self.wxr.wtp.add_page( + "Sjabloon:-nlnoun-", + 10, + """{| class="infobox" +|- +! +! [[enkelvoud]] +! [[meervoud]] +|- +| class="infoboxrijhoofding" | [[zelfstandig naamwoord|naamwoord]] +| hond +| [[honden]] +|- +| class="infoboxrijhoofding" | [[verkleinwoord]] +| [[hondje]] +| [[hondjes]] +|}[[Categorie:Zelfstandig naamwoord in het Nederlands]][[Categorie:Telbaar]] +""", + ) + data = parse_page( + self.wxr, + "hond", + """==Nederlands== +=====Woordherkomst en -opbouw===== +*[A] uiteindelijk +{{-nlnoun-|hond|[[honden]]|[[hondje]]|[[hondjes]]}} +====Zelfstandig naamwoord==== +[A] {{-l-|m}} +# zoogdier +====Zelfstandig naamwoord==== +[B] {{-l-|o}} +# landmaat""", + ) + self.assertEqual(len(data), 2) + self.assertEqual(data[0]["categories"], data[1]["categories"]) + self.assertEqual(len(data[0]["forms"]), 3) + self.assertEqual(data[0]["forms"], data[1]["forms"]) From ec4b2331abf03e6b3a803875797c309fd83b9882 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 22 Oct 2024 17:48:41 +0800 Subject: [PATCH 2/2] [nl] don't extract conjugation table pages --- src/wiktextract/extractor/nl/page.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index 1b34bc19..56e1f531 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -113,6 +113,8 @@ def parse_page( # https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien # language templates # https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen + if page_title.endswith("/vervoeging"): + return [] # skip conjugation pages wxr.wtp.start_page(page_title) tree = wxr.wtp.parse(page_text, pre_expand=True) page_data: list[WordEntry] = []