Skip to content

Commit

Permalink
[nl] extract linkage sections
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Oct 10, 2024
1 parent 6a9ecc2 commit 46c46bf
Show file tree
Hide file tree
Showing 7 changed files with 182 additions and 5 deletions.
69 changes: 69 additions & 0 deletions src/wiktextract/extractor/nl/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry


def extract_linkage_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
linkage_type: str,
) -> None:
sense_index = 0
sense = ""
raw_tags = []
for node in level_node.children:
if isinstance(node, TemplateNode):
if node.template_name == "intens":
# https://nl.wiktionary.org/wiki/Sjabloon:intens
raw_tags = ["intensivering"]
s_index_str = node.template_parameters.get(2, "").strip()
if re.fullmatch(r"\d+", s_index_str):
sense_index = int(s_index_str)
elif isinstance(node, WikiNode):
if node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
getattr(word_entry, linkage_type).append(
Linkage(
word=word,
sense=sense,
sense_index=sense_index,
raw_tags=raw_tags,
)
)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr,
word_entry,
list_item,
linkage_type,
sense,
sense_index,
)


def extract_linkage_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WordEntry,
linkage_type: str,
sense: str,
sense_index: str,
) -> None:
for node in list_item.children:
if isinstance(node, str):
m = re.search(r"\[(\d+)\]", node)
if m is not None:
sense_index = int(m.group(1))
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
getattr(word_entry, linkage_type).append(
Linkage(word=word, sense=sense, sense_index=sense_index)
)
26 changes: 25 additions & 1 deletion src/wiktextract/extractor/nl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ class Sound(DutchBaseModel):
raw_tags: list[str] = []


class Linkage(DutchBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""
sense: str = Field(default="", description="Definition of the word")
sense_index: int = Field(
default=0, ge=0, description="Number of the definition, start from 1"
)


class WordEntry(DutchBaseModel):
model_config = ConfigDict(title="Dutch Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -48,5 +59,18 @@ class WordEntry(DutchBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
sense_index: str = ""
etymology_index: str = Field(default="", exclude=True)
sounds: list[Sound] = []
anagrams: list[Linkage] = []
antonyms: list[Linkage] = []
derived: list[Linkage] = []
proverbs: list[Linkage] = []
holonyms: list[Linkage] = []
homophones: list[Linkage] = []
hypernyms: list[Linkage] = []
hyponyms: list[Linkage] = []
metonyms: list[Linkage] = []
paronyms: list[Linkage] = []
related: list[Linkage] = []
rhymes: list[Linkage] = []
synonyms: list[Linkage] = []
10 changes: 9 additions & 1 deletion src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import extract_sound_section


Expand All @@ -39,6 +40,13 @@ def parse_section(
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ def extract_pos_header_line_nodes(
wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
) -> None:
for node in nodes:
if isinstance(node, str) and word_entry.sense_index == "":
if isinstance(node, str) and word_entry.etymology_index == "":
m = re.search(r"\[(.+)\]", node.strip())
if m is not None:
word_entry.sense_index = m.group(1).strip()
word_entry.etymology_index = m.group(1).strip()
elif isinstance(node, TemplateNode) and node.template_name == "-l-":
extract_l_template(wxr, word_entry, node)

Expand Down
20 changes: 20 additions & 0 deletions src/wiktextract/extractor/nl/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,23 @@
"Symbool": {"pos": "symbol"},
"Werkwoord": {"pos": "verb"},
}


LINKAGE_SECTIONS = {
"Anagrammen": "anagrams",
"Antoniemen": "antonyms",
"Afgeleide begrippen": "derived",
"Uitdrukkingen en gezegden": "proverbs",
"Holoniemen": "holonyms",
"Gelijkklinkende woorden": "homophones",
"Hyperoniemen": "hypernyms",
"Hyponiemen": "hyponyms",
"Hyponiemen (in taxonomische zin)": "hyponyms",
"Meroniemen": "metonyms",
"Paroniemen": "paronyms",
"Spreekwoorden": "proverbs",
"Verwante begrippen": "related",
"Rijmwoorden": "rhymes",
"Synoniemen": "synonyms",
"Typische woordcombinaties": "derived",
}
1 change: 0 additions & 1 deletion tests/test_nl_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def test_simple_case(self):
"raw_tags": ["roofdieren"],
}
],
"sense_index": "A",
"tags": ["masculine"],
"word": "hond",
}
Expand Down
57 changes: 57 additions & 0 deletions tests/test_nl_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.nl.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestNlLinkage(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="nl"),
WiktionaryConfig(
dump_file_lang_code="nl",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_intens_template(self):
data = parse_page(
self.wxr,
"hond",
"""==Nederlands==
====Zelfstandig naamwoord====
# zoogdier
=====Hyponiemen=====
{{intens|nld|1}} [[superhond]]
{{intens|nld|2}} [[kankerhond]], [[tyfushond]]
*[1] [[reu]]""",
)
self.assertEqual(
data[0]["hyponyms"],
[
{
"word": "superhond",
"sense_index": 1,
"raw_tags": ["intensivering"],
},
{
"word": "kankerhond",
"sense_index": 2,
"raw_tags": ["intensivering"],
},
{
"word": "tyfushond",
"sense_index": 2,
"raw_tags": ["intensivering"],
},
{"word": "reu", "sense_index": 1},
],
)

0 comments on commit 46c46bf

Please sign in to comment.