Skip to content

Commit

Permalink
Merge pull request #807 from xxyzz/de
Browse files Browse the repository at this point in the history
[de] fix `IndexError` exception and add some title templates
  • Loading branch information
xxyzz authored Sep 9, 2024
2 parents 2f7812a + fe381ec commit 1a46f2b
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 1 deletion.
95 changes: 95 additions & 0 deletions src/wiktextract/data/overrides/de.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
{
"Vorlage:Abgeleitete Symbole": {
"body": "====Abgeleitete Symbole====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Abkürzungen": {
"body": "==== Abkürzungen ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Ähnlichkeiten": {
"body": "====Ähnliche Wörter====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Alternative Schreibweisen": {
"body": "==== Alternative Schreibweisen ====",
"namespace_id": 10,
Expand Down Expand Up @@ -44,6 +54,16 @@
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Erbwörter": {
"body": "====Erbwörter====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Geflügelte Worte": {
"body": "====Geflügelte Worte====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Gegenwörter": {
"body": "==== Gegenwörter ====",
"namespace_id": 10,
Expand All @@ -54,31 +74,81 @@
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Hanja": {
"body": "====Hanja====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Herkunft": {
"body": "==== Herkunft ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Heteronyme": {
"body": "====Heteronyme====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Holonyme": {
"body": "==== Holonyme ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:in arabischer Schrift": {
"body": "====In arabischer Schrift====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:in hebräischer Schrift": {
"body": "====In hebräischer Schrift====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:in kanadischer Silbenschrift": {
"body": "====In kanadischer Silbenschrift====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:in kyrillischer Schrift": {
"body": "====In kyrillischer Schrift====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:in lateinischer Schrift": {
"body": "====In lateinischer Schrift====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Koseformen": {
"body": "==== Koseformen ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Kurzformen": {
"body": "====Kurzformen====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Lesungen": {
"body": "==== Lesungen ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Männliche Namensvarianten": {
"body": "====Männliche Namensvarianten====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Männliche Wortformen": {
"body": "==== Männliche Wortformen ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Meronyme": {
"body": "====Meronyme====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Namensvarianten": {
"body": "==== Namensvarianten ====",
"namespace_id": 10,
Expand Down Expand Up @@ -129,11 +199,21 @@
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Sinnverwandte Redewendungen": {
"body": "====Sinnverwandte Redewendungen====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Sinnverwandte Wörter": {
"body": "==== Sinnverwandte Wörter ====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Sinnverwandte Zeichen": {
"body": "====Sinnverwandte Zeichen====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Sprichwörter": {
"body": "==== Sprichwörter ====",
"namespace_id": 10,
Expand All @@ -159,6 +239,16 @@
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Verballhornung": {
"body": "====Verballhornungen====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Vergrößerungsformen": {
"body": "====Vergrößerungsformen====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Verkleinerungsformen": {
"body": "==== Verkleinerungsformen ====",
"namespace_id": 10,
Expand All @@ -169,6 +259,11 @@
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Weibliche Namensvarianten": {
"body": "====Weibliche Namensvarianten====",
"namespace_id": 10,
"need_pre_expand": true
},
"Vorlage:Weibliche Wortformen": {
"body": "==== Weibliche Wortformen ====",
"namespace_id": 10,
Expand Down
13 changes: 12 additions & 1 deletion src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,17 @@ def process_linkage_list_item(
raw_tag = clean_node(wxr, None, child)
if raw_tag.endswith(":"):
raw_tags.append(raw_tag.strip(": "))
else:
for link_node in child.find_child(NodeKind.LINK):
link_text = clean_node(wxr, None, link_node)
if link_text != "":
linkage = Linkage(
word=link_text,
sense_index=sense_idx,
raw_tags=raw_tags,
)
translate_raw_tags(linkage)
linkage_list.append(linkage)
elif isinstance(child, TemplateNode) and child.template_name.endswith(
"."
):
Expand All @@ -67,7 +78,7 @@ def process_linkage_list_item(
translate_raw_tags(linkage)
linkage_list.append(linkage)

if len(note_nodes) > 0:
if len(note_nodes) > 0 and len(linkage_list) > 0:
linkage_list[-1].note = clean_node(wxr, None, note_nodes).strip(
"–—―‒- "
)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,4 @@ class WordEntry(BaseModelWrap):
redirects: list[str] = []
etymology_text: str = ""
forms: list[Form] = []
meronyms: list[Linkage] = []
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def parse_section(
# Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage
# Level 3 headings are used to start POS sections like
# === {{Wortart|Verb|Deutsch}} ===
# title templates:
# https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine
if level_node.kind == NodeKind.LEVEL3:
process_pos_section(wxr, page_data, base_data, level_node)
# Level 4 headings were introduced by overriding the default templates.
Expand Down
11 changes: 11 additions & 0 deletions src/wiktextract/extractor/de/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,30 @@
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Redewendungen": "synonyms",
"Sinnverwandte Wörter": "coordinate_terms",
"Sinnverwandte Zeichen": "synonyms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived",
"Abgeleitete Symbole": "derived",
"Geflügelte Worte": "proverbs",
"Meronyme": "meronyms",
}

FORM_TITLES = {
"Nebenformen": ["variant"],
"Namensvarianten": ["variant"],
"Weibliche Wortformen": ["feminine"],
"Weibliche Namensvarianten": ["feminine"],
"Männliche Wortformen": ["masculine"],
"Verkleinerungsformen": ["diminutive"],
"Vergrößerungsformen": ["augmentative"],
"Kurzformen": ["abbreviation"],
"Koseformen": ["affective"],
"Hanja": ["hanja"],
"Männliche Namensvarianten": ["masculine"],
"Nicht mehr gültige Schreibweisen": ["obsolete"],
"Symbole": ["symbol"],
}
21 changes: 21 additions & 0 deletions tests/test_de_linkages.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,24 @@ def test_tag_template(self):
},
],
)

def test_link_in_italic_node(self):
self.wxr.wtp.start_page("Abendland")
root = self.wxr.wtp.parse("""====Redewendungen====
:[1] ''[[Morgenland und Abendland]]'' -""")
word_entry = WordEntry(
word="Abendland",
lang_code="de",
lang="Deutsch",
senses=[Sense(sense_index="1")],
)
extract_linkages(self.wxr, word_entry, root.children[0], "expressions")
self.assertEqual(
[
d.model_dump(exclude_defaults=True)
for d in word_entry.expressions
],
[
{"word": "Morgenland und Abendland", "sense_index": "1"},
],
)

0 comments on commit 1a46f2b

Please sign in to comment.