Skip to content

Commit

Permalink
Merge pull request #863 from xxyzz/es
Browse files Browse the repository at this point in the history
[es] extract "silabación" to "hyphenation" field
  • Loading branch information
xxyzz authored Oct 11, 2024
2 parents fbdb5c2 + c6fd5db commit 0ceb747
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/data/es/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"allowed_html_tags": {
"phonos": {"parents": ["phrasing"], "content": ["flow"]}
"phonos": {"parents": ["*"], "content": ["*"]}
}
}
1 change: 1 addition & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,4 @@ class WordEntry(BaseModelWrap):
tags: list[str] = []
extra_sounds: dict[str, str] = {}
forms: list[Form] = []
hyphenation: str = ""
6 changes: 4 additions & 2 deletions src/wiktextract/extractor/es/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand All @@ -7,7 +7,6 @@

# translate table row header to sound model field
PRON_GRAF_HEADER_MAP = {
"silabación": "syllabic",
"rimas": "rhymes",
"rima": "rhymes",
}
Expand Down Expand Up @@ -36,6 +35,8 @@ def process_pron_graf_template(
value_text = clean_node(wxr, None, value_node)
if header_text.endswith(" (AFI)"): # IPA
process_pron_graf_ipa_cell(wxr, word_entry, value_node, header_text)
elif header_text == "silabación":
word_entry.hyphenation = value_text
elif header_text in PRON_GRAF_HEADER_MAP:
sound = Sound()
setattr(sound, PRON_GRAF_HEADER_MAP[header_text], value_text)
Expand All @@ -59,6 +60,7 @@ def process_pron_graf_template(

if len(extra_sounds) > 0:
word_entry.extra_sounds = extra_sounds
clean_node(wxr, word_entry, expanded_node)


def process_pron_graf_ipa_cell(
Expand Down
33 changes: 26 additions & 7 deletions tests/test_es_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,9 @@ class TestESPronunciation(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
conf = WiktionaryConfig(dump_file_lang_code="es")
self.wxr = WiktextractContext(
Wtp(
lang_code="es",
extension_tags={
"phonos": {"parents": ["phrasing"], "content": ["flow"]}
},
),
WiktionaryConfig(dump_file_lang_code="es"),
Wtp(lang_code="es", extension_tags=conf.allowed_html_tags), conf
)

def tearDown(self) -> None:
Expand Down Expand Up @@ -181,3 +176,27 @@ def test_pron_graf_homophone(self):
},
],
)

def test_pron_graf_hyphenation(self):
self.wxr.wtp.start_page("perro")
self.wxr.wtp.add_page(
"Plantilla:pron-graf",
10,
"""{|class="pron-graf toccolours"|<span>perro</span>
|-
|'''silabación'''
|pe-rro
|-
|'''rima'''
|[[:Categoría:ES:Rimas:e.ro|e.ro]][[Categoría:ES:Rimas:e.ro]]
|}""",
)
root = self.wxr.wtp.parse("{{pron-graf}}")
word_entry = WordEntry(word="perro", lang_code="es", lang="Español")
process_pron_graf_template(self.wxr, word_entry, root.children[0])
self.assertEqual(
word_entry.model_dump(exclude_defaults=True)["sounds"],
[{"rhymes": "e.ro"}],
)
self.assertEqual(word_entry.hyphenation, "pe-rro")
self.assertEqual(word_entry.categories, ["ES:Rimas:e.ro"])

0 comments on commit 0ceb747

Please sign in to comment.