diff --git a/src/wiktextract/data/es/config.json b/src/wiktextract/data/es/config.json index 073bd030..c38399f2 100644 --- a/src/wiktextract/data/es/config.json +++ b/src/wiktextract/data/es/config.json @@ -1,5 +1,5 @@ { "allowed_html_tags": { - "phonos": {"parents": ["phrasing"], "content": ["flow"]} + "phonos": {"parents": ["*"], "content": ["*"]} } } diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index dd70348d..82771ff8 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -164,3 +164,4 @@ class WordEntry(BaseModelWrap): tags: list[str] = [] extra_sounds: dict[str, str] = {} forms: list[Form] = [] + hyphenation: str = "" diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py index 9329fa8a..9160a0f8 100644 --- a/src/wiktextract/extractor/es/pronunciation.py +++ b/src/wiktextract/extractor/es/pronunciation.py @@ -1,4 +1,4 @@ -from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode +from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -7,7 +7,6 @@ # translate table row header to sound model field PRON_GRAF_HEADER_MAP = { - "silabación": "syllabic", "rimas": "rhymes", "rima": "rhymes", } @@ -36,6 +35,8 @@ def process_pron_graf_template( value_text = clean_node(wxr, None, value_node) if header_text.endswith(" (AFI)"): # IPA process_pron_graf_ipa_cell(wxr, word_entry, value_node, header_text) + elif header_text == "silabación": + word_entry.hyphenation = value_text elif header_text in PRON_GRAF_HEADER_MAP: sound = Sound() setattr(sound, PRON_GRAF_HEADER_MAP[header_text], value_text) @@ -59,6 +60,7 @@ def process_pron_graf_template( if len(extra_sounds) > 0: word_entry.extra_sounds = extra_sounds + clean_node(wxr, word_entry, expanded_node) def process_pron_graf_ipa_cell( diff --git a/tests/test_es_pronunciation.py b/tests/test_es_pronunciation.py index 339ad54b..46d26377 100644 --- a/tests/test_es_pronunciation.py +++ b/tests/test_es_pronunciation.py @@ -12,14 +12,9 @@ class TestESPronunciation(unittest.TestCase): maxDiff = None def setUp(self) -> None: + conf = WiktionaryConfig(dump_file_lang_code="es") self.wxr = WiktextractContext( - Wtp( - lang_code="es", - extension_tags={ - "phonos": {"parents": ["phrasing"], "content": ["flow"]} - }, - ), - WiktionaryConfig(dump_file_lang_code="es"), + Wtp(lang_code="es", extension_tags=conf.allowed_html_tags), conf ) def tearDown(self) -> None: @@ -181,3 +176,27 @@ def test_pron_graf_homophone(self): }, ], ) + + def test_pron_graf_hyphenation(self): + self.wxr.wtp.start_page("perro") + self.wxr.wtp.add_page( + "Plantilla:pron-graf", + 10, + """{|class="pron-graf toccolours"|perro +|- +|'''silabación''' +|pe-rro +|- +|'''rima''' +|[[:Categoría:ES:Rimas:e.ro|e.ro]][[Categoría:ES:Rimas:e.ro]] +|}""", + ) + root = self.wxr.wtp.parse("{{pron-graf}}") + word_entry = WordEntry(word="perro", lang_code="es", lang="Español") + process_pron_graf_template(self.wxr, word_entry, root.children[0]) + self.assertEqual( + word_entry.model_dump(exclude_defaults=True)["sounds"], + [{"rhymes": "e.ro"}], + ) + self.assertEqual(word_entry.hyphenation, "pe-rro") + self.assertEqual(word_entry.categories, ["ES:Rimas:e.ro"])