Merge pull request #863 from xxyzz/es

[es] extract "silabación" to "hyphenation" field
tatuylonen · Oct 11, 2024 · 0ceb747 · 0ceb747
2 parents fbdb5c2 + c6fd5db
commit 0ceb747
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 10 deletions.
diff --git a/src/wiktextract/data/es/config.json b/src/wiktextract/data/es/config.json
@@ -1,5 +1,5 @@
 {
   "allowed_html_tags": {
-    "phonos": {"parents": ["phrasing"], "content": ["flow"]}
+    "phonos": {"parents": ["*"], "content": ["*"]}
   }
 }
diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -164,3 +164,4 @@ class WordEntry(BaseModelWrap):
     tags: list[str] = []
     extra_sounds: dict[str, str] = {}
     forms: list[Form] = []
+    hyphenation: str = ""
diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py
@@ -1,4 +1,4 @@
-from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
+from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -7,7 +7,6 @@
 
 # translate table row header to sound model field
 PRON_GRAF_HEADER_MAP = {
-    "silabación": "syllabic",
     "rimas": "rhymes",
     "rima": "rhymes",
 }
@@ -36,6 +35,8 @@ def process_pron_graf_template(
         value_text = clean_node(wxr, None, value_node)
         if header_text.endswith(" (AFI)"):  # IPA
             process_pron_graf_ipa_cell(wxr, word_entry, value_node, header_text)
+        elif header_text == "silabación":
+            word_entry.hyphenation = value_text
         elif header_text in PRON_GRAF_HEADER_MAP:
             sound = Sound()
             setattr(sound, PRON_GRAF_HEADER_MAP[header_text], value_text)
@@ -59,6 +60,7 @@ def process_pron_graf_template(
 
     if len(extra_sounds) > 0:
         word_entry.extra_sounds = extra_sounds
+    clean_node(wxr, word_entry, expanded_node)
 
 
 def process_pron_graf_ipa_cell(

diff --git a/tests/test_es_pronunciation.py b/tests/test_es_pronunciation.py
@@ -12,14 +12,9 @@ class TestESPronunciation(unittest.TestCase):
     maxDiff = None
 
     def setUp(self) -> None:
+        conf = WiktionaryConfig(dump_file_lang_code="es")
         self.wxr = WiktextractContext(
-            Wtp(
-                lang_code="es",
-                extension_tags={
-                    "phonos": {"parents": ["phrasing"], "content": ["flow"]}
-                },
-            ),
-            WiktionaryConfig(dump_file_lang_code="es"),
+            Wtp(lang_code="es", extension_tags=conf.allowed_html_tags), conf
         )
 
     def tearDown(self) -> None:
@@ -181,3 +176,27 @@ def test_pron_graf_homophone(self):
                 },
             ],
         )
+
+    def test_pron_graf_hyphenation(self):
+        self.wxr.wtp.start_page("perro")
+        self.wxr.wtp.add_page(
+            "Plantilla:pron-graf",
+            10,
+            """{|class="pron-graf toccolours"|<span>perro</span>
+|-
+|'''silabación'''
+|pe-rro
+|-
+|'''rima'''
+|[[:Categoría:ES:Rimas:e.ro|e.ro]][[Categoría:ES:Rimas:e.ro]]
+|}""",
+        )
+        root = self.wxr.wtp.parse("{{pron-graf}}")
+        word_entry = WordEntry(word="perro", lang_code="es", lang="Español")
+        process_pron_graf_template(self.wxr, word_entry, root.children[0])
+        self.assertEqual(
+            word_entry.model_dump(exclude_defaults=True)["sounds"],
+            [{"rhymes": "e.ro"}],
+        )
+        self.assertEqual(word_entry.hyphenation, "pe-rro")
+        self.assertEqual(word_entry.categories, ["ES:Rimas:e.ro"])