tatuylonen · xxyzz · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/src/wiktextract/extractor/ko/example.py b/src/wiktextract/extractor/ko/example.py
@@ -1,4 +1,4 @@
-from wikitextprocessor import TemplateNode, WikiNode
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -7,20 +7,32 @@
 
 
 def extract_example_list_item(
-    wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str
+    wxr: WiktextractContext,
+    sense: Sense,
+    list_item: WikiNode,
+    lang_code: str,
+    parent_example: Example | None = None,
 ) -> None:
-    example = Example()
+    example = Example() if parent_example is None else parent_example
     after_lang_template = False
     for node in list_item.children:
         if isinstance(node, TemplateNode) and node.template_name == "lang":
             after_lang_template = True
             extract_example_lang_template(wxr, example, node, lang_code)
         elif isinstance(node, TemplateNode) and node.template_name.startswith(
-            "따옴"
+            ("따옴", "지봉유설")
         ):
             example.ref = clean_node(wxr, None, node).strip("() ")
+        elif isinstance(node, TemplateNode) and node.template_name in [
+            "예문",
+            "ux",
+        ]:
+            extract_ux_template(wxr, sense, example, node)
+            break
         elif after_lang_template:
             example.translation += clean_node(wxr, None, node)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            break
         else:
             example.text += clean_node(wxr, None, node)
 
@@ -38,6 +50,12 @@ def extract_example_list_item(
         else:
             sense.examples.append(example)
 
+    for nested_list in list_item.find_child(NodeKind.LIST):
+        for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
+            extract_example_list_item(
+                wxr, sense, nested_list_item, lang_code, example
+            )
+
 
 def extract_example_lang_template(
     wxr: WiktextractContext,
@@ -66,3 +84,43 @@ def extract_example_lang_template(
         roman_start_index = example.text.index("(")
         example.roman = example.text[roman_start_index:].strip("() ")
         example.text = example.text[:roman_start_index].strip()
+
+
+def extract_ux_template(
+    wxr: WiktextractContext,
+    sense: Sense,
+    example: Example,
+    t_node: TemplateNode,
+) -> None:
+    # https://ko.wiktionary.org/wiki/틀:ux
+    lang_code = t_node.template_parameters.get(1, "")
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    if lang_code == "ja":
+        for span_tag in expanded_node.find_html_recursively("span"):
+            span_class = span_tag.attrs.get("class", "")
+            if span_class == "Jpan":
+                example.ruby, no_ruby = extract_ruby(wxr, span_tag)
+                example.text = clean_node(wxr, None, no_ruby)
+            elif span_class == "tr":
+                example.roman = clean_node(wxr, None, span_tag)
+        example.translation = clean_node(
+            wxr, None, t_node.template_parameters.get(4, "")
+        )
+        example.literal_meaning = clean_node(
+            wxr, None, t_node.template_parameters.get("lit", "")
+        )
+    else:
+        example.text = clean_node(
+            wxr, None, t_node.template_parameters.get(2, "")
+        )
+        example.translation = clean_node(
+            wxr, None, t_node.template_parameters.get(3, "")
+        )
+        example.note = clean_node(
+            wxr, None, t_node.template_parameters.get("footer", "")
+        )
+
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        clean_node(wxr, sense, link_node)
diff --git a/src/wiktextract/extractor/ko/linkage.py b/src/wiktextract/extractor/ko/linkage.py
@@ -33,11 +33,15 @@ def extract_linkage_section(
     if linkage_type == "proverbs":
         extract_proverb_section(wxr, word_entry, level_node)
     else:
+        from .translation import extract_translation_template
+
         for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
             extract_linkage_list_item(wxr, word_entry, list_item, linkage_type)
 
         for t_node in level_node.find_child(NodeKind.TEMPLATE):
             extract_linkage_template(wxr, word_entry, t_node)
+            if t_node.template_name == "외국어":
+                extract_translation_template(wxr, word_entry, t_node)
 
 
 def extract_linkage_list_item(

diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py
@@ -19,6 +19,8 @@ class Example(KoreanBaseModel):
         default=[], description="Japanese Kanji and furigana"
     )
     tags: list[str] = []
+    literal_meaning: str = ""
+    note: str = ""
 
 
 class Sense(KoreanBaseModel):
@@ -44,6 +46,7 @@ class Sound(KoreanBaseModel):
     raw_tags: list[str] = []
     hangul: str = ""
     roman: str = ""
+    other: str = ""
 
 
 class Linkage(KoreanBaseModel):

diff --git a/src/wiktextract/extractor/ko/page.py b/src/wiktextract/extractor/ko/page.py
@@ -10,7 +10,11 @@
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import LINKAGE_SECTIONS, POS_DATA
-from .sound import SOUND_TEMPLATES, extract_sound_template
+from .sound import (
+    SOUND_TEMPLATES,
+    extract_sound_section,
+    extract_sound_template,
+)
 from .translation import extract_translation_section
 
 
@@ -42,6 +46,10 @@ def parse_section(
         )
     elif title_text == "번역" and len(page_data) > 0:
         extract_translation_section(wxr, page_data[-1], level_node)
+    elif title_text == "발음":
+        extract_sound_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/ko/pos.py b/src/wiktextract/extractor/ko/pos.py
@@ -65,7 +65,7 @@ def extract_gloss_list_item(
     sense = Sense()
     for node in list_item.children:
         if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
-            if node.sarg.startswith(":"):
+            if ":" in node.sarg:
                 for e_list_item in node.find_child(NodeKind.LIST_ITEM):
                     extract_example_list_item(
                         wxr, sense, e_list_item, word_entry.lang_code

diff --git a/src/wiktextract/extractor/ko/section_titles.py b/src/wiktextract/extractor/ko/section_titles.py
@@ -11,6 +11,7 @@
     "의미": {"pos": "unknown"},
     "타동사": {"pos": "verb", "tags": ["transitive"]},
     "종별사": {"pos": "counter"},
+    "감탄사": {"pos": "intj"},
 }
 
 LINKAGE_SECTIONS = {

diff --git a/src/wiktextract/extractor/ko/sound.py b/src/wiktextract/extractor/ko/sound.py
@@ -1,11 +1,18 @@
-from wikitextprocessor import NodeKind, TemplateNode
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from ..share import set_sound_file_url_fields
 from .models import Sound, WordEntry
 
-SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA"])
+SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron"])
+
+
+def extract_sound_section(
+    wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
+) -> None:
+    for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
+        extract_sound_template(wxr, word_entry, t_node)
 
 
 def extract_sound_template(
@@ -17,6 +24,8 @@ def extract_sound_template(
         extract_ipa_template(wxr, word_entry, node)
     elif node.template_name == "ko-IPA":
         extract_ko_ipa_template(wxr, word_entry, node)
+    elif node.template_name == "ja-pron":
+        extract_ja_pron_template(wxr, word_entry, node)
 
 
 def extract_listen_pronunciation_template(
@@ -93,3 +102,30 @@ def extract_ko_ipa_template(
 
     for link_node in expanded_node.find_child(NodeKind.LINK):
         clean_node(wxr, word_entry, link_node)
+
+
+def extract_ja_pron_template(
+    wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
+) -> None:
+    # https://ko.wiktionary.org/wiki/틀:ja-pron
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(node), expand_all=True
+    )
+    for ul_tag in expanded_node.find_html("ul"):
+        for li_tag in ul_tag.find_html("li"):
+            sound = Sound()
+            for span_tag in li_tag.find_html("span"):
+                span_class = span_tag.attrs.get("class", "")
+                if span_class == "usage-label-accent":
+                    sound.raw_tags.append(
+                        clean_node(wxr, None, span_tag).strip("()")
+                    )
+                elif span_class == "Jpan":
+                    sound.other = clean_node(wxr, None, span_tag)
+                elif span_class == "Latn":
+                    sound.roman = clean_node(wxr, None, span_tag)
+                elif span_class == "IPA":
+                    sound.ipa = clean_node(wxr, None, span_tag)
+            if sound.ipa != "" or sound.roman != "":
+                word_entry.sounds.append(sound)
+    clean_node(wxr, word_entry, expanded_node)
diff --git a/tests/test_ko_example.py b/tests/test_ko_example.py
@@ -95,3 +95,84 @@ def test_ref_quote_template(self):
                 "ref": "3세기, 진수,《삼국지》, 〈권30 위서 오환선비동이전 (魏書 烏丸鮮卑東夷傳)〉",
             },
         )
+
+    def test_ja_ux_template(self):
+        self.wxr.wtp.add_page(
+            "틀:예문",
+            10,
+            """<div style="font-size: 120%"><span lang="ja" class="Jpan">'''<ruby>東西<rp>(</rp><rt>とうざい</rt><rp>)</rp></ruby>'''に<ruby>走<rp>(</rp><rt>はし</rt><rp>)</rp></ruby>る<ruby>道<rp>(</rp><rt>どう</rt><rp>)</rp></ruby><ruby>路<rp>(</rp><rt>ろ</rt><rp>)</rp></ruby></span></div><dl><dd><i><span class="tr">'''tōzai''' ni hashiru dōro</span></i></dd><dd>'''동서'''로 달리는 도로</dd><dd>(literally, “lit”)</dd></dl>[[분류:일본어 용례가 포함된 낱말|東西]]""",
+        )
+        data = parse_page(
+            self.wxr,
+            "東西",
+            """== 일본어 ==
+=== 명사 ===
+# [[동서]] ([[동쪽]]과 [[서쪽]])
+#: {{예문|ja|'''東西'''に走る道%路|'''とうざい''' に はしる どう%ろ|'''동서'''로 달리는 도로}}""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0],
+            {
+                "text": "東西に走る道路",
+                "ruby": [
+                    ("東西", "とうざい"),
+                    ("走", "はし"),
+                    ("道", "どう"),
+                    ("路", "ろ"),
+                ],
+                "roman": "tōzai ni hashiru dōro",
+                "translation": "동서로 달리는 도로",
+            },
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["categories"], ["일본어 용례가 포함된 낱말"]
+        )
+
+    def test_ko_ux_template(self):
+        self.wxr.wtp.add_page(
+            "틀:예문",
+            10,
+            """<div class="h-usage-example"><span class="None" lang="ko"><span style="font-size: 120%25">그녀는 '''없는''' 가정에서 자랐다.</span></span><dl><dd><span class="e-footer">매우 가난하게 살았다.</span></dd></dl></div>[[Category:한국어 용례가 포함된 낱말|없다]][[Category:한국어 용례가 포함된 낱말|없다|없다]]""",
+        )
+        data = parse_page(
+            self.wxr,
+            "없다",
+            """== 한국어 ==
+=== 형용사 ===
+# 궁핍하다.
+#:{{예문|ko|그녀는 '''없는''' 가정에서 자랐다.|footer= 매우 가난하게 살았다.}}""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0],
+            {
+                "text": "그녀는 없는 가정에서 자랐다.",
+                "note": "매우 가난하게 살았다.",
+            },
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["categories"], ["한국어 용례가 포함된 낱말"]
+        )
+
+    def test_jibong_yuseol_template(self):
+        self.wxr.wtp.add_page(
+            "틀:지봉유설",
+            10,
+            """'''1614년''', [[:w:이수광|이수광]], 《[[:s:지봉유설|지봉유설]]》, 〈[[:s:지봉유설/2권|2권 外國 條]]〉""",
+        )
+        data = parse_page(
+            self.wxr,
+            "없다",
+            """== 중국어 ==
+====명사====
+# [[동서]].
+#: {{지봉유설|2|2권 外國 條}}
+#:: {{lang|zh|'''東西'''六十日程}} 동서로 60일이 걸리는 거리이다.""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0]["examples"][0],
+            {
+                "text": "東西六十日程",
+                "translation": "동서로 60일이 걸리는 거리이다.",
+                "ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉",
+            },
+        )
diff --git a/tests/test_ko_sound.py b/tests/test_ko_sound.py
@@ -75,3 +75,37 @@ def test_ko_ipa_template(self):
         self.assertEqual(
             data[0]["categories"], ["한국어 IPA 발음이 포함된 낱말"]
         )
+
+    def test_ja_pron(self):
+        self.wxr.wtp.add_page(
+            "틀:ja-pron",
+            10,
+            """<ul><li><span class="usage-label-accent"><span class="ib-brac">(</span><span class="ib-content">[[w:도쿄 방언|도쿄]]</span><span class="ib-brac">)</span></span> <span lang="ja" class="Jpan"><span>と<span></span></span>ーざい</span></span> <span class="Latn"><samp>[tóꜜòzàì]</samp></span> ([[頭高型|두고형]] – [1])</li><li>[[w:국제 음성 기호|IPA]]<sup>([[부록:일본어 발음|표기]])</sup>:&#32;<span class="IPA">[to̞ːza̠i]</span>[[Category:일본어 IPA 발음이 포함된 낱말|とうざい]][[Category:일본어 중복되지 않는 수동 정렬 키를 포함하는 낱말|東西]]</li></ul>""",
+        )
+        data = parse_page(
+            self.wxr,
+            "東西",
+            """== 일본어 ==
+=== 발음 ===
+* {{ja-pron|とうざい|acc=1|acc_ref=DJR,NHK}}
+=== 명사 ===
+# [[동서]] ([[동쪽]]과 [[서쪽]])""",
+        )
+        self.assertEqual(
+            data[0]["sounds"],
+            [
+                {
+                    "roman": "[tóꜜòzàì]",
+                    "other": "とーざい",
+                    "raw_tags": ["도쿄"],
+                },
+                {"ipa": "[to̞ːza̠i]"},
+            ],
+        )
+        self.assertEqual(
+            data[0]["categories"],
+            [
+                "일본어 IPA 발음이 포함된 낱말",
+                "일본어 중복되지 않는 수동 정렬 키를 포함하는 낱말",
+            ],
+        )