Merge pull request #877 from xxyzz/ko

[ko] extract more gloss and linkage data
tatuylonen · Oct 18, 2024 · 0a6aa5f · 0a6aa5f
2 parents 3e4b018 + 836c41c
commit 0a6aa5f
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 15 deletions.
diff --git a/src/wiktextract/extractor/ko/example.py b/src/wiktextract/extractor/ko/example.py
@@ -22,7 +22,9 @@ def extract_example_list_item(
         elif isinstance(node, TemplateNode) and node.template_name.startswith(
             ("따옴", "지봉유설")
         ):
-            example.ref = clean_node(wxr, None, node).strip("() ")
+            example.ref = (
+                clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄")
+            )
         elif isinstance(node, TemplateNode) and node.template_name in [
             "예문",
             "ux",

diff --git a/src/wiktextract/extractor/ko/linkage.py b/src/wiktextract/extractor/ko/linkage.py
@@ -1,3 +1,5 @@
+import re
+
 from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
@@ -57,23 +59,35 @@ def extract_linkage_list_item(
     list_item: WikiNode,
     linkage_type: str,
 ) -> None:
+    raw_tag = ""
+    is_roman = False
     for child in list_item.children:
-        if isinstance(child, str) and ":" in child:
-            l_type_str = child[: child.index(":")].strip()
-            if l_type_str in LINKAGE_SECTIONS:
-                linkage_type = LINKAGE_SECTIONS[l_type_str]
+        if isinstance(child, str):
+            if ":" in child:
+                l_type_str = child[: child.index(":")].strip()
+                if l_type_str in LINKAGE_SECTIONS:
+                    linkage_type = LINKAGE_SECTIONS[l_type_str]
+            else:
+                m = re.search(r"\(([^()]+)\)", child)
+                if m is not None:
+                    raw_tag = m.group(1).strip()
+                    is_roman = re.search(r"[a-z]", raw_tag) is not None
 
     for link_node in list_item.find_child(NodeKind.LINK):
         word = clean_node(wxr, None, link_node)
         if word != "":
-            getattr(word_entry, linkage_type).append(
-                Linkage(
-                    word=word,
-                    sense=word_entry.senses[-1].glosses[-1]
-                    if len(word_entry.senses) > 0
-                    else "",
-                )
+            linkage = Linkage(
+                word=word,
+                sense=word_entry.senses[-1].glosses[-1]
+                if len(word_entry.senses) > 0
+                else "",
             )
+            if len(raw_tag) > 0:
+                if is_roman:
+                    linkage.roman = raw_tag
+                else:
+                    linkage.raw_tags.append(raw_tag)
+            getattr(word_entry, linkage_type).append(linkage)
 
 
 def extract_proverb_section(

diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py
@@ -23,6 +23,10 @@ class Example(KoreanBaseModel):
     note: str = ""
 
 
+class AltForm(KoreanBaseModel):
+    word: str
+
+
 class Sense(KoreanBaseModel):
     glosses: list[str] = []
     tags: list[str] = []
@@ -31,6 +35,7 @@ class Sense(KoreanBaseModel):
     categories: list[str] = []
     examples: list[Example] = []
     note: str = ""
+    form_of: list[AltForm] = []
 
 
 class Sound(KoreanBaseModel):
@@ -52,6 +57,9 @@ class Sound(KoreanBaseModel):
 class Linkage(KoreanBaseModel):
     word: str
     sense: str = ""
+    roman: str = ""
+    raw_tags: list[str] = []
+    tags: list[str] = []
 
 
 class Translation(KoreanBaseModel):

diff --git a/src/wiktextract/extractor/ko/pos.py b/src/wiktextract/extractor/ko/pos.py
@@ -10,7 +10,7 @@
     extract_linkage_list_item,
     extract_linkage_template,
 )
-from .models import Sense, WordEntry
+from .models import AltForm, Sense, WordEntry
 from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .sound import SOUND_TEMPLATES, extract_sound_template
 from .translation import extract_translation_template
@@ -78,6 +78,11 @@ def extract_gloss_list_item(
             for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
                 extract_unorderd_list_item(wxr, word_entry, nested_list_item)
             continue
+        elif isinstance(node, TemplateNode) and node.template_name.endswith(
+            " of"
+        ):
+            extract_form_of_template(wxr, sense, node)
+            gloss_nodes.append(node)
         else:
             gloss_nodes.append(node)
 
@@ -100,7 +105,7 @@ def extract_unorderd_list_item(
             # `* '''1.''' gloss text`, terrible obsolete layout
             is_first_bold = False
             bold_text = clean_node(wxr, None, node)
-            if re.fullmatch(r"\d+\.?", bold_text):
+            if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
                 new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
                 new_list_item.children = list_item.children[index + 1 :]
                 extract_gloss_list_item(wxr, word_entry, new_list_item)
@@ -136,3 +141,14 @@ def extract_unorderd_list_item(
             extract_example_list_item(
                 wxr, word_entry.senses[-1], list_item, word_entry.lang_code
             )
+
+
+def extract_form_of_template(
+    wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
+) -> None:
+    if "form-of" not in sense.tags:
+        sense.tags.append("form-of")
+    word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
+    word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
+    if len(word) > 0:
+        sense.form_of.append(AltForm(word=word))
diff --git a/tests/test_ko_gloss.py b/tests/test_ko_gloss.py
@@ -31,6 +31,16 @@ def test_ignore_gloss_index_bold_node(self):
 * '''1.''' [[우리]].""",
         )
         self.assertEqual(data[0]["senses"], [{"glosses": ["우리."]}])
+        data = parse_page(
+            self.wxr,
+            "보다",
+            """== 한국어 ==
+=== 동사 ===
+* '''1-1.''' 눈으로 무엇을 알아차리다.""",
+        )
+        self.assertEqual(
+            data[0]["senses"], [{"glosses": ["눈으로 무엇을 알아차리다."]}]
+        )
 
     def test_no_pos_section(self):
         data = parse_page(
@@ -81,3 +91,27 @@ def test_note_list(self):
             data[0]["senses"][0]["note"],
             "특정 업계에서는 'ea'란 표현을 쓰기도 한다.",
         )
+
+    def test_form_of_template(self):
+        self.wxr.wtp.add_page(
+            "틀:ko-hanja form of",
+            10,
+            """<span class="form-of-definition"><i class="None mention" lang="ko">[[전화#한국어|전화]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-gloss-double-quote">“</span><span class="mention-gloss">전화기로 말을 주고받는 일</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>의 [[한자#한국어|한자]] 형태.</span>""",
+        )
+
+        data = parse_page(
+            self.wxr,
+            "電話",
+            """== 한국어 ==
+=== 명사 ===
+# {{ko-hanja form of|전화|전화기로 말을 주고받는 일}}""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0],
+            {
+                # "categories": ["한국어 비표준 문자가 포함된 낱말 (링크)"],
+                "form_of": [{"word": "전화"}],
+                "tags": ["form-of"],
+                "glosses": ["전화 (“전화기로 말을 주고받는 일”)의 한자 형태."],
+            },
+        )
diff --git a/tests/test_ko_linkage.py b/tests/test_ko_linkage.py
@@ -84,5 +84,27 @@ def test_colon_linkage_list(self):
         )
         self.assertEqual(
             data[0]["senses"][0]["examples"],
-            [{"text": "한글은 창제 당시 총 28개의 자모가 있었지만 지금은 24개만 사용한다."}]
+            [
+                {
+                    "text": "한글은 창제 당시 총 28개의 자모가 있었지만 지금은 24개만 사용한다."
+                }
+            ],
+        )
+
+    def test_zh_pinyin(self):
+        data = parse_page(
+            self.wxr,
+            "土",
+            """== 중국어 ==
+=== 명사 ===
+* '''1.''' 흙, 땅
+=== 합성어 ===
+:*[[土產]]/[[土产 ]](tǔchǎn)""",
+        )
+        self.assertEqual(
+            data[0]["derived"],
+            [
+                {"word": "土產", "roman": "tǔchǎn", "sense": "흙, 땅"},
+                {"word": "土产", "roman": "tǔchǎn", "sense": "흙, 땅"},
+            ],
         )