Skip to content

Commit

Permalink
Merge pull request #877 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] extract more gloss and linkage data
  • Loading branch information
xxyzz authored Oct 18, 2024
2 parents 3e4b018 + 836c41c commit 0a6aa5f
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 15 deletions.
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/ko/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def extract_example_list_item(
elif isinstance(node, TemplateNode) and node.template_name.startswith(
("따옴", "지봉유설")
):
example.ref = clean_node(wxr, None, node).strip("() ")
example.ref = (
clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄")
)
elif isinstance(node, TemplateNode) and node.template_name in [
"예문",
"ux",
Expand Down
36 changes: 25 additions & 11 deletions src/wiktextract/extractor/ko/linkage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
Expand Down Expand Up @@ -57,23 +59,35 @@ def extract_linkage_list_item(
list_item: WikiNode,
linkage_type: str,
) -> None:
raw_tag = ""
is_roman = False
for child in list_item.children:
if isinstance(child, str) and ":" in child:
l_type_str = child[: child.index(":")].strip()
if l_type_str in LINKAGE_SECTIONS:
linkage_type = LINKAGE_SECTIONS[l_type_str]
if isinstance(child, str):
if ":" in child:
l_type_str = child[: child.index(":")].strip()
if l_type_str in LINKAGE_SECTIONS:
linkage_type = LINKAGE_SECTIONS[l_type_str]
else:
m = re.search(r"\(([^()]+)\)", child)
if m is not None:
raw_tag = m.group(1).strip()
is_roman = re.search(r"[a-z]", raw_tag) is not None

for link_node in list_item.find_child(NodeKind.LINK):
word = clean_node(wxr, None, link_node)
if word != "":
getattr(word_entry, linkage_type).append(
Linkage(
word=word,
sense=word_entry.senses[-1].glosses[-1]
if len(word_entry.senses) > 0
else "",
)
linkage = Linkage(
word=word,
sense=word_entry.senses[-1].glosses[-1]
if len(word_entry.senses) > 0
else "",
)
if len(raw_tag) > 0:
if is_roman:
linkage.roman = raw_tag
else:
linkage.raw_tags.append(raw_tag)
getattr(word_entry, linkage_type).append(linkage)


def extract_proverb_section(
Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class Example(KoreanBaseModel):
note: str = ""


class AltForm(KoreanBaseModel):
word: str


class Sense(KoreanBaseModel):
glosses: list[str] = []
tags: list[str] = []
Expand All @@ -31,6 +35,7 @@ class Sense(KoreanBaseModel):
categories: list[str] = []
examples: list[Example] = []
note: str = ""
form_of: list[AltForm] = []


class Sound(KoreanBaseModel):
Expand All @@ -52,6 +57,9 @@ class Sound(KoreanBaseModel):
class Linkage(KoreanBaseModel):
word: str
sense: str = ""
roman: str = ""
raw_tags: list[str] = []
tags: list[str] = []


class Translation(KoreanBaseModel):
Expand Down
20 changes: 18 additions & 2 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
extract_linkage_list_item,
extract_linkage_template,
)
from .models import Sense, WordEntry
from .models import AltForm, Sense, WordEntry
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template
from .translation import extract_translation_template
Expand Down Expand Up @@ -78,6 +78,11 @@ def extract_gloss_list_item(
for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_unorderd_list_item(wxr, word_entry, nested_list_item)
continue
elif isinstance(node, TemplateNode) and node.template_name.endswith(
" of"
):
extract_form_of_template(wxr, sense, node)
gloss_nodes.append(node)
else:
gloss_nodes.append(node)

Expand All @@ -100,7 +105,7 @@ def extract_unorderd_list_item(
# `* '''1.''' gloss text`, terrible obsolete layout
is_first_bold = False
bold_text = clean_node(wxr, None, node)
if re.fullmatch(r"\d+\.?", bold_text):
if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
new_list_item.children = list_item.children[index + 1 :]
extract_gloss_list_item(wxr, word_entry, new_list_item)
Expand Down Expand Up @@ -136,3 +141,14 @@ def extract_unorderd_list_item(
extract_example_list_item(
wxr, word_entry.senses[-1], list_item, word_entry.lang_code
)


def extract_form_of_template(
wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
) -> None:
if "form-of" not in sense.tags:
sense.tags.append("form-of")
word_arg = 1 if t_node.template_name == "ko-hanja form of" else 2
word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
if len(word) > 0:
sense.form_of.append(AltForm(word=word))
34 changes: 34 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ def test_ignore_gloss_index_bold_node(self):
* '''1.''' [[우리]].""",
)
self.assertEqual(data[0]["senses"], [{"glosses": ["우리."]}])
data = parse_page(
self.wxr,
"보다",
"""== 한국어 ==
=== 동사 ===
* '''1-1.''' 눈으로 무엇을 알아차리다.""",
)
self.assertEqual(
data[0]["senses"], [{"glosses": ["눈으로 무엇을 알아차리다."]}]
)

def test_no_pos_section(self):
data = parse_page(
Expand Down Expand Up @@ -81,3 +91,27 @@ def test_note_list(self):
data[0]["senses"][0]["note"],
"특정 업계에서는 'ea'란 표현을 쓰기도 한다.",
)

def test_form_of_template(self):
self.wxr.wtp.add_page(
"틀:ko-hanja form of",
10,
"""<span class="form-of-definition"><i class="None mention" lang="ko">[[전화#한국어|전화]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-gloss-double-quote">“</span><span class="mention-gloss">전화기로 말을 주고받는 일</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>의 [[한자#한국어|한자]] 형태.</span>""",
)

data = parse_page(
self.wxr,
"電話",
"""== 한국어 ==
=== 명사 ===
# {{ko-hanja form of|전화|전화기로 말을 주고받는 일}}""",
)
self.assertEqual(
data[0]["senses"][0],
{
# "categories": ["한국어 비표준 문자가 포함된 낱말 (링크)"],
"form_of": [{"word": "전화"}],
"tags": ["form-of"],
"glosses": ["전화 (“전화기로 말을 주고받는 일”)의 한자 형태."],
},
)
24 changes: 23 additions & 1 deletion tests/test_ko_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,27 @@ def test_colon_linkage_list(self):
)
self.assertEqual(
data[0]["senses"][0]["examples"],
[{"text": "한글은 창제 당시 총 28개의 자모가 있었지만 지금은 24개만 사용한다."}]
[
{
"text": "한글은 창제 당시 총 28개의 자모가 있었지만 지금은 24개만 사용한다."
}
],
)

def test_zh_pinyin(self):
data = parse_page(
self.wxr,
"土",
"""== 중국어 ==
=== 명사 ===
* '''1.''' 흙, 땅
=== 합성어 ===
:*[[土產]]/[[土产 ]](tǔchǎn)""",
)
self.assertEqual(
data[0]["derived"],
[
{"word": "土產", "roman": "tǔchǎn", "sense": "흙, 땅"},
{"word": "土产", "roman": "tǔchǎn", "sense": "흙, 땅"},
],
)

0 comments on commit 0a6aa5f

Please sign in to comment.