Skip to content

Commit

Permalink
Merge pull request #878 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] extract gloss label template
  • Loading branch information
xxyzz authored Oct 18, 2024
2 parents 0a6aa5f + 450b6b7 commit eaa6b66
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,7 +1521,7 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
# XXX "Category" -> config variable for portability
category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]
# Fail if we received empty dict from .get()
category_ns_names = {category_ns_data["name"]} | set(
category_ns_names = {"Category", category_ns_data["name"]} | set(
category_ns_data["aliases"]
)
category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def extract_example_list_item(
elif isinstance(node, TemplateNode) and node.template_name in [
"예문",
"ux",
"uxi",
]:
extract_ux_template(wxr, sense, example, node)
break
Expand Down
11 changes: 11 additions & 0 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .models import AltForm, Sense, WordEntry
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template
from .tags import translate_raw_tags
from .translation import extract_translation_template


Expand Down Expand Up @@ -83,12 +84,22 @@ def extract_gloss_list_item(
):
extract_form_of_template(wxr, sense, node)
gloss_nodes.append(node)
elif isinstance(node, TemplateNode) and node.template_name == "라벨":
sense.raw_tags.extend(
[
raw_tag.strip()
for raw_tag in clean_node(wxr, sense, node)
.strip("()")
.split(",")
]
)
else:
gloss_nodes.append(node)

gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)


Expand Down
26 changes: 26 additions & 0 deletions src/wiktextract/extractor/ko/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from .models import WordEntry

# https://ko.wiktionary.org/wiki/모듈:labels/data/topical
# https://ko.wiktionary.org/wiki/모듈:labels/data
GLOSS_TAGS = {
"고어": "archaic",
"자동사": "intransitive",
}

TAGS = {**GLOSS_TAGS}

TOPICS = {
"금융": "finance",
}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS:
data.tags.append(TAGS[raw_tag])
elif raw_tag in TOPICS:
data.topics.append(TOPICS[raw_tag])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
27 changes: 24 additions & 3 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,8 @@ def test_form_of_template(self):
self.wxr.wtp.add_page(
"틀:ko-hanja form of",
10,
"""<span class="form-of-definition"><i class="None mention" lang="ko">[[전화#한국어|전화]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-gloss-double-quote">“</span><span class="mention-gloss">전화기로 말을 주고받는 일</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>의 [[한자#한국어|한자]] 형태.</span>""",
"""<span class="form-of-definition"><i class="None mention" lang="ko">[[전화#한국어|전화]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-gloss-double-quote">“</span><span class="mention-gloss">전화기로 말을 주고받는 일</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>[[Category:한국어 비표준 문자가 포함된 낱말 (링크)|電話]]의 [[한자#한국어|한자]] 형태.</span>""",
)

data = parse_page(
self.wxr,
"電話",
Expand All @@ -109,9 +108,31 @@ def test_form_of_template(self):
self.assertEqual(
data[0]["senses"][0],
{
# "categories": ["한국어 비표준 문자가 포함된 낱말 (링크)"],
"categories": ["한국어 비표준 문자가 포함된 낱말 (링크)"],
"form_of": [{"word": "전화"}],
"tags": ["form-of"],
"glosses": ["전화 (“전화기로 말을 주고받는 일”)의 한자 형태."],
},
)

def test_label_template(self):
self.wxr.wtp.add_page(
"틀:라벨",
10,
"""<span class="usage-label-sense"><span class="ib-brac">(</span><span class="ib-content">[[부록:용어사전#잘 쓰이지 않는 표현과 그 정도|고어]][[Category:한국어 고어|열다]]<span class="ib-comma">,</span>&#32;[[부록:용어사전#자동사|자동사]][[Category:한국어 자동사|열다]]</span><span class="ib-brac">)</span></span>""",
)
data = parse_page(
self.wxr,
"열다",
"""== 한국어 ==
=== 명사 ===
# {{라벨|ko|고어|자동사}} [[열매가 맺히다]]""",
)
self.assertEqual(
data[0]["senses"][0],
{
"categories": ["한국어 고어", "한국어 자동사"],
"tags": ["archaic", "intransitive"],
"glosses": ["열매가 맺히다"],
},
)

0 comments on commit eaa6b66

Please sign in to comment.