Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ko] extract more translation, pronunciation, example data #861

Merged
merged 4 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 62 additions & 4 deletions src/wiktextract/extractor/ko/example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from wikitextprocessor import TemplateNode, WikiNode
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand All @@ -7,20 +7,32 @@


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode, lang_code: str
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
lang_code: str,
parent_example: Example | None = None,
) -> None:
example = Example()
example = Example() if parent_example is None else parent_example
after_lang_template = False
for node in list_item.children:
if isinstance(node, TemplateNode) and node.template_name == "lang":
after_lang_template = True
extract_example_lang_template(wxr, example, node, lang_code)
elif isinstance(node, TemplateNode) and node.template_name.startswith(
"따옴"
("따옴", "지봉유설")
):
example.ref = clean_node(wxr, None, node).strip("() ")
elif isinstance(node, TemplateNode) and node.template_name in [
"예문",
"ux",
]:
extract_ux_template(wxr, sense, example, node)
break
elif after_lang_template:
example.translation += clean_node(wxr, None, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
break
else:
example.text += clean_node(wxr, None, node)

Expand All @@ -38,6 +50,12 @@ def extract_example_list_item(
else:
sense.examples.append(example)

for nested_list in list_item.find_child(NodeKind.LIST):
for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, nested_list_item, lang_code, example
)


def extract_example_lang_template(
wxr: WiktextractContext,
Expand Down Expand Up @@ -66,3 +84,43 @@ def extract_example_lang_template(
roman_start_index = example.text.index("(")
example.roman = example.text[roman_start_index:].strip("() ")
example.text = example.text[:roman_start_index].strip()


def extract_ux_template(
wxr: WiktextractContext,
sense: Sense,
example: Example,
t_node: TemplateNode,
) -> None:
# https://ko.wiktionary.org/wiki/틀:ux
lang_code = t_node.template_parameters.get(1, "")
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
if lang_code == "ja":
for span_tag in expanded_node.find_html_recursively("span"):
span_class = span_tag.attrs.get("class", "")
if span_class == "Jpan":
example.ruby, no_ruby = extract_ruby(wxr, span_tag)
example.text = clean_node(wxr, None, no_ruby)
elif span_class == "tr":
example.roman = clean_node(wxr, None, span_tag)
example.translation = clean_node(
wxr, None, t_node.template_parameters.get(4, "")
)
example.literal_meaning = clean_node(
wxr, None, t_node.template_parameters.get("lit", "")
)
else:
example.text = clean_node(
wxr, None, t_node.template_parameters.get(2, "")
)
example.translation = clean_node(
wxr, None, t_node.template_parameters.get(3, "")
)
example.note = clean_node(
wxr, None, t_node.template_parameters.get("footer", "")
)

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/ko/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,15 @@ def extract_linkage_section(
if linkage_type == "proverbs":
extract_proverb_section(wxr, word_entry, level_node)
else:
from .translation import extract_translation_template

for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
extract_linkage_list_item(wxr, word_entry, list_item, linkage_type)

for t_node in level_node.find_child(NodeKind.TEMPLATE):
extract_linkage_template(wxr, word_entry, t_node)
if t_node.template_name == "외국어":
extract_translation_template(wxr, word_entry, t_node)


def extract_linkage_list_item(
Expand Down
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Example(KoreanBaseModel):
default=[], description="Japanese Kanji and furigana"
)
tags: list[str] = []
literal_meaning: str = ""
note: str = ""


class Sense(KoreanBaseModel):
Expand All @@ -44,6 +46,7 @@ class Sound(KoreanBaseModel):
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""
other: str = ""


class Linkage(KoreanBaseModel):
Expand Down
10 changes: 9 additions & 1 deletion src/wiktextract/extractor/ko/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template
from .sound import (
SOUND_TEMPLATES,
extract_sound_section,
extract_sound_template,
)
from .translation import extract_translation_section


Expand Down Expand Up @@ -42,6 +46,10 @@ def parse_section(
)
elif title_text == "번역" and len(page_data) > 0:
extract_translation_section(wxr, page_data[-1], level_node)
elif title_text == "발음":
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def extract_gloss_list_item(
sense = Sense()
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.startswith(":"):
if ":" in node.sarg:
for e_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, e_list_item, word_entry.lang_code
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"의미": {"pos": "unknown"},
"타동사": {"pos": "verb", "tags": ["transitive"]},
"종별사": {"pos": "counter"},
"감탄사": {"pos": "intj"},
}

LINKAGE_SECTIONS = {
Expand Down
40 changes: 38 additions & 2 deletions src/wiktextract/extractor/ko/sound.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from wikitextprocessor import NodeKind, TemplateNode
from wikitextprocessor import LevelNode, NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry

SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA"])
SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron"])


def extract_sound_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
extract_sound_template(wxr, word_entry, t_node)


def extract_sound_template(
Expand All @@ -17,6 +24,8 @@ def extract_sound_template(
extract_ipa_template(wxr, word_entry, node)
elif node.template_name == "ko-IPA":
extract_ko_ipa_template(wxr, word_entry, node)
elif node.template_name == "ja-pron":
extract_ja_pron_template(wxr, word_entry, node)


def extract_listen_pronunciation_template(
Expand Down Expand Up @@ -93,3 +102,30 @@ def extract_ko_ipa_template(

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, word_entry, link_node)


def extract_ja_pron_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:ja-pron
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
for ul_tag in expanded_node.find_html("ul"):
for li_tag in ul_tag.find_html("li"):
sound = Sound()
for span_tag in li_tag.find_html("span"):
span_class = span_tag.attrs.get("class", "")
if span_class == "usage-label-accent":
sound.raw_tags.append(
clean_node(wxr, None, span_tag).strip("()")
)
elif span_class == "Jpan":
sound.other = clean_node(wxr, None, span_tag)
elif span_class == "Latn":
sound.roman = clean_node(wxr, None, span_tag)
elif span_class == "IPA":
sound.ipa = clean_node(wxr, None, span_tag)
if sound.ipa != "" or sound.roman != "":
word_entry.sounds.append(sound)
clean_node(wxr, word_entry, expanded_node)
81 changes: 81 additions & 0 deletions tests/test_ko_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,84 @@ def test_ref_quote_template(self):
"ref": "3세기, 진수,《삼국지》, 〈권30 위서 오환선비동이전 (魏書 烏丸鮮卑東夷傳)〉",
},
)

def test_ja_ux_template(self):
self.wxr.wtp.add_page(
"틀:예문",
10,
"""<div style="font-size: 120%"><span lang="ja" class="Jpan">'''<ruby>東西<rp>(</rp><rt>とうざい</rt><rp>)</rp></ruby>'''に<ruby>走<rp>(</rp><rt>はし</rt><rp>)</rp></ruby>る<ruby>道<rp>(</rp><rt>どう</rt><rp>)</rp></ruby><ruby>路<rp>(</rp><rt>ろ</rt><rp>)</rp></ruby></span></div><dl><dd><i><span class="tr">'''tōzai''' ni hashiru dōro</span></i></dd><dd>'''동서'''로 달리는 도로</dd><dd>(literally, “lit”)</dd></dl>[[분류:일본어 용례가 포함된 낱말|東西]]""",
)
data = parse_page(
self.wxr,
"東西",
"""== 일본어 ==
=== 명사 ===
# [[동서]] ([[동쪽]]과 [[서쪽]])
#: {{예문|ja|'''東西'''に走る道%路|'''とうざい''' に はしる どう%ろ|'''동서'''로 달리는 도로}}""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0],
{
"text": "東西に走る道路",
"ruby": [
("東西", "とうざい"),
("走", "はし"),
("道", "どう"),
("路", "ろ"),
],
"roman": "tōzai ni hashiru dōro",
"translation": "동서로 달리는 도로",
},
)
self.assertEqual(
data[0]["senses"][0]["categories"], ["일본어 용례가 포함된 낱말"]
)

def test_ko_ux_template(self):
self.wxr.wtp.add_page(
"틀:예문",
10,
"""<div class="h-usage-example"><span class="None" lang="ko"><span style="font-size: 120%25">그녀는 '''없는''' 가정에서 자랐다.</span></span><dl><dd><span class="e-footer">매우 가난하게 살았다.</span></dd></dl></div>[[Category:한국어 용례가 포함된 낱말|없다]][[Category:한국어 용례가 포함된 낱말|없다|없다]]""",
)
data = parse_page(
self.wxr,
"없다",
"""== 한국어 ==
=== 형용사 ===
# 궁핍하다.
#:{{예문|ko|그녀는 '''없는''' 가정에서 자랐다.|footer= 매우 가난하게 살았다.}}""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0],
{
"text": "그녀는 없는 가정에서 자랐다.",
"note": "매우 가난하게 살았다.",
},
)
self.assertEqual(
data[0]["senses"][0]["categories"], ["한국어 용례가 포함된 낱말"]
)

def test_jibong_yuseol_template(self):
self.wxr.wtp.add_page(
"틀:지봉유설",
10,
"""'''1614년''', [[:w:이수광|이수광]], 《[[:s:지봉유설|지봉유설]]》, 〈[[:s:지봉유설/2권|2권 外國 條]]〉""",
)
data = parse_page(
self.wxr,
"없다",
"""== 중국어 ==
====명사====
# [[동서]].
#: {{지봉유설|2|2권 外國 條}}
#:: {{lang|zh|'''東西'''六十日程}} 동서로 60일이 걸리는 거리이다.""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0],
{
"text": "東西六十日程",
"translation": "동서로 60일이 걸리는 거리이다.",
"ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉",
},
)
34 changes: 34 additions & 0 deletions tests/test_ko_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,37 @@ def test_ko_ipa_template(self):
self.assertEqual(
data[0]["categories"], ["한국어 IPA 발음이 포함된 낱말"]
)

def test_ja_pron(self):
self.wxr.wtp.add_page(
"틀:ja-pron",
10,
"""<ul><li><span class="usage-label-accent"><span class="ib-brac">(</span><span class="ib-content">[[w:도쿄 방언|도쿄]]</span><span class="ib-brac">)</span></span> <span lang="ja" class="Jpan"><span>と<span></span></span>ーざい</span></span> <span class="Latn"><samp>[tóꜜòzàì]</samp></span> ([[頭高型|두고형]] – [1])</li><li>[[w:국제 음성 기호|IPA]]<sup>([[부록:일본어 발음|표기]])</sup>:&#32;<span class="IPA">[to̞ːza̠i]</span>[[Category:일본어 IPA 발음이 포함된 낱말|とうざい]][[Category:일본어 중복되지 않는 수동 정렬 키를 포함하는 낱말|東西]]</li></ul>""",
)
data = parse_page(
self.wxr,
"東西",
"""== 일본어 ==
=== 발음 ===
* {{ja-pron|とうざい|acc=1|acc_ref=DJR,NHK}}
=== 명사 ===
# [[동서]] ([[동쪽]]과 [[서쪽]])""",
)
self.assertEqual(
data[0]["sounds"],
[
{
"roman": "[tóꜜòzàì]",
"other": "とーざい",
"raw_tags": ["도쿄"],
},
{"ipa": "[to̞ːza̠i]"},
],
)
self.assertEqual(
data[0]["categories"],
[
"일본어 IPA 발음이 포함된 낱말",
"일본어 중복되지 않는 수동 정렬 키를 포함하는 낱말",
],
)