Skip to content

Commit

Permalink
Merge pull request #608 from xxyzz/zh
Browse files Browse the repository at this point in the history
Use `str` type `Example.text` field for zh edition's pydantic model
  • Loading branch information
xxyzz authored Apr 28, 2024
2 parents b874583 + 786716f commit 210104c
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 53 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -320,20 +320,18 @@ cd wiktextract
python -m venv .venv
source .venv/bin/activate
python -m pip install -U pip
python -m pip install --use-pep517 .
python -m pip install -e .
```

If you are installing wiktextract from source, you also need to install
wikitextprocessor from source separately; otherwise, a newer wiktextract
version will be installed alongside an older pypi version of wikitextprocessor,
which will not work out.

Use `pip install` command's `--force-reinstall` and `-e` option to
reinstall the wikitextprocessor package from source in editable
mode if you want to update both packages' code with `git pull`.

### Running tests

This package includes tests written using the `unittest` framework.
The test dependencies can be installed with command
`python -m pip install --use-pep517 -e ".[dev]"`.
`python -m pip install -e .[dev]`.

To run the tests, use the following command in the top-level directory:

Expand Down
119 changes: 77 additions & 42 deletions src/wiktextract/extractor/zh/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def extract_examples(
elif template_name in {"ja-x", "ja-usex"}:
extract_template_ja_usex(wxr, child, example_data)
elif template_name in {"zh-x", "zh-usex"}:
extract_template_zh_usex(wxr, child, example_data)
extract_template_zh_x(wxr, child, sense_data)
elif template_name in {"ux", "eg", "usex"}:
extract_template_ux(wxr, child, example_data)
elif template_name == "uxi":
Expand All @@ -62,9 +62,9 @@ def extract_examples(
else "",
)
else:
example_data.texts = [clean_node(wxr, None, child)]
example_data.text = clean_node(wxr, None, child)

if len(example_data.texts) > 0:
if len(example_data.text) > 0:
sense_data.examples.append(example_data)
else:
extract_examples(wxr, sense_data, node.children, page_data)
Expand All @@ -79,9 +79,9 @@ def extract_example_list(
and child_node.kind == NodeKind.LIST
):
example_data.ref = clean_node(wxr, None, node.children[:index])
example_data.texts = [
clean_node(wxr, None, child_node.children[0].children)
]
example_data.text = clean_node(
wxr, None, child_node.children[0].children
)


def extract_quote_templates(
Expand All @@ -95,15 +95,15 @@ def extract_quote_templates(
if line_num == 0:
key = "ref"
elif line_num == 1:
key = "texts"
key = "text"
elif line_num == 2 and "transliteration" in node.template_parameters:
key = "roman"
else:
key = "translation"

if expanded_line != "(請為本引文添加中文翻譯)":
if key == "texts":
example_data.texts.append(expanded_line)
if key == "text":
example_data.text = expanded_line
else:
setattr(example_data, key, expanded_line)

Expand All @@ -118,44 +118,79 @@ def extract_template_ja_usex(
expanded_text = clean_node(wxr, None, node_without_ruby)
for line_num, expanded_line in enumerate(expanded_text.splitlines()):
if line_num == 0:
key = "texts"
key = "text"
elif line_num == 1:
key = "roman"
else:
key = "translation"
if key == "texts":
example_data.texts.append(expanded_line)
if key == "text":
example_data.text = expanded_line
else:
setattr(example_data, key, expanded_line)
if len(ruby_data) > 0:
example_data.ruby = ruby_data


def extract_template_zh_usex(
wxr: WiktextractContext, node: WikiNode, example_data: Example
def extract_template_zh_x(
wxr: WiktextractContext, template_node: TemplateNode, sense: Sense
) -> None:
expanded_text = clean_node(wxr, None, node)
if "―" in expanded_text:
for index, split_text in enumerate(expanded_text.split("―")):
if index == 0:
for example_text in split_text.split(" / "):
example_data.texts.append(example_text.strip())
elif index == 1:
example_data.roman = split_text.strip()
return

for expanded_line in expanded_text.splitlines():
if expanded_line.endswith("體]"):
# expanded simplified or traditional Chinese
# example sentence usually ends with
# "繁體]" or "簡體]"
example_data.texts.append(expanded_line)
elif expanded_line.endswith("]"):
example_data.roman = expanded_line
elif expanded_line.startswith("來自:"):
example_data.ref = expanded_line[3:]
else:
example_data.translation = expanded_line
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
has_dl_tag = False
for dl_tag in expanded_node.find_html("dl"):
has_dl_tag = True
ref = ""
pinyin = ""
translation = ""
for dd_tag in dl_tag.find_html("dd"):
dd_text = clean_node(wxr, None, dd_tag)
if dd_text.startswith("來自:"):
ref = dd_text.removeprefix("來自:")
else:
is_pinyin = False
for span_tag in dd_tag.find_html(
"span", attr_name="class", attr_value="Latn"
):
pinyin = dd_text
is_pinyin = True
if not is_pinyin:
translation = dd_text

example_text = ""
for span_tag in dl_tag.find_html("span"):
span_text = clean_node(wxr, None, span_tag)
if span_tag.attrs.get("class", "") in ["Hant", "Hans"]:
example_text = span_text
elif len(example_text) > 0:
raw_tag = span_text
example_data = Example(
text=example_text,
roman=pinyin,
ref=ref,
translation=translation,
raw_tags=raw_tag.strip("[]").split(","),
)
sense.examples.append(example_data)

if not has_dl_tag:
pinyin = ""
for span_tag in expanded_node.find_html(
"span", attr_name="lang", attr_value="Latn"
):
pinyin = clean_node(wxr, None, span_tag)
for span_tag in expanded_node.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang in ["zh-Hant", "zh-Hans"]:
example_text = clean_node(wxr, None, span_tag)
if len(example_text) > 0:
example_data = Example(text=example_text, roman=pinyin)
example_data.tags.append(
"Traditional Chinese"
if span_lang == "zh-Hant"
else "Simplified Chinese"
)
sense.examples.append(example_data)


def extract_template_ux(
Expand All @@ -169,16 +204,16 @@ def extract_template_ux(
lines = expanded_text.splitlines()
for line_num, expanded_line in enumerate(lines):
if line_num == 0:
key = "texts"
key = "text"
elif line_num == 1:
if line_num == len(lines) - 1:
key = "translation"
else:
key = "roman"
else:
key = "translation"
if key == "texts":
example_data.texts.append(expanded_line)
if key == "text":
example_data.text = expanded_line
else:
setattr(example_data, key, expanded_line)

Expand All @@ -196,15 +231,15 @@ def extract_template_uxi_text(
parts = expanded_text.split(" ― ")
for index, part in enumerate(parts):
if index == 0:
key = "texts"
key = "text"
elif index == 1:
if index == len(parts) - 1:
key = "translation"
else:
key = "roman"
else:
key = "translation"
if key == "texts":
example_data.texts.append(part)
if key == "text":
example_data.text = part
else:
setattr(example_data, key, part)
6 changes: 4 additions & 2 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class ChineseBaseModel(BaseModel):


class Example(ChineseBaseModel):
texts: list[str] = Field(
[],
text: str = Field(
"",
description="Example usage sentences, some might have have both "
"Simplified and Traditional Chinese forms",
)
Expand All @@ -29,6 +29,8 @@ class Example(ChineseBaseModel):
ruby: list[tuple[str, ...]] = Field(
[], description="Japanese Kanji and furigana"
)
tags: list[str] = []
raw_tags: list[str] = []


class AltForm(ChineseBaseModel):
Expand Down
60 changes: 58 additions & 2 deletions tests/test_zh_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_example_list(self) -> None:
sense_data.examples[0].model_dump(exclude_defaults=True),
{
"ref": "ref text",
"texts": ["example text"],
"text": "example text",
},
)

Expand All @@ -54,7 +54,63 @@ def test_quote_example(self, mock_clean_node) -> None:
sense_data.examples[0].model_dump(exclude_defaults=True),
{
"ref": "ref text",
"texts": ["quote text"],
"text": "quote text",
"translation": "translation text",
},
)

def test_zh_x(self):
self.wxr.wtp.start_page("大家")
self.wxr.wtp.add_page(
"Template:zh-x",
10,
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">-{<!-- -->[[王#漢語|王]][[曰#漢語|曰]]:「[[封#漢語|封]],[[以#漢語|以]][[厥#漢語|厥]][[庶民#漢語|庶民]][[暨#漢語|暨]][[厥#漢語|厥]][[臣#漢語|臣]][[達#漢語|達]]<b>大家</b>,[[以#漢語|以]][[厥#漢語|厥]][[臣#漢語|臣]][[達#漢語|達]][[王#漢語|王]][[惟#漢語|惟]][[邦君#漢語|邦君]]。」<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:文言文|文言文]],[[繁體中文|繁體]]&#93;</span><br><span lang="zh-Hans" class="Hans">-{<!-- -->[[王#漢語|王]][[曰#漢語|曰]]:“[[封#漢語|封]],[[以#漢語|以]][[厥#漢語|厥]][[庶民#漢語|庶民]][[暨#漢語|暨]][[厥#漢語|厥]][[臣#漢語|臣]][[达#漢語|达]]<b>大家</b>,[[以#漢語|以]][[厥#漢語|厥]][[臣#漢語|臣]][[达#漢語|达]][[王#漢語|王]][[惟#漢語|惟]][[邦君#漢語|邦君]]。”<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:文言文|文言文]],[[簡體中文|簡體]]&#93;</span><dd><small>來自:《[[s:尚書/梓材|尚書·梓材]]》</small></dd><dd><span lang="Latn" style="color:#404D52"><i>Wáng yuē: “Fēng, yǐ jué shùmín jì jué chén dá <b>dàjiā</b>, yǐ jué chén dá wáng wéi bāngjūn.”</i></span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:漢語拼音|漢語拼音]]&#93;</span></dd><dd>王說:「封啊,從殷的老百姓和他們的官員到'''卿大夫''',從他們的官員到諸侯和國君。」</dd></dl>[[Category:有引文的文言文詞]]""",
)
sense_data = Sense()
root = self.wxr.wtp.parse(
"#* {{zh-x|王 曰:「封,以 厥 庶民 暨 厥 臣 達 大家,以 厥 臣 達 王 惟 邦君。」|王說:「封啊,從殷的老百姓和他們的官員到'''卿大夫''',從他們的官員到諸侯和國君。」|CL|ref=《[[s:尚書/梓材|尚書·梓材]]》}}"
)
extract_examples(self.wxr, sense_data, root.children[0], [])
self.assertEqual(
[e.model_dump(exclude_defaults=True) for e in sense_data.examples],
[
{
"ref": "《尚書·梓材》",
"raw_tags": ["文言文", "繁體"],
"text": "王曰:「封,以厥庶民暨厥臣達大家,以厥臣達王惟邦君。」",
"translation": "王說:「封啊,從殷的老百姓和他們的官員到卿大夫,從他們的官員到諸侯和國君。」",
},
{
"ref": "《尚書·梓材》",
"raw_tags": ["文言文", "簡體"],
"text": "王曰:“封,以厥庶民暨厥臣达大家,以厥臣达王惟邦君。”",
"translation": "王說:「封啊,從殷的老百姓和他們的官員到卿大夫,從他們的官員到諸侯和國君。」",
},
],
)

def test_zh_x_no_ref(self):
self.wxr.wtp.start_page("中文")
self.wxr.wtp.add_page(
"Template:zh-x",
10,
"""<span lang="zh-Hant" class="Hant">-{<!-- --><b>中文</b>[[授課#漢語|授課]]<!-- -->}-</span> / <span lang="zh-Hans" class="Hans">-{<!-- --><b>中文</b>[[授课#漢語|授课]]<!-- -->}-</span>&nbsp; ―&nbsp; <span lang="Latn" style="color:#404D52"><i><b>zhōngwén</b> shòukè</i></span>&nbsp; ―&nbsp; [[Category:有使用例的官話詞]]""",
)
sense_data = Sense()
root = self.wxr.wtp.parse("#* {{zh-x|中文 授課}}")
extract_examples(self.wxr, sense_data, root.children[0], [])
self.assertEqual(
[e.model_dump(exclude_defaults=True) for e in sense_data.examples],
[
{
"text": "中文授課",
"tags": ["Traditional Chinese"],
"roman": "zhōngwén shòukè",
},
{
"text": "中文授课",
"tags": ["Simplified Chinese"],
"roman": "zhōngwén shòukè",
},
],
)

0 comments on commit 210104c

Please sign in to comment.