Merge pull request #608 from xxyzz/zh

Use `str` type `Example.text` field for zh edition's pydantic model
tatuylonen · Apr 28, 2024 · 210104c · 210104c
2 parents b874583 + 786716f
commit 210104c
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 53 deletions.
diff --git a/README.md b/README.md
@@ -320,20 +320,18 @@ cd wiktextract
 python -m venv .venv
 source .venv/bin/activate
 python -m pip install -U pip
-python -m pip install --use-pep517 .
+python -m pip install -e .
 ```
 
-If you are installing wiktextract from source, you also need to install
-wikitextprocessor from source separately; otherwise, a newer wiktextract
-version will be installed alongside an older pypi version of wikitextprocessor,
-which will not work out.
-
+Use `pip install` command's `--force-reinstall` and `-e` option to
+reinstall the wikitextprocessor package from source in editable
+mode if you want to update both packages' code with `git pull`.
 
 ### Running tests
 
 This package includes tests written using the `unittest` framework.
 The test dependencies can be installed with command
-`python -m pip install --use-pep517 -e ".[dev]"`.
+`python -m pip install -e .[dev]`.
 
 To run the tests, use the following command in the top-level directory:
 

diff --git a/src/wiktextract/extractor/zh/example.py b/src/wiktextract/extractor/zh/example.py
@@ -46,7 +46,7 @@ def extract_examples(
                     elif template_name in {"ja-x", "ja-usex"}:
                         extract_template_ja_usex(wxr, child, example_data)
                     elif template_name in {"zh-x", "zh-usex"}:
-                        extract_template_zh_usex(wxr, child, example_data)
+                        extract_template_zh_x(wxr, child, sense_data)
                     elif template_name in {"ux", "eg", "usex"}:
                         extract_template_ux(wxr, child, example_data)
                     elif template_name == "uxi":
@@ -62,9 +62,9 @@ def extract_examples(
                             else "",
                         )
                     else:
-                        example_data.texts = [clean_node(wxr, None, child)]
+                        example_data.text = clean_node(wxr, None, child)
 
-            if len(example_data.texts) > 0:
+            if len(example_data.text) > 0:
                 sense_data.examples.append(example_data)
         else:
             extract_examples(wxr, sense_data, node.children, page_data)
@@ -79,9 +79,9 @@ def extract_example_list(
             and child_node.kind == NodeKind.LIST
         ):
             example_data.ref = clean_node(wxr, None, node.children[:index])
-            example_data.texts = [
-                clean_node(wxr, None, child_node.children[0].children)
-            ]
+            example_data.text = clean_node(
+                wxr, None, child_node.children[0].children
+            )
 
 
 def extract_quote_templates(
@@ -95,15 +95,15 @@ def extract_quote_templates(
         if line_num == 0:
             key = "ref"
         elif line_num == 1:
-            key = "texts"
+            key = "text"
         elif line_num == 2 and "transliteration" in node.template_parameters:
             key = "roman"
         else:
             key = "translation"
 
         if expanded_line != "（請為本引文添加中文翻譯）":
-            if key == "texts":
-                example_data.texts.append(expanded_line)
+            if key == "text":
+                example_data.text = expanded_line
             else:
                 setattr(example_data, key, expanded_line)
 
@@ -118,44 +118,79 @@ def extract_template_ja_usex(
     expanded_text = clean_node(wxr, None, node_without_ruby)
     for line_num, expanded_line in enumerate(expanded_text.splitlines()):
         if line_num == 0:
-            key = "texts"
+            key = "text"
         elif line_num == 1:
             key = "roman"
         else:
             key = "translation"
-        if key == "texts":
-            example_data.texts.append(expanded_line)
+        if key == "text":
+            example_data.text = expanded_line
         else:
             setattr(example_data, key, expanded_line)
     if len(ruby_data) > 0:
         example_data.ruby = ruby_data
 
 
-def extract_template_zh_usex(
-    wxr: WiktextractContext, node: WikiNode, example_data: Example
+def extract_template_zh_x(
+    wxr: WiktextractContext, template_node: TemplateNode, sense: Sense
 ) -> None:
-    expanded_text = clean_node(wxr, None, node)
-    if "―" in expanded_text:
-        for index, split_text in enumerate(expanded_text.split("―")):
-            if index == 0:
-                for example_text in split_text.split(" / "):
-                    example_data.texts.append(example_text.strip())
-            elif index == 1:
-                example_data.roman = split_text.strip()
-        return
-
-    for expanded_line in expanded_text.splitlines():
-        if expanded_line.endswith("體]"):
-            # expanded simplified or traditional Chinese
-            # example sentence usually ends with
-            # "繁體]" or "簡體]"
-            example_data.texts.append(expanded_line)
-        elif expanded_line.endswith("]"):
-            example_data.roman = expanded_line
-        elif expanded_line.startswith("來自："):
-            example_data.ref = expanded_line[3:]
-        else:
-            example_data.translation = expanded_line
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    has_dl_tag = False
+    for dl_tag in expanded_node.find_html("dl"):
+        has_dl_tag = True
+        ref = ""
+        pinyin = ""
+        translation = ""
+        for dd_tag in dl_tag.find_html("dd"):
+            dd_text = clean_node(wxr, None, dd_tag)
+            if dd_text.startswith("來自："):
+                ref = dd_text.removeprefix("來自：")
+            else:
+                is_pinyin = False
+                for span_tag in dd_tag.find_html(
+                    "span", attr_name="class", attr_value="Latn"
+                ):
+                    pinyin = dd_text
+                    is_pinyin = True
+                if not is_pinyin:
+                    translation = dd_text
+
+        example_text = ""
+        for span_tag in dl_tag.find_html("span"):
+            span_text = clean_node(wxr, None, span_tag)
+            if span_tag.attrs.get("class", "") in ["Hant", "Hans"]:
+                example_text = span_text
+            elif len(example_text) > 0:
+                raw_tag = span_text
+                example_data = Example(
+                    text=example_text,
+                    roman=pinyin,
+                    ref=ref,
+                    translation=translation,
+                    raw_tags=raw_tag.strip("[]").split("，"),
+                )
+                sense.examples.append(example_data)
+
+    if not has_dl_tag:
+        pinyin = ""
+        for span_tag in expanded_node.find_html(
+            "span", attr_name="lang", attr_value="Latn"
+        ):
+            pinyin = clean_node(wxr, None, span_tag)
+        for span_tag in expanded_node.find_html("span"):
+            span_lang = span_tag.attrs.get("lang", "")
+            if span_lang in ["zh-Hant", "zh-Hans"]:
+                example_text = clean_node(wxr, None, span_tag)
+                if len(example_text) > 0:
+                    example_data = Example(text=example_text, roman=pinyin)
+                    example_data.tags.append(
+                        "Traditional Chinese"
+                        if span_lang == "zh-Hant"
+                        else "Simplified Chinese"
+                    )
+                    sense.examples.append(example_data)
 
 
 def extract_template_ux(
@@ -169,16 +204,16 @@ def extract_template_ux(
     lines = expanded_text.splitlines()
     for line_num, expanded_line in enumerate(lines):
         if line_num == 0:
-            key = "texts"
+            key = "text"
         elif line_num == 1:
             if line_num == len(lines) - 1:
                 key = "translation"
             else:
                 key = "roman"
         else:
             key = "translation"
-        if key == "texts":
-            example_data.texts.append(expanded_line)
+        if key == "text":
+            example_data.text = expanded_line
         else:
             setattr(example_data, key, expanded_line)
 
@@ -196,15 +231,15 @@ def extract_template_uxi_text(
     parts = expanded_text.split(" ― ")
     for index, part in enumerate(parts):
         if index == 0:
-            key = "texts"
+            key = "text"
         elif index == 1:
             if index == len(parts) - 1:
                 key = "translation"
             else:
                 key = "roman"
         else:
             key = "translation"
-        if key == "texts":
-            example_data.texts.append(part)
+        if key == "text":
+            example_data.text = part
         else:
             setattr(example_data, key, part)
diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -13,8 +13,8 @@ class ChineseBaseModel(BaseModel):
 
 
 class Example(ChineseBaseModel):
-    texts: list[str] = Field(
-        [],
+    text: str = Field(
+        "",
         description="Example usage sentences, some might have have both "
         "Simplified and Traditional Chinese forms",
     )
@@ -29,6 +29,8 @@ class Example(ChineseBaseModel):
     ruby: list[tuple[str, ...]] = Field(
         [], description="Japanese Kanji and furigana"
     )
+    tags: list[str] = []
+    raw_tags: list[str] = []
 
 
 class AltForm(ChineseBaseModel):

diff --git a/tests/test_zh_example.py b/tests/test_zh_example.py
@@ -34,7 +34,7 @@ def test_example_list(self) -> None:
             sense_data.examples[0].model_dump(exclude_defaults=True),
             {
                 "ref": "ref text",
-                "texts": ["example text"],
+                "text": "example text",
             },
         )
 
@@ -54,7 +54,63 @@ def test_quote_example(self, mock_clean_node) -> None:
             sense_data.examples[0].model_dump(exclude_defaults=True),
             {
                 "ref": "ref text",
-                "texts": ["quote text"],
+                "text": "quote text",
                 "translation": "translation text",
             },
         )
+
+    def test_zh_x(self):
+        self.wxr.wtp.start_page("大家")
+        self.wxr.wtp.add_page(
+            "Template:zh-x",
+            10,
+            """<dl class="zhusex"><span lang="zh-Hant" class="Hant">-{<!-- -->[[王#漢語|王]][[曰#漢語|曰]]：「[[封#漢語|封]]，[[以#漢語|以]][[厥#漢語|厥]][[庶民#漢語|庶民]][[暨#漢語|暨]][[厥#漢語|厥]][[臣#漢語|臣]][[達#漢語|達]]<b>大家</b>，[[以#漢語|以]][[厥#漢語|厥]][[臣#漢語|臣]][[達#漢語|達]][[王#漢語|王]][[惟#漢語|惟]][[邦君#漢語|邦君]]。」<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:文言文|文言文]]，[[繁體中文|繁體]]&#93;</span><br><span lang="zh-Hans" class="Hans">-{<!-- -->[[王#漢語|王]][[曰#漢語|曰]]：“[[封#漢語|封]]，[[以#漢語|以]][[厥#漢語|厥]][[庶民#漢語|庶民]][[暨#漢語|暨]][[厥#漢語|厥]][[臣#漢語|臣]][[达#漢語|达]]<b>大家</b>，[[以#漢語|以]][[厥#漢語|厥]][[臣#漢語|臣]][[达#漢語|达]][[王#漢語|王]][[惟#漢語|惟]][[邦君#漢語|邦君]]。”<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:文言文|文言文]]，[[簡體中文|簡體]]&#93;</span><dd><small>來自：《[[s:尚書/梓材|尚書·梓材]]》</small></dd><dd><span lang="Latn" style="color:#404D52"><i>Wáng yuē: “Fēng, yǐ jué shùmín jì jué chén dá <b>dàjiā</b>, yǐ jué chén dá wáng wéi bāngjūn.”</i></span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:漢語拼音|漢語拼音]]&#93;</span></dd><dd>王說：「封啊，從殷的老百姓和他們的官員到'''卿大夫'''，從他們的官員到諸侯和國君。」</dd></dl>[[Category:有引文的文言文詞]]""",
+        )
+        sense_data = Sense()
+        root = self.wxr.wtp.parse(
+            "#* {{zh-x|王 曰：「封，以 厥 庶民 暨 厥 臣 達 大家，以 厥 臣 達 王 惟 邦君。」|王說：「封啊，從殷的老百姓和他們的官員到'''卿大夫'''，從他們的官員到諸侯和國君。」|CL|ref=《[[s:尚書/梓材|尚書·梓材]]》}}"
+        )
+        extract_examples(self.wxr, sense_data, root.children[0], [])
+        self.assertEqual(
+            [e.model_dump(exclude_defaults=True) for e in sense_data.examples],
+            [
+                {
+                    "ref": "《尚書·梓材》",
+                    "raw_tags": ["文言文", "繁體"],
+                    "text": "王曰：「封，以厥庶民暨厥臣達大家，以厥臣達王惟邦君。」",
+                    "translation": "王說：「封啊，從殷的老百姓和他們的官員到卿大夫，從他們的官員到諸侯和國君。」",
+                },
+                {
+                    "ref": "《尚書·梓材》",
+                    "raw_tags": ["文言文", "簡體"],
+                    "text": "王曰：“封，以厥庶民暨厥臣达大家，以厥臣达王惟邦君。”",
+                    "translation": "王說：「封啊，從殷的老百姓和他們的官員到卿大夫，從他們的官員到諸侯和國君。」",
+                },
+            ],
+        )
+
+    def test_zh_x_no_ref(self):
+        self.wxr.wtp.start_page("中文")
+        self.wxr.wtp.add_page(
+            "Template:zh-x",
+            10,
+            """<span lang="zh-Hant" class="Hant">-{<!-- --><b>中文</b>[[授課#漢語|授課]]<!-- -->}-</span> / <span lang="zh-Hans" class="Hans">-{<!-- --><b>中文</b>[[授课#漢語|授课]]<!-- -->}-</span>&nbsp; ―&nbsp; <span lang="Latn" style="color:#404D52"><i><b>zhōngwén</b> shòukè</i></span>&nbsp; ―&nbsp; [[Category:有使用例的官話詞]]""",
+        )
+        sense_data = Sense()
+        root = self.wxr.wtp.parse("#* {{zh-x|中文 授課}}")
+        extract_examples(self.wxr, sense_data, root.children[0], [])
+        self.assertEqual(
+            [e.model_dump(exclude_defaults=True) for e in sense_data.examples],
+            [
+                {
+                    "text": "中文授課",
+                    "tags": ["Traditional Chinese"],
+                    "roman": "zhōngwén shòukè",
+                },
+                {
+                    "text": "中文授课",
+                    "tags": ["Simplified Chinese"],
+                    "roman": "zhōngwén shòukè",
+                },
+            ],
+        )