Merge branch 'master' into jq/spec-update

jquast · Jan 6, 2024 · 1ba02e2 · 1ba02e2
2 parents 3970392 + 8ed7f2c
commit 1ba02e2
Show file tree

Hide file tree

Showing 7 changed files with 259 additions and 103 deletions.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -54,6 +54,19 @@
 MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
 BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))
 
+# Hangul Jamo is a decomposed form of Hangul Syllables, see
+# see https://www.unicode.org/faq/korean.html#3
+#     https://github.com/ridiculousfish/widecharwidth/pull/17
+#     https://github.com/jquast/ucs-detect/issues/9
+#     https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
+# "Conjoining Jamo are divided into three classes: L, V, T (Leading
+#  consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
+#  <LV> or <LVT> sequences."
+HANGUL_JAMO_ZEROWIDTH = (
+    *range(0x1160, 0x1200),  # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
+    *range(0xD7B0, 0xD800),  # Hangul Jungseong O-Yeo  .. Undefined Character of Hangul Jamo Extended-B
+)
+
 
 def _bisearch(ucs, table):
     """A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
@@ -112,11 +125,11 @@ class TableEntry:
     properties: tuple[str, ...]
     comment: str
 
-    def filter_by_category(self, category_codes: str, wide: int) -> bool:
+    def filter_by_category_width(self, wide: int) -> bool:
         """
-        Return whether entry matches given category code and displayed width.
+        Return whether entry matches displayed width.
 
-        Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
+        Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
         """
         if self.code_range is None:
             return False
@@ -146,13 +159,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
         return wide == 1
 
     @staticmethod
-    def parse_category_values(category_codes: str,
-                              table_iter: Iterator[TableEntry],
-                              wide: int) -> set[tuple[int, int]]:
+    def parse_width_category_values(table_iter: Iterator[TableEntry],
+                                    wide: int) -> set[tuple[int, int]]:
         """Parse value ranges of unicode data files, by given category and width."""
         return {n
                 for entry in table_iter
-                if entry.filter_by_category(category_codes, wide)
+                if entry.filter_by_category_width(wide)
                 for n in list(range(entry.code_range[0], entry.code_range[1]))}
 
 
@@ -326,18 +338,19 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # parse typical 'wide' characters by categories 'W' and 'F',
         table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
-                                        category_codes=('W', 'F'),
                                         wide=2)
 
         # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
         # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
-        table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                     category_codes=('Mn', 'Mc'),
-                                                     wide=0).values)
+        table[version].values = table[version].values.difference(parse_category(
+            fname=UnicodeDataFile.DerivedGeneralCategory(version),
+            wide=0).values)
+
+        # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
 
         # finally, join with atypical 'wide' characters defined by category 'Sk',
         table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                    category_codes=('Sk',),
                                                     wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
@@ -352,11 +365,13 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # Determine values of zero-width character lookup table by the following category codes
         table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                        category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
                                         wide=0)
 
-        # And, include NULL
+        # Include NULL
         table[version].values.add(0)
+
+        # Add Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -501,9 +516,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
 
 
 @functools.cache
-def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
+def parse_category(fname: str, wide: int) -> TableDef:
     """Parse value ranges of unicode data files, by given categories into string tables."""
-    print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
+    print(f'parsing {fname}, wide={wide}: ', end='', flush=True)
 
     with open(fname, encoding='utf-8') as f:
         table_iter = parse_unicode_table(f)
@@ -512,7 +527,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-        values = TableEntry.parse_category_values(category_codes, table_iter, wide)
+        values = TableEntry.parse_width_category_values(table_iter, wide)
     print('ok')
     return TableDef(version, date, values)
 

diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -63,9 +63,30 @@
 import logging
 
 
+def bisearch_pair(ucs, table):
+    """
+    A copy of wcwidth._bisearch() but also returns the range of matched values.
+    """
+    lbound = 0
+    ubound = len(table) - 1
+
+    if ucs < table[0][0] or ucs > table[ubound][1]:
+        return (0, None, None)
+    while ubound >= lbound:
+        mid = (lbound + ubound) // 2
+        if ucs > table[mid][1]:
+            lbound = mid + 1
+        elif ucs < table[mid][0]:
+            ubound = mid - 1
+        else:
+            return (1, table[mid][0], table[mid][1])
+
+    return (0, None, None)
+
+
 def main(log: logging.Logger):
-    # local
-    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
+    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
+
     reversed_uni_versions = list(reversed(list_versions()))
     tables = {'ZERO_WIDTH': ZERO_WIDTH,
               'WIDE_EASTASIAN': WIDE_EASTASIAN}
@@ -81,14 +102,21 @@ def main(log: logging.Logger):
             other_table = tables[other_table_name][version]
             for start_range, stop_range in curr_table:
                 for unichar_n in range(start_range, stop_range):
-                    if not _bisearch(unichar_n, next_table):
-                        log.info(f'value {hex(unichar_n)} in table_name={table_name}'
-                                 f' version={version} is not defined in next_version={next_version}'
-                                 f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
-                    if _bisearch(unichar_n, other_table):
-                        log.error(f'value {hex(unichar_n)} in table_name={table_name}'
-                                  f' version={version} is duplicated in other_table_name={other_table_name}'
-                                  f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                    result, _, _ = bisearch_pair(unichar_n, next_table)
+                    if not result:
+                        log.info(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is not defined in next_version={next_version}'
+                            f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
+                        )
+                    result, lbound, ubound = bisearch_pair(unichar_n, other_table)
+                    if result:
+                        log.error(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is duplicated in other_table_name={other_table_name}'
+                            f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
+                            f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
+                        )
                         errors += 1
     if errors:
         log.error(f'{errors} errors, exit 1')

diff --git a/docs/intro.rst b/docs/intro.rst
@@ -216,8 +216,11 @@ Other Languages
 =======
 History
 =======
+Unreleased
+  * **Bugfix** zero-width support for Hangul Jamo (Korean)
+
 0.2.12 *2023-11-21*
-  * re-release to remove .pyi file misplaced in wheel files `Issue #101`.
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
 
 0.2.11 *2023-11-20*
   * Include tests files in the source distribution (`PR #98`_, `PR #100`_).

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -222,17 +222,48 @@ def test_balinese_script():
     assert length_phrase == expect_length_phrase
 
 
+def test_kr_jamo():
+    """
+    Test basic combining of HANGUL CHOSEONG and JUNGSEONG
+
+    Example and from Raymond Chen's blog post,
+    https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
+    """
+    # This is an example where both characters are "wide" when displayed alone.
+    #
+    # But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
+    #
+    # This wcwidth library understands their width only when combination,
+    # and not by independent display, like other zero-width characters that may
+    # only combine with an appropriate preceding character.
+    phrase = (
+        u"\u1100"  # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
+        u"\u1161"  # ᅡ HANGUL JUNGSEONG A (vowel)
+    )
+    expect_length_each = (2, 0)
+    expect_length_phrase = 2
+
+    # exercise,
+    length_each = tuple(map(wcwidth.wcwidth, phrase))
+    length_phrase = wcwidth.wcswidth(phrase)
+
+    # verify.
+    assert length_each == expect_length_each
+    assert length_phrase == expect_length_phrase
+
+
 def test_kr_jamo_filler():
     u"""
     Jamo filler is 0 width.
 
-    According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
-    like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
-    seems it doesn't matter whether it is implemented or not, they are not typically used !
+    Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
     """
-    phrase = u"\u1100\u1160"
-    expect_length_each = (2, 1)
-    expect_length_phrase = 3
+    phrase = (
+        u"\u1100"  # HANGUL CHOSEONG KIYEOK (consonant)
+        u"\u1160"  # HANGUL JUNGSEONG FILLER (vowel)
+    )
+    expect_length_each = (2, 0)
+    expect_length_phrase = 2
 
     # exercise,
     length_each = tuple(map(wcwidth.wcwidth, phrase))
@@ -355,3 +386,17 @@ def test_kannada_script_2():
     # verify.
     assert length_each == expect_length_each
     assert length_phrase == expect_length_phrase
+
+
+def test_zero_wide_conflict():
+    # Test characters considered both "wide" and "zero" width
+    # -  (0x03000, 0x0303e,),  # Ideographic Space       ..Ideographic Variation In
+    # +  (0x03000, 0x03029,),  # Ideographic Space       ..Hangzhou Numeral Nine
+    assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
+    assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0
+
+    # - (0x03099, 0x030ff,),  # Combining Katakana-hirag..Katakana Digraph Koto
+    # + (0x0309b, 0x030ff,),  # Katakana-hiragana Voiced..Katakana Digraph Koto
+    assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2
diff --git a/tests/test_table_integrity.py b/tests/test_table_integrity.py
@@ -0,0 +1,15 @@
+"""
+Executes verify-table-integrity.py as a unit test.
+"""
+import os
+import sys
+import subprocess
+
+import pytest
+
+@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
+def test_verify_table_integrity():
+    subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
+                                                          os.path.pardir,
+                                                          'bin',
+                                                          'verify-table-integrity.py')])