Non-bugfix about zero & wide definition conflicts (again!) (#110)

In this update to update-tables.py, 04d6d90 I wrote, > `verify-table-integrity.py` exercises a "bug" of duplicated tables that has no effect, because wcswidth() first checks for zero-width, and that is preferred in cases of conflict. This PR also resolves that error of duplication. In that change I used method [set.discard()](https://docs.python.org/3/library/stdtypes.html#frozenset.discard) in error, the discard method takes a single item as an argument, while I was providing using a whole set and so it had no effect. Instead, I now use [set.difference()](https://docs.python.org/3/library/stdtypes.html#frozenset.difference) to re-assign the value. Also, - the `category_codes` argument has been removed in update-tables.py, it is not used. - `verify-table-integrity.py` has been improved to show both range values in conflict - `verify-table-integrity.py` now included as a unit test for a single version of python (3.12) - new unit test about conflicting wide & zero values. This demonstrates that the update to table_wide.py has no effect, as these tests succeed before and after change to table_wide.py.
jquast · Jan 6, 2024 · 3af992a · 3af992a
1 parent 0ba0278
commit 3af992a
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 83 deletions.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -112,11 +112,11 @@ class TableEntry:
     properties: tuple[str, ...]
     comment: str
 
-    def filter_by_category(self, category_codes: str, wide: int) -> bool:
+    def filter_by_category_width(self, wide: int) -> bool:
         """
-        Return whether entry matches given category code and displayed width.
+        Return whether entry matches displayed width.
 
-        Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
+        Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
         """
         if self.code_range is None:
             return False
@@ -146,13 +146,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
         return wide == 1
 
     @staticmethod
-    def parse_category_values(category_codes: str,
-                              table_iter: Iterator[TableEntry],
-                              wide: int) -> set[tuple[int, int]]:
+    def parse_width_category_values(table_iter: Iterator[TableEntry],
+                                    wide: int) -> set[tuple[int, int]]:
         """Parse value ranges of unicode data files, by given category and width."""
         return {n
                 for entry in table_iter
-                if entry.filter_by_category(category_codes, wide)
+                if entry.filter_by_category_width(wide)
                 for n in list(range(entry.code_range[0], entry.code_range[1]))}
 
 
@@ -326,18 +325,16 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # parse typical 'wide' characters by categories 'W' and 'F',
         table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
-                                        category_codes=('W', 'F'),
                                         wide=2)
 
         # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
         # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
-        table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                     category_codes=('Mn', 'Mc'),
-                                                     wide=0).values)
+        table[version].values = table[version].values.difference(parse_category(
+            fname=UnicodeDataFile.DerivedGeneralCategory(version),
+            wide=0).values)
 
         # finally, join with atypical 'wide' characters defined by category 'Sk',
         table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                    category_codes=('Sk',),
                                                     wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
@@ -352,7 +349,6 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # Determine values of zero-width character lookup table by the following category codes
         table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                        category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
                                         wide=0)
 
         # And, include NULL
@@ -501,9 +497,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
 
 
 @functools.cache
-def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
+def parse_category(fname: str, wide: int) -> TableDef:
     """Parse value ranges of unicode data files, by given categories into string tables."""
-    print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
+    print(f'parsing {fname}, wide={wide}: ', end='', flush=True)
 
     with open(fname, encoding='utf-8') as f:
         table_iter = parse_unicode_table(f)
@@ -512,7 +508,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-        values = TableEntry.parse_category_values(category_codes, table_iter, wide)
+        values = TableEntry.parse_width_category_values(table_iter, wide)
     print('ok')
     return TableDef(version, date, values)
 

diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -63,9 +63,30 @@
 import logging
 
 
+def bisearch_pair(ucs, table):
+    """
+    A copy of wcwidth._bisearch() but also returns the range of matched values.
+    """
+    lbound = 0
+    ubound = len(table) - 1
+
+    if ucs < table[0][0] or ucs > table[ubound][1]:
+        return (0, None, None)
+    while ubound >= lbound:
+        mid = (lbound + ubound) // 2
+        if ucs > table[mid][1]:
+            lbound = mid + 1
+        elif ucs < table[mid][0]:
+            ubound = mid - 1
+        else:
+            return (1, table[mid][0], table[mid][1])
+
+    return (0, None, None)
+
+
 def main(log: logging.Logger):
-    # local
-    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
+    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
+
     reversed_uni_versions = list(reversed(list_versions()))
     tables = {'ZERO_WIDTH': ZERO_WIDTH,
               'WIDE_EASTASIAN': WIDE_EASTASIAN}
@@ -81,14 +102,21 @@ def main(log: logging.Logger):
             other_table = tables[other_table_name][version]
             for start_range, stop_range in curr_table:
                 for unichar_n in range(start_range, stop_range):
-                    if not _bisearch(unichar_n, next_table):
-                        log.info(f'value {hex(unichar_n)} in table_name={table_name}'
-                                 f' version={version} is not defined in next_version={next_version}'
-                                 f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
-                    if _bisearch(unichar_n, other_table):
-                        log.error(f'value {hex(unichar_n)} in table_name={table_name}'
-                                  f' version={version} is duplicated in other_table_name={other_table_name}'
-                                  f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                    result, _, _ = bisearch_pair(unichar_n, next_table)
+                    if not result:
+                        log.info(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is not defined in next_version={next_version}'
+                            f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
+                        )
+                    result, lbound, ubound = bisearch_pair(unichar_n, other_table)
+                    if result:
+                        log.error(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is duplicated in other_table_name={other_table_name}'
+                            f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
+                            f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
+                        )
                         errors += 1
     if errors:
         log.error(f'{errors} errors, exit 1')

diff --git a/docs/intro.rst b/docs/intro.rst
@@ -217,7 +217,7 @@ Other Languages
 History
 =======
 0.2.12 *2023-11-21*
-  * re-release to remove .pyi file misplaced in wheel files `Issue #101`.
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
 
 0.2.11 *2023-11-20*
   * Include tests files in the source distribution (`PR #98`_, `PR #100`_).

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -355,3 +355,17 @@ def test_kannada_script_2():
     # verify.
     assert length_each == expect_length_each
     assert length_phrase == expect_length_phrase
+
+
+def test_zero_wide_conflict():
+    # Test characters considered both "wide" and "zero" width
+    # -  (0x03000, 0x0303e,),  # Ideographic Space       ..Ideographic Variation In
+    # +  (0x03000, 0x03029,),  # Ideographic Space       ..Hangzhou Numeral Nine
+    assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
+    assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0
+
+    # - (0x03099, 0x030ff,),  # Combining Katakana-hirag..Katakana Digraph Koto
+    # + (0x0309b, 0x030ff,),  # Katakana-hiragana Voiced..Katakana Digraph Koto
+    assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
+    assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2
diff --git a/tests/test_table_integrity.py b/tests/test_table_integrity.py
@@ -0,0 +1,15 @@
+"""
+Executes verify-table-integrity.py as a unit test.
+"""
+import os
+import sys
+import subprocess
+
+import pytest
+
+@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
+def test_verify_table_integrity():
+    subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
+                                                          os.path.pardir,
+                                                          'bin',
+                                                          'verify-table-integrity.py')])