Skip to content

Commit

Permalink
Non-bugfix about zero & wide definition conflicts (again!) (#110)
Browse files Browse the repository at this point in the history
In this update to update-tables.py, 04d6d90 I wrote,

> `verify-table-integrity.py` exercises a "bug" of duplicated tables that has no effect, because wcswidth() first checks for zero-width, and that is preferred in cases of conflict. This PR also resolves that error of duplication.

In that change I used method [set.discard()](https://docs.python.org/3/library/stdtypes.html#frozenset.discard) in error, the discard method takes a single item as an argument, while I was providing using a whole set and so it had no effect. Instead, I now use [set.difference()](https://docs.python.org/3/library/stdtypes.html#frozenset.difference) to re-assign the value.

Also,
- the `category_codes` argument has been removed in update-tables.py, it is not used.
- `verify-table-integrity.py` has been improved to show both range values in conflict
- `verify-table-integrity.py` now included as a unit test for a single version of python (3.12)
- new unit test about conflicting wide & zero values. This demonstrates that the update to table_wide.py has no effect, as these tests succeed before and after change to table_wide.py.
  • Loading branch information
jquast authored Jan 6, 2024
1 parent 0ba0278 commit 3af992a
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 83 deletions.
28 changes: 12 additions & 16 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ class TableEntry:
properties: tuple[str, ...]
comment: str

def filter_by_category(self, category_codes: str, wide: int) -> bool:
def filter_by_category_width(self, wide: int) -> bool:
"""
Return whether entry matches given category code and displayed width.
Return whether entry matches displayed width.
Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
"""
if self.code_range is None:
return False
Expand Down Expand Up @@ -146,13 +146,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
return wide == 1

@staticmethod
def parse_category_values(category_codes: str,
table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
def parse_width_category_values(table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
"""Parse value ranges of unicode data files, by given category and width."""
return {n
for entry in table_iter
if entry.filter_by_category(category_codes, wide)
if entry.filter_by_category_width(wide)
for n in list(range(entry.code_range[0], entry.code_range[1]))}


Expand Down Expand Up @@ -326,18 +325,16 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# parse typical 'wide' characters by categories 'W' and 'F',
table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
category_codes=('W', 'F'),
wide=2)

# subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
# but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Mn', 'Mc'),
wide=0).values)
table[version].values = table[version].values.difference(parse_category(
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Sk',),
wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)

Expand All @@ -352,7 +349,6 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# Determine values of zero-width character lookup table by the following category codes
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
wide=0)

# And, include NULL
Expand Down Expand Up @@ -501,9 +497,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:


@functools.cache
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
def parse_category(fname: str, wide: int) -> TableDef:
"""Parse value ranges of unicode data files, by given categories into string tables."""
print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
print(f'parsing {fname}, wide={wide}: ', end='', flush=True)

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
Expand All @@ -512,7 +508,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
values = TableEntry.parse_width_category_values(table_iter, wide)
print('ok')
return TableDef(version, date, values)

Expand Down
48 changes: 38 additions & 10 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,30 @@
import logging


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
lbound = 0
ubound = len(table) - 1

if ucs < table[0][0] or ucs > table[ubound][1]:
return (0, None, None)
while ubound >= lbound:
mid = (lbound + ubound) // 2
if ucs > table[mid][1]:
lbound = mid + 1
elif ucs < table[mid][0]:
ubound = mid - 1
else:
return (1, table[mid][0], table[mid][1])

return (0, None, None)


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
tables = {'ZERO_WIDTH': ZERO_WIDTH,
'WIDE_EASTASIAN': WIDE_EASTASIAN}
Expand All @@ -81,14 +102,21 @@ def main(log: logging.Logger):
other_table = tables[other_table_name][version]
for start_range, stop_range in curr_table:
for unichar_n in range(start_range, stop_range):
if not _bisearch(unichar_n, next_table):
log.info(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
if _bisearch(unichar_n, other_table):
log.error(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
result, _, _ = bisearch_pair(unichar_n, next_table)
if not result:
log.info(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
)
result, lbound, ubound = bisearch_pair(unichar_n, other_table)
if result:
log.error(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
)
errors += 1
if errors:
log.error(f'{errors} errors, exit 1')
Expand Down
2 changes: 1 addition & 1 deletion docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ Other Languages
History
=======
0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`.
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

0.2.11 *2023-11-20*
* Include tests files in the source distribution (`PR #98`_, `PR #100`_).
Expand Down
14 changes: 14 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,17 @@ def test_kannada_script_2():
# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_zero_wide_conflict():
# Test characters considered both "wide" and "zero" width
# - (0x03000, 0x0303e,), # Ideographic Space ..Ideographic Variation In
# + (0x03000, 0x03029,), # Ideographic Space ..Hangzhou Numeral Nine
assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0

# - (0x03099, 0x030ff,), # Combining Katakana-hirag..Katakana Digraph Koto
# + (0x0309b, 0x030ff,), # Katakana-hiragana Voiced..Katakana Digraph Koto
assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2
15 changes: 15 additions & 0 deletions tests/test_table_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Executes verify-table-integrity.py as a unit test.
"""
import os
import sys
import subprocess

import pytest

@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
def test_verify_table_integrity():
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
os.path.pardir,
'bin',
'verify-table-integrity.py')])
Loading

0 comments on commit 3af992a

Please sign in to comment.