Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Non-bugfix about zero & wide definition conflicts (again!) #110

Merged
merged 5 commits into from
Jan 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,11 @@ class TableEntry:
properties: tuple[str, ...]
comment: str

def filter_by_category(self, category_codes: str, wide: int) -> bool:
def filter_by_category_width(self, wide: int) -> bool:
"""
Return whether entry matches given category code and displayed width.
Return whether entry matches displayed width.

Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
"""
if self.code_range is None:
return False
Expand Down Expand Up @@ -146,13 +146,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
return wide == 1

@staticmethod
def parse_category_values(category_codes: str,
table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
def parse_width_category_values(table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
"""Parse value ranges of unicode data files, by given category and width."""
return {n
for entry in table_iter
if entry.filter_by_category(category_codes, wide)
if entry.filter_by_category_width(wide)
for n in list(range(entry.code_range[0], entry.code_range[1]))}


Expand Down Expand Up @@ -326,18 +325,16 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# parse typical 'wide' characters by categories 'W' and 'F',
table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
category_codes=('W', 'F'),
wide=2)

# subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
# but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Mn', 'Mc'),
wide=0).values)
table[version].values = table[version].values.difference(parse_category(
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Sk',),
wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)

Expand All @@ -352,7 +349,6 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# Determine values of zero-width character lookup table by the following category codes
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
wide=0)

# And, include NULL
Expand Down Expand Up @@ -501,9 +497,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:


@functools.cache
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
def parse_category(fname: str, wide: int) -> TableDef:
"""Parse value ranges of unicode data files, by given categories into string tables."""
print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
print(f'parsing {fname}, wide={wide}: ', end='', flush=True)

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
Expand All @@ -512,7 +508,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
values = TableEntry.parse_width_category_values(table_iter, wide)
print('ok')
return TableDef(version, date, values)

Expand Down
48 changes: 38 additions & 10 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,30 @@
import logging


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
lbound = 0
ubound = len(table) - 1

if ucs < table[0][0] or ucs > table[ubound][1]:
return (0, None, None)
while ubound >= lbound:
mid = (lbound + ubound) // 2
if ucs > table[mid][1]:
lbound = mid + 1
elif ucs < table[mid][0]:
ubound = mid - 1
else:
return (1, table[mid][0], table[mid][1])

return (0, None, None)


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
tables = {'ZERO_WIDTH': ZERO_WIDTH,
'WIDE_EASTASIAN': WIDE_EASTASIAN}
Expand All @@ -81,14 +102,21 @@ def main(log: logging.Logger):
other_table = tables[other_table_name][version]
for start_range, stop_range in curr_table:
for unichar_n in range(start_range, stop_range):
if not _bisearch(unichar_n, next_table):
log.info(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
if _bisearch(unichar_n, other_table):
log.error(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
result, _, _ = bisearch_pair(unichar_n, next_table)
if not result:
log.info(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
)
result, lbound, ubound = bisearch_pair(unichar_n, other_table)
if result:
log.error(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
)
errors += 1
if errors:
log.error(f'{errors} errors, exit 1')
Expand Down
2 changes: 1 addition & 1 deletion docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ Other Languages
History
=======
0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`.
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

0.2.11 *2023-11-20*
* Include tests files in the source distribution (`PR #98`_, `PR #100`_).
Expand Down
14 changes: 14 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,17 @@ def test_kannada_script_2():
# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_zero_wide_conflict():
# Test characters considered both "wide" and "zero" width
# - (0x03000, 0x0303e,), # Ideographic Space ..Ideographic Variation In
# + (0x03000, 0x03029,), # Ideographic Space ..Hangzhou Numeral Nine
assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0

# - (0x03099, 0x030ff,), # Combining Katakana-hirag..Katakana Digraph Koto
# + (0x0309b, 0x030ff,), # Katakana-hiragana Voiced..Katakana Digraph Koto
assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2
15 changes: 15 additions & 0 deletions tests/test_table_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Executes verify-table-integrity.py as a unit test.
"""
import os
import sys
import subprocess

import pytest

@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
def test_verify_table_integrity():
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
os.path.pardir,
'bin',
'verify-table-integrity.py')])
Loading