Skip to content

Commit

Permalink
Merge branch 'master' into jq/spec-update
Browse files Browse the repository at this point in the history
  • Loading branch information
jquast authored Jan 6, 2024
2 parents 3970392 + 8ed7f2c commit 1ba02e2
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 103 deletions.
49 changes: 32 additions & 17 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))

# Hangul Jamo is a decomposed form of Hangul Syllables, see
# see https://www.unicode.org/faq/korean.html#3
# https://github.com/ridiculousfish/widecharwidth/pull/17
# https://github.com/jquast/ucs-detect/issues/9
# https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
# "Conjoining Jamo are divided into three classes: L, V, T (Leading
# consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
# <LV> or <LVT> sequences."
HANGUL_JAMO_ZEROWIDTH = (
*range(0x1160, 0x1200), # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
*range(0xD7B0, 0xD800), # Hangul Jungseong O-Yeo .. Undefined Character of Hangul Jamo Extended-B
)


def _bisearch(ucs, table):
"""A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
Expand Down Expand Up @@ -112,11 +125,11 @@ class TableEntry:
properties: tuple[str, ...]
comment: str

def filter_by_category(self, category_codes: str, wide: int) -> bool:
def filter_by_category_width(self, wide: int) -> bool:
"""
Return whether entry matches given category code and displayed width.
Return whether entry matches displayed width.
Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
"""
if self.code_range is None:
return False
Expand Down Expand Up @@ -146,13 +159,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
return wide == 1

@staticmethod
def parse_category_values(category_codes: str,
table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
def parse_width_category_values(table_iter: Iterator[TableEntry],
wide: int) -> set[tuple[int, int]]:
"""Parse value ranges of unicode data files, by given category and width."""
return {n
for entry in table_iter
if entry.filter_by_category(category_codes, wide)
if entry.filter_by_category_width(wide)
for n in list(range(entry.code_range[0], entry.code_range[1]))}


Expand Down Expand Up @@ -326,18 +338,19 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# parse typical 'wide' characters by categories 'W' and 'F',
table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
category_codes=('W', 'F'),
wide=2)

# subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
# but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Mn', 'Mc'),
wide=0).values)
table[version].values = table[version].values.difference(parse_category(
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Sk',),
wide=2).values)
return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)

Expand All @@ -352,11 +365,13 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
for version in fetch_unicode_versions():
# Determine values of zero-width character lookup table by the following category codes
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
wide=0)

# And, include NULL
# Include NULL
table[version].values.add(0)

# Add Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down Expand Up @@ -501,9 +516,9 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:


@functools.cache
def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
def parse_category(fname: str, wide: int) -> TableDef:
"""Parse value ranges of unicode data files, by given categories into string tables."""
print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
print(f'parsing {fname}, wide={wide}: ', end='', flush=True)

with open(fname, encoding='utf-8') as f:
table_iter = parse_unicode_table(f)
Expand All @@ -512,7 +527,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
version = next(table_iter).comment.strip()
# and "date string" from second line
date = next(table_iter).comment.split(':', 1)[1].strip()
values = TableEntry.parse_category_values(category_codes, table_iter, wide)
values = TableEntry.parse_width_category_values(table_iter, wide)
print('ok')
return TableDef(version, date, values)

Expand Down
48 changes: 38 additions & 10 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,30 @@
import logging


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
lbound = 0
ubound = len(table) - 1

if ucs < table[0][0] or ucs > table[ubound][1]:
return (0, None, None)
while ubound >= lbound:
mid = (lbound + ubound) // 2
if ucs > table[mid][1]:
lbound = mid + 1
elif ucs < table[mid][0]:
ubound = mid - 1
else:
return (1, table[mid][0], table[mid][1])

return (0, None, None)


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
tables = {'ZERO_WIDTH': ZERO_WIDTH,
'WIDE_EASTASIAN': WIDE_EASTASIAN}
Expand All @@ -81,14 +102,21 @@ def main(log: logging.Logger):
other_table = tables[other_table_name][version]
for start_range, stop_range in curr_table:
for unichar_n in range(start_range, stop_range):
if not _bisearch(unichar_n, next_table):
log.info(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
if _bisearch(unichar_n, other_table):
log.error(f'value {hex(unichar_n)} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
result, _, _ = bisearch_pair(unichar_n, next_table)
if not result:
log.info(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is not defined in next_version={next_version}'
f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
)
result, lbound, ubound = bisearch_pair(unichar_n, other_table)
if result:
log.error(
f'value 0x{unichar_n:05x} in table_name={table_name}'
f' version={version} is duplicated in other_table_name={other_table_name}'
f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
)
errors += 1
if errors:
log.error(f'{errors} errors, exit 1')
Expand Down
5 changes: 4 additions & 1 deletion docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,11 @@ Other Languages
=======
History
=======
Unreleased
* **Bugfix** zero-width support for Hangul Jamo (Korean)

0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`.
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

0.2.11 *2023-11-20*
* Include tests files in the source distribution (`PR #98`_, `PR #100`_).
Expand Down
57 changes: 51 additions & 6 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,48 @@ def test_balinese_script():
assert length_phrase == expect_length_phrase


def test_kr_jamo():
"""
Test basic combining of HANGUL CHOSEONG and JUNGSEONG
Example and from Raymond Chen's blog post,
https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
"""
# This is an example where both characters are "wide" when displayed alone.
#
# But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
#
# This wcwidth library understands their width only when combination,
# and not by independent display, like other zero-width characters that may
# only combine with an appropriate preceding character.
phrase = (
u"\u1100" # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
u"\u1161" # ᅡ HANGUL JUNGSEONG A (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
length_phrase = wcwidth.wcswidth(phrase)

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_kr_jamo_filler():
u"""
Jamo filler is 0 width.
According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
seems it doesn't matter whether it is implemented or not, they are not typically used !
Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
"""
phrase = u"\u1100\u1160"
expect_length_each = (2, 1)
expect_length_phrase = 3
phrase = (
u"\u1100" # HANGUL CHOSEONG KIYEOK (consonant)
u"\u1160" # HANGUL JUNGSEONG FILLER (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand Down Expand Up @@ -355,3 +386,17 @@ def test_kannada_script_2():
# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_zero_wide_conflict():
# Test characters considered both "wide" and "zero" width
# - (0x03000, 0x0303e,), # Ideographic Space ..Ideographic Variation In
# + (0x03000, 0x03029,), # Ideographic Space ..Hangzhou Numeral Nine
assert wcwidth.wcwidth(unichr(0x03029), unicode_version='4.1.0') == 2
assert wcwidth.wcwidth(unichr(0x0302a), unicode_version='4.1.0') == 0

# - (0x03099, 0x030ff,), # Combining Katakana-hirag..Katakana Digraph Koto
# + (0x0309b, 0x030ff,), # Katakana-hiragana Voiced..Katakana Digraph Koto
assert wcwidth.wcwidth(unichr(0x03099), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309a), unicode_version='4.1.0') == 0
assert wcwidth.wcwidth(unichr(0x0309b), unicode_version='4.1.0') == 2
15 changes: 15 additions & 0 deletions tests/test_table_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Executes verify-table-integrity.py as a unit test.
"""
import os
import sys
import subprocess

import pytest

@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
def test_verify_table_integrity():
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
os.path.pardir,
'bin',
'verify-table-integrity.py')])
Loading

0 comments on commit 1ba02e2

Please sign in to comment.