Merge branch 'master' into typed

jquast · Oct 28, 2024 · 4eaa725 · 4eaa725
2 parents 3190837 + 57cfbda
commit 4eaa725
Show file tree

Hide file tree

Showing 18 changed files with 354 additions and 144 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -6,6 +6,8 @@ on:
     tags: ["*"]
   pull_request:
     branches: [master]
+  schedule:
+    - cron: "30 16 1 * *"
   workflow_dispatch:
 
 env:
@@ -33,6 +35,7 @@ jobs:
           - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
           - "pypy-2.7"
           - "pypy-3.7"
           - "pypy-3.8"
@@ -48,13 +51,26 @@ jobs:
           - os: ubuntu-20.04
             container: python:2.7-buster
             python-version: "2.7"
+        exclude:
+          - os: macos-latest
+            python-version: "3.7"
+          - os: macos-latest
+            python-version: "pypy-3.7"
 
     runs-on: ${{ matrix.os }}
     container: ${{ matrix.container }}
 
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+
+      - name: Ignore certificate verification on python 3.5
+        shell: bash
+        run: |
+          # INSECURE!! But it should be OK for CI tests.
+          echo 'PIP_TRUSTED_HOST=pypi.python.org pypi.org files.pythonhosted.org' >>$GITHUB_ENV
+        if: 'matrix.python-version == 3.5'
+
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           allow-prereleases: true
@@ -117,10 +133,11 @@ jobs:
           fi
 
       - name: Upload coverage data
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: coverage-data
+          name: coverage-data.${{ matrix.os }}-${{ matrix.python-version }}
           path: .coverage.*
+          include-hidden-files: true
           if-no-files-found: ignore
 
   coverage:
@@ -129,14 +146,22 @@ jobs:
     needs: tests
 
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         with:
           python-version-file: .python-version-default
           cache: pip
 
+      - name: Merge coverage data artifacts
+        uses: actions/upload-artifact/merge@v4
+        with:
+          name: coverage-data
+          pattern: coverage-data.*
+          include-hidden-files: true
+          delete-merged: true
+
       - name: Download coverage data
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: coverage-data
 
@@ -152,7 +177,7 @@ jobs:
           python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
 
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
         env:
           CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}}
 
@@ -162,7 +187,7 @@ jobs:
           python -Im coverage report --fail-under=100
 
       - name: Upload HTML report if check failed.
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: html-report
           path: htmlcov

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -6,7 +6,7 @@ on:
   pull_request:
     branches: [ "master" ]
   schedule:
-    - cron: "36 4 * * 2"
+    - cron: "36 4 1 * *"
 
 jobs:
   analyze:

diff --git a/.python-version-default b/.python-version-default
@@ -1 +1 @@
-3.11
+3.12
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -21,7 +21,7 @@
 from pathlib import Path
 from dataclasses import field, fields, dataclass
 
-from typing import Any, Mapping, Iterable, Iterator, Sequence, Container, Collection
+from typing import Any, Mapping, Iterable, Iterator, Sequence, Collection
 
 try:
     from typing import Self
@@ -54,6 +54,19 @@
 MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
 BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))
 
+# Hangul Jamo is a decomposed form of Hangul Syllables, see
+# see https://www.unicode.org/faq/korean.html#3
+#     https://github.com/ridiculousfish/widecharwidth/pull/17
+#     https://github.com/jquast/ucs-detect/issues/9
+#     https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
+# "Conjoining Jamo are divided into three classes: L, V, T (Leading
+#  consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
+#  <LV> or <LVT> sequences."
+HANGUL_JAMO_ZEROWIDTH = (
+    *range(0x1160, 0x1200),  # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
+    *range(0xD7B0, 0xD800),  # Hangul Jungseong O-Yeo  .. Undefined Character of Hangul Jamo Extended-B
+)
+
 
 def _bisearch(ucs, table):
     """A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
@@ -77,7 +90,7 @@ def _bisearch(ucs, table):
 
 @dataclass(order=True, frozen=True)
 class UnicodeVersion:
-    """A class for camparable unicode version."""
+    """A class for comparable unicode version."""
     major: int
     minor: int
     micro: int | None
@@ -112,11 +125,11 @@ class TableEntry:
     properties: tuple[str, ...]
     comment: str
 
-    def filter_by_category(self, category_codes: str, wide: int) -> bool:
+    def filter_by_category_width(self, wide: int) -> bool:
         """
-        Return whether entry matches given category code and displayed width.
+        Return whether entry matches displayed width.
 
-        Categories are described here, https://www.unicode.org/reports/tr44/#GC_Values_Table
+        Parses both DerivedGeneralCategory.txt and EastAsianWidth.txt
         """
         if self.code_range is None:
             return False
@@ -146,13 +159,12 @@ def filter_by_category(self, category_codes: str, wide: int) -> bool:
         return wide == 1
 
     @staticmethod
-    def parse_category_values(category_codes: str,
-                              table_iter: Iterator[TableEntry],
-                              wide: int) -> set[tuple[int, int]]:
+    def parse_width_category_values(table_iter: Iterator[TableEntry],
+                                    wide: int) -> set[tuple[int, int]]:
         """Parse value ranges of unicode data files, by given category and width."""
         return {n
                 for entry in table_iter
-                if entry.filter_by_category(category_codes, wide)
+                if entry.filter_by_category_width(wide)
                 for n in list(range(entry.code_range[0], entry.code_range[1]))}
 
 
@@ -326,18 +338,19 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # parse typical 'wide' characters by categories 'W' and 'F',
         table[version] = parse_category(fname=UnicodeDataFile.EastAsianWidth(version),
-                                        category_codes=('W', 'F'),
                                         wide=2)
 
         # subtract(!) wide characters that were defined above as 'W' category in EastAsianWidth,
         # but also zero-width category 'Mn' or 'Mc' in DerivedGeneralCategory!
-        table[version].values.discard(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                     category_codes=('Mn', 'Mc'),
-                                                     wide=0).values)
+        table[version].values = table[version].values.difference(parse_category(
+            fname=UnicodeDataFile.DerivedGeneralCategory(version),
+            wide=0).values)
+
+        # Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)
 
         # finally, join with atypical 'wide' characters defined by category 'Sk',
         table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                                    category_codes=('Sk',),
                                                     wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
@@ -352,11 +365,13 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
     for version in fetch_unicode_versions():
         # Determine values of zero-width character lookup table by the following category codes
         table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
-                                        category_codes=('Me', 'Mn', 'Mc', 'Cf', 'Zl', 'Zp', 'Sk'),
                                         wide=0)
 
-        # And, include NULL
+        # Include NULL
         table[version].values.add(0)
+
+        # Add Hangul Jamo Vowels and Hangul Trailing Consonants
+        table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -482,7 +497,7 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
 
 
 def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
-    """Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
+    """Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
     hex_str_vs16 = 'FE0F'
     for line in fp:
         data, _, comment = line.partition('#')
@@ -496,14 +511,14 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
             continue
         code_points = code_points_str.split()
         if len(code_points) == 2 and code_points[1] == hex_str_vs16:
-            # yeild a single "code range" entry for a single value that preceeds FE0F
+            # yield a single "code range" entry for a single value that precedes FE0F
             yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
 
 
 @functools.cache
-def parse_category(fname: str, category_codes: Container[str], wide: int) -> TableDef:
+def parse_category(fname: str, wide: int) -> TableDef:
     """Parse value ranges of unicode data files, by given categories into string tables."""
-    print(f'parsing {fname} category_codes={",".join(category_codes)}: ', end='', flush=True)
+    print(f'parsing {fname}, wide={wide}: ', end='', flush=True)
 
     with open(fname, encoding='utf-8') as f:
         table_iter = parse_unicode_table(f)
@@ -512,7 +527,7 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-        values = TableEntry.parse_category_values(category_codes, table_iter, wide)
+        values = TableEntry.parse_width_category_values(table_iter, wide)
     print('ok')
     return TableDef(version, date, values)
 

diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -63,9 +63,30 @@
 import logging
 
 
+def bisearch_pair(ucs, table):
+    """
+    A copy of wcwidth._bisearch() but also returns the range of matched values.
+    """
+    lbound = 0
+    ubound = len(table) - 1
+
+    if ucs < table[0][0] or ucs > table[ubound][1]:
+        return (0, None, None)
+    while ubound >= lbound:
+        mid = (lbound + ubound) // 2
+        if ucs > table[mid][1]:
+            lbound = mid + 1
+        elif ucs < table[mid][0]:
+            ubound = mid - 1
+        else:
+            return (1, table[mid][0], table[mid][1])
+
+    return (0, None, None)
+
+
 def main(log: logging.Logger):
-    # local
-    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, _bisearch, list_versions
+    from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions
+
     reversed_uni_versions = list(reversed(list_versions()))
     tables = {'ZERO_WIDTH': ZERO_WIDTH,
               'WIDE_EASTASIAN': WIDE_EASTASIAN}
@@ -81,14 +102,21 @@ def main(log: logging.Logger):
             other_table = tables[other_table_name][version]
             for start_range, stop_range in curr_table:
                 for unichar_n in range(start_range, stop_range):
-                    if not _bisearch(unichar_n, next_table):
-                        log.info(f'value {hex(unichar_n)} in table_name={table_name}'
-                                 f' version={version} is not defined in next_version={next_version}'
-                                 f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
-                    if _bisearch(unichar_n, other_table):
-                        log.error(f'value {hex(unichar_n)} in table_name={table_name}'
-                                  f' version={version} is duplicated in other_table_name={other_table_name}'
-                                  f' from inclusive range {hex(start_range)}-{hex(stop_range)}')
+                    result, _, _ = bisearch_pair(unichar_n, next_table)
+                    if not result:
+                        log.info(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is not defined in next_version={next_version}'
+                            f' from inclusive range {hex(start_range)}-{hex(stop_range)}'
+                        )
+                    result, lbound, ubound = bisearch_pair(unichar_n, other_table)
+                    if result:
+                        log.error(
+                            f'value 0x{unichar_n:05x} in table_name={table_name}'
+                            f' version={version} is duplicated in other_table_name={other_table_name}'
+                            f' from inclusive range 0x{start_range:05x}-0x{stop_range:05x} of'
+                            f' {table_name} against 0x{lbound:05x}-0x{ubound:05x} in {other_table_name}'
+                        )
                         errors += 1
     if errors:
         log.error(f'{errors} errors, exit 1')

diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py
@@ -116,7 +116,7 @@ def __init__(self, width, unicode_version):
         """
         self.characters = []
         letters_o = ('o' * width)
-        for (begin, end) in ZERO_WIDTH[unicode_version]:
+        for (begin, end) in ZERO_WIDTH[_wcmatch_version(unicode_version)]:
             for val in [_val for _val in
                         range(begin, end + 1)
                         if _val <= LIMIT_UCS]:

diff --git a/code_templates/python_table_width.py.j2 b/code_templates/python_table_width.py.j2
diff --git a/docs/intro.rst b/docs/intro.rst
@@ -32,7 +32,7 @@ Example
    >>>  text = u'コンニチハ'
 
 Python **incorrectly** uses the *string length* of 5 codepoints rather than the
-*printible length* of 10 cells, so that when using the `rjust` function, the
+*printable length* of 10 cells, so that when using the `rjust` function, the
 output length is wrong::
 
     >>> print(len('コンニチハ'))
@@ -216,8 +216,15 @@ Other Languages
 =======
 History
 =======
+
+0.2.13 *2024-01-06*
+  * **Bugfix** zero-width support for Hangul Jamo (Korean)
+
+0.2.12 *2023-11-21*
+  * re-release to remove .pyi file misplaced in wheel files `Issue #101`_.
+
 0.2.11 *2023-11-20*
-  * Include tests files in the source distibution (`PR #98`_, `PR #100`_).
+  * Include tests files in the source distribution (`PR #98`_, `PR #100`_).
 
 0.2.10 *2023-11-13*
   * **Bugfix** accounting of some kinds of emoji sequences using U+FE0F
@@ -231,7 +238,7 @@ History
     character measurements.
 
 0.2.8 *2023-09-30*
-  * Include requirements files in the source distibution (`PR #82`_).
+  * Include requirements files in the source distribution (`PR #82`_).
 
 0.2.7 *2023-09-28*
   * **Updated** tables to include Unicode Specification 15.1.0.
@@ -330,6 +337,7 @@ https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c::
 .. _`PR #97`: https://github.com/jquast/wcwidth/pull/97
 .. _`PR #98`: https://github.com/jquast/wcwidth/pull/98
 .. _`PR #100`: https://github.com/jquast/wcwidth/pull/100
+.. _`Issue #101`: https://github.com/jquast/wcwidth/issues/101
 .. _`jquast/blessed`: https://github.com/jquast/blessed
 .. _`selectel/pyte`: https://github.com/selectel/pyte
 .. _`thomasballinger/curtsies`: https://github.com/thomasballinger/curtsies