Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Variation Selector 15 (VS-15, U+FE0E) support. #120

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 58 additions & 10 deletions bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,19 +417,22 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
"""
table: dict[UnicodeVersion, TableDef] = {}
unicode_latest = fetch_unicode_versions()[-1]
hex_str_vs = 'FE0F'

wide_tables = fetch_table_wide_data().table
unicode_version = UnicodeVersion.parse('9.0.0')

# parse table formatted by the latest emoji release (developed with
# 15.1.0) and parse a single file for all individual releases
table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version)
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs)

# parse and join the final emoji release 12.0 of the earlier "type"
table[unicode_version].values.update(
parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version).values)
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs).values)

# perform culling on any values that are already understood as 'wide'
# without the variation-16 selector
Expand All @@ -442,16 +445,61 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)


def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
with open(fname, encoding='utf-8') as fin:
table_iter = parse_vs16_table(fin)
table_iter = parse_vs_table(fin, hex_str_vs)
# pull "date string"
date = next(table_iter).comment.split(':', 1)[1].strip()
# pull values only matching this unicode version and lower
values = {entry.code_range[0] for entry in table_iter}
return TableDef(ubound_unicode_version, date, values)


def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
"""
Fetch and create a "wide to narrow variation-15" lookup table.

Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
become narrow, for the given versions of unicode.

UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
'variation selector-15' wide emoji becoming narrow.

Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
while most others display it as a wide cell, only.

It is fair to call these ambiguous, see related 'ucs-detect' project.
"""
table: dict[UnicodeVersion, TableDef] = {}
unicode_latest = fetch_unicode_versions()[-1]
hex_str_vs = 'FE0E'

wide_tables = fetch_table_wide_data().table
unicode_version = UnicodeVersion.parse('9.0.0')

# parse table formatted by the latest emoji release (developed with
# 15.1.0) and parse a single file for all individual releases
table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs)

# parse and join the final emoji release 12.0 of the earlier "type"
table[unicode_version].values.update(
parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
ubound_unicode_version=unicode_version,
hex_str_vs=hex_str_vs).values)

# perform culling on any values that are already understood as 'narrow'
# without the variation-15 selector
wide_table = wide_tables[unicode_version].as_value_ranges()
table[unicode_version].values = {
ucs for ucs in table[unicode_version].values
if _bisearch(ucs, wide_table)
}

return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)


def cite_source_description(filename: str) -> tuple[str, str]:
"""Return unicode.org source data file's own description as citation."""
with open(filename, encoding='utf-8') as f:
Expand Down Expand Up @@ -496,9 +544,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
yield TableEntry(code_range, tuple(properties), comment)


def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
"""Parse emoji-variation-sequences.txt for codepoints that preceed 0xFE0F."""
hex_str_vs16 = 'FE0F'
def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
"""Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`"""
for line in fp:
data, _, comment = line.partition('#')
data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
Expand All @@ -510,7 +557,7 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
yield TableEntry(None, tuple(properties), comment)
continue
code_points = code_points_str.split()
if len(code_points) == 2 and code_points[1] == hex_str_vs16:
if len(code_points) == 2 and code_points[1] == hex_str_vs:
# yeild a single "code range" entry for a single value that preceeds FE0F
yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)

Expand Down Expand Up @@ -663,6 +710,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
UnicodeVersionPyRenderCtx(fetch_unicode_versions())
)
yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
yield UnicodeVersionRstRenderDef.new(fetch_source_headers())
Expand Down
5 changes: 2 additions & 3 deletions bin/verify-table-integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@


def bisearch_pair(ucs, table):
"""
A copy of wcwidth._bisearch() but also returns the range of matched values.
"""
"""A copy of wcwidth._bisearch() but also returns the range of matched values."""
lbound = 0
ubound = len(table) - 1

Expand All @@ -85,6 +83,7 @@ def bisearch_pair(ucs, table):


def main(log: logging.Logger):
# local
from wcwidth import ZERO_WIDTH, WIDE_EASTASIAN, list_versions

reversed_uni_versions = list(reversed(list_versions()))
Expand Down
4 changes: 4 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ Other Languages
History
=======

0.2.14 *2024-02-14*
* **Bugfix** accounting of some kinds of emoji sequences using U+FE0E
Variation Selector 15 (`PR #120`_).

0.2.13 *2024-01-06*
* **Bugfix** zero-width support for Hangul Jamo (Korean)

Expand Down
4 changes: 4 additions & 0 deletions docs/specs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ Width of 1
String characters are measured width of 1 when they are not
measured as `Width of 0`_ or `Width of 2`_.

Any character in sequence with `U+FE0E`_ (variation Selector 15) defined
by `emoji-variation-sequences.txt`_ as ``text style``.

Width of 2
----------

Expand All @@ -73,6 +76,7 @@ Any character in sequence with `U+FE0F`_ (Variation Selector 16) defined by
.. _`U+2029`: https://codepoints.net/U+2029
.. _`U+D7B0`: https://codepoints.net/U+D7B0
.. _`U+D7FF`: https://codepoints.net/U+D7FF
.. _`U+FE0E`: https://codepoints.net/U+FE0E
.. _`U+FE0F`: https://codepoints.net/U+FE0F
.. _`DerivedGeneralCategory.txt`: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
.. _`EastAsianWidth.txt`: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def main():
setuptools.setup(
name='wcwidth',
# NOTE: manually manage __version__ in wcwidth/__init__.py !
version='0.2.13',
version='0.2.14',
description=(
"Measures the displayed width of unicode strings in a terminal"),
long_description=codecs.open(
Expand Down
89 changes: 80 additions & 9 deletions tests/test_emojis.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_longer_emoji_zwj_sequence():
u"\u200d" # 'Cf', 'N' -- ZERO WIDTH JOINER
u"\U0001F9D1" # 'So', 'W' -- ADULT
u"\U0001F3FD" # 'Sk', 'W' -- EMOJI MODIFIER FITZPATRICK TYPE-4
) * 2
) * 2
# This test adapted from https://www.unicode.org/L2/L2023/23107-terminal-suppt.pdf
expect_length_each = (2, 0, 0, 1, 0, 0, 2, 0, 2, 0) * 2
expect_length_phrase = 4
Expand All @@ -148,8 +148,8 @@ def test_longer_emoji_zwj_sequence():
def read_sequences_from_file(filename):
fp = codecs.open(os.path.join(os.path.dirname(__file__), filename), 'r', encoding='utf-8')
lines = [line.strip()
for line in fp.readlines()
if not line.startswith('#') and line.strip()]
for line in fp.readlines()
if not line.startswith('#') and line.strip()]
fp.close()
sequences = [make_sequence_from_line(line) for line in lines]
return lines, sequences
Expand Down Expand Up @@ -184,7 +184,7 @@ def test_recommended_emoji_zwj_sequences():

def test_recommended_variation_16_sequences():
"""
Test wcswidth of all of the unicode.org-published emoji-variation-sequences.txt
Test wcswidth of vs-16 sequences from unicode.org's emoji-variation-sequences.txt
"""
# given,
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')
Expand All @@ -210,12 +210,61 @@ def test_recommended_variation_16_sequences():
assert num >= 742


def test_recommended_variation_15_sequences():
"""
Test wcswidth of vs-15 sequences from unicode.org's emoji-variation-sequences.txt
"""
# given,
lines, sequences = read_sequences_from_file('emoji-variation-sequences.txt')

errors = []
num = 0
for sequence, line in zip(sequences, lines):
num += 1
if '\ufe0e' not in sequence:
# filter for only \uFE0E (VS-15)
continue
measured_width = wcwidth.wcswidth(sequence)
if measured_width != 1:
errors.append({
'expected_width': 1,
'line': line,
'measured_width': wcwidth.wcswidth(sequence),
'sequence': sequence,
})

# verify
assert errors == []
assert num >= 742


def test_unicode_9_vs16():
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
phrase = (u"\u2640" # FEMALE SIGN
u"\uFE0F" # VARIATION SELECTOR-16
u"X" # ASCII Letter 'X'
u"\uFE0F") # VARIATION SELECTOR-16

expect_length_each = (1, 0)
expect_length_each = (1, 0, 1, 0)
expect_length_phrase = 3

# exercise,
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='9.0') for w_char in phrase)
length_phrase = wcwidth.wcswidth(phrase, unicode_version='9.0')

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_unicode_9_vs15():
"""Verify effect of VS-16 on unicode_version 9.0 and later"""
phrase = (u"\U0001f4da" # BOOKS
u"\uFE0E" # VARIATION SELECTOR-15
u"X" # ASCII Letter 'X'
u"\uFE0E") # VARIATION SELECTOR-15

expect_length_each = (2, 0, 1, 0)
expect_length_phrase = 2

# exercise,
Expand All @@ -226,18 +275,40 @@ def test_unicode_9_vs16():
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_unicode_8_vs16():
"""Verify that VS-16 has no effect on unicode_version 8.0 and earler"""
"""Verify that VS-16 has no effect on unicode_version 8.0 and earlier"""
phrase = (u"\u2640" # FEMALE SIGN
u"\uFE0F" # VARIATION SELECTOR-16
u"X" # ASCII Letter 'X'
u"\uFE0F") # VARIATION SELECTOR-16

expect_length_each = (1, 0)
expect_length_phrase = 1
expect_length_each = (1, 0, 1, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_unicode_8_vs15():
"""Verify that VS-15 has no effect on unicode_version 8.0 and earlier"""
phrase = (u"\U0001f4da" # BOOKS
u"\uFE0E" # VARIATION SELECTOR-15
u"X" # ASCII Letter 'X'
u"\uFE0E") # VARIATION SELECTOR-15

expect_length_each = (1, 0, 1, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(wcwidth.wcwidth(w_char, unicode_version='8.0') for w_char in phrase)
length_phrase = wcwidth.wcswidth(phrase, unicode_version='8.0')

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase
assert length_phrase == expect_length_phrase
5 changes: 4 additions & 1 deletion tests/test_table_integrity.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""
Executes verify-table-integrity.py as a unit test.
"""
# std imports
import os
import sys
import subprocess

# 3rd party
import pytest


@pytest.mark.skipif(sys.version_info[:2] != (3, 12), reason='Test only with a single version of python')
def test_verify_table_integrity():
subprocess.check_output([sys.executable, os.path.join(os.path.dirname(__file__),
os.path.pardir,
'bin',
'verify-table-integrity.py')])
'verify-table-integrity.py')])
6 changes: 5 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,13 @@ basepython = python3.11
commands = {envbindir}/isort --quiet --apply --recursive wcwidth tests bin

[testenv:pylint]
# Files table_vs15.py and table_wide.py erroneously report "duplicate lines".
# Except for adding '# pylint: disable=duplicate-code' to the template files, we
# can chose only to disable a specific check, or specific files. We ignore the
# files.
basepython = python3.11
commands = {envbindir}/pylint --rcfile={toxinidir}/.pylintrc \
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox \
--ignore=tests,docs,setup.py,conf.py,build,distutils,.pyenv,.git,.tox,table_wide.py,table_vs15.py \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO, this is for .pylintrc / pyproject.toml section for pylint

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[tool:pylint] may also be included in tox.ini, but I decided it was less complex to just include it here with the others

{posargs:{toxinidir}}/wcwidth

[testenv:flake8]
Expand Down
14 changes: 8 additions & 6 deletions wcwidth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
"""
# re-export all functions & definitions, even private ones, from top-level
# module path, to allow for 'from wcwidth import _private_func'. Of course,
# user beware that any _private function may disappear or change signature at
# any future version.
# user beware that any _private functions or variables not exported by __all__
# may disappear or change signature at any future version.

# local
from .wcwidth import ZERO_WIDTH # noqa
from .wcwidth import (WIDE_EASTASIAN,
VS15_WIDE_TO_NARROW,
VS16_NARROW_TO_WIDE,
wcwidth,
wcswidth,
Expand All @@ -23,7 +24,8 @@
# 'from wcwidth import *', but also to say, "This is the public API".
__all__ = ('wcwidth', 'wcswidth', 'list_versions')

# We also used pkg_resources to load unicode version tables from version.json,
# generated by bin/update-tables.py, but some environments are unable to
# import pkg_resources for one reason or another, yikes!
__version__ = '0.2.13'
# We previously used pkg_resources to load unicode version tables from
# 'version.json', generated by bin/update-tables.py, but some environments are
# unable to import pkg_resources for one reason or another, so this is
# MANUALLY DUPLICATED here and in setup.py
__version__ = '0.2.14'
Loading
Loading