Skip to content

Commit

Permalink
Implement zero-width support for Hangul Jamo (#111)
Browse files Browse the repository at this point in the history
  • Loading branch information
jquast authored Jan 6, 2024
1 parent 3af992a commit 8ed7f2c
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 21 deletions.
21 changes: 20 additions & 1 deletion bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))

# Hangul Jamo is a decomposed form of Hangul Syllables, see
# see https://www.unicode.org/faq/korean.html#3
# https://github.com/ridiculousfish/widecharwidth/pull/17
# https://github.com/jquast/ucs-detect/issues/9
# https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
# "Conjoining Jamo are divided into three classes: L, V, T (Leading
# consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
# <LV> or <LVT> sequences."
HANGUL_JAMO_ZEROWIDTH = (
*range(0x1160, 0x1200), # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
*range(0xD7B0, 0xD800), # Hangul Jungseong O-Yeo .. Undefined Character of Hangul Jamo Extended-B
)


def _bisearch(ucs, table):
"""A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
Expand Down Expand Up @@ -333,6 +346,9 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=2).values)
Expand All @@ -351,8 +367,11 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0)

# And, include NULL
# Include NULL
table[version].values.add(0)

# Add Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down
3 changes: 3 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ Other Languages
=======
History
=======
Unreleased
* **Bugfix** zero-width support for Hangul Jamo (Korean)

0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

Expand Down
43 changes: 37 additions & 6 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,48 @@ def test_balinese_script():
assert length_phrase == expect_length_phrase


def test_kr_jamo():
"""
Test basic combining of HANGUL CHOSEONG and JUNGSEONG
Example and from Raymond Chen's blog post,
https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
"""
# This is an example where both characters are "wide" when displayed alone.
#
# But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
#
# This wcwidth library understands their width only when combination,
# and not by independent display, like other zero-width characters that may
# only combine with an appropriate preceding character.
phrase = (
u"\u1100" # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
u"\u1161" # ᅡ HANGUL JUNGSEONG A (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
length_phrase = wcwidth.wcswidth(phrase)

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_kr_jamo_filler():
u"""
Jamo filler is 0 width.
According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
seems it doesn't matter whether it is implemented or not, they are not typically used !
Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
"""
phrase = u"\u1100\u1160"
expect_length_each = (2, 1)
expect_length_phrase = 3
phrase = (
u"\u1100" # HANGUL CHOSEONG KIYEOK (consonant)
u"\u1160" # HANGUL JUNGSEONG FILLER (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand Down
14 changes: 1 addition & 13 deletions wcwidth/table_wide.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Exports WIDE_EASTASIAN table keyed by supporting unicode version level.
This code generated by wcwidth/bin/update-tables.py on 2024-01-03 17:16:09 UTC.
This code generated by wcwidth/bin/update-tables.py on 2024-01-06 01:39:49 UTC.
"""
WIDE_EASTASIAN = {
'4.1.0': (
Expand Down Expand Up @@ -126,8 +126,6 @@
# Date: 2009-06-09, 17:47:00 PDT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -149,8 +147,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand All @@ -169,8 +165,6 @@
# Date: 2010-08-17, 12:17:00 PDT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -192,8 +186,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand All @@ -214,8 +206,6 @@
# Date: 2011-09-19, 18:46:00 GMT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -237,8 +227,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand Down
Loading

0 comments on commit 8ed7f2c

Please sign in to comment.