diff --git a/lib/std/text/unicode.kk b/lib/std/text/unicode.kk index 7fa7de832..f8ca71531 100644 --- a/lib/std/text/unicode.kk +++ b/lib/std/text/unicode.kk @@ -37,6 +37,7 @@ pub fun is-combining( c : char ) : bool { (i >= 0x20D0 && i <= 0x20FF) || (i >= 0xFE20 && i <= 0xFE2F) || (i >= 0xFE00 && i <= 0xFE0F)) // Added variation selectors + // Should we instead add `zero-widths.force.contains(i)`? } // Join combining characters with their base into a grapheme. @@ -124,21 +125,272 @@ pub fun string/width( s : string ) : int { //-------------------------------------------------------------- // These characters are considered wide, i.e. 2 columns wide. +// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt +// See ranges with postfix ;W +// +// Update with `python3 util/update-unicode.py -a` +// TODO: Handle 'unassigned' ranges: (Following is an excerpt from https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt ) +// - All code points, assigned or unassigned, that are not listed +// explicitly are given the value "N". +// - The unassigned code points in the following blocks default to "W": +// CJK Unified Ideographs Extension A: U+3400..U+4DBF +// CJK Unified Ideographs: U+4E00..U+9FFF +// CJK Compatibility Ideographs: U+F900..U+FAFF +// - All undesignated code points in Planes 2 and 3, whether inside or +// outside of allocated blocks, default to "W": +// Plane 2: U+20000..U+2FFFD +// Plane 3: U+30000..U+3FFFD val asian-wide : delayed = delay{ build-rtree([ single(0x1100,0x115F), + single(0x231A,0x231B), single(0x2329,0x2329), single(0x232A,0x232A), - single(0x2E80,0x303E), - single(0x3040,0xA4CF), + single(0x23E9,0x23EC), + single(0x23F0,0x23F0), + single(0x23F3,0x23F3), + single(0x25FD,0x25FE), + single(0x2614,0x2615), + single(0x2648,0x2653), + single(0x267F,0x267F), + single(0x2693,0x2693), + single(0x26A1,0x26A1), + single(0x26AA,0x26AB), + single(0x26BD,0x26BE), + single(0x26C4,0x26C5), + single(0x26CE,0x26CE), + single(0x26D4,0x26D4), + single(0x26EA,0x26EA), + single(0x26F2,0x26F3), + single(0x26F5,0x26F5), + single(0x26FA,0x26FA), + single(0x26FD,0x26FD), + single(0x2705,0x2705), + single(0x270A,0x270B), + single(0x2728,0x2728), + single(0x274C,0x274C), + single(0x274E,0x274E), + single(0x2753,0x2755), + single(0x2757,0x2757), + single(0x2795,0x2797), + single(0x27B0,0x27B0), + single(0x27BF,0x27BF), + single(0x2B1B,0x2B1C), + single(0x2B50,0x2B50), + single(0x2B55,0x2B55), + single(0x2E80,0x2E99), + single(0x2E9B,0x2EF3), + single(0x2F00,0x2FD5), + single(0x2FF0,0x2FFB), + single(0x3001,0x3003), + single(0x3004,0x3004), + single(0x3005,0x3005), + single(0x3006,0x3006), + single(0x3007,0x3007), + single(0x3008,0x3008), + single(0x3009,0x3009), + single(0x300A,0x300A), + single(0x300B,0x300B), + single(0x300C,0x300C), + single(0x300D,0x300D), + single(0x300E,0x300E), + single(0x300F,0x300F), + single(0x3010,0x3010), + single(0x3011,0x3011), + single(0x3012,0x3013), + single(0x3014,0x3014), + single(0x3015,0x3015), + single(0x3016,0x3016), + single(0x3017,0x3017), + single(0x3018,0x3018), + single(0x3019,0x3019), + single(0x301A,0x301A), + single(0x301B,0x301B), + single(0x301C,0x301C), + single(0x301D,0x301D), + single(0x301E,0x301F), + single(0x3020,0x3020), + single(0x3021,0x3029), + single(0x302A,0x302D), + single(0x302E,0x302F), + single(0x3030,0x3030), + single(0x3031,0x3035), + single(0x3036,0x3037), + single(0x3038,0x303A), + single(0x303B,0x303B), + single(0x303C,0x303C), + single(0x303D,0x303D), + single(0x303E,0x303E), + single(0x3041,0x3096), + single(0x3099,0x309A), + single(0x309B,0x309C), + single(0x309D,0x309E), + single(0x309F,0x309F), + single(0x30A0,0x30A0), + single(0x30A1,0x30FA), + single(0x30FB,0x30FB), + single(0x30FC,0x30FE), + single(0x30FF,0x30FF), + single(0x3105,0x312F), + single(0x3131,0x318E), + single(0x3190,0x3191), + single(0x3192,0x3195), + single(0x3196,0x319F), + single(0x31A0,0x31BF), + single(0x31C0,0x31E3), + single(0x31F0,0x31FF), + single(0x3200,0x321E), + single(0x3220,0x3229), + single(0x322A,0x3247), + single(0x3250,0x3250), + single(0x3251,0x325F), + single(0x3260,0x327F), + single(0x3280,0x3289), + single(0x328A,0x32B0), + single(0x32B1,0x32BF), + single(0x32C0,0x32FF), + single(0x3300,0x33FF), + single(0x3400,0x4DBF), + single(0x4E00,0x9FFF), + single(0xA000,0xA014), + single(0xA015,0xA015), + single(0xA016,0xA48C), + single(0xA490,0xA4C6), + single(0xA960,0xA97C), single(0xAC00,0xD7A3), - single(0xF900,0xFAFF), - single(0xFE10,0xFE19), - single(0xFE30,0xFE6F), - single(0xFF00,0xFF60), - single(0xFFE0,0xFFE6), - single(0x20000,0x2FFFD), - single(0x30000,0x3FFFD), + single(0xF900,0xFA6D), + single(0xFA6E,0xFA6F), + single(0xFA70,0xFAD9), + single(0xFADA,0xFAFF), + single(0xFE10,0xFE16), + single(0xFE17,0xFE17), + single(0xFE18,0xFE18), + single(0xFE19,0xFE19), + single(0xFE30,0xFE30), + single(0xFE31,0xFE32), + single(0xFE33,0xFE34), + single(0xFE35,0xFE35), + single(0xFE36,0xFE36), + single(0xFE37,0xFE37), + single(0xFE38,0xFE38), + single(0xFE39,0xFE39), + single(0xFE3A,0xFE3A), + single(0xFE3B,0xFE3B), + single(0xFE3C,0xFE3C), + single(0xFE3D,0xFE3D), + single(0xFE3E,0xFE3E), + single(0xFE3F,0xFE3F), + single(0xFE40,0xFE40), + single(0xFE41,0xFE41), + single(0xFE42,0xFE42), + single(0xFE43,0xFE43), + single(0xFE44,0xFE44), + single(0xFE45,0xFE46), + single(0xFE47,0xFE47), + single(0xFE48,0xFE48), + single(0xFE49,0xFE4C), + single(0xFE4D,0xFE4F), + single(0xFE50,0xFE52), + single(0xFE54,0xFE57), + single(0xFE58,0xFE58), + single(0xFE59,0xFE59), + single(0xFE5A,0xFE5A), + single(0xFE5B,0xFE5B), + single(0xFE5C,0xFE5C), + single(0xFE5D,0xFE5D), + single(0xFE5E,0xFE5E), + single(0xFE5F,0xFE61), + single(0xFE62,0xFE62), + single(0xFE63,0xFE63), + single(0xFE64,0xFE66), + single(0xFE68,0xFE68), + single(0xFE69,0xFE69), + single(0xFE6A,0xFE6B), + single(0x16FE0,0x16FE1), + single(0x16FE2,0x16FE2), + single(0x16FE3,0x16FE3), + single(0x16FE4,0x16FE4), + single(0x16FF0,0x16FF1), + single(0x17000,0x187F7), + single(0x18800,0x18AFF), + single(0x18B00,0x18CD5), + single(0x18D00,0x18D08), + single(0x1AFF0,0x1AFF3), + single(0x1AFF5,0x1AFFB), + single(0x1AFFD,0x1AFFE), + single(0x1B000,0x1B0FF), + single(0x1B100,0x1B122), + single(0x1B132,0x1B132), + single(0x1B150,0x1B152), + single(0x1B155,0x1B155), + single(0x1B164,0x1B167), + single(0x1B170,0x1B2FB), + single(0x1F004,0x1F004), + single(0x1F0CF,0x1F0CF), + single(0x1F18E,0x1F18E), + single(0x1F191,0x1F19A), + single(0x1F200,0x1F202), + single(0x1F210,0x1F23B), + single(0x1F240,0x1F248), + single(0x1F250,0x1F251), + single(0x1F260,0x1F265), + single(0x1F300,0x1F320), + single(0x1F32D,0x1F335), + single(0x1F337,0x1F37C), + single(0x1F37E,0x1F393), + single(0x1F3A0,0x1F3CA), + single(0x1F3CF,0x1F3D3), + single(0x1F3E0,0x1F3F0), + single(0x1F3F4,0x1F3F4), + single(0x1F3F8,0x1F3FA), + single(0x1F3FB,0x1F3FF), + single(0x1F400,0x1F43E), + single(0x1F440,0x1F440), + single(0x1F442,0x1F4FC), + single(0x1F4FF,0x1F53D), + single(0x1F54B,0x1F54E), + single(0x1F550,0x1F567), + single(0x1F57A,0x1F57A), + single(0x1F595,0x1F596), + single(0x1F5A4,0x1F5A4), + single(0x1F5FB,0x1F5FF), + single(0x1F600,0x1F64F), + single(0x1F680,0x1F6C5), + single(0x1F6CC,0x1F6CC), + single(0x1F6D0,0x1F6D2), + single(0x1F6D5,0x1F6D7), + single(0x1F6DC,0x1F6DF), + single(0x1F6EB,0x1F6EC), + single(0x1F6F4,0x1F6FC), + single(0x1F7E0,0x1F7EB), + single(0x1F7F0,0x1F7F0), + single(0x1F90C,0x1F93A), + single(0x1F93C,0x1F945), + single(0x1F947,0x1F9FF), + single(0x1FA70,0x1FA7C), + single(0x1FA80,0x1FA88), + single(0x1FA90,0x1FABD), + single(0x1FABF,0x1FAC5), + single(0x1FACE,0x1FADB), + single(0x1FAE0,0x1FAE8), + single(0x1FAF0,0x1FAF8), + single(0x20000,0x2A6DF), + single(0x2A6E0,0x2A6FF), + single(0x2A700,0x2B739), + single(0x2B73A,0x2B73F), + single(0x2B740,0x2B81D), + single(0x2B81E,0x2B81F), + single(0x2B820,0x2CEA1), + single(0x2CEA2,0x2CEAF), + single(0x2CEB0,0x2EBE0), + single(0x2EBE1,0x2F7FF), + single(0x2F800,0x2FA1D), + single(0x2FA1E,0x2FA1F), + single(0x2FA20,0x2FFFD), + single(0x30000,0x3134A), + single(0x3134B,0x3134F), + single(0x31350,0x323AF), + single(0x323B0,0x3FFFD) ]) } diff --git a/test/lib/unicode.kk b/test/lib/unicode.kk index 09b46cdfd..9d7466642 100644 --- a/test/lib/unicode.kk +++ b/test/lib/unicode.kk @@ -1,7 +1,11 @@ +// https://github.com/koka-lang/koka/issues/457 +// https://github.com/koka-lang/koka/issues/458 import std/text/unicode fun main() // heart, variation, zero width join, fire // ['h','i','/u2764','/uFE0F','/u200D','/U01F525'] "hi❤️‍🔥".list.println - "hi❤️‍🔥".graphemes.length.println \ No newline at end of file + "hi❤️‍🔥".graphemes.length.println + + println(width("👾")) diff --git a/test/lib/unicode.kk.out b/test/lib/unicode.kk.out index fff8009ca..5b4849494 100644 --- a/test/lib/unicode.kk.out +++ b/test/lib/unicode.kk.out @@ -1,2 +1,3 @@ ['h','i','/u2764','/uFE0F','/u200D','/U01F525'] -3 \ No newline at end of file +3 +2 \ No newline at end of file diff --git a/util/update-unicode.py b/util/update-unicode.py new file mode 100644 index 000000000..948b7f9c6 --- /dev/null +++ b/util/update-unicode.py @@ -0,0 +1,22 @@ +import requests +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog="update-unicode",description="prints updated unicode lists") + parser.add_argument("-a", "--asian_wide", default=False, action='store_true') + args = parser.parse_args() + if args.asian_wide: + result = requests.get("https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt") + for line in result.text.split("\n"): + values = line.split(";") + if len(values) >= 2: + width = values[1].split("#")[0].strip() + if width == "W": + charrange = values[0].split("..") + if len(charrange) == 2: + print(f" single(0x{charrange[0]},0x{charrange[1]}),") + elif len(charrange) == 1: + print(f" single(0x{charrange[0]},0x{charrange[0]}),") + else: + print(f"Error unsupported range {charrange}") + exit(-1) \ No newline at end of file