diff --git a/test/graphemetest.c b/test/graphemetest.c index 22880fc..124be3a 100644 --- a/test/graphemetest.c +++ b/test/graphemetest.c @@ -81,6 +81,35 @@ void checkline(const char *_buf, bool verbose) { } while (i < si); } + if (si) { /* test calls to utf8proc_iterate_graphemes */ + + /* note and remove break indicators */ + int breaks[16]; + int a = 0, i = 0, j = 0; + while (i < si) { + if (src[i] == '/') { + breaks[j++] = i; + a = i; + while (a < si) { + src[a] = src[a+1]; + a++; + } + si = a-1; + } + i++; + } + breaks[j++] = si; + int k = 0; + + int read_bytes = 0; + int start, end; + while ( utf8proc_iterate_graphemes(src, &read_bytes, si, &start, &end) ) { + check(breaks[k] == start, "expected grapheme start not found"); + check(breaks[k+1] == end, "expected grapheme end not found"); + k++; + } + } + if (verbose) printf("passed grapheme test: \"%s\"\n", (char*) src); } diff --git a/utf8proc.c b/utf8proc.c index 1af3456..53b5560 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -170,6 +170,35 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( return 4; } +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes( + const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen, + utf8proc_uint32_t *start, utf8proc_uint32_t *end +) { + int n, break_state = 0; + utf8proc_int32_t codepoint, prev_codepoint = 0; + if (*read_bytes == strlen) + return 0; + *start = *read_bytes; + while (1) { + n = utf8proc_iterate(str + *read_bytes, strlen - *read_bytes, &codepoint); + if (*read_bytes == strlen) { + codepoint = 0; // Final dummy codepoint + } else + if (codepoint == -1) { + return n; + } + *read_bytes = *read_bytes + n; + if (prev_codepoint != 0 && (true == utf8proc_grapheme_break_stateful( + prev_codepoint, codepoint, &break_state)) ) { + *read_bytes = *read_bytes - n; + *end = *read_bytes; // The last byte (not inclusive) of this grapheme + return 1; + } + prev_codepoint = codepoint; + } + // Unreachable +} + UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); } diff --git a/utf8proc.h b/utf8proc.h index 2e8a7ae..3d34ee9 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -613,6 +613,25 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); +/** + * Given the UTF-8 string `str`, produce the starting and ending byte indices of each + * extended grapheme cluster (as defined by UAX#29). + * + * This function will return 1 as long as a grapheme cluster can be found, and 0 + * once `strlen` number of bytes has been read. A negative error code will be + * returned in case of an error. + * + * @param str The UTF-8 string to read from. + * @param start The index of the first byte in the grapheme cluster. + * @param strlen The maximum number of bytes read from `str`. + * @param end The index of the last byte (non-inclusive) in the grapheme cluster. + * @param read_bytes Keeps track of how many bytes have been read. Should + * initially be set to 0. + */ + +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes( + const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen, + utf8proc_uint32_t *start, utf8proc_uint32_t *end); /** * Given a codepoint `c`, return the codepoint of the corresponding