Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add utf8proc_iterate_graphemes #213

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions test/graphemetest.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,35 @@ void checkline(const char *_buf, bool verbose) {
} while (i < si);
}

if (si) { /* test calls to utf8proc_iterate_graphemes */

/* note and remove break indicators */
int breaks[16];
int a = 0, i = 0, j = 0;
while (i < si) {
if (src[i] == '/') {
breaks[j++] = i;
a = i;
while (a < si) {
src[a] = src[a+1];
a++;
}
si = a-1;
}
i++;
}
breaks[j++] = si;
int k = 0;

int read_bytes = 0;
int start, end;
while ( utf8proc_iterate_graphemes(src, &read_bytes, si, &start, &end) ) {
check(breaks[k] == start, "expected grapheme start not found");
check(breaks[k+1] == end, "expected grapheme end not found");
k++;
}
}

if (verbose)
printf("passed grapheme test: \"%s\"\n", (char*) src);
}
Expand Down
29 changes: 29 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,35 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
return 4;
}

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
utf8proc_uint32_t *start, utf8proc_uint32_t *end
) {
int n, break_state = 0;
utf8proc_int32_t codepoint, prev_codepoint = 0;
if (*read_bytes == strlen)
return 0;
*start = *read_bytes;
while (1) {
n = utf8proc_iterate(str + *read_bytes, strlen - *read_bytes, &codepoint);
if (*read_bytes == strlen) {
codepoint = 0; // Final dummy codepoint
} else
if (codepoint == -1) {
return n;
}
*read_bytes = *read_bytes + n;
if (prev_codepoint != 0 && (true == utf8proc_grapheme_break_stateful(
prev_codepoint, codepoint, &break_state)) ) {
*read_bytes = *read_bytes - n;
*end = *read_bytes; // The last byte (not inclusive) of this grapheme
return 1;
}
prev_codepoint = codepoint;
}
// Unreachable
}

UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
}
Expand Down
19 changes: 19 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,25 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);

/**
* Given the UTF-8 string `str`, produce the starting and ending byte indices of each
* extended grapheme cluster (as defined by UAX#29).
*
* This function will return 1 as long as a grapheme cluster can be found, and 0
* once `strlen` number of bytes has been read. A negative error code will be
* returned in case of an error.
*
* @param str The UTF-8 string to read from.
* @param start The index of the first byte in the grapheme cluster.
* @param strlen The maximum number of bytes read from `str`.
* @param end The index of the last byte (non-inclusive) in the grapheme cluster.
* @param read_bytes Keeps track of how many bytes have been read. Should
* initially be set to 0.
*/

UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
utf8proc_uint32_t *start, utf8proc_uint32_t *end);

/**
* Given a codepoint `c`, return the codepoint of the corresponding
Expand Down