JuliaStrings · niblo · Jan 4, 2021
diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -81,6 +81,35 @@ void checkline(const char *_buf, bool verbose) {
  } while (i < si);
  }
 
+ if (si) { /* test calls to utf8proc_iterate_graphemes */
+
+ /* note and remove break indicators */
+ int breaks[16];
+ int a = 0, i = 0, j = 0;
+ while (i < si) {
+ if (src[i] == '/') {
+ breaks[j++] = i;
+ a = i;
+ while (a < si) {
+ src[a] = src[a+1];
+ a++;
+ }
+ si = a-1;
+ }
+ i++;
+ }
+ breaks[j++] = si;
+ int k = 0;
+
+ int read_bytes = 0;
+ int start, end;
+ while ( utf8proc_iterate_graphemes(src, &read_bytes, si, &start, &end) ) {
+ check(breaks[k] == start, "expected grapheme start not found");
+ check(breaks[k+1] == end, "expected grapheme end not found");
+ k++;
+ }
+ }
+
  if (verbose)
  printf("passed grapheme test: \"%s\"\n", (char*) src);
 }

diff --git a/utf8proc.c b/utf8proc.c
@@ -170,6 +170,35 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
  return 4;
 }
 
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
+ const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
+ utf8proc_uint32_t *start, utf8proc_uint32_t *end
+) {
+ int n, break_state = 0;
+ utf8proc_int32_t codepoint, prev_codepoint = 0;
+ if (*read_bytes == strlen)
+ return 0;
+ *start = *read_bytes;
+ while (1) {
+ n = utf8proc_iterate(str + *read_bytes, strlen - *read_bytes, &codepoint);
+ if (*read_bytes == strlen) {
+ codepoint = 0; // Final dummy codepoint
+ } else
+ if (codepoint == -1) {
+ return n;
+ }
+ *read_bytes = *read_bytes + n;
+ if (prev_codepoint != 0 && (true == utf8proc_grapheme_break_stateful(
+ prev_codepoint, codepoint, &break_state)) ) {
+ *read_bytes = *read_bytes - n;
+ *end = *read_bytes; // The last byte (not inclusive) of this grapheme
+ return 1;
+ }
+ prev_codepoint = codepoint;
+ }
+ // Unreachable
+}
+
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
  return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
 }

diff --git a/utf8proc.h b/utf8proc.h
@@ -613,6 +613,25 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
  utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
 
+/**
+ * Given the UTF-8 string `str`, produce the starting and ending byte indices of each
+ * extended grapheme cluster (as defined by UAX#29).
+ *
+ * This function will return 1 as long as a grapheme cluster can be found, and 0
+ * once `strlen` number of bytes has been read. A negative error code will be
+ * returned in case of an error.
+ *
+ * @param str The UTF-8 string to read from.
+ * @param start The index of the first byte in the grapheme cluster.
+ * @param strlen The maximum number of bytes read from `str`.
+ * @param end The index of the last byte (non-inclusive) in the grapheme cluster.
+ * @param read_bytes Keeps track of how many bytes have been read. Should
+ * initially be set to 0.
+ */
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
+ const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
+ utf8proc_uint32_t *start, utf8proc_uint32_t *end);
 
 /**
  * Given a codepoint `c`, return the codepoint of the corresponding