From e3a6026a0e95e3b68d07e6eb4e487749bf23b625 Mon Sep 17 00:00:00 2001 From: Chanhee Lee <26643843+iamchanii@users.noreply.github.com> Date: Thu, 26 Dec 2024 18:02:42 +0900 Subject: [PATCH] fix(core): Address UTF-8 character boundary issues in LineBreaker (#3228) Co-authored-by: Tushar Mathur --- Cargo.lock | 1 + Cargo.toml | 1 + src/core/document.rs | 72 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5e0fd46574..e36f881d60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5718,6 +5718,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "ttl_cache", + "unicode-segmentation", "update-informer", "url", "urlencoding", diff --git a/Cargo.toml b/Cargo.toml index 68f25713f8..9ee5d9b070 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -175,6 +175,7 @@ tailcall-valid = { workspace = true } dashmap = "6.1.0" urlencoding = "2.1.3" tailcall-chunk = "0.3.0" +unicode-segmentation = "1.12.0" # to build rquickjs bindings on systems without builtin bindings [target.'cfg(all(target_os = "windows", target_arch = "x86"))'.dependencies] diff --git a/src/core/document.rs b/src/core/document.rs index a65b307f7c..933c72ccbc 100644 --- a/src/core/document.rs +++ b/src/core/document.rs @@ -4,6 +4,7 @@ use std::fmt::Display; use async_graphql::parser::types::*; use async_graphql::Positioned; use async_graphql_value::ConstValue; +use unicode_segmentation::UnicodeSegmentation; use super::jit::Directive as JitDirective; use super::json::JsonLikeOwned; @@ -28,19 +29,35 @@ impl<'a> Iterator for LineBreaker<'a> { return None; } - let end_index = self - .string - .chars() - .skip(self.index + self.break_at) - .enumerate() - .find(|(_, ch)| ch.is_whitespace()) - .map(|(index, _)| self.index + self.break_at + index + 1) - .unwrap_or(self.string.len()); + let graphemes = self.string[self.index..].graphemes(true).peekable(); + let mut iter = graphemes; + let mut current_len = 0; + let mut last_valid_index = self.index; + + while let Some(grapheme) = iter.peek() { + let grapheme_len = grapheme.len(); + + if current_len + grapheme_len > self.break_at { + break; + } + + iter.next(); + current_len += grapheme_len; + last_valid_index += grapheme_len; + } + + for grapheme in iter { + if grapheme.chars().any(|ch| ch.is_whitespace()) { + last_valid_index += grapheme.len(); + break; + } + last_valid_index += grapheme.len(); + } let start_index = self.index; - self.index = end_index; + self.index = last_valid_index; - Some(&self.string[start_index..end_index]) + Some(&self.string[start_index..self.index]) } } @@ -456,3 +473,38 @@ impl<'a, Input: JsonLikeOwned + Display> From<&'a JitDirective> for Direc } } } + +#[cfg(test)] +mod tests { + use super::get_formatted_docs; + + #[test] + fn test_get_formatted_docs() { + let input = Some(String::from( + "This is a test string for get_formatted_docs function. You are typing a long sentence for testing. What a nice, long sentence!", + )); + let indent = 4; + + let result = get_formatted_docs(input, indent); + let expected = String::from( + " \"\"\"\n This is a test string for get_formatted_docs function. You are typing a long sentence \n for testing. What a nice, long sentence!\n \"\"\"\n", + ); + + assert_eq!(result, expected) + } + + #[test] + fn test_get_formatted_docs_utf8() { + let input = Some(String::from( + "get_formatted_docs 함수 테스트를 위한 문장입니다. 테스트를 위해 긴 문장을 입력하는 중 입니다. テストのために長い文章を入力しているところです。なんて素敵な長文です!", + )); + let indent = 4; + + let result = get_formatted_docs(input, indent); + let expected = String::from( + " \"\"\"\n get_formatted_docs 함수 테스트를 위한 문장입니다. 테스트를 위해 \n 긴 문장을 입력하는 중 입니다. テストのために長い文章を入力しているところです。なんて素敵な長文です!\n \"\"\"\n", + ); + + assert_eq!(result, expected) + } +}