Skip to content

Commit

Permalink
fix(core): Address UTF-8 character boundary issues in LineBreaker (#3228
Browse files Browse the repository at this point in the history
)

Co-authored-by: Tushar Mathur <[email protected]>
  • Loading branch information
iamchanii and tusharmath authored Dec 26, 2024
1 parent 4cf48a1 commit e3a6026
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 10 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ tailcall-valid = { workspace = true }
dashmap = "6.1.0"
urlencoding = "2.1.3"
tailcall-chunk = "0.3.0"
unicode-segmentation = "1.12.0"

# to build rquickjs bindings on systems without builtin bindings
[target.'cfg(all(target_os = "windows", target_arch = "x86"))'.dependencies]
Expand Down
72 changes: 62 additions & 10 deletions src/core/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::fmt::Display;
use async_graphql::parser::types::*;
use async_graphql::Positioned;
use async_graphql_value::ConstValue;
use unicode_segmentation::UnicodeSegmentation;

use super::jit::Directive as JitDirective;
use super::json::JsonLikeOwned;
Expand All @@ -28,19 +29,35 @@ impl<'a> Iterator for LineBreaker<'a> {
return None;
}

let end_index = self
.string
.chars()
.skip(self.index + self.break_at)
.enumerate()
.find(|(_, ch)| ch.is_whitespace())
.map(|(index, _)| self.index + self.break_at + index + 1)
.unwrap_or(self.string.len());
let graphemes = self.string[self.index..].graphemes(true).peekable();
let mut iter = graphemes;
let mut current_len = 0;
let mut last_valid_index = self.index;

while let Some(grapheme) = iter.peek() {
let grapheme_len = grapheme.len();

if current_len + grapheme_len > self.break_at {
break;
}

iter.next();
current_len += grapheme_len;
last_valid_index += grapheme_len;
}

for grapheme in iter {
if grapheme.chars().any(|ch| ch.is_whitespace()) {
last_valid_index += grapheme.len();
break;
}
last_valid_index += grapheme.len();
}

let start_index = self.index;
self.index = end_index;
self.index = last_valid_index;

Some(&self.string[start_index..end_index])
Some(&self.string[start_index..self.index])
}
}

Expand Down Expand Up @@ -456,3 +473,38 @@ impl<'a, Input: JsonLikeOwned + Display> From<&'a JitDirective<Input>> for Direc
}
}
}

#[cfg(test)]
mod tests {
use super::get_formatted_docs;

#[test]
fn test_get_formatted_docs() {
let input = Some(String::from(
"This is a test string for get_formatted_docs function. You are typing a long sentence for testing. What a nice, long sentence!",
));
let indent = 4;

let result = get_formatted_docs(input, indent);
let expected = String::from(
" \"\"\"\n This is a test string for get_formatted_docs function. You are typing a long sentence \n for testing. What a nice, long sentence!\n \"\"\"\n",
);

assert_eq!(result, expected)
}

#[test]
fn test_get_formatted_docs_utf8() {
let input = Some(String::from(
"get_formatted_docs 함수 테스트를 위한 문장입니다. 테스트를 위해 긴 문장을 입력하는 중 입니다. テストのために長い文章を入力しているところです。なんて素敵な長文です!",
));
let indent = 4;

let result = get_formatted_docs(input, indent);
let expected = String::from(
" \"\"\"\n get_formatted_docs 함수 테스트를 위한 문장입니다. 테스트를 위해 \n 긴 문장을 입력하는 중 입니다. テストのために長い文章を入力しているところです。なんて素敵な長文です!\n \"\"\"\n",
);

assert_eq!(result, expected)
}
}

1 comment on commit e3a6026

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running 30s test @ http://localhost:8000/graphql

4 threads and 100 connections

Thread Stats Avg Stdev Max +/- Stdev
Latency 4.19ms 2.08ms 32.15ms 80.75%
Req/Sec 6.17k 0.92k 9.78k 92.08%

736254 requests in 30.03s, 3.69GB read

Requests/sec: 24520.45

Transfer/sec: 125.86MB

Please sign in to comment.