Skip to content

Commit

Permalink
fix: items_to_regexp for chinese (#8368)
Browse files Browse the repository at this point in the history
  • Loading branch information
SyMind authored Nov 7, 2024
1 parent 2dfa301 commit 67a208e
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 41 deletions.
54 changes: 54 additions & 0 deletions crates/rspack_core/src/dependency/runtime_template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ pub fn define_es_module_flag_statement(
#[allow(unused_imports)]
mod test_items_to_regexp {
use crate::items_to_regexp;

#[test]
fn basic() {
assert_eq!(
Expand Down Expand Up @@ -812,5 +813,58 @@ mod test_items_to_regexp {
),
"[1234a]".to_string()
);

assert_eq!(
items_to_regexp(
vec!["foo_js", "_js"]
.into_iter()
.map(String::from)
.collect::<Vec<_>>(),
),
"(|foo)_js".to_string()
);
}

#[test]
fn multibyte() {
assert_eq!(
items_to_regexp(
vec!["🍉", "🍊", "🍓", "🍐", "🍍🫙"]
.into_iter()
.map(String::from)
.collect::<Vec<_>>(),
),
"([🍉🍊🍐🍓]|🍍🫙)".to_string()
);

assert_eq!(
items_to_regexp(
vec!["🫙🍉", "🫙🍊", "🫙🍓", "🫙🍐", "🍽🍍"]
.into_iter()
.map(String::from)
.collect::<Vec<_>>(),
),
"(🫙[🍉🍊🍐🍓]|🍽🍍)".to_string()
);

assert_eq!(
items_to_regexp(
vec!["🍉🍭", "🍊🍭", "🍓🍭", "🍐🍭", "🍍🫙"]
.into_iter()
.map(String::from)
.collect::<Vec<_>>(),
),
"([🍉🍊🍐🍓]🍭|🍍🫙)".to_string()
);

assert_eq!(
items_to_regexp(
vec!["🍉", "🍊", "🍓", "🍐", "🫙"]
.into_iter()
.map(String::from)
.collect::<Vec<_>>(),
),
"[🍉🍊🍐🍓🫙]".to_string()
);
}
}
96 changes: 55 additions & 41 deletions crates/rspack_core/src/utils/compile_boolean_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
items_set.sort_unstable();

// Merge single char items: (a|b|c|d|ef) => ([abcd]|ef)
let count_of_single_char_items = items_set.iter().filter(|&item| item.len() == 1).count();
let count_of_single_char_items = items_set
.iter()
.filter(|&item| item.chars().count() == 1)
.count();

// Special case for only single char items
if count_of_single_char_items == items_set.len() {
Expand All @@ -108,7 +111,7 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
let mut single_char_items: String = String::new();
let mut new_items = BTreeSet::new();
for item in items {
if item.len() == 1 {
if item.chars().count() == 1 {
single_char_items += &item;
continue;
}
Expand All @@ -126,19 +129,19 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
if !prefix.is_empty() || !suffix.is_empty() {
return format!(
"{}{}{}",
quote_meta(&prefix),
quote_meta(prefix),
items_to_regexp(
items
.iter()
.map(|item| item
.strip_prefix(&prefix)
.strip_prefix(prefix)
.expect("should strip prefix")
.strip_suffix(&suffix)
.expect("should strip suffix")
.to_string())
.collect::<Vec<_>>()
),
quote_meta(&suffix)
quote_meta(suffix)
);
}
}
Expand Down Expand Up @@ -199,12 +202,12 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
let prefix = get_common_prefix(prefixed_items.iter().map(|item| item.as_str()));
finished_items.push(format!(
"{}{}",
quote_meta(&prefix),
quote_meta(prefix),
items_to_regexp(
prefixed_items
.iter()
.map(|item| item
.strip_prefix(&prefix)
.strip_prefix(prefix)
.expect("should strip prefix")
.to_string())
.collect::<Vec<_>>()
Expand All @@ -217,7 +220,13 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
&mut items,
|item| {
if !item.is_empty() {
Some(item[item.len() - 1..].to_string())
Some(
item
.chars()
.last()
.expect("should have at least one char")
.to_string(),
)
} else {
None
}
Expand Down Expand Up @@ -256,7 +265,7 @@ pub(crate) fn items_to_regexp(items_arr: Vec<String>) -> String {
.to_string())
.collect::<Vec<_>>()
),
quote_meta(&suffix)
quote_meta(suffix)
));
}

Expand Down Expand Up @@ -309,20 +318,18 @@ where
result
}

fn get_common_prefix<'a>(mut items: impl Iterator<Item = &'a str> + Clone) -> String {
if items.clone().count() == 0 {
return String::new();
}
fn get_common_prefix<'a>(mut items: impl Iterator<Item = &'a str> + Clone) -> &'a str {
let mut prefix = if let Some(prefix) = items.next() {
prefix
} else {
return "";
};

let mut prefix = items
.next()
.expect("should have at least one element")
.to_string();
for item in items {
for (p, c) in item.chars().enumerate() {
if let Some(prefix_char) = prefix.chars().nth(p) {
for (char_index, (byte_index, c)) in item.char_indices().enumerate() {
if let Some(prefix_char) = prefix.chars().nth(char_index) {
if c != prefix_char {
prefix = prefix[..p].to_string();
prefix = &prefix[..byte_index];
break;
}
} else {
Expand All @@ -334,37 +341,44 @@ fn get_common_prefix<'a>(mut items: impl Iterator<Item = &'a str> + Clone) -> St
prefix
}

fn get_common_suffix<'a, I: Iterator<Item = &'a str> + Clone>(mut items: I) -> String {
if items.clone().count() == 0 {
return String::new();
}
fn is_utf8_start_byte(c: u8) -> bool {
c.is_ascii()
|| ((c & 0b1110_0000) == 0b1100_0000)
|| ((c & 0b1111_0000) == 0b1110_0000)
|| ((c & 0b1111_1000) == 0b1111_0000)
}

fn get_common_suffix<'a, I: Iterator<Item = &'a str>>(mut items: I) -> &'a str {
let mut suffix = if let Some(suffix) = items.next() {
suffix.as_bytes()
} else {
return "";
};

let mut suffix = items
.next()
.expect("should have at least one element")
.to_string();
for item in items {
let item = item.as_bytes();

let mut p = item.len();
let mut s = suffix.len();

while s > 0 {
s -= 1;
let Some(suffix_char) = suffix.chars().nth(s) else {
break;
};
let suffix_byte = suffix[s];

let item_char = if p > 0 { item.chars().nth(p - 1) } else { None };

if let Some(item_char) = item_char
&& item_char == suffix_char
{
p -= 1;
} else {
suffix = suffix[s + 1..].to_string();
break;
if p > 0 {
let item_byte = item[p - 1];
if suffix_byte == item_byte {
p -= 1;
continue;
}
}
suffix = &suffix[s + 1..];
break;
}
}

suffix
while !suffix.is_empty() && !is_utf8_start_byte(suffix[0]) {
suffix = &suffix[1..]
}
unsafe { std::str::from_utf8_unchecked(suffix) }
}

2 comments on commit 67a208e

@rspack-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

📝 Benchmark detail: Open

Name Base (2024-11-07 52d7dcd) Current Change
10000_big_production-mode + exec 44.3 s ± 1.17 s 44.5 s ± 729 ms +0.59 %
10000_development-mode + exec 1.84 s ± 21 ms 1.83 s ± 10 ms -0.68 %
10000_development-mode_hmr + exec 659 ms ± 13 ms 644 ms ± 8 ms -2.23 %
10000_production-mode + exec 2.43 s ± 50 ms 2.43 s ± 40 ms -0.04 %
arco-pro_development-mode + exec 1.78 s ± 45 ms 1.74 s ± 33 ms -2.12 %
arco-pro_development-mode_hmr + exec 431 ms ± 1.5 ms 431 ms ± 2 ms -0.00 %
arco-pro_production-mode + exec 3.18 s ± 66 ms 3.22 s ± 80 ms +1.13 %
arco-pro_production-mode_generate-package-json-webpack-plugin + exec 3.24 s ± 66 ms 3.26 s ± 58 ms +0.49 %
threejs_development-mode_10x + exec 1.6 s ± 13 ms 1.6 s ± 19 ms -0.28 %
threejs_development-mode_10x_hmr + exec 777 ms ± 8.3 ms 775 ms ± 12 ms -0.25 %
threejs_production-mode_10x + exec 4.98 s ± 18 ms 4.98 s ± 43 ms +0.07 %

@rspack-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

📝 Ran ecosystem CI: Open

suite result
modernjs ✅ success
_selftest ✅ success
rspress ✅ success
rslib ✅ success
rsbuild ✅ success
examples ✅ success
devserver ✅ success

Please sign in to comment.