From c7554dacf70a8dd6dad599f6e4d9ebed41bef3a2 Mon Sep 17 00:00:00 2001 From: Cong-Cong Date: Thu, 7 Nov 2024 15:31:10 +0800 Subject: [PATCH 1/3] fix: items_to_regexp for chinese --- .../src/dependency/runtime_template.rs | 21 +++++ .../src/utils/compile_boolean_matcher.rs | 89 +++++++++++-------- 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/crates/rspack_core/src/dependency/runtime_template.rs b/crates/rspack_core/src/dependency/runtime_template.rs index 10b638f003a..bd7e5150ea0 100644 --- a/crates/rspack_core/src/dependency/runtime_template.rs +++ b/crates/rspack_core/src/dependency/runtime_template.rs @@ -771,6 +771,7 @@ pub fn define_es_module_flag_statement( #[allow(unused_imports)] mod test_items_to_regexp { use crate::items_to_regexp; + #[test] fn basic() { assert_eq!( @@ -812,5 +813,25 @@ mod test_items_to_regexp { ), "[1234a]".to_string() ); + + assert_eq!( + items_to_regexp( + vec!["西瓜汽水", "西瓜糖果", "西瓜冰沙"] + .into_iter() + .map(String::from) + .collect::>(), + ), + "西瓜(冰沙|汽水|糖果)".to_string() + ); + + assert_eq!( + items_to_regexp( + vec!["西瓜汽水", "苏打汽水", "橘子汽水"] + .into_iter() + .map(String::from) + .collect::>(), + ), + "(橘子|苏打|西瓜)汽水".to_string() + ); } } diff --git a/crates/rspack_core/src/utils/compile_boolean_matcher.rs b/crates/rspack_core/src/utils/compile_boolean_matcher.rs index dcb2741bfe1..45a353fa8a7 100644 --- a/crates/rspack_core/src/utils/compile_boolean_matcher.rs +++ b/crates/rspack_core/src/utils/compile_boolean_matcher.rs @@ -126,19 +126,19 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { if !prefix.is_empty() || !suffix.is_empty() { return format!( "{}{}{}", - quote_meta(&prefix), + quote_meta(prefix), items_to_regexp( items .iter() .map(|item| item - .strip_prefix(&prefix) + .strip_prefix(prefix) .expect("should strip prefix") .strip_suffix(&suffix) .expect("should strip suffix") .to_string()) .collect::>() ), - quote_meta(&suffix) + quote_meta(suffix) ); } } @@ -199,12 +199,12 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { let prefix = get_common_prefix(prefixed_items.iter().map(|item| item.as_str())); finished_items.push(format!( "{}{}", - quote_meta(&prefix), + quote_meta(prefix), items_to_regexp( prefixed_items .iter() .map(|item| item - .strip_prefix(&prefix) + .strip_prefix(prefix) .expect("should strip prefix") .to_string()) .collect::>() @@ -217,7 +217,13 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { &mut items, |item| { if !item.is_empty() { - Some(item[item.len() - 1..].to_string()) + Some( + item + .chars() + .last() + .expect("should have at least one char") + .to_string(), + ) } else { None } @@ -256,7 +262,7 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { .to_string()) .collect::>() ), - quote_meta(&suffix) + quote_meta(suffix) )); } @@ -309,20 +315,18 @@ where result } -fn get_common_prefix<'a>(mut items: impl Iterator + Clone) -> String { - if items.clone().count() == 0 { - return String::new(); - } +fn get_common_prefix<'a>(mut items: impl Iterator + Clone) -> &'a str { + let mut prefix = if let Some(prefix) = items.next() { + prefix + } else { + return ""; + }; - let mut prefix = items - .next() - .expect("should have at least one element") - .to_string(); for item in items { - for (p, c) in item.chars().enumerate() { - if let Some(prefix_char) = prefix.chars().nth(p) { + for (char_index, (byte_index, c)) in item.char_indices().enumerate() { + if let Some(prefix_char) = prefix.chars().nth(char_index) { if c != prefix_char { - prefix = prefix[..p].to_string(); + prefix = &prefix[..byte_index]; break; } } else { @@ -334,37 +338,44 @@ fn get_common_prefix<'a>(mut items: impl Iterator + Clone) -> St prefix } -fn get_common_suffix<'a, I: Iterator + Clone>(mut items: I) -> String { - if items.clone().count() == 0 { - return String::new(); - } +fn is_utf8_start_byte(c: u8) -> bool { + c.is_ascii() + || ((c & 0b1110_0000) == 0b1100_0000) + || ((c & 0b1111_0000) == 0b1110_0000) + || ((c & 0b1111_1000) == 0b1111_0000) +} + +fn get_common_suffix<'a, I: Iterator>(mut items: I) -> &'a str { + let mut suffix = if let Some(suffix) = items.next() { + suffix.as_bytes() + } else { + return ""; + }; - let mut suffix = items - .next() - .expect("should have at least one element") - .to_string(); for item in items { + let item = item.as_bytes(); + let mut p = item.len(); let mut s = suffix.len(); while s > 0 { s -= 1; - let Some(suffix_char) = suffix.chars().nth(s) else { - break; - }; + let suffix_byte = suffix[s]; - let item_char = if p > 0 { item.chars().nth(p - 1) } else { None }; - - if let Some(item_char) = item_char - && item_char == suffix_char - { - p -= 1; - } else { - suffix = suffix[s + 1..].to_string(); - break; + if p > 0 { + let item_byte = item[p - 1]; + if suffix_byte == item_byte { + p -= 1; + } else { + suffix = &suffix[s + 1..]; + break; + } } } } - suffix + while !suffix.is_empty() && !is_utf8_start_byte(suffix[0]) { + suffix = &suffix[1..] + } + unsafe { std::str::from_utf8_unchecked(suffix) } } From 2b04ae0c4dd42db2ae98e1173f35df157a78415a Mon Sep 17 00:00:00 2001 From: Cong-Cong Date: Thu, 7 Nov 2024 16:21:29 +0800 Subject: [PATCH 2/3] fix --- .../src/dependency/runtime_template.rs | 31 ++++++++++++++++--- .../src/utils/compile_boolean_matcher.rs | 7 +++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/crates/rspack_core/src/dependency/runtime_template.rs b/crates/rspack_core/src/dependency/runtime_template.rs index bd7e5150ea0..ac2ee38d5fa 100644 --- a/crates/rspack_core/src/dependency/runtime_template.rs +++ b/crates/rspack_core/src/dependency/runtime_template.rs @@ -813,25 +813,48 @@ mod test_items_to_regexp { ), "[1234a]".to_string() ); + } + + #[test] + fn multibyte() { + assert_eq!( + items_to_regexp( + vec!["🍉", "🍊", "🍓", "🍐", "🍍🫙"] + .into_iter() + .map(String::from) + .collect::>(), + ), + "([🍉🍊🍐🍓]|🍍🫙)".to_string() + ); + + assert_eq!( + items_to_regexp( + vec!["🫙🍉", "🫙🍊", "🫙🍓", "🫙🍐", "🍽🍍"] + .into_iter() + .map(String::from) + .collect::>(), + ), + "(🫙[🍉🍊🍐🍓]|🍽🍍)".to_string() + ); assert_eq!( items_to_regexp( - vec!["西瓜汽水", "西瓜糖果", "西瓜冰沙"] + vec!["🍉🍭", "🍊🍭", "🍓🍭", "🍐🍭", "🍍🫙"] .into_iter() .map(String::from) .collect::>(), ), - "西瓜(冰沙|汽水|糖果)".to_string() + "([🍉🍊🍐🍓]🍭|🍍🫙)".to_string() ); assert_eq!( items_to_regexp( - vec!["西瓜汽水", "苏打汽水", "橘子汽水"] + vec!["🍉", "🍊", "🍓", "🍐", "🫙"] .into_iter() .map(String::from) .collect::>(), ), - "(橘子|苏打|西瓜)汽水".to_string() + "[🍉🍊🍐🍓🫙]".to_string() ); } } diff --git a/crates/rspack_core/src/utils/compile_boolean_matcher.rs b/crates/rspack_core/src/utils/compile_boolean_matcher.rs index 45a353fa8a7..c9cdb9609a5 100644 --- a/crates/rspack_core/src/utils/compile_boolean_matcher.rs +++ b/crates/rspack_core/src/utils/compile_boolean_matcher.rs @@ -91,7 +91,10 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { items_set.sort_unstable(); // Merge single char items: (a|b|c|d|ef) => ([abcd]|ef) - let count_of_single_char_items = items_set.iter().filter(|&item| item.len() == 1).count(); + let count_of_single_char_items = items_set + .iter() + .filter(|&item| item.chars().count() == 1) + .count(); // Special case for only single char items if count_of_single_char_items == items_set.len() { @@ -108,7 +111,7 @@ pub(crate) fn items_to_regexp(items_arr: Vec) -> String { let mut single_char_items: String = String::new(); let mut new_items = BTreeSet::new(); for item in items { - if item.len() == 1 { + if item.chars().count() == 1 { single_char_items += &item; continue; } From d6b54d93aeb0a0684305d731f9749d9a24a8b6bd Mon Sep 17 00:00:00 2001 From: Cong-Cong Date: Thu, 7 Nov 2024 16:49:49 +0800 Subject: [PATCH 3/3] fix --- crates/rspack_core/src/dependency/runtime_template.rs | 10 ++++++++++ .../rspack_core/src/utils/compile_boolean_matcher.rs | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/crates/rspack_core/src/dependency/runtime_template.rs b/crates/rspack_core/src/dependency/runtime_template.rs index ac2ee38d5fa..ac72ceff8f0 100644 --- a/crates/rspack_core/src/dependency/runtime_template.rs +++ b/crates/rspack_core/src/dependency/runtime_template.rs @@ -813,6 +813,16 @@ mod test_items_to_regexp { ), "[1234a]".to_string() ); + + assert_eq!( + items_to_regexp( + vec!["foo_js", "_js"] + .into_iter() + .map(String::from) + .collect::>(), + ), + "(|foo)_js".to_string() + ); } #[test] diff --git a/crates/rspack_core/src/utils/compile_boolean_matcher.rs b/crates/rspack_core/src/utils/compile_boolean_matcher.rs index c9cdb9609a5..58fdece1b3b 100644 --- a/crates/rspack_core/src/utils/compile_boolean_matcher.rs +++ b/crates/rspack_core/src/utils/compile_boolean_matcher.rs @@ -369,11 +369,11 @@ fn get_common_suffix<'a, I: Iterator>(mut items: I) -> &'a str { let item_byte = item[p - 1]; if suffix_byte == item_byte { p -= 1; - } else { - suffix = &suffix[s + 1..]; - break; + continue; } } + suffix = &suffix[s + 1..]; + break; } }