Skip to content

Commit

Permalink
Merge pull request #61 from rutar-forks/handle-windows-newline
Browse files Browse the repository at this point in the history
  • Loading branch information
pascalkuthe authored Dec 7, 2024
2 parents 474edc7 + 21fc173 commit d17e29a
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 30 deletions.
15 changes: 11 additions & 4 deletions matcher/src/chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,17 @@ pub(crate) enum CharClass {
pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
#[cfg(feature = "unicode-segmentation")]
let res = text.graphemes(true).map(|grapheme| {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
// we need to special-case this check since `\r\n` is a single grapheme and is
// therefore the exception to the rule that normalization of a grapheme should
// map to the first character.
if grapheme == "\r\n" {
'\n'
} else {
grapheme
.chars()
.next()
.expect("graphemes must be non-empty")
}
});
#[cfg(not(feature = "unicode-segmentation"))]
let res = text.chars();
Expand Down
125 changes: 99 additions & 26 deletions matcher/src/utf32_str.rs
Original file line number Diff line number Diff line change
@@ -1,56 +1,115 @@
#[cfg(test)]
mod tests;

use std::borrow::Cow;
use std::ops::{Bound, RangeBounds};
use std::{fmt, slice};

use memchr::memmem;

use crate::chars;

/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
/// Check if a given string can be represented internally as the `Ascii` variant in a
/// [`Utf32String`] or a [`Utf32Str`].
///
/// This returns true if the string is ASCII and does not contain a windows-style newline
/// `'\r'`.
/// The additional carriage return check is required since even for strings consisting only
/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
#[inline]
fn has_ascii_graphemes(string: &str) -> bool {
string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
}

/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
///
/// This is mostly intended as an internal string type, but some methods are exposed for
/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
/// using one of its `From<T>` constructors for string types `T` or from the
/// [`Utf32Str::new`] method.
///
/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
/// slice.
/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
/// to the original string.
/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
///
/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
/// assumptions when handling `Utf32Str(ing)`s of unknown origin.
///
/// ## Caveats
/// Despite the name, this type is quite far from being a true string type. Here are some
/// examples demonstrating this.
///
/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
/// operates on codepoints (it should operate on graphemes but that's too much
/// hassle to deal with). We want to quickly iterate these codepoints between
/// (up to 5 times) during matching.
/// ### String conversions are not round-trip
/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
/// COMBINING_DIAERESIS`), the trailing codepoints are truncated.
/// ```
/// # use nucleo_matcher::Utf32String;
/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
/// ```
///
/// ### Indexing is done by grapheme
/// Indexing into a string is done by grapheme rather than by codepoint.
/// ```
/// # use nucleo_matcher::Utf32String;
/// assert!(Utf32String::from("au\u{0308}").len() == 2);
/// ```
///
/// ### A `Unicode` variant may be produced by all-ASCII characters.
/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
/// strings containing `\r\n` will still result in a `Unicode` variant.
/// ```
/// # use nucleo_matcher::Utf32String;
/// let s = Utf32String::from("\r\n");
/// assert!(!s.slice(..).is_ascii());
/// assert!(s.len() == 1);
/// assert!(s.slice(..).get(0) == '\n');
/// ```
///
/// ## Design rationale
/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
/// operates on codepoints (ideally, it should operate on graphemes but that's too
/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
/// times) during matching.
///
/// Doing codepoint segmentation on the fly not only blows trough the cache
/// (lookuptables and Icache) but also has nontrivial runtime compared to the
/// matching itself. Furthermore there are a lot of exta optimizations available
/// for ascii only text (but checking during each match has too much overhead).
/// (lookup tables and I-cache) but also has nontrivial runtime compared to the
/// matching itself. Furthermore there are many extra optimizations available
/// for ASCII only text, but checking each match has too much overhead.
///
/// Ofcourse this comes at exta memory cost as we usually still need the ut8
/// encoded variant for rendering. In the (dominant) case of ascii-only text
/// Of course, this comes at extra memory cost as we usually still need the UTF-8
/// encoded variant for rendering. In the (dominant) case of ASCII-only text
/// we don't require a copy. Furthermore fuzzy matching usually is applied while
/// the user is typing on the fly so the same item is potentially matched many
/// times (making the the upfront cost more worth it). That means that its
/// basically always worth it to presegment the string.
/// times (making the the up-front cost more worth it). That means that its
/// basically always worth it to pre-segment the string.
///
/// For usecases that only match (a lot of) strings once its possible to keep
/// char buffer around that is filled with the presegmented chars
/// char buffer around that is filled with the presegmented chars.
///
/// Another advantage of this approach is that the matcher will naturally
/// produce char indices (instead of utf8 offsets) anyway. With a
/// produce grapheme indices (instead of utf8 offsets) anyway. With a
/// codepoint basic representation like this the indices can be used
/// directly
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
pub enum Utf32Str<'a> {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
/// Correctness invariant: must only contain valid ASCII (`<= 127`)
Ascii(&'a [u8]),
/// A string represented as an array of unicode codepoints (basically UTF-32).
Unicode(&'a [char]),
}

impl<'a> Utf32Str<'a> {
/// Convenience method to construct a `Utf32Str` from a normal utf8 str
/// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
if str.is_ascii() {
if has_ascii_graphemes(str) {
Utf32Str::Ascii(str.as_bytes())
} else {
buf.clear();
buf.extend(crate::chars::graphemes(str));
if buf.iter().all(|c| c.is_ascii()) {
return Utf32Str::Ascii(str.as_bytes());
}
Utf32Str::Unicode(&*buf)
Utf32Str::Unicode(buf)
}
}

Expand Down Expand Up @@ -107,7 +166,7 @@ impl<'a> Utf32Str<'a> {
}
}

/// Returns the number of leading whitespaces in this string
/// Returns the number of trailing whitespaces in this string
#[inline]
pub(crate) fn trailing_white_space(self) -> usize {
match self {
Expand Down Expand Up @@ -144,25 +203,36 @@ impl<'a> Utf32Str<'a> {
}
}

/// Returns whether this string only contains ascii text.
/// Returns whether this string only contains graphemes which are single ASCII chars.
///
/// This is almost equivalent to the string being ASCII, except with the additional requirement
/// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
/// grapheme.
pub fn is_ascii(self) -> bool {
matches!(self, Utf32Str::Ascii(_))
}

/// Returns the `n`th character in this string.
/// Returns the `n`th character in this string, zero-indexed
pub fn get(self, n: u32) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
Utf32Str::Unicode(codepoints) => codepoints[n as usize],
}
}

/// Returns the last character in this string.
///
/// Panics if the string is empty.
pub(crate) fn last(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
}
}

/// Returns the first character in this string.
///
/// Panics if the string is empty.
pub(crate) fn first(self) -> char {
match self {
Utf32Str::Ascii(bytes) => bytes[0] as char,
Expand Down Expand Up @@ -204,6 +274,7 @@ pub enum Chars<'a> {
Ascii(slice::Iter<'a, u8>),
Unicode(slice::Iter<'a, char>),
}

impl Iterator for Chars<'_> {
type Item = char;

Expand All @@ -226,6 +297,8 @@ impl DoubleEndedIterator for Chars<'_> {

#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
/// An owned version of [`Utf32Str`].
///
/// See the API documentation for [`Utf32Str`] for more detail.
pub enum Utf32String {
/// A string represented as ASCII encoded bytes.
/// Correctness invariant: must only contain valid ASCII (<=127)
Expand Down Expand Up @@ -307,7 +380,7 @@ impl Utf32String {
impl From<&str> for Utf32String {
#[inline]
fn from(value: &str) -> Self {
if value.is_ascii() {
if has_ascii_graphemes(value) {
Self::Ascii(value.to_owned().into_boxed_str())
} else {
Self::Unicode(chars::graphemes(value).collect())
Expand All @@ -317,7 +390,7 @@ impl From<&str> for Utf32String {

impl From<Box<str>> for Utf32String {
fn from(value: Box<str>) -> Self {
if value.is_ascii() {
if has_ascii_graphemes(&value) {
Self::Ascii(value)
} else {
Self::Unicode(chars::graphemes(&value).collect())
Expand Down
44 changes: 44 additions & 0 deletions matcher/src/utf32_str/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
use crate::{Utf32Str, Utf32String};

#[test]
fn test_utf32str_ascii() {
/// Helper function for testing
fn expect_ascii(src: &str, is_ascii: bool) {
let mut buffer = Vec::new();
assert!(Utf32Str::new(src, &mut buffer).is_ascii() == is_ascii);
assert!(Utf32String::from(src).slice(..).is_ascii() == is_ascii);
assert!(Utf32String::from(src.to_owned()).slice(..).is_ascii() == is_ascii);
}

// ascii
expect_ascii("", true);
expect_ascii("a", true);
expect_ascii("a\nb", true);
expect_ascii("\n\r", true);

// not ascii
expect_ascii("aü", false);
expect_ascii("au\u{0308}", false);

// windows-style newline
expect_ascii("a\r\nb", false);
expect_ascii(\r\n", false);
expect_ascii("\r\n", false);
}

#[test]
fn test_grapheme_truncation() {
// ascii is preserved
let s = Utf32String::from("ab");
assert_eq!(s.slice(..).get(0), 'a');
assert_eq!(s.slice(..).get(1), 'b');

// windows-style newline is truncated to '\n'
let s = Utf32String::from("\r\n");
assert_eq!(s.slice(..).get(0), '\n');

// normal graphemes are truncated to the first character
let s = Utf32String::from("u\u{0308}\r\n");
assert_eq!(s.slice(..).get(0), 'u');
assert_eq!(s.slice(..).get(1), '\n');
}

0 comments on commit d17e29a

Please sign in to comment.