From 5a9944f779e7b6777841f9b5f48e1908fcf6fde4 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 1 Oct 2024 11:41:37 +0200 Subject: [PATCH] fix: recognize two more Unicode spaces. --- parser/src/tokenizer/mod.rs | 6 +++--- parser/src/tokenizer/tests.rs | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/parser/src/tokenizer/mod.rs b/parser/src/tokenizer/mod.rs index 529dc72b..0b15367f 100644 --- a/parser/src/tokenizer/mod.rs +++ b/parser/src/tokenizer/mod.rs @@ -550,7 +550,7 @@ enum NormalToken<'src> { // Space, tab, and many other Unicode characters that are considered spaces. // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")] + #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] @@ -596,7 +596,7 @@ enum HexPatternToken { // Space, tab, and many other Unicode characters that are considered spaces. // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")] + #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] @@ -652,7 +652,7 @@ enum HexJumpToken<'src> { // Space, tab, and many other Unicode characters that are considered spaces. // https://www.compart.com/en/unicode/bidiclass/WS - #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")] + #[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")] Whitespace, #[token("\n")] diff --git a/parser/src/tokenizer/tests.rs b/parser/src/tokenizer/tests.rs index c4e938cf..3ef194ce 100644 --- a/parser/src/tokenizer/tests.rs +++ b/parser/src/tokenizer/tests.rs @@ -324,6 +324,16 @@ fn whitespaces() { let mut lexer = super::Tokenizer::new(b"\xE2\x80\x8A"); assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3)))); assert_eq!(lexer.next_token(), None); + + // "Narrow No-Break Space" character (U+202f). + let mut lexer = super::Tokenizer::new(b"\xE2\x80\xAF"); + assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3)))); + assert_eq!(lexer.next_token(), None); + + // "Medium Mathematical Space" character (U+205f). + let mut lexer = super::Tokenizer::new(b"\xE2\x81\x9F"); + assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3)))); + assert_eq!(lexer.next_token(), None); } #[test]