Skip to content

Commit

Permalink
fix: recognize two more Unicode spaces.
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Oct 1, 2024
1 parent bfee80e commit 5a9944f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
6 changes: 3 additions & 3 deletions parser/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ enum NormalToken<'src> {

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -596,7 +596,7 @@ enum HexPatternToken {

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -652,7 +652,7 @@ enum HexJumpToken<'src> {

// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,

#[token("\n")]
Expand Down
10 changes: 10 additions & 0 deletions parser/src/tokenizer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,16 @@ fn whitespaces() {
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x8A");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Narrow No-Break Space" character (U+202f).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\xAF");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Medium Mathematical Space" character (U+205f).
let mut lexer = super::Tokenizer::new(b"\xE2\x81\x9F");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);
}

#[test]
Expand Down

0 comments on commit 5a9944f

Please sign in to comment.