Skip to content

Commit

Permalink
fix(parser): recognize multiple Unicode space characters
Browse files Browse the repository at this point in the history
Besides the ASCII 0x0A character, there are many other Unicode characters that are considered spaces. See: https://www.compart.com/en/unicode/bidiclass/WS
  • Loading branch information
plusvic committed Oct 1, 2024
1 parent c2ad525 commit bfee80e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 4 deletions.
14 changes: 10 additions & 4 deletions parser/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ impl<'src> Tokenizer<'src> {
}
};

// Truncate `unexpected` at the first whitespace if any.
let unexpected = unexpected.split(char::is_whitespace).next().unwrap();

// If `unexpected` is larger than the current token, bump the lexer to the
Expand Down Expand Up @@ -547,8 +548,9 @@ enum NormalToken<'src> {
#[regex(r#"//[^\n]*"#)]
Comment,

// /\*([^*]|\*[^/])*\*/
#[regex("[ \t]+")]
// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -592,7 +594,9 @@ enum HexPatternToken {
#[token("]")]
RBracket,

#[regex("[ \t]+")]
// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
Whitespace,

#[token("\n")]
Expand Down Expand Up @@ -646,7 +650,9 @@ enum HexJumpToken<'src> {
]
IntegerLit(&'src [u8]),

#[regex("[ \t]+")]
// Space, tab, and many other Unicode characters that are considered spaces.
// https://www.compart.com/en/unicode/bidiclass/WS
#[regex("[ \t\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}]+")]
Whitespace,

#[token("\n")]
Expand Down
55 changes: 55 additions & 0 deletions parser/src/tokenizer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,61 @@ fn whitespaces() {
let mut lexer = super::Tokenizer::new(" \t".as_bytes());
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..2))));
assert_eq!(lexer.next_token(), None);

// "En Quad" character (U+2000).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x80");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Em Quad" character (U+2001).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x81");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "En Space" character (U+2002).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x82");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Em Space" character (U+2003).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x83");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Three-Per-Em" character (U+2004).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x84");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Four-Per-Em" character (U+2005).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x85");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Six-Per-Em" character (U+2006).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x86");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Figure Space" character (U+2007).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x87");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Punctuation Space" character (U+2008).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x88");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Thin Space" character (U+2009).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x89");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);

// "Hair Space" character (U+200A).
let mut lexer = super::Tokenizer::new(b"\xE2\x80\x8A");
assert_eq!(lexer.next_token(), Some(Token::WHITESPACE(Span(0..3))));
assert_eq!(lexer.next_token(), None);
}

#[test]
Expand Down

0 comments on commit bfee80e

Please sign in to comment.