From 1fd3976e193ea9712326952f5087388cb12da831 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Mon, 8 Jul 2024 16:32:33 +0200 Subject: [PATCH] feat: implement comments --- .../src/parser/tests/testdata/comments.in | 34 +++++ .../src/parser/tests/testdata/comments.out | 120 ++++++++++++++++++ parser-ng/src/tokenizer/mod.rs | 51 +++++++- parser-ng/src/tokenizer/tests.rs | 20 +++ 4 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 parser-ng/src/parser/tests/testdata/comments.in create mode 100644 parser-ng/src/parser/tests/testdata/comments.out diff --git a/parser-ng/src/parser/tests/testdata/comments.in b/parser-ng/src/parser/tests/testdata/comments.in new file mode 100644 index 000000000..1674a2006 --- /dev/null +++ b/parser-ng/src/parser/tests/testdata/comments.in @@ -0,0 +1,34 @@ +/* + This + is + a + multi-line + comment. +*/ + +rule test { + // Comment + condition: + 1 + 2 + // Comment + == // Comment + 4 - 1 // Comment +} + +rule test { + strings: + $ = { + // Comment + 00 01 + /* Comment */ + 02 03 + /* + Comment + + */ + 04 05 + } + + condition: + $a +} diff --git a/parser-ng/src/parser/tests/testdata/comments.out b/parser-ng/src/parser/tests/testdata/comments.out new file mode 100644 index 000000000..61a81bb5a --- /dev/null +++ b/parser-ng/src/parser/tests/testdata/comments.out @@ -0,0 +1,120 @@ +SOURCE_FILE@0..318 + COMMENT@0..50 "/*\n This\n is\n a ..." + NEWLINE@50..51 "\n" + NEWLINE@51..52 "\n" + RULE_DECL@52..148 + RULE_KW@52..56 "rule" + WHITESPACE@56..57 " " + IDENT@57..61 "test" + WHITESPACE@61..62 " " + L_BRACE@62..63 "{" + NEWLINE@63..64 "\n" + WHITESPACE@64..66 " " + COMMENT@66..76 "// Comment" + NEWLINE@76..77 "\n" + WHITESPACE@77..78 "\t" + CONDITION_BLK@78..133 + CONDITION_KW@78..87 "condition" + COLON@87..88 ":" + NEWLINE@88..89 "\n" + WHITESPACE@89..91 "\t\t" + BOOLEAN_EXPR@91..133 + BOOLEAN_TERM@91..133 + EXPR@91..96 + TERM@91..92 + PRIMARY_EXPR@91..92 + INTEGER_LIT@91..92 "1" + WHITESPACE@92..93 " " + ADD@93..94 "+" + WHITESPACE@94..95 " " + TERM@95..96 + PRIMARY_EXPR@95..96 + INTEGER_LIT@95..96 "2" + NEWLINE@96..97 "\n" + WHITESPACE@97..99 "\t\t" + COMMENT@99..109 "// Comment" + NEWLINE@109..110 "\n" + WHITESPACE@110..112 "\t\t" + EQ@112..114 "==" + WHITESPACE@114..115 " " + COMMENT@115..125 "// Comment" + NEWLINE@125..126 "\n" + WHITESPACE@126..128 "\t\t" + EXPR@128..133 + TERM@128..129 + PRIMARY_EXPR@128..129 + INTEGER_LIT@128..129 "4" + WHITESPACE@129..130 " " + SUB@130..131 "-" + WHITESPACE@131..132 " " + TERM@132..133 + PRIMARY_EXPR@132..133 + INTEGER_LIT@132..133 "1" + WHITESPACE@133..136 " " + COMMENT@136..146 "// Comment" + NEWLINE@146..147 "\n" + R_BRACE@147..148 "}" + NEWLINE@148..149 "\n" + NEWLINE@149..150 "\n" + RULE_DECL@150..317 + RULE_KW@150..154 "rule" + WHITESPACE@154..155 " " + IDENT@155..159 "test" + WHITESPACE@159..160 " " + L_BRACE@160..161 "{" + NEWLINE@161..162 "\n" + WHITESPACE@162..164 " " + PATTERNS_BLK@164..297 + STRINGS_KW@164..171 "strings" + COLON@171..172 ":" + NEWLINE@172..173 "\n" + WHITESPACE@173..177 " " + PATTERN_DEF@177..297 + PATTERN_IDENT@177..178 "$" + WHITESPACE@178..179 " " + EQUAL@179..180 "=" + WHITESPACE@180..181 " " + HEX_PATTERN@181..297 + L_BRACE@181..182 "{" + NEWLINE@182..183 "\n" + WHITESPACE@183..189 " \t " + COMMENT@189..199 "// Comment" + NEWLINE@199..200 "\n" + WHITESPACE@200..206 " \t " + HEX_SUB_PATTERN@206..291 + HEX_BYTE@206..208 "00" + WHITESPACE@208..209 " " + HEX_BYTE@209..211 "01" + NEWLINE@211..212 "\n" + WHITESPACE@212..218 " \t " + COMMENT@218..231 "/* Comment */" + NEWLINE@231..232 "\n" + WHITESPACE@232..238 " \t " + HEX_BYTE@238..240 "02" + WHITESPACE@240..241 " " + HEX_BYTE@241..243 "03" + NEWLINE@243..244 "\n" + WHITESPACE@244..250 " \t " + COMMENT@250..279 "/*\n \t Comment\n\n ..." + NEWLINE@279..280 "\n" + WHITESPACE@280..286 " \t " + HEX_BYTE@286..288 "04" + WHITESPACE@288..289 " " + HEX_BYTE@289..291 "05" + NEWLINE@291..292 "\n" + WHITESPACE@292..296 " " + R_BRACE@296..297 "}" + NEWLINE@297..298 "\n" + NEWLINE@298..299 "\n" + WHITESPACE@299..300 "\t" + CONDITION_BLK@300..315 + CONDITION_KW@300..309 "condition" + COLON@309..310 ":" + NEWLINE@310..311 "\n" + WHITESPACE@311..313 "\t\t" + BOOLEAN_EXPR@313..315 + BOOLEAN_TERM@313..315 + PATTERN_IDENT@313..315 "$a" + NEWLINE@315..316 "\n" + R_BRACE@316..317 "}" + NEWLINE@317..318 "\n" diff --git a/parser-ng/src/tokenizer/mod.rs b/parser-ng/src/tokenizer/mod.rs index 381641ec9..6b23d0d94 100644 --- a/parser-ng/src/tokenizer/mod.rs +++ b/parser-ng/src/tokenizer/mod.rs @@ -190,7 +190,7 @@ impl<'src> Tokenizer<'src> { #[derive(Debug)] enum Mode<'src> { Normal(logos::Lexer<'src, NormalToken<'src>>), - HexPattern(logos::Lexer<'src, HexPatternToken>), + HexPattern(logos::Lexer<'src, HexPatternToken<'src>>), HexJump(logos::Lexer<'src, HexJumpToken<'src>>), } @@ -410,6 +410,24 @@ enum NormalToken<'src> { ] Regexp(&'src [u8]), + // Block comment. + #[regex(r#"(?x) # allow comments in the regexp + /\* # starts with /* + ( # one or more.. + [^*] # anything except asterisk + | # or.. + \*[^/] # asterisk followed by something that is not / + )* + \*/ # ends with */ + + "#, |token| token.slice())] + BlockComment(&'src [u8]), + + // Single-line comment + #[regex(r#"//[^\n]*"#, |token| token.slice())] + Comment(&'src [u8]), + + // /\*([^*]|\*[^/])*\*/ #[regex("[ \t]+")] Whitespace, @@ -419,7 +437,7 @@ enum NormalToken<'src> { #[derive(logos::Logos, Debug, PartialEq)] #[logos(source = [u8])] -enum HexPatternToken { +enum HexPatternToken<'src> { // A hex byte is an optional tilde ~, followed by two hex digits or // question marks. The following are valid tokens: // @@ -451,6 +469,23 @@ enum HexPatternToken { #[token("\n")] Newline, + + // Block comment. + #[regex(r#"(?x) # allow comments in the regexp + /\* # starts with /* + ( # one or more.. + [^*] # anything except asterisk + | # or.. + \*[^/] # asterisk followed by something that is not / + )* + \*/ # ends with */ + + "#, |token| token.slice())] + BlockComment(&'src [u8]), + + // Single-line comment + #[regex(r#"//[^\n]*"#, |token| token.slice())] + Comment(&'src [u8]), } #[derive(logos::Logos, Debug, PartialEq)] @@ -634,6 +669,12 @@ fn convert_normal_token(token: NormalToken, span: Span) -> Token { Err(_) => unreachable!(), } } + NormalToken::BlockComment(c) | NormalToken::Comment(c) => { + return match from_utf8(c) { + Ok(_) => Token::COMMENT(span), + Err(_) => unreachable!(), + } + } } } @@ -647,6 +688,12 @@ fn convert_hex_pattern_token(token: HexPatternToken, span: Span) -> Token { HexPatternToken::RParen => Token::R_PAREN(span), HexPatternToken::LBracket => Token::L_BRACKET(span), HexPatternToken::RBracket => Token::R_BRACKET(span), + HexPatternToken::BlockComment(c) | HexPatternToken::Comment(c) => { + return match from_utf8(c) { + Ok(_) => Token::COMMENT(span), + Err(_) => unreachable!(), + } + } } } diff --git a/parser-ng/src/tokenizer/tests.rs b/parser-ng/src/tokenizer/tests.rs index c70ec529a..9edfcd695 100644 --- a/parser-ng/src/tokenizer/tests.rs +++ b/parser-ng/src/tokenizer/tests.rs @@ -99,6 +99,26 @@ fn string_literals() { assert_eq!(lexer.next_token(), None); } +#[test] +fn comments() { + let mut lexer = super::Tokenizer::new(r#"/* comment */"#.as_bytes()); + assert_eq!(lexer.next_token(), Some(Token::COMMENT(Span(0..13)))); + assert_eq!(lexer.next_token(), None); + + let mut lexer = super::Tokenizer::new( + r#"/* + comment * / +*/"# + .as_bytes(), + ); + assert_eq!(lexer.next_token(), Some(Token::COMMENT(Span(0..19)))); + assert_eq!(lexer.next_token(), None); + + let mut lexer = super::Tokenizer::new(r#"// comment "#.as_bytes()); + assert_eq!(lexer.next_token(), Some(Token::COMMENT(Span(0..11)))); + assert_eq!(lexer.next_token(), None); +} + #[test] fn regexps() { let mut lexer = super::Tokenizer::new(r#"/foobar/ /.*/"#.as_bytes());