From ba2059dcfe83633faf6e62b18f2d8cd45415a6af Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 25 Sep 2024 19:22:55 +0200 Subject: [PATCH] feat: protect the parser against stack overflows As the parser uses recursive functions while producing the AST, we must limit the depth of the AST tree to some reasonable amount in order to avoid stack overflows. --- parser/src/ast/cst2ast.rs | 173 +++++++++++++++++++++++--------------- 1 file changed, 103 insertions(+), 70 deletions(-) diff --git a/parser/src/ast/cst2ast.rs b/parser/src/ast/cst2ast.rs index 2e3a321c..f8fbf97d 100644 --- a/parser/src/ast/cst2ast.rs +++ b/parser/src/ast/cst2ast.rs @@ -15,13 +15,24 @@ use crate::cst::{CSTStream, Event, SyntaxKind}; use crate::Span; #[derive(Debug)] -struct Abort; +enum BuilderError { + /// This error indicates that some rule could not be converted into its + /// AST and the builder aborted the generation of the AST for that rule. + /// The builder recovers from this error by continuing with the next rule. + Abort, + /// This error indicates that the AST builder has reached the maximum + /// allowed depth for the AST tree. By controlling the maximum depth for + /// AST tree we avoid stack overflows while traversing the AST with + /// recursive functions. + MaxDepthReached, +} /// Creates an Abstract Syntax Tree from a [`Parser`]. pub(super) struct Builder<'src> { source: &'src [u8], events: Peekable>, errors: Vec, + depth: usize, } impl<'src> Builder<'src> { @@ -29,6 +40,7 @@ impl<'src> Builder<'src> { Self { errors: Vec::new(), source: parser.source(), + depth: 0, events: parser .into_cst_stream() .whitespaces(false) @@ -52,7 +64,8 @@ impl<'src> Builder<'src> { // but we try to continue at the next rule declaration // or import statement. The `recover` function discards // everything until finding the next rule or import. - Err(Abort) => self.recover(), + Err(BuilderError::Abort) => self.recover(), + Err(BuilderError::MaxDepthReached) => {} }, Event::Begin(IMPORT_STMT) => match self.import_stmt() { Ok(import) => imports.push(import), @@ -60,7 +73,8 @@ impl<'src> Builder<'src> { // but we try to continue at the next rule declaration // or import statement. The `recover` function discards // everything until finding the next rule or import. - Err(Abort) => self.recover(), + Err(BuilderError::Abort) => self.recover(), + Err(BuilderError::MaxDepthReached) => {} }, Event::End(SOURCE_FILE) => break, _ => self.recover(), @@ -120,6 +134,8 @@ macro_rules! new_binary_expr { } impl<'src> Builder<'src> { + const MAX_AST_DEPTH: usize = 3000; + /// Consumes all events until finding the start of a rule, an import /// statement or the end of the file. /// @@ -169,12 +185,15 @@ impl<'src> Builder<'src> { /// Most of the tokens returned by the tokenizer are guaranteed to be valid /// UTF-8, but there are some exceptions, like literal strings, comments, /// regular expressions, and of course, the `INVALID_UTF8` token. - fn get_source_str(&mut self, span: &Span) -> Result<&'src str, Abort> { + fn get_source_str( + &mut self, + span: &Span, + ) -> Result<&'src str, BuilderError> { from_utf8(self.get_source(span)).map_err(|err| { self.errors.push(Error::InvalidUTF8( span.subspan(err.valid_up_to(), err.valid_up_to() + 1), )); - Abort + BuilderError::Abort }) } @@ -201,27 +220,35 @@ impl<'src> Builder<'src> { /// and it's not actually part of the syntax tree, while the latter is a /// CST node that contains portions of the syntax tree that were not /// correctly parsed. - fn next(&mut self) -> Result { + fn next(&mut self) -> Result { if let Event::Begin(ERROR) = self.peek() { - return Err(Abort); + return Err(BuilderError::Abort); } Ok(self.events.next().expect("unexpected end of events")) } - fn begin(&mut self, kind: SyntaxKind) -> Result<(), Abort> { + fn begin(&mut self, kind: SyntaxKind) -> Result<(), BuilderError> { assert_eq!(self.next()?, Event::Begin(kind)); + if self.depth == Self::MAX_AST_DEPTH { + return Err(BuilderError::MaxDepthReached); + } + self.depth += 1; Ok(()) } - fn end(&mut self, kind: SyntaxKind) -> Result<(), Abort> { + fn end(&mut self, kind: SyntaxKind) -> Result<(), BuilderError> { assert_eq!(self.next()?, Event::End(kind)); + self.depth -= 1; Ok(()) } /// Makes sure that the next non-error token is of the given kind. /// /// The token is consumed and the function returns the token's span. - fn expect(&mut self, expected_kind: SyntaxKind) -> Result { + fn expect( + &mut self, + expected_kind: SyntaxKind, + ) -> Result { match self.next()? { Event::Token { kind, span } => { if expected_kind != kind { @@ -262,9 +289,9 @@ impl<'src> Builder<'src> { /// [3]: https://abarker.github.io/typped/pratt_parsing_intro.html fn pratt_parser( &mut self, - parse_expr: fn(&mut Self) -> Result, Abort>, + parse_expr: fn(&mut Self) -> Result, BuilderError>, min_bp: u8, - ) -> Result, Abort> { + ) -> Result, BuilderError> { let mut lhs = parse_expr(self)?; // Operator precedence table. For each operator there's a tuple @@ -411,7 +438,7 @@ impl<'src> Builder<'src> { } impl<'src> Builder<'src> { - fn import_stmt(&mut self) -> Result, Abort> { + fn import_stmt(&mut self) -> Result, BuilderError> { self.begin(IMPORT_STMT)?; let span = self.expect(IMPORT_KW)?; let (module_name, module_name_span) = self.utf8_string_lit()?; @@ -419,7 +446,7 @@ impl<'src> Builder<'src> { Ok(Import { module_name, span: span.combine(&module_name_span) }) } - fn rule_decl(&mut self) -> Result, Abort> { + fn rule_decl(&mut self) -> Result, BuilderError> { self.begin(RULE_DECL)?; let flags = if let Event::Begin(RULE_MODS) = self.peek() { @@ -465,7 +492,7 @@ impl<'src> Builder<'src> { Ok(Rule { flags, identifier, tags, meta, patterns, condition }) } - fn rule_mods(&mut self) -> Result { + fn rule_mods(&mut self) -> Result { self.begin(RULE_MODS)?; let mut flags = RuleFlags::none(); @@ -486,7 +513,7 @@ impl<'src> Builder<'src> { Ok(flags) } - fn rule_tags(&mut self) -> Result>, Abort> { + fn rule_tags(&mut self) -> Result>, BuilderError> { self.begin(RULE_TAGS)?; self.expect(COLON)?; @@ -501,7 +528,7 @@ impl<'src> Builder<'src> { Ok(tags) } - fn meta_blk(&mut self) -> Result>, Abort> { + fn meta_blk(&mut self) -> Result>, BuilderError> { self.begin(META_BLK)?; self.expect(META_KW)?; self.expect(COLON)?; @@ -517,7 +544,7 @@ impl<'src> Builder<'src> { Ok(meta) } - fn patterns_blk(&mut self) -> Result>, Abort> { + fn patterns_blk(&mut self) -> Result>, BuilderError> { self.begin(PATTERNS_BLK)?; self.expect(STRINGS_KW)?; self.expect(COLON)?; @@ -533,7 +560,7 @@ impl<'src> Builder<'src> { Ok(patterns) } - fn meta_def(&mut self) -> Result, Abort> { + fn meta_def(&mut self) -> Result, BuilderError> { self.begin(META_DEF)?; let identifier = self.identifier()?; @@ -587,7 +614,7 @@ impl<'src> Builder<'src> { Ok(Meta { identifier, value }) } - fn pattern_def(&mut self) -> Result, Abort> { + fn pattern_def(&mut self) -> Result, BuilderError> { self.begin(PATTERN_DEF)?; let identifier = self.pattern_ident()?; self.expect(EQUAL)?; @@ -631,7 +658,9 @@ impl<'src> Builder<'src> { Ok(pattern) } - fn pattern_mods_opt(&mut self) -> Result, Abort> { + fn pattern_mods_opt( + &mut self, + ) -> Result, BuilderError> { if let Event::Begin(PATTERN_MODS) = self.peek() { self.pattern_mods() } else { @@ -639,7 +668,9 @@ impl<'src> Builder<'src> { } } - fn pattern_mods(&mut self) -> Result, Abort> { + fn pattern_mods( + &mut self, + ) -> Result, BuilderError> { self.begin(PATTERN_MODS)?; let mut modifiers = Vec::new(); @@ -725,7 +756,7 @@ impl<'src> Builder<'src> { Ok(PatternModifiers::new(modifiers)) } - fn hex_pattern(&mut self) -> Result { + fn hex_pattern(&mut self) -> Result { self.begin(HEX_PATTERN)?; self.expect(L_BRACE)?; @@ -737,7 +768,7 @@ impl<'src> Builder<'src> { Ok(sub_pattern) } - fn hex_sub_pattern(&mut self) -> Result { + fn hex_sub_pattern(&mut self) -> Result { self.begin(HEX_SUB_PATTERN)?; let mut sub_patterns = Vec::new(); @@ -795,7 +826,7 @@ impl<'src> Builder<'src> { Ok(HexTokens { tokens: sub_patterns }) } - fn hex_alternative(&mut self) -> Result { + fn hex_alternative(&mut self) -> Result { self.begin(HEX_ALTERNATIVE)?; let l_paren_span = self.expect(L_PAREN)?; @@ -815,7 +846,7 @@ impl<'src> Builder<'src> { }) } - fn hex_jump(&mut self) -> Result { + fn hex_jump(&mut self) -> Result { self.begin(HEX_JUMP)?; let l_bracket_span = self.expect(L_BRACKET)?; @@ -847,14 +878,14 @@ impl<'src> Builder<'src> { }) } - fn boolean_expr(&mut self) -> Result, Abort> { + fn boolean_expr(&mut self) -> Result, BuilderError> { self.begin(BOOLEAN_EXPR)?; let expr = self.pratt_parser(Self::boolean_term, 0)?; self.end(BOOLEAN_EXPR)?; Ok(expr) } - fn boolean_expr_tuple(&mut self) -> Result>, Abort> { + fn boolean_expr_tuple(&mut self) -> Result>, BuilderError> { self.begin(BOOLEAN_EXPR_TUPLE)?; self.expect(L_PAREN)?; @@ -871,7 +902,7 @@ impl<'src> Builder<'src> { Ok(exprs) } - fn boolean_term(&mut self) -> Result, Abort> { + fn boolean_term(&mut self) -> Result, BuilderError> { self.begin(BOOLEAN_TERM)?; let expr = match self.peek() { @@ -919,7 +950,7 @@ impl<'src> Builder<'src> { Ok(expr) } - fn for_expr(&mut self) -> Result, Abort> { + fn for_expr(&mut self) -> Result, BuilderError> { self.begin(FOR_EXPR)?; let for_span = self.expect(FOR_KW)?; @@ -990,7 +1021,7 @@ impl<'src> Builder<'src> { Ok(expr) } - fn of_expr(&mut self) -> Result, Abort> { + fn of_expr(&mut self) -> Result, BuilderError> { self.begin(OF_EXPR)?; let quantifier = self.quantifier()?; self.expect(OF_KW)?; @@ -1023,7 +1054,7 @@ impl<'src> Builder<'src> { Ok(Expr::Of(Box::new(Of { span, quantifier, items, anchor }))) } - fn with_expr(&mut self) -> Result, Abort> { + fn with_expr(&mut self) -> Result, BuilderError> { self.begin(WITH_EXPR)?; let mut span = self.expect(WITH_KW)?; @@ -1031,7 +1062,7 @@ impl<'src> Builder<'src> { self.begin(WITH_DECLS)?; let declaration = - |i: &mut Self| -> Result, Abort> { + |i: &mut Self| -> Result, BuilderError> { i.begin(WITH_DECL)?; let identifier = i.identifier()?; @@ -1066,7 +1097,7 @@ impl<'src> Builder<'src> { Ok(Expr::With(Box::new(With { span, declarations, condition }))) } - fn quantifier(&mut self) -> Result, Abort> { + fn quantifier(&mut self) -> Result, BuilderError> { self.begin(QUANTIFIER)?; let quantifier = match self.peek() { @@ -1093,7 +1124,7 @@ impl<'src> Builder<'src> { Ok(quantifier) } - fn iterable(&mut self) -> Result, Abort> { + fn iterable(&mut self) -> Result, BuilderError> { self.begin(ITERABLE)?; let iterable = match self.peek() { @@ -1110,7 +1141,7 @@ impl<'src> Builder<'src> { Ok(iterable) } - fn anchor(&mut self) -> Result>, Abort> { + fn anchor(&mut self) -> Result>, BuilderError> { match self.peek() { Event::Token { kind: AT_KW, .. } => { let at_span = self.expect(AT_KW)?; @@ -1132,7 +1163,7 @@ impl<'src> Builder<'src> { } } - fn range(&mut self) -> Result, Abort> { + fn range(&mut self) -> Result, BuilderError> { self.begin(RANGE)?; let l_paren_span = self.expect(L_PAREN)?; @@ -1153,14 +1184,14 @@ impl<'src> Builder<'src> { }) } - fn expr(&mut self) -> Result, Abort> { + fn expr(&mut self) -> Result, BuilderError> { self.begin(EXPR)?; let expr = self.pratt_parser(Self::term, 0)?; self.end(EXPR)?; Ok(expr) } - fn expr_tuple(&mut self) -> Result>, Abort> { + fn expr_tuple(&mut self) -> Result>, BuilderError> { self.begin(EXPR_TUPLE)?; self.expect(L_PAREN)?; @@ -1177,7 +1208,7 @@ impl<'src> Builder<'src> { Ok(exprs) } - fn term(&mut self) -> Result, Abort> { + fn term(&mut self) -> Result, BuilderError> { self.begin(TERM)?; let mut expr = self.primary_expr()?; @@ -1224,7 +1255,7 @@ impl<'src> Builder<'src> { Ok(expr) } - fn primary_expr(&mut self) -> Result, Abort> { + fn primary_expr(&mut self) -> Result, BuilderError> { self.begin(PRIMARY_EXPR)?; let expr = match self.peek() { @@ -1377,34 +1408,36 @@ impl<'src> Builder<'src> { Ok(expr) } - fn identifier(&mut self) -> Result, Abort> { + fn identifier(&mut self) -> Result, BuilderError> { let span = self.expect(IDENT)?; Ok(Ident { name: self.get_source_str(&span)?, span }) } - fn pattern_ident(&mut self) -> Result, Abort> { + fn pattern_ident(&mut self) -> Result, BuilderError> { let span = self.expect(PATTERN_IDENT)?; Ok(Ident { name: self.get_source_str(&span)?, span }) } fn pattern_ident_tuple( &mut self, - ) -> Result>, Abort> { + ) -> Result>, BuilderError> { self.begin(PATTERN_IDENT_TUPLE)?; self.expect(L_PAREN)?; - let item = |s: &mut Self| -> Result, Abort> { - let ident = s.pattern_ident()?; - let mut span = ident.span(); - let wildcard = - if matches!(s.peek(), Event::Token { kind: ASTERISK, .. }) { - span = span.combine(&s.expect(ASTERISK)?); - true - } else { - false - }; - Ok(PatternSetItem { span, identifier: ident.name, wildcard }) - }; + let item = + |s: &mut Self| -> Result, BuilderError> { + let ident = s.pattern_ident()?; + let mut span = ident.span(); + let wildcard = + if matches!(s.peek(), Event::Token { kind: ASTERISK, .. }) + { + span = span.combine(&s.expect(ASTERISK)?); + true + } else { + false + }; + Ok(PatternSetItem { span, identifier: ident.name, wildcard }) + }; let mut items = vec![item(self)?]; @@ -1419,7 +1452,7 @@ impl<'src> Builder<'src> { Ok(items) } - fn integer_lit(&mut self) -> Result<(T, &'src str, Span), Abort> + fn integer_lit(&mut self) -> Result<(T, &'src str, Span), BuilderError> where T: Num + Bounded + CheckedMul + FromPrimitive + std::fmt::Display, { @@ -1461,25 +1494,25 @@ impl<'src> Builder<'src> { // type T. let value = value.map_err(|_| { self.errors.push(build_error(&span)); - Abort + BuilderError::Abort })?; // The multiplier may not fit in type T. let multiplier = T::from_i32(multiplier).ok_or_else(|| { self.errors.push(build_error(&span)); - Abort + BuilderError::Abort })?; // The value after applying the multiplier may not fit in type T. let value = value.checked_mul(&multiplier).ok_or_else(|| { self.errors.push(build_error(&span)); - Abort + BuilderError::Abort })?; Ok((value, literal, span)) } - fn float_lit(&mut self) -> Result<(f64, &'src str, Span), Abort> { + fn float_lit(&mut self) -> Result<(f64, &'src str, Span), BuilderError> { let span = self.expect(FLOAT_LIT)?; let literal = self.get_source_str(&span)?; let value = literal.parse::().map_err(|err| { @@ -1487,13 +1520,13 @@ impl<'src> Builder<'src> { message: err.to_string(), span: span.clone(), }); - Abort + BuilderError::Abort })?; Ok((value, literal, span)) } - fn regexp(&mut self) -> Result, Abort> { + fn regexp(&mut self) -> Result, BuilderError> { let span = self.expect(REGEXP)?; let re = self.get_source_str(&span)?; @@ -1523,7 +1556,7 @@ impl<'src> Builder<'src> { span, }); - return Err(Abort); + return Err(BuilderError::Abort); } } } @@ -1539,7 +1572,7 @@ impl<'src> Builder<'src> { /// This function is similar [`string_lit`] but guarantees that the /// string is a valid UTF-8 string. - fn utf8_string_lit(&mut self) -> Result<(&'src str, Span), Abort> { + fn utf8_string_lit(&mut self) -> Result<(&'src str, Span), BuilderError> { // Call `string_lit` with `allow_escape_char` set to false. This // guarantees that the returned string is borrowed from the source code // and is valid UTF-8, therefore is safe to convert it to &str without @@ -1574,7 +1607,7 @@ impl<'src> Builder<'src> { fn string_lit( &mut self, allow_escape_char: bool, - ) -> Result<(Cow<'src, BStr>, &'src str, Span), Abort> { + ) -> Result<(Cow<'src, BStr>, &'src str, Span), BuilderError> { let span = self.expect(STRING_LIT)?; let literal = self.get_source_str(&span)?; @@ -1600,7 +1633,7 @@ impl<'src> Builder<'src> { { if !allow_escape_char { self.errors.push(Error::UnexpectedEscapeSequence(span)); - return Err(Abort); + return Err(BuilderError::Abort); } backslash_pos } else { @@ -1658,7 +1691,7 @@ impl<'src> Builder<'src> { .subspan(start, end + 1), } ); - return Err(Abort); + return Err(BuilderError::Abort); } } _ => { @@ -1673,7 +1706,7 @@ impl<'src> Builder<'src> { ), }); - return Err(Abort); + return Err(BuilderError::Abort); } }, _ => { @@ -1695,7 +1728,7 @@ impl<'src> Builder<'src> { ), }); - return Err(Abort); + return Err(BuilderError::Abort); } } }