Skip to content

Commit

Permalink
feat: improve error reporting and recovering
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Jul 2, 2024
1 parent 18da5a8 commit 2d55a6b
Show file tree
Hide file tree
Showing 23 changed files with 356 additions and 86 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions parser-ng/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Deciding whether to use a CST or AST depends on the kind of problem you want to
solve.
*/

use std::fmt::{Display, Formatter};
use std::ops::Range;

mod parser;
Expand All @@ -39,6 +40,12 @@ impl From<logos::Span> for Span {
}
}

impl Display for Span {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "[{}..{}]", self.start(), self.end())
}
}

impl Span {
const MAX: usize = u32::MAX as usize;

Expand Down
2 changes: 1 addition & 1 deletion parser-ng/src/parser/cst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ impl Debug for CST {
if !self.errors.is_empty() {
writeln!(f, "\nERRORS:")?;
for (span, err) in &self.errors {
writeln!(f, "- {:?}: {}", span, err)?;
writeln!(f, "- {}: {}", span, err)?;
}
}
Ok(())
Expand Down
5 changes: 3 additions & 2 deletions parser-ng/src/parser/cst/syntax_kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,19 @@ pub enum SyntaxKind {
WHITESPACE,
NEWLINE,

ERROR,
IDENT,
IMPORT_STMT,
RULE_DECL,
RULE_MODS,
RULE_TAGS,
CONDITION,
CONDITION_BLK,
META_DEF,
META_DEFS,
SOURCE_FILE,
BOOLEAN_EXPR,
BOOLEAN_TERM,

ERROR,
}

impl From<SyntaxKind> for rowan::SyntaxKind {
Expand Down
146 changes: 127 additions & 19 deletions parser-ng/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@
The parser receives a sequence of tokens produced by the [`Tokenizer`], and
produces a Concrete Syntax-Tree ([`CST`]), also known as a lossless syntax
tree.
tree. The CST is initially represented as a stream of [events][`Event`], but
this stream is later converted to a tree using the [rowan][2] create.
Under the hood, the parser uses the [rowan][2] create.
This parser is error-tolerant, it is able to parse YARA code that contains
syntax errors. After each error, the parser recovers and keeps parsing the
remaining code. The resulting CST may contain error nodes containing portions
of the code that are not syntactically correct, but anything outside of those
error nodes is valid YARA code.
[1]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
[2]: https://github.com/rust-analyzer/rowan
*/

use std::collections::HashMap;
use std::mem;

pub mod cst;
Expand Down Expand Up @@ -57,6 +63,9 @@ impl<'src> Parser<'src> {
struct InternalParser<'src> {
tokens: TokenStream<'src>,
output: SyntaxStream,
pending_errors: Vec<(String, Span)>,
expected_tokens: HashMap<usize, Vec<&'static str>>,
opt_depth: usize,
failed: bool,
}

Expand All @@ -66,6 +75,9 @@ impl<'src> From<Tokenizer<'src>> for InternalParser<'src> {
Self {
tokens: TokenStream::new(tokenizer),
output: SyntaxStream::new(),
pending_errors: Vec::new(),
expected_tokens: HashMap::new(),
opt_depth: 0,
failed: false,
}
}
Expand All @@ -89,6 +101,7 @@ impl Iterator for InternalParser<'_> {
if self.output.is_empty() && self.tokens.has_more() {
let _ = self.ws();
let _ = self.top_level_item();
self.failed = false;
}
self.output.pop()
}
Expand Down Expand Up @@ -164,28 +177,85 @@ impl<'src> InternalParser<'src> {
self
}

fn recover(&mut self, tokens: &TokenSet) -> &mut Self {
match self.peek() {
None => {}
Some(token) if tokens.contains(token) => {}
Some(_) => {
self.output.begin(SyntaxKind::ERROR);
while let Some(token) = self.peek() {
if tokens.contains(token) {
break;
} else {
self.bump();
}
}
self.output.end();
}
}
self.failed = false;
self
}

fn expect_and_recover(&mut self, tokens: &TokenSet) -> &mut Self {
self.expect(tokens);
if self.failed {
self.recover(tokens);
self.bump();
}
self
}

/// Expects one of the tokens in `expected_tokens`.
///
/// If the next token is not one of the expected ones, the parser enters
/// the failed state.
fn expect(&mut self, expected_tokens: &[Token]) -> &mut Self {
fn expect(&mut self, tokens: &TokenSet) -> &mut Self {
let token = match self.peek() {
Some(token) => token,
None => {
self.failed = true;
return self;
}
Some(token) if tokens.contains(token) => {
self.bump();
return self;
}
Some(token) => token,
};
if expected_tokens.iter().any(|expected| {
mem::discriminant(expected) == mem::discriminant(token)
}) {
self.bump();

let span = token.span();
let token_str = token.as_str();

let expected_tokens =
self.expected_tokens.entry(span.start()).or_default();

expected_tokens.extend(tokens.iter().map(|t| t.as_str()));

let (last, all_except_last) = expected_tokens.split_last().unwrap();

let error_msg = if all_except_last.is_empty() {
format!("expecting {last}, found {}", token_str)
} else {
let span = token.span();
self.bump();
self.output.push_error("foo", span);
self.failed = true;
format!(
"expecting {} or {last}, found {}",
all_except_last.join(", "),
token_str,
)
};

self.pending_errors.push((error_msg, span));

if self.opt_depth == 0 {
if let Some((error, span)) = self
.pending_errors
.drain(0..)
.max_by_key(|(_, span)| span.start())
{
self.output.push_error(error, span);
}
}

self.failed = true;
self
}

Expand All @@ -203,7 +273,10 @@ impl<'src> InternalParser<'src> {
}

let bookmark = self.bookmark();

self.opt_depth += 1;
p(self);
self.opt_depth -= 1;

// Any error occurred while parsing the optional production is ignored.
if self.failed {
Expand Down Expand Up @@ -253,7 +326,9 @@ impl<'src> InternalParser<'src> {
if !self.failed {
loop {
let bookmark = self.bookmark();
self.opt_depth += 1;
p(self);
self.opt_depth -= 1;
if self.failed {
self.failed = false;
self.restore_bookmark(&bookmark);
Expand Down Expand Up @@ -300,7 +375,7 @@ use Token::*;

macro_rules! t {
($( $tokens:path )|*) => {
&[$( $tokens(Span::default()) ),*]
&TokenSet(&[$( $tokens(Span::default()) ),*])
};
}

Expand Down Expand Up @@ -413,17 +488,16 @@ impl<'src> InternalParser<'src> {
.ws()
.expect(t!(IDENT))
.ws()
.expect(t!(L_BRACE))
.opt(|p| p.rule_tags())
.ws()
.expect_and_recover(t!(L_BRACE))
.ws()
.opt(|p| p.meta_defs())
.ws()
.opt(|p| p.pattern_defs())
.ws()
.expect(t!(CONDITION_KW))
.ws()
.expect(t!(COLON))
.ws()
.one(|p| p.boolean_expr())
.recover(t!(CONDITION_KW))
.one(|p| p.condition_blk())
.ws()
.expect(t!(R_BRACE))
.end()
Expand All @@ -447,6 +521,13 @@ impl<'src> InternalParser<'src> {
.end()
}

fn rule_tags(&mut self) -> &mut Self {
self.begin(SyntaxKind::RULE_TAGS)
.expect(t!(COLON))
.one_or_more(|p| p.ws().expect(t!(IDENT)))
.end()
}

/// Parses metadata definitions
///
/// ```text
Expand Down Expand Up @@ -500,6 +581,16 @@ impl<'src> InternalParser<'src> {
todo!()
}

fn condition_blk(&mut self) -> &mut Self {
self.begin(SyntaxKind::CONDITION_BLK)
.expect(t!(CONDITION_KW))
.ws()
.expect(t!(COLON))
.ws()
.one(|p| p.boolean_expr())
.end()
}

fn hex_pattern(&mut self) -> &mut Self {
todo!()
}
Expand Down Expand Up @@ -564,6 +655,23 @@ struct Bookmark {
output: syntax_stream::Bookmark,
}

struct TokenSet<'a>(&'a [Token]);

impl<'a> TokenSet<'a> {
#[inline]
fn is_empty(&self) -> bool {
self.0.is_empty()
}

fn contains(&self, token: &Token) -> bool {
self.0.iter().any(|t| mem::discriminant(t) == mem::discriminant(token))
}

fn iter(&self) -> impl Iterator<Item = &'a Token> {
self.0.iter()
}
}

struct Alt<'a, 'src> {
parser: &'a mut InternalParser<'src>,
matched: bool,
Expand Down
6 changes: 3 additions & 3 deletions parser-ng/src/parser/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ use crate::Parser;
fn test() {
let cst = CST::from(Parser::new(
r#"
rule test {
condition:
true and false
rule test : {
condition:
true
}
"#
.as_bytes(),
Expand Down
24 changes: 0 additions & 24 deletions parser-ng/src/parser/tests/testdata/2.out

This file was deleted.

29 changes: 0 additions & 29 deletions parser-ng/src/parser/tests/testdata/3.out

This file was deleted.

File renamed without changes.
Loading

0 comments on commit 2d55a6b

Please sign in to comment.