diff --git a/example.yar b/example.yar index 338e339..6a78cdf 100644 --- a/example.yar +++ b/example.yar @@ -1,12 +1,12 @@ +// Example of YARA rule for testing purposes + import "macho" include "test" -//Global comment //Rule comment rule test : bla test { //Rule block comment - meta: author = "Author" description = 20 diff --git a/src/bin.rs b/src/bin.rs index 5ebbe71..cbcaee8 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -1,6 +1,8 @@ use std::fs; use yara_parser::{AstNode, SourceFile}; +/// This is a simple binary that reads a file and prints the AST and errors. +/// It is used to test the parser. fn main() { let filename = std::env::args().nth(1).expect("No arguments provided"); let file_content = diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index c4aef90..91e6b31 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -26,6 +26,7 @@ impl fmt::Display for LexingError { } } +/// Root lexer for YARA language. #[derive(Logos, Debug, PartialEq)] #[logos(error = LexingError)] pub(crate) enum LogosToken { @@ -211,6 +212,7 @@ pub(crate) enum LogosToken { MultilineComment, } +/// Lexer for hexadecimal string. #[derive(Logos, Debug, PartialEq)] #[logos(error = LexingError)] pub(crate) enum HexLogosToken { @@ -267,9 +269,11 @@ pub fn tokenize(text: &str) -> (Vec, Vec) { let token_range = TextRange::at(offset.try_into().unwrap(), token_len); let syntaxkind = match token { Ok(token) => { + // Handle hexadecimal string token separately if let LogosToken::HexString(hex_string) = token { process_hex_string_token(hex_string, &mut tokens, &mut errors, &mut offset); continue; + // Handle regex string token separately } else if let LogosToken::Regexp(regex) = token { let detailed_tokens = process_regex_string_token(regex); for (kind, len) in detailed_tokens { @@ -384,6 +388,9 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind { } } +/// Process regex string token to generate detailed tokens +/// This is the representation that YARA-X uses, therefore for an +/// easier integration with YARA-X, we need to keep this representation fn process_regex_string_token(regex: String) -> Vec<(SyntaxKind, usize)> { let mut tokens = Vec::new(); let mut chars = regex.chars().peekable(); @@ -419,7 +426,9 @@ fn process_regex_string_token(regex: String) -> Vec<(SyntaxKind, usize)> { tokens } -// Process hexadecimal string token to generate detailed tokens +/// Process hexadecimal string token to generate detailed tokens +/// This is the representation that YARA-X uses, therefore for an +/// easier integration with YARA-X, we need to keep this representation fn process_hex_string_token( hex_string: String, tokens: &mut Vec, diff --git a/src/lib.rs b/src/lib.rs index 1b56a5f..f3de8a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,9 @@ -/// This library is used to create a parser for YARA language -/// It should provide also token for whitespaces -/// as we want full fidelity and error resilience.; +//! This library is used to create a parser for YARA language +//! It should provide also token for whitespaces +//! as we want full fidelity and error resilience.; +//! It is inspired by the Swift's libSyntax and the Rust's rowan. +//! +//! author: Tomáš Ďuriš use crate::syntax::{ syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, }; @@ -46,7 +49,6 @@ fn api_walktrough() { // SourceFile is the main entry point for any given file // it contains a `parse` method which returns a `Parse` struct // that contains AST and list of errors - let parse_struct = SourceFile::parse(source_code); assert!(parse_struct.errors().is_empty()); @@ -134,43 +136,49 @@ fn api_walktrough() { } // For the condition part, we can similarly get its body which is - // an `BOOLEAN_EXPR` node + // an `EXPRESSION` node let condition = block.condition().unwrap(); let expression_stmt = condition.expression_stmt().unwrap(); let expression = expression_stmt.expression().unwrap(); + // Expression can be either a `BOOLEAN_EXPR` or `BOOLEAN_TERM` + // In this example we have `BOOLEAN_EXPR` let boolean_expr = match &expression { Expression::BooleanExpr(e) => e, _ => unreachable!(), }; - // Now we can obtain `lhs`, `rhs` or `op` nodes for top level expression + // Now we can obtain `lhs`, `rhs` or `op` nodes for top level boolean expression // in this case we have `OR` operator assert!(boolean_expr.op_token().is_some()); assert!(boolean_expr.op_token().unwrap().kind() == SyntaxKind::OR_KW); - // On the left hand side we have a LITERAL token + // On the left hand side we have a `BOOLEAN_TERM` node let lhs = boolean_expr.lhs().unwrap(); let lhs_literal = match &lhs { Expression::BooleanTerm(l) => l, _ => unreachable!(), }; + // It contains a `VARIABLE` node + // which is essentially a variable token $a assert!(lhs_literal.variable_token().unwrap().kind() == SyntaxKind::VARIABLE); assert_eq!(lhs_literal.variable_token().unwrap().text(), "$a"); - // On the right hand side we have a `BOOLEAN_EXPT` node + // On the right hand side we have a `BOOLEAN_EXPR` node let rhs = boolean_expr.rhs().unwrap(); - - // It contains prefix expression which is essentially a `BOOLEAN_TERM` node - // in this case we have `NOT` node and nested `VARIABLE` node let rhs_literal = match &rhs { Expression::BooleanExpr(r) => r, _ => unreachable!(), }; + // As it is again an expression we can obtain the operator, + // left hand side and right hand side let lhs_of_rhs = rhs_literal.lhs().unwrap(); + // In this case we only have a left hand side + // which is a `BOOLEAN_TERM` node and contains a `NOT` operator + // followed by a `BOOLEAN_TERM` node with a `BOOL_LIT` token let lhs = match &lhs_of_rhs { Expression::BooleanTerm(l) => l, _ => unreachable!(), @@ -191,6 +199,7 @@ fn api_walktrough() { // Syntax node have also bunch of methods // for example we can obtain the parent node + // Parent node is whole condition block let parent = expression_stmt_syntax.parent().unwrap(); assert_eq!(parent.kind(), SyntaxKind::CONDITION); assert_eq!(parent.text().to_string(), "condition:\n $a or not true"); @@ -267,6 +276,9 @@ fn api_walktrough() { let parse_struct = SourceFile::parse(source_code); // There are some errors + // First three are for invalid pattern declaration in strings section + // The rest is for an invalid part of a condition + //(nor operator is invalid so also everything after it in condition block) assert!(!parse_struct.errors().is_empty()); assert!(parse_struct.errors().len() == 6); assert!( @@ -293,7 +305,7 @@ fn api_walktrough() { }; // The operator is wrong, therefore we only have - // a variable + // a variable in expression statement assert!(boolean_term.variable_token().unwrap().kind() == SyntaxKind::VARIABLE); // and we can obtain the error token @@ -304,6 +316,7 @@ fn api_walktrough() { .unwrap(); assert!(error_token.kind() == SyntaxKind::ERROR); + // it contains the token for wrong operator inside (as an identifier) assert!(error_token.as_node().unwrap().text() == "nor"); } // We can also search a token that produced the error diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index f4f9d67..f57b8f7 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -1,5 +1,6 @@ use super::*; +// Pattern modifiers const PATTERN_MODIFIERS_SET: TokenSet = TokenSet::new(&[ T![ascii], T![wide], @@ -28,8 +29,8 @@ pub(crate) fn block_expr(p: &mut Parser) { } /// Parse a rule body -/// A rule body consists of `strings` and `condition` blocks -/// `strings` part is optional but condition is required +/// A rule body consists of `meta`, `strings` and `condition` blocks +/// `strings` or `meta` part is optional but condition is required /// but each of them can be defined only once and have to be in order pub(super) fn rule_body(p: &mut Parser) { let mut has_strings = false; @@ -69,10 +70,6 @@ pub(super) fn rule_body(p: &mut Parser) { if has_condition { p.err_and_bump("invalid yara expression"); } else { - // It did not contain strings or condition in valid form - // but we can still try to parse their body and throw an error for parent - // for now it just looks at next 2 tokens to differenciate between valid strings - // body or condition body. This should probably be adjusted later p.err_and_bump("expected meta, strings or condition keyword"); } } @@ -327,7 +324,7 @@ enum Associativity { Right, } -/// Binding powers of operators for a Pratt parser. +/// Binding powers of operators in top level (boolean expression) for a Pratt parser. fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { match p.current() { T![and] => (4, T![and], Associativity::Left), @@ -336,6 +333,7 @@ fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { } } +/// Binding powers of operators in second level for a Pratt parser. fn expr_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { match p.current() { T![|] => (10, T![|], Associativity::Left), @@ -353,6 +351,7 @@ fn expr_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { } } +/// Binding powers of operators in third level for a Pratt parser. fn expr_stmt_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { match p.current() { T![==] => (6, T![==], Associativity::Left), @@ -379,6 +378,8 @@ fn expr_stmt_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { /// This is also used to reflect operator precedence and associativity /// It is inspired by Pratt parser used in rust-analyter /// +/// +/// There are multiple layers of Pratt parser used to parse different levels of expressions fn boolean_expr(p: &mut Parser, m: Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match boolean_term(p) { @@ -409,6 +410,8 @@ fn boolean_expr(p: &mut Parser, m: Option, bp: u8) -> Option Option { let m = p.start(); match p.current() { @@ -445,6 +448,7 @@ fn boolean_term(p: &mut Parser) -> Option { for_expr(p); } _ => { + // Calculate the length of primary expression let mut parentheses_count = 0; let mut primary_expr_len = primary_expr_length(p, 0, &mut parentheses_count); if parentheses_count != 0 { @@ -457,6 +461,8 @@ fn boolean_term(p: &mut Parser) -> Option { primary_expr_len += 1; } + // Decide if it is a primary expression, of expression or a statement + // If it is a primary expression, we need to check if it is followed by "of" keyword if p.at(T!['(']) && primary_expr_len == 0 { p.bump(T!['(']); boolean_expr(p, None, 1); @@ -476,6 +482,7 @@ fn boolean_term(p: &mut Parser) -> Option { Some(cm) } +/// Pratt parser for parsing expression statements layer fn expr_stmt(p: &mut Parser, m: Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match expr(p, None, bp) { @@ -504,6 +511,7 @@ fn expr_stmt(p: &mut Parser, m: Option, bp: u8) -> Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match term(p) { @@ -532,6 +540,8 @@ fn expr(p: &mut Parser, m: Option, bp: u8) -> Option { Some(lhs) } +/// Parse a term +/// It can be a primary expression, indexing expression or function call expression fn term(p: &mut Parser) -> Option { let m = p.start(); let pe = primary_expr(p); @@ -568,6 +578,9 @@ fn term(p: &mut Parser) -> Option { Some(cm) } +/// Parse a primary expression +/// It can be a float, int, string, variable count, variable offset, variable length, +/// filesize, entrypoint, regex pattern, unary expression or identifier fn primary_expr(p: &mut Parser) -> Option { let m = p.start(); match p.current() { @@ -639,6 +652,7 @@ fn primary_expr(p: &mut Parser) -> Option { Some(cm) } +/// Parse a variable count expression fn variable_count(p: &mut Parser) { let m = p.start(); p.bump(T![variable_count]); @@ -651,6 +665,7 @@ fn variable_count(p: &mut Parser) { m.complete(p, VARIABLE_COUNT); } +/// Parse a variable offset expression fn variable_offset(p: &mut Parser) { let m = p.start(); p.bump(T![variable_offset]); @@ -664,6 +679,7 @@ fn variable_offset(p: &mut Parser) { m.complete(p, VARIABLE_OFFSET); } +/// Parse a variable length expression fn variable_length(p: &mut Parser) { let m = p.start(); p.bump(T![variable_length]); @@ -677,6 +693,7 @@ fn variable_length(p: &mut Parser) { m.complete(p, VARIABLE_LENGTH); } +/// Parse a range expression fn range(p: &mut Parser) { let m = p.start(); p.expect(T!['(']); @@ -687,6 +704,7 @@ fn range(p: &mut Parser) { m.complete(p, RANGE); } +/// Parse an of expression fn of_expr(p: &mut Parser) { let m = p.start(); @@ -728,6 +746,7 @@ fn of_expr(p: &mut Parser) { m.complete(p, OF_EXPR); } +/// Parse a for expression fn for_expr(p: &mut Parser) { let m = p.start(); p.expect(T![for]); @@ -751,6 +770,7 @@ fn for_expr(p: &mut Parser) { m.complete(p, FOR_EXPR); } +/// Parse a quantifier expression fn quantifier(p: &mut Parser) { let m = p.start(); match p.current() { @@ -773,6 +793,7 @@ fn quantifier(p: &mut Parser) { m.complete(p, QUANTIFIER); } +/// Parse an iterable expression fn iterable(p: &mut Parser) { match p.current() { T!['('] => { @@ -801,6 +822,7 @@ fn iterable(p: &mut Parser) { } } +/// Parse a boolean expression tuple fn boolean_expr_tuple(p: &mut Parser) { let m = p.start(); p.expect(T!['(']); @@ -813,6 +835,7 @@ fn boolean_expr_tuple(p: &mut Parser) { m.complete(p, BOOLEAN_EXPR_TUPLE); } +/// Parse a pattern identifier tuple fn pattern_ident_tupple(p: &mut Parser) { let m = p.start(); p.expect(T!['(']); @@ -825,6 +848,7 @@ fn pattern_ident_tupple(p: &mut Parser) { m.complete(p, PATTERN_IDENT_TUPLE); } +/// Parse an identifier tuple fn ident_tuple(p: &mut Parser) { let m = p.start(); p.expect(T![identifier]); @@ -834,6 +858,7 @@ fn ident_tuple(p: &mut Parser) { } } +/// Parse an ident list expression fn ident_list(p: &mut Parser) { let m = p.start(); p.expect(T![,]); @@ -841,6 +866,7 @@ fn ident_list(p: &mut Parser) { m.complete(p, IDENTIFIER_NODE); } +/// Parse a variable wildcard expression fn variable_wildcard(p: &mut Parser) { let m = p.start(); p.expect(T![variable]); @@ -850,6 +876,9 @@ fn variable_wildcard(p: &mut Parser) { m.complete(p, VARIABLE_WILDCARD); } +/// Calculate the length of primary expression +/// It is used to decide if the expression is a primary expression, of expression or a statement +/// and to determine if primary expression is in valid form fn primary_expr_length(p: &mut Parser, len: usize, parentheses_count: &mut i32) -> usize { match p.nth(len) { T![float_lit] | T![int_lit] | T![string_lit] | T![filesize] | T![entrypoint] => len + 1, @@ -870,6 +899,7 @@ fn primary_expr_length(p: &mut Parser, len: usize, parentheses_count: &mut i32) } } +/// Calculate the length of range expression fn regex_pattern_length(p: &mut Parser, mut len: usize) -> usize { // Check if the pattern starts with `/` and ends with `/` if p.nth(len) == T![/] && p.nth(len + 1) == REGEX_LIT && p.nth(len + 2) == T![/] { @@ -884,6 +914,7 @@ fn regex_pattern_length(p: &mut Parser, mut len: usize) -> usize { len } +/// Calculate the length of variable count expression fn variable_count_length(p: &mut Parser, mut len: usize) -> usize { len += 1; if p.nth(len) == T![in] { @@ -893,6 +924,7 @@ fn variable_count_length(p: &mut Parser, mut len: usize) -> usize { len } +/// Calculate the length of range expression fn range_length(p: &mut Parser, mut len: usize) -> usize { len += 1; len = expr_length(p, len, &mut 0); @@ -901,6 +933,7 @@ fn range_length(p: &mut Parser, mut len: usize) -> usize { len + 1 } +/// Calculate the length of variable offset expression fn variable_offset_length(p: &mut Parser, mut len: usize) -> usize { len += 1; if p.nth(len) == T!['['] { @@ -911,6 +944,7 @@ fn variable_offset_length(p: &mut Parser, mut len: usize) -> usize { len } +/// Calculate the length of variable length expression fn variable_length_length(p: &mut Parser, mut len: usize) -> usize { len += 1; if p.nth(len) == T!['['] { @@ -921,6 +955,7 @@ fn variable_length_length(p: &mut Parser, mut len: usize) -> usize { len } +/// Calculate the length of term expression fn term_length(p: &mut Parser, mut len: usize, parentheses_count: &mut i32) -> usize { len = primary_expr_length(p, len, parentheses_count); @@ -948,6 +983,7 @@ fn term_length(p: &mut Parser, mut len: usize, parentheses_count: &mut i32) -> u } } +/// Calculate the length of expression fn expr_length(p: &mut Parser, mut len: usize, parentheses_count: &mut i32) -> usize { // Check if the expression starts with `(` if p.nth(len) == T!['('] { diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs index 031cd4e..05bba19 100644 --- a/src/syntax/ast.rs +++ b/src/syntax/ast.rs @@ -85,6 +85,7 @@ impl Iterator for AstChildren { } } +/// Helper module to work with AST nodes and obtaining their children pub mod support { use super::{AstChildren, AstNode, SyntaxKind, SyntaxNode, SyntaxToken}; diff --git a/src/syntax/ast/expr_ext.rs b/src/syntax/ast/expr_ext.rs index c169c75..701e303 100644 --- a/src/syntax/ast/expr_ext.rs +++ b/src/syntax/ast/expr_ext.rs @@ -1,7 +1,11 @@ -//! Similary as `operators.rs` it contains various extensions -//! and methods for `ast::Expr` and `ast::Literal` nodes. -//! `LiteralKind` type will probably also be changed during integration -//! It is for now just to showcase its abilities +//! Extension traits for the numerous AST nodes in typed layer +//! These traits provide additional functionality to the AST nodes +//! that is not provided by the generated code. +//! This is the only place where we should add functionality +//! and methods that we want to use in the typed layer. +//! +//! It provides operators methods and methods for obtaining left-hand side and right-hand side +//! of expressions use crate::{ syntax::ast::{ @@ -12,20 +16,6 @@ use crate::{ SyntaxToken, T, }; -//impl ast::PrefixExpr { -// pub fn op_kind(&self) -> Option { -// let res = match self.op_token()?.kind() { -// T![not] => UnaryOp::Not, -// _ => return None, -// }; -// Some(res) -// } -// -// pub fn op_token(&self) -> Option { -// self.syntax().first_child_or_token()?.into_token() -// } -//} -// impl ast::ExprBody { pub fn op_details(&self) -> Option<(SyntaxToken, BinaryOp)> { self.syntax().children_with_tokens().filter_map(|it| it.into_token()).find_map(|c| { @@ -193,41 +183,3 @@ impl ast::VariableWildcard { } } } -// -//#[derive(Clone, Debug, PartialEq, Eq, Hash)] -//pub enum LiteralKind { -// String(ast::StringLit), -// Int(ast::IntLit), -// Float(ast::FloatLit), -// Variable(ast::Variable), -// Bool(ast::BoolLit), -//} - -//impl ast::Literal { -// pub fn token(&self) -> SyntaxToken { -// self.syntax() -// .children_with_tokens() -// .find(|e| !e.kind().is_trivia()) -// .and_then(|e| e.into_token()) -// .unwrap() -// } -// -// pub fn kind(&self) -> LiteralKind { -// let token = self.token(); -// -// if let Some(number) = ast::IntLit::cast(token.clone()) { -// LiteralKind::Int(number) -// } else if let Some(number) = ast::FloatLit::cast(token.clone()) { -// LiteralKind::Float(number) -// } else if let Some(variable) = ast::Variable::cast(token.clone()) { -// LiteralKind::Variable(variable) -// } else if let Some(string) = ast::StringLit::cast(token.clone()) { -// LiteralKind::String(string) -// } else if let Some(boolean) = ast::BoolLit::cast(token.clone()) { -// LiteralKind::Bool(boolean) -// } else { -// unreachable!("Unknown literal kind") -// } -// } -//} -// diff --git a/src/syntax/ast/operators.rs b/src/syntax/ast/operators.rs index 6ec0e32..ac465f0 100644 --- a/src/syntax/ast/operators.rs +++ b/src/syntax/ast/operators.rs @@ -1,6 +1,4 @@ //! Implementations of operators for the AST. -//! This uses own type and it can be reused for both AST and HIR -//! This will probably be change during parser integration into `YARA-X` use std::fmt; diff --git a/src/syntax/syntax_error.rs b/src/syntax/syntax_error.rs index cb6fab0..154f6e8 100644 --- a/src/syntax/syntax_error.rs +++ b/src/syntax/syntax_error.rs @@ -9,17 +9,22 @@ use text_size::{TextRange, TextSize}; pub struct SyntaxError(String, TextRange); impl SyntaxError { + /// Create a new error with a message and a range pub fn new(message: impl Into, range: TextRange) -> Self { Self(message.into(), range) } + + /// Create a new error with a message and an offset pub fn new_at_offset(message: impl Into, offset: TextSize) -> Self { Self(message.into(), TextRange::empty(offset)) } + /// Get the range pub fn range(&self) -> TextRange { self.1 } + /// Connect range to the error pub fn with_range(mut self, range: TextRange) -> Self { self.1 = range; self diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs index b52ba63..a3afb80 100644 --- a/src/syntax/syntax_node.rs +++ b/src/syntax/syntax_node.rs @@ -18,10 +18,12 @@ pub enum YARALanguage {} impl Language for YARALanguage { type Kind = SyntaxKind; + /// Convert from raw kind to SyntaxKind fn kind_from_raw(raw: rowan_test::SyntaxKind) -> SyntaxKind { SyntaxKind::from(raw.0) } + /// Convert from SyntaxKind to raw kind fn kind_to_raw(kind: SyntaxKind) -> rowan_test::SyntaxKind { rowan_test::SyntaxKind(kind.into()) } @@ -43,25 +45,30 @@ pub struct SyntaxTreeBuilder { } impl SyntaxTreeBuilder { + /// Finish building the tree and return the result as a pair of GreenNode and list of errors pub(crate) fn finish_raw(self) -> (GreenNode, Vec) { let green = self.inner.finish(); (green, self.errors) } + /// Create a token pub fn token(&mut self, kind: SyntaxKind, text: &str) { let kind = YARALanguage::kind_to_raw(kind); self.inner.token(kind, text) } + /// Start a new node pub fn start_node(&mut self, kind: SyntaxKind) { let kind = YARALanguage::kind_to_raw(kind); self.inner.start_node(kind) } + /// Finish the current node pub fn finish_node(&mut self) { self.inner.finish_node() } + /// Add a new syntax error to the list of errors pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { self.errors.push(SyntaxError::new_at_offset(error.0, text_pos)) } diff --git a/src/syntax/tests/ast_src.rs b/src/syntax/tests/ast_src.rs index 9f2ae83..f6ece2c 100644 --- a/src/syntax/tests/ast_src.rs +++ b/src/syntax/tests/ast_src.rs @@ -8,6 +8,7 @@ pub(crate) struct KindsSrc<'a> { pub(crate) nodes: &'a [&'a str], } +// Definition of tokens and nodes for AST generation pub(crate) const KINDS_SRC: KindsSrc = KindsSrc { punct: &[ (":", "COLON"), diff --git a/src/syntax/tests/sourcegen_ast.rs b/src/syntax/tests/sourcegen_ast.rs index f722ca5..99aa079 100644 --- a/src/syntax/tests/sourcegen_ast.rs +++ b/src/syntax/tests/sourcegen_ast.rs @@ -3,6 +3,7 @@ //! It uses `ungrammar` crate to parse `yara.ungram` file and generate AST //! It is not a grammar, it does not validate anything. Just generates methods //! and types for AST layer +//! It is inspired by rust-analyzer's code generation using ungrammar use std::{collections::HashSet, fmt::Write}; @@ -634,7 +635,6 @@ fn lower_comma_list( true } -//TODO: possible deduplication and enum extraction and struct traits, so far not needed fn extract_struct_traits(ast: &mut AstSrc) { let nodes_with_comments = ["SourceFile", "Rule", "BlockExpr", "Strings", "Condition"]; diff --git a/src/syntax/text_token_source.rs b/src/syntax/text_token_source.rs index f52df84..9b977d3 100644 --- a/src/syntax/text_token_source.rs +++ b/src/syntax/text_token_source.rs @@ -15,14 +15,17 @@ pub(crate) struct TextTokenSource<'t> { } impl<'t> TokenSource for TextTokenSource<'t> { + /// Returns the current token fn current(&self) -> parser::Token { self.curr.0 } + /// Lookahead `n` tokens fn lookahead_nth(&self, n: usize) -> parser::Token { mk_token(self.curr.1 + n, &self.token_offset_pairs) } + /// Bumps the current token fn bump(&mut self) { if self.curr.0.kind == EOF { return; @@ -32,6 +35,7 @@ impl<'t> TokenSource for TextTokenSource<'t> { self.curr = (mk_token(pos, &self.token_offset_pairs), pos); } + /// Check if the current token is specific `SyntaxKind` kind fn is_keyword(&self, kw: &str) -> bool { self.token_offset_pairs .get(self.curr.1) @@ -40,6 +44,7 @@ impl<'t> TokenSource for TextTokenSource<'t> { } } +/// Create a token from a position fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { let (kind, is_jointed_to_next) = match token_offset_pairs.get(pos) { Some((token, offset)) => ( diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs index 589e4f3..ef6e29d 100644 --- a/src/syntax/text_tree_sink.rs +++ b/src/syntax/text_tree_sink.rs @@ -25,6 +25,7 @@ enum State { } impl<'a> TreeSink for TextTreeSink<'a> { + /// Attach a token to the current node fn token(&mut self, kind: SyntaxKind, n_tokens: u8) { match mem::replace(&mut self.state, State::Normal) { State::PendingStart => unreachable!(), @@ -42,6 +43,8 @@ impl<'a> TreeSink for TextTreeSink<'a> { self.do_token(kind, len, n_tokens); } + /// Start a new node + /// This method also handles attaching trivia to the node fn start_node(&mut self, kind: SyntaxKind) { match mem::replace(&mut self.state, State::Normal) { State::PendingStart => { @@ -74,6 +77,7 @@ impl<'a> TreeSink for TextTreeSink<'a> { self.eat_n_trivias(n_attached_trivias); } + /// Finish the current node fn finish_node(&mut self) { match mem::replace(&mut self.state, State::PendingFinish) { State::PendingStart => unreachable!(), @@ -113,6 +117,7 @@ impl<'a> TextTreeSink<'a> { self.inner.finish_raw() } + /// Consumes trivias until the next non-trivia token fn eat_trivias(&mut self) { while let Some(&token) = self.tokens.get(self.token_pos) { if !token.kind.is_trivia() { @@ -122,6 +127,7 @@ impl<'a> TextTreeSink<'a> { } } + /// Consumes n trivias fn eat_n_trivias(&mut self, n: usize) { for _ in 0..n { let token = self.tokens[self.token_pos]; @@ -130,6 +136,7 @@ impl<'a> TextTreeSink<'a> { } } + /// Consumes a token and attaches it to the current node fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { let range = TextRange::at(self.text_pos, len); let text = &self.text[range]; @@ -139,6 +146,9 @@ impl<'a> TextTreeSink<'a> { } } +/// Returns the number of attached trivias for the given node kind +/// Trivias are attached to the node if they are directly before the node +/// and there is no empty line between the trivia and the node fn n_attached_trivias<'a>( kind: SyntaxKind, trivias: impl Iterator, diff --git a/tests/test38.in b/tests/test_advanced_for_expression.in similarity index 100% rename from tests/test38.in rename to tests/test_advanced_for_expression.in diff --git a/tests/test38.out b/tests/test_advanced_for_expression.out similarity index 100% rename from tests/test38.out rename to tests/test_advanced_for_expression.out diff --git a/tests/test33.in b/tests/test_advanced_of_expression.in similarity index 100% rename from tests/test33.in rename to tests/test_advanced_of_expression.in diff --git a/tests/test33.out b/tests/test_advanced_of_expression.out similarity index 100% rename from tests/test33.out rename to tests/test_advanced_of_expression.out diff --git a/tests/test47.in b/tests/test_advanced_pattern_expression.in similarity index 100% rename from tests/test47.in rename to tests/test_advanced_pattern_expression.in diff --git a/tests/test47.out b/tests/test_advanced_pattern_expression.out similarity index 100% rename from tests/test47.out rename to tests/test_advanced_pattern_expression.out diff --git a/tests/test18.in b/tests/test_advanced_regex.in similarity index 100% rename from tests/test18.in rename to tests/test_advanced_regex.in diff --git a/tests/test18.out b/tests/test_advanced_regex.out similarity index 100% rename from tests/test18.out rename to tests/test_advanced_regex.out diff --git a/tests/test48.in b/tests/test_any_of_wildcard_expression.in similarity index 100% rename from tests/test48.in rename to tests/test_any_of_wildcard_expression.in diff --git a/tests/test48.out b/tests/test_any_of_wildcard_expression.out similarity index 100% rename from tests/test48.out rename to tests/test_any_of_wildcard_expression.out diff --git a/tests/test27.in b/tests/test_at_expression.in similarity index 100% rename from tests/test27.in rename to tests/test_at_expression.in diff --git a/tests/test27.out b/tests/test_at_expression.out similarity index 100% rename from tests/test27.out rename to tests/test_at_expression.out diff --git a/tests/test14.in b/tests/test_base64_custom_alphabet.in similarity index 100% rename from tests/test14.in rename to tests/test_base64_custom_alphabet.in diff --git a/tests/test14.out b/tests/test_base64_custom_alphabet.out similarity index 100% rename from tests/test14.out rename to tests/test_base64_custom_alphabet.out diff --git a/tests/test1.in b/tests/test_basic_string.in similarity index 100% rename from tests/test1.in rename to tests/test_basic_string.in diff --git a/tests/test1.out b/tests/test_basic_string.out similarity index 100% rename from tests/test1.out rename to tests/test_basic_string.out diff --git a/tests/test43.in b/tests/test_bitwise_negation_expression.in similarity index 100% rename from tests/test43.in rename to tests/test_bitwise_negation_expression.in diff --git a/tests/test43.out b/tests/test_bitwise_negation_expression.out similarity index 100% rename from tests/test43.out rename to tests/test_bitwise_negation_expression.out diff --git a/tests/test49.in b/tests/test_comments_inside_regex.in similarity index 100% rename from tests/test49.in rename to tests/test_comments_inside_regex.in diff --git a/tests/test49.out b/tests/test_comments_inside_regex.out similarity index 100% rename from tests/test49.out rename to tests/test_comments_inside_regex.out diff --git a/tests/test20.in b/tests/test_dot_expression.in similarity index 100% rename from tests/test20.in rename to tests/test_dot_expression.in diff --git a/tests/test20.out b/tests/test_dot_expression.out similarity index 100% rename from tests/test20.out rename to tests/test_dot_expression.out diff --git a/tests/test39.in b/tests/test_field_access_expression.in similarity index 100% rename from tests/test39.in rename to tests/test_field_access_expression.in diff --git a/tests/test39.out b/tests/test_field_access_expression.out similarity index 100% rename from tests/test39.out rename to tests/test_field_access_expression.out diff --git a/tests/test24.in b/tests/test_filesize_entrypoint.in similarity index 100% rename from tests/test24.in rename to tests/test_filesize_entrypoint.in diff --git a/tests/test24.out b/tests/test_filesize_entrypoint.out similarity index 100% rename from tests/test24.out rename to tests/test_filesize_entrypoint.out diff --git a/tests/test37.in b/tests/test_for_expression.in similarity index 100% rename from tests/test37.in rename to tests/test_for_expression.in diff --git a/tests/test37.out b/tests/test_for_expression.out similarity index 100% rename from tests/test37.out rename to tests/test_for_expression.out diff --git a/tests/test40.in b/tests/test_function_call_expression.in similarity index 100% rename from tests/test40.in rename to tests/test_function_call_expression.in diff --git a/tests/test40.out b/tests/test_function_call_expression.out similarity index 100% rename from tests/test40.out rename to tests/test_function_call_expression.out diff --git a/tests/test16.in b/tests/test_hex_string.in similarity index 100% rename from tests/test16.in rename to tests/test_hex_string.in diff --git a/tests/test16.out b/tests/test_hex_string.out similarity index 100% rename from tests/test16.out rename to tests/test_hex_string.out diff --git a/tests/test5.in b/tests/test_identifier_expr.in similarity index 100% rename from tests/test5.in rename to tests/test_identifier_expr.in diff --git a/tests/test5.out b/tests/test_identifier_expr.out similarity index 100% rename from tests/test5.out rename to tests/test_identifier_expr.out diff --git a/tests/test12.in b/tests/test_imports.in similarity index 100% rename from tests/test12.in rename to tests/test_imports.in diff --git a/tests/test12.out b/tests/test_imports.out similarity index 100% rename from tests/test12.out rename to tests/test_imports.out diff --git a/tests/test26.in b/tests/test_in_expression.in similarity index 100% rename from tests/test26.in rename to tests/test_in_expression.in diff --git a/tests/test26.out b/tests/test_in_expression.out similarity index 100% rename from tests/test26.out rename to tests/test_in_expression.out diff --git a/tests/test6.err b/tests/test_invalid_expr.err similarity index 100% rename from tests/test6.err rename to tests/test_invalid_expr.err diff --git a/tests/test6.in b/tests/test_invalid_expr.in similarity index 100% rename from tests/test6.in rename to tests/test_invalid_expr.in diff --git a/tests/test6.out b/tests/test_invalid_expr.out similarity index 100% rename from tests/test6.out rename to tests/test_invalid_expr.out diff --git a/tests/test4.err b/tests/test_invalid_string.err similarity index 100% rename from tests/test4.err rename to tests/test_invalid_string.err diff --git a/tests/test4.in b/tests/test_invalid_string.in similarity index 100% rename from tests/test4.in rename to tests/test_invalid_string.in diff --git a/tests/test4.out b/tests/test_invalid_string.out similarity index 100% rename from tests/test4.out rename to tests/test_invalid_string.out diff --git a/tests/test8.err b/tests/test_invalid_strings_section.err similarity index 100% rename from tests/test8.err rename to tests/test_invalid_strings_section.err diff --git a/tests/test8.in b/tests/test_invalid_strings_section.in similarity index 100% rename from tests/test8.in rename to tests/test_invalid_strings_section.in diff --git a/tests/test8.out b/tests/test_invalid_strings_section.out similarity index 100% rename from tests/test8.out rename to tests/test_invalid_strings_section.out diff --git a/tests/test9.in b/tests/test_meta.in similarity index 100% rename from tests/test9.in rename to tests/test_meta.in diff --git a/tests/test9.out b/tests/test_meta.out similarity index 100% rename from tests/test9.out rename to tests/test_meta.out diff --git a/tests/test3.in b/tests/test_multiple_boolean_expr.in similarity index 100% rename from tests/test3.in rename to tests/test_multiple_boolean_expr.in diff --git a/tests/test3.out b/tests/test_multiple_boolean_expr.out similarity index 100% rename from tests/test3.out rename to tests/test_multiple_boolean_expr.out diff --git a/tests/test22.in b/tests/test_multiple_expressions.in similarity index 100% rename from tests/test22.in rename to tests/test_multiple_expressions.in diff --git a/tests/test22.out b/tests/test_multiple_expressions.out similarity index 100% rename from tests/test22.out rename to tests/test_multiple_expressions.out diff --git a/tests/test42.in b/tests/test_negative_number_expression.in similarity index 100% rename from tests/test42.in rename to tests/test_negative_number_expression.in diff --git a/tests/test42.out b/tests/test_negative_number_expression.out similarity index 100% rename from tests/test42.out rename to tests/test_negative_number_expression.out diff --git a/tests/test41.in b/tests/test_nested_function_call_expression.in similarity index 100% rename from tests/test41.in rename to tests/test_nested_function_call_expression.in diff --git a/tests/test41.out b/tests/test_nested_function_call_expression.out similarity index 100% rename from tests/test41.out rename to tests/test_nested_function_call_expression.out diff --git a/tests/test23.in b/tests/test_not_defined.in similarity index 100% rename from tests/test23.in rename to tests/test_not_defined.in diff --git a/tests/test23.out b/tests/test_not_defined.out similarity index 100% rename from tests/test23.out rename to tests/test_not_defined.out diff --git a/tests/test29.in b/tests/test_of_expression.in similarity index 100% rename from tests/test29.in rename to tests/test_of_expression.in diff --git a/tests/test29.out b/tests/test_of_expression.out similarity index 100% rename from tests/test29.out rename to tests/test_of_expression.out diff --git a/tests/test34.in b/tests/test_of_expression_tuple.in similarity index 100% rename from tests/test34.in rename to tests/test_of_expression_tuple.in diff --git a/tests/test34.out b/tests/test_of_expression_tuple.out similarity index 100% rename from tests/test34.out rename to tests/test_of_expression_tuple.out diff --git a/tests/test30.in b/tests/test_of_pattern_tuple.in similarity index 100% rename from tests/test30.in rename to tests/test_of_pattern_tuple.in diff --git a/tests/test30.out b/tests/test_of_pattern_tuple.out similarity index 100% rename from tests/test30.out rename to tests/test_of_pattern_tuple.out diff --git a/tests/test32.in b/tests/test_of_percentage.in similarity index 100% rename from tests/test32.in rename to tests/test_of_percentage.in diff --git a/tests/test32.out b/tests/test_of_percentage.out similarity index 100% rename from tests/test32.out rename to tests/test_of_percentage.out diff --git a/tests/test2.in b/tests/test_or_expr.in similarity index 100% rename from tests/test2.in rename to tests/test_or_expr.in diff --git a/tests/test2.out b/tests/test_or_expr.out similarity index 100% rename from tests/test2.out rename to tests/test_or_expr.out diff --git a/tests/test44.in b/tests/test_pattern_count_expression.in similarity index 100% rename from tests/test44.in rename to tests/test_pattern_count_expression.in diff --git a/tests/test44.out b/tests/test_pattern_count_expression.out similarity index 100% rename from tests/test44.out rename to tests/test_pattern_count_expression.out diff --git a/tests/test46.in b/tests/test_pattern_length_expression.in similarity index 100% rename from tests/test46.in rename to tests/test_pattern_length_expression.in diff --git a/tests/test46.out b/tests/test_pattern_length_expression.out similarity index 100% rename from tests/test46.out rename to tests/test_pattern_length_expression.out diff --git a/tests/test45.in b/tests/test_pattern_offset_expression.in similarity index 100% rename from tests/test45.in rename to tests/test_pattern_offset_expression.in diff --git a/tests/test45.out b/tests/test_pattern_offset_expression.out similarity index 100% rename from tests/test45.out rename to tests/test_pattern_offset_expression.out diff --git a/tests/test28.in b/tests/test_precedence_advanced_expression.in similarity index 100% rename from tests/test28.in rename to tests/test_precedence_advanced_expression.in diff --git a/tests/test28.out b/tests/test_precedence_advanced_expression.out similarity index 100% rename from tests/test28.out rename to tests/test_precedence_advanced_expression.out diff --git a/tests/test7.in b/tests/test_precedence_expr.in similarity index 100% rename from tests/test7.in rename to tests/test_precedence_expr.in diff --git a/tests/test7.out b/tests/test_precedence_expr.out similarity index 100% rename from tests/test7.out rename to tests/test_precedence_expr.out diff --git a/tests/test19.in b/tests/test_primary_expression.in similarity index 100% rename from tests/test19.in rename to tests/test_primary_expression.in diff --git a/tests/test19.out b/tests/test_primary_expression.out similarity index 100% rename from tests/test19.out rename to tests/test_primary_expression.out diff --git a/tests/test35.in b/tests/test_primary_expression_precedence.in similarity index 100% rename from tests/test35.in rename to tests/test_primary_expression_precedence.in diff --git a/tests/test35.out b/tests/test_primary_expression_precedence.out similarity index 100% rename from tests/test35.out rename to tests/test_primary_expression_precedence.out diff --git a/tests/test36.in b/tests/test_primary_expression_precedence2.in similarity index 100% rename from tests/test36.in rename to tests/test_primary_expression_precedence2.in diff --git a/tests/test36.out b/tests/test_primary_expression_precedence2.out similarity index 100% rename from tests/test36.out rename to tests/test_primary_expression_precedence2.out diff --git a/tests/test21.in b/tests/test_primary_expression_standalone.in similarity index 100% rename from tests/test21.in rename to tests/test_primary_expression_standalone.in diff --git a/tests/test21.out b/tests/test_primary_expression_standalone.out similarity index 100% rename from tests/test21.out rename to tests/test_primary_expression_standalone.out diff --git a/tests/test31.in b/tests/test_primary_expression_tuple.in similarity index 100% rename from tests/test31.in rename to tests/test_primary_expression_tuple.in diff --git a/tests/test31.out b/tests/test_primary_expression_tuple.out similarity index 100% rename from tests/test31.out rename to tests/test_primary_expression_tuple.out diff --git a/tests/test17.in b/tests/test_regex.in similarity index 100% rename from tests/test17.in rename to tests/test_regex.in diff --git a/tests/test17.out b/tests/test_regex.out similarity index 100% rename from tests/test17.out rename to tests/test_regex.out diff --git a/tests/test25.in b/tests/test_regex_in_condition.in similarity index 100% rename from tests/test25.in rename to tests/test_regex_in_condition.in diff --git a/tests/test25.out b/tests/test_regex_in_condition.out similarity index 100% rename from tests/test25.out rename to tests/test_regex_in_condition.out diff --git a/tests/test10.in b/tests/test_rule_modifiers.in similarity index 100% rename from tests/test10.in rename to tests/test_rule_modifiers.in diff --git a/tests/test10.out b/tests/test_rule_modifiers.out similarity index 100% rename from tests/test10.out rename to tests/test_rule_modifiers.out diff --git a/tests/test11.in b/tests/test_rule_tags.in similarity index 100% rename from tests/test11.in rename to tests/test_rule_tags.in diff --git a/tests/test11.out b/tests/test_rule_tags.out similarity index 100% rename from tests/test11.out rename to tests/test_rule_tags.out diff --git a/tests/test13.in b/tests/test_string_pattern_modifiers.in similarity index 100% rename from tests/test13.in rename to tests/test_string_pattern_modifiers.in diff --git a/tests/test13.out b/tests/test_string_pattern_modifiers.out similarity index 100% rename from tests/test13.out rename to tests/test_string_pattern_modifiers.out diff --git a/tests/test15.in b/tests/test_xor_modifier.in similarity index 100% rename from tests/test15.in rename to tests/test_xor_modifier.in diff --git a/tests/test15.out b/tests/test_xor_modifier.out similarity index 100% rename from tests/test15.out rename to tests/test_xor_modifier.out diff --git a/yara.ungram b/yara.ungram index 2b0f462..c689e02 100644 --- a/yara.ungram +++ b/yara.ungram @@ -1,3 +1,11 @@ +// This is the ungrammar for the YARA language +// It follows the syntax of the YARA language as described in the official documentation +// and PEST grammar used by current YARA-X parser: https://github.com/VirusTotal/yara-x/blob/main/parser/src/parser/grammar.pest +// It is inspired by ungrammar for Rust Language: https://github.com/rust-analyzer/ungrammar/blob/master/rust.ungram +// Some aspects are modified in order to provide easier integration to YARA-X or usage in language server +// This grammar is only used for creation of typed layer on top of the AST. It does not have to be unambiguous, +// as the parsing is not following this structure. + SourceFile = ImportStmt* | IncludeStmt* |Rule* diff --git a/yara_subset.grammar b/yara_subset.grammar deleted file mode 100644 index d0b1894..0000000 --- a/yara_subset.grammar +++ /dev/null @@ -1,14 +0,0 @@ -SOURCE -> RULE | eps. -RULE -> rule identifier lbrace RULEBODY rbrace. -RULEBODY -> STRINGS CONDITION | CONDITION . -STRINGS -> string colon STRINGSBODY. -CONDITION -> condition colon EXPRESSION. -STRINGSBODY -> variable assign string STRINGSBODY | eps. -EXPRESSION -> LITERAL EXPRESSION_2 | NOTOPERATOR EXPRESSION. -EXPRESSION_2 -> OPERATOR EXPRESSION EXPRESSION_2 | eps. -LITERAL -> variable | BOOLEAN. -BOOLEAN -> true | false. -OPERATOR -> and | or. -NOTOPERATOR -> not. - -// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+CONDITION+%7C+CONDITION+.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+EXPRESSION.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0AEXPRESSION+-%3E+LITERAL+EXPRESSION_2+%7C+NOTOPERATOR+EXPRESSION.%0D%0AEXPRESSION_2+-%3E+OPERATOR+EXPRESSION+EXPRESSION_2+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or.%0D%0ANOTOPERATOR+-%3E+not. \ No newline at end of file