Skip to content

Commit

Permalink
add comments, rename tests
Browse files Browse the repository at this point in the history
  • Loading branch information
TommYDeeee committed Apr 19, 2024
1 parent b3022c3 commit 53164c7
Show file tree
Hide file tree
Showing 117 changed files with 128 additions and 95 deletions.
4 changes: 2 additions & 2 deletions example.yar
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
// Example of YARA rule for testing purposes

import "macho"
include "test"
//Global comment

//Rule comment
rule test : bla test
{
//Rule block comment

meta:
author = "Author"
description = 20
Expand Down
2 changes: 2 additions & 0 deletions src/bin.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::fs;
use yara_parser::{AstNode, SourceFile};

/// This is a simple binary that reads a file and prints the AST and errors.
/// It is used to test the parser.
fn main() {
let filename = std::env::args().nth(1).expect("No arguments provided");
let file_content =
Expand Down
11 changes: 10 additions & 1 deletion src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ impl fmt::Display for LexingError {
}
}

/// Root lexer for YARA language.
#[derive(Logos, Debug, PartialEq)]
#[logos(error = LexingError)]
pub(crate) enum LogosToken {
Expand Down Expand Up @@ -211,6 +212,7 @@ pub(crate) enum LogosToken {
MultilineComment,
}

/// Lexer for hexadecimal string.
#[derive(Logos, Debug, PartialEq)]
#[logos(error = LexingError)]
pub(crate) enum HexLogosToken {
Expand Down Expand Up @@ -267,9 +269,11 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
let token_range = TextRange::at(offset.try_into().unwrap(), token_len);
let syntaxkind = match token {
Ok(token) => {
// Handle hexadecimal string token separately
if let LogosToken::HexString(hex_string) = token {
process_hex_string_token(hex_string, &mut tokens, &mut errors, &mut offset);
continue;
// Handle regex string token separately
} else if let LogosToken::Regexp(regex) = token {
let detailed_tokens = process_regex_string_token(regex);
for (kind, len) in detailed_tokens {
Expand Down Expand Up @@ -384,6 +388,9 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind {
}
}

/// Process regex string token to generate detailed tokens
/// This is the representation that YARA-X uses, therefore for an
/// easier integration with YARA-X, we need to keep this representation
fn process_regex_string_token(regex: String) -> Vec<(SyntaxKind, usize)> {
let mut tokens = Vec::new();
let mut chars = regex.chars().peekable();
Expand Down Expand Up @@ -419,7 +426,9 @@ fn process_regex_string_token(regex: String) -> Vec<(SyntaxKind, usize)> {
tokens
}

// Process hexadecimal string token to generate detailed tokens
/// Process hexadecimal string token to generate detailed tokens
/// This is the representation that YARA-X uses, therefore for an
/// easier integration with YARA-X, we need to keep this representation
fn process_hex_string_token(
hex_string: String,
tokens: &mut Vec<Token>,
Expand Down
37 changes: 25 additions & 12 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/// This library is used to create a parser for YARA language
/// It should provide also token for whitespaces
/// as we want full fidelity and error resilience.;
//! This library is used to create a parser for YARA language
//! It should provide also token for whitespaces
//! as we want full fidelity and error resilience.;
//! It is inspired by the Swift's libSyntax and the Rust's rowan.
//!
//! author: Tomáš Ďuriš
use crate::syntax::{
syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink,
};
Expand Down Expand Up @@ -46,7 +49,6 @@ fn api_walktrough() {
// SourceFile is the main entry point for any given file
// it contains a `parse` method which returns a `Parse` struct
// that contains AST and list of errors

let parse_struct = SourceFile::parse(source_code);
assert!(parse_struct.errors().is_empty());

Expand Down Expand Up @@ -134,43 +136,49 @@ fn api_walktrough() {
}

// For the condition part, we can similarly get its body which is
// an `BOOLEAN_EXPR` node
// an `EXPRESSION` node
let condition = block.condition().unwrap();
let expression_stmt = condition.expression_stmt().unwrap();

let expression = expression_stmt.expression().unwrap();

// Expression can be either a `BOOLEAN_EXPR` or `BOOLEAN_TERM`
// In this example we have `BOOLEAN_EXPR`
let boolean_expr = match &expression {
Expression::BooleanExpr(e) => e,
_ => unreachable!(),
};

// Now we can obtain `lhs`, `rhs` or `op` nodes for top level expression
// Now we can obtain `lhs`, `rhs` or `op` nodes for top level boolean expression
// in this case we have `OR` operator
assert!(boolean_expr.op_token().is_some());
assert!(boolean_expr.op_token().unwrap().kind() == SyntaxKind::OR_KW);

// On the left hand side we have a LITERAL token
// On the left hand side we have a `BOOLEAN_TERM` node
let lhs = boolean_expr.lhs().unwrap();
let lhs_literal = match &lhs {
Expression::BooleanTerm(l) => l,
_ => unreachable!(),
};
// It contains a `VARIABLE` node
// which is essentially a variable token $a
assert!(lhs_literal.variable_token().unwrap().kind() == SyntaxKind::VARIABLE);
assert_eq!(lhs_literal.variable_token().unwrap().text(), "$a");

// On the right hand side we have a `BOOLEAN_EXPT` node
// On the right hand side we have a `BOOLEAN_EXPR` node
let rhs = boolean_expr.rhs().unwrap();

// It contains prefix expression which is essentially a `BOOLEAN_TERM` node
// in this case we have `NOT` node and nested `VARIABLE` node
let rhs_literal = match &rhs {
Expression::BooleanExpr(r) => r,
_ => unreachable!(),
};

// As it is again an expression we can obtain the operator,
// left hand side and right hand side
let lhs_of_rhs = rhs_literal.lhs().unwrap();

// In this case we only have a left hand side
// which is a `BOOLEAN_TERM` node and contains a `NOT` operator
// followed by a `BOOLEAN_TERM` node with a `BOOL_LIT` token
let lhs = match &lhs_of_rhs {
Expression::BooleanTerm(l) => l,
_ => unreachable!(),
Expand All @@ -191,6 +199,7 @@ fn api_walktrough() {

// Syntax node have also bunch of methods
// for example we can obtain the parent node
// Parent node is whole condition block
let parent = expression_stmt_syntax.parent().unwrap();
assert_eq!(parent.kind(), SyntaxKind::CONDITION);
assert_eq!(parent.text().to_string(), "condition:\n $a or not true");
Expand Down Expand Up @@ -267,6 +276,9 @@ fn api_walktrough() {
let parse_struct = SourceFile::parse(source_code);

// There are some errors
// First three are for invalid pattern declaration in strings section
// The rest is for an invalid part of a condition
//(nor operator is invalid so also everything after it in condition block)
assert!(!parse_struct.errors().is_empty());
assert!(parse_struct.errors().len() == 6);
assert!(
Expand All @@ -293,7 +305,7 @@ fn api_walktrough() {
};

// The operator is wrong, therefore we only have
// a variable
// a variable in expression statement
assert!(boolean_term.variable_token().unwrap().kind() == SyntaxKind::VARIABLE);

// and we can obtain the error token
Expand All @@ -304,6 +316,7 @@ fn api_walktrough() {
.unwrap();

assert!(error_token.kind() == SyntaxKind::ERROR);
// it contains the token for wrong operator inside (as an identifier)
assert!(error_token.as_node().unwrap().text() == "nor");
}
// We can also search a token that produced the error
Expand Down
Loading

0 comments on commit 53164c7

Please sign in to comment.