diff --git a/Cargo.lock b/Cargo.lock index 8591c6a..8094f17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -14,30 +23,204 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata 0.1.10", +] + +[[package]] +name = "bstr" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "windows-sys", +] + [[package]] name = "countme" version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + [[package]] name = "drop_bomb" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "globset" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +dependencies = [ + "aho-corasick", + "bstr 1.9.0", + "log", + "regex-automata 0.4.4", + "regex-syntax 0.8.2", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags 2.4.2", + "ignore", + "walkdir", +] + +[[package]] +name = "goldenfile" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a67453a3b358bd8213aedafd4feed75eecab9fb04bed26ba6fdf94694be560" +dependencies = [ + "scopeguard", + "similar-asserts", + "tempfile", + "yansi", +] + [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +[[package]] +name = "ignore" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.4", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + [[package]] name = "logos" version = "0.13.0" @@ -57,7 +240,7 @@ dependencies = [ "fnv", "proc-macro2", "quote", - "regex-syntax", + "regex-syntax 0.6.29", "syn", ] @@ -70,6 +253,12 @@ dependencies = [ "logos-codegen", ] +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + [[package]] name = "memoffset" version = "0.9.0" @@ -97,12 +286,44 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-automata" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + [[package]] name = "regex-syntax" version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + [[package]] name = "rowan-test" version = "0.1.0" @@ -121,6 +342,74 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustix" +version = "0.38.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "similar" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21" +dependencies = [ + "bstr 0.2.17", + "unicode-segmentation", +] + +[[package]] +name = "similar-asserts" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e041bb827d1bfca18f213411d51b665309f1afb37a04a5d1464530e13779fc0f" +dependencies = [ + "console", + "similar", +] + [[package]] name = "syn" version = "2.0.46" @@ -132,6 +421,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + [[package]] name = "text-size" version = "1.1.1" @@ -144,11 +446,132 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "yara-parser" version = "0.1.0" dependencies = [ "drop_bomb", + "globwalk", + "goldenfile", "logos", "rowan-test", "text-size", diff --git a/Cargo.toml b/Cargo.toml index 594d4bc..2a3d280 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,7 @@ logos = "0.13.0" rowan-test = { git = "https://github.com/avast/avast-rowan.git" } text-size = "1.1.1" drop_bomb = "0.1.5" + +[dev-dependencies] +goldenfile = "1.6.0" +globwalk = "0.9.1" diff --git a/example.yar b/example.yar index 12f2d0b..9f68a9b 100644 --- a/example.yar +++ b/example.yar @@ -10,6 +10,6 @@ rule test $a = "foo" $b = "bar" condition: - $a and - $b + $a or + $b and true } diff --git a/src/main.rs b/src/main.rs index cf5820f..6bd6dda 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ /// This library is used to create a parser for YARA language /// It should provide also token for whitespaces /// as we want full fidelity and error resilience.; -use std::{env::args, fs, path::Path}; +use std::{env::args, fs, io::Write, path::Path}; use rowan_test::{GreenNode, NodeOrToken}; @@ -42,27 +42,68 @@ fn parse_text(text: &str) -> (GreenNode, Vec) { println!(); let indent = 0; - print(indent, syntax_tree.into()); - //for child in syntax_tree.children() { - // print!("{:indent$}", "", indent = indent); - // println!("{:?}", child.kind()); - // println!("{:?}", child.green().children()); - //} + let result = print(indent, syntax_tree.into()); + + print!("{}", result); (tree, parser_errors) } -fn print(indent: usize, element: SyntaxElement) { - let kind: SyntaxKind = element.kind().into(); - print!("{:indent$}", "", indent = indent); +fn print(indent: usize, element: SyntaxElement) -> String { + let mut result = String::new(); + let kind: SyntaxKind = element.kind(); + result.push_str(&format!("{:indent$}", "", indent = indent)); match element { NodeOrToken::Node(node) => { - println!("- {:?}", kind); + result.push_str(&format!("- {:?}\n", kind)); for child in node.children_with_tokens() { - print(indent + 2, child); + result.push_str(&print(indent + 2, child)); } } - NodeOrToken::Token(token) => println!("- {:?} {:?}", token.text(), kind), + NodeOrToken::Token(token) => { + result.push_str(&format!("- {:?} {:?}\n", token.text(), kind)); + } + } + result +} + +#[test] +fn test_parse_text() { + let mut mint = goldenfile::Mint::new("."); + + for entry in globwalk::glob("tests/*.in").unwrap().flatten() { + // Path to the .in.zip file. + let path = entry.into_path(); + let display_path = path.display(); + + let input = fs::read_to_string(&path) + .unwrap_or_else(|_| panic!("Failed to read input file {:?}", display_path)); + + let (tree, errors) = parse_text(&input); + + let out_path = path.with_extension("").with_extension("out"); + let syntax_tree = SyntaxNode::new_root(tree.clone()); + + let output = print(0, syntax_tree.into()); + + let mut output_file = mint.new_goldenfile(out_path).unwrap(); + + write!(output_file, "{}", output).unwrap(); + + // Check errors + let err_path = path.with_extension("").with_extension("err"); + if err_path.exists() { + let expected_errors = fs::read_to_string(&err_path) + .unwrap_or_else(|_| panic!("Failed to read error file {:?}", err_path.display())); + let actual_errors = errors + .iter() + .map(|error| format!("{:?}", error)) + .collect::>() + .join("\n"); + assert_eq!(actual_errors, expected_errors); + } else { + assert!(errors.is_empty(), "Unexpected errors: {:?}", errors); + } } } diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index c47c99d..da80b41 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -45,25 +45,34 @@ fn condition(p: &mut Parser) { m.complete(p, CONDITION); } +const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); + pub(super) fn strings_body(p: &mut Parser) { // add support for meta also while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { - assert!(p.at(VARIABLE)); let m = p.start(); - p.bump(VARIABLE); + if p.at(VARIABLE) { + let m = p.start(); + p.bump(VARIABLE); + m.complete(p, VARIABLE); + } else { + p.err_recover("expected a variable", VARIABLE_RECOVERY_SET); + } p.expect(ASSIGN); // so far only strings are supported, later add match for hex strings and regex string(p); - m.complete(p, VARIABLE); + m.complete(p, VARIABLE_STMT); } } -// do the same for hex and regex strings +// add support for hex and regex strings later on fn string(p: &mut Parser) { - assert!(p.at(STRING)); let m = p.start(); - p.bump(STRING); - // add plain string modifiers + match p.current() { + STRING => p.bump(STRING), + _ => p.err_and_bump("expected a string"), + } + // add string modifiers m.complete(p, STRING); } @@ -96,10 +105,7 @@ fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { fn expression(p: &mut Parser, m: Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match lhs(p) { - Some(lhs) => { - let lhs = lhs.extend_to(p, m); - lhs - } + Some(lhs) => lhs.extend_to(p, m), None => { m.abandon(p); return None; diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 8e383ae..18f227f 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -13,11 +13,21 @@ pub(crate) fn literal(p: &mut Parser) -> Option { Some(m.complete(p, LITERAL)) } +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, AND, OR, NOT]); + // add support for while/for loops, if/else statements, etc. pub(super) fn atom_expr(p: &mut Parser) -> Option { if let Some(m) = literal(p) { return Some(m); - } else { - todo!("add support for other atoms") } + + // This will be extended to support more expressions later + #[allow(clippy::match_single_binding)] + match p.current() { + _ => { + p.err_recover("expected expression", EXPR_RECOVERY_SET); + #[allow(clippy::needless_return)] + return None; + } + }; } diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs index c344898..ff500ad 100644 --- a/src/parser/grammar/items.rs +++ b/src/parser/grammar/items.rs @@ -8,7 +8,7 @@ pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( ); pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { - while !p.at(EOF) && !(p.at(RBRACE) && stop_on_r_brace) { + while !(p.at(EOF) || p.at(RBRACE) && stop_on_r_brace) { process_top_level(p, stop_on_r_brace); } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 1cdc340..5585781 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3,13 +3,14 @@ pub mod syntaxkind; pub use syntaxkind::SyntaxKind; mod event; mod grammar; +#[allow(clippy::module_inception)] mod parser; mod token_set; use grammar::parse_source_file; #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ParseError(pub Box); +pub struct ParseError(pub String); pub trait TokenSource { fn current(&self) -> Token; diff --git a/src/parser/parser.rs b/src/parser/parser.rs index 53b1e49..d6afde2 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -94,7 +94,7 @@ impl<'t> Parser<'t> { } pub(crate) fn error>(&mut self, message: T) { - let msg = ParseError(Box::new(message.into())); + let msg = ParseError(message.into()); self.push_event(Event::Error { msg }); } @@ -111,15 +111,8 @@ impl<'t> Parser<'t> { } pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { - match self.current() { - LBRACE | RBRACE => { - self.error(message); - return; - } - _ => (), - } - if self.at_ts(recovery) { + println!("recovery: {:?}", self.current()); self.error(message); return; } diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs index 36c5eeb..fc47c2c 100644 --- a/src/parser/syntaxkind.rs +++ b/src/parser/syntaxkind.rs @@ -1,3 +1,5 @@ +#![allow(clippy::upper_case_acronyms)] + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[repr(u16)] pub enum SyntaxKind { @@ -31,6 +33,7 @@ pub enum SyntaxKind { LITERAL, EXPRESSION, EXPRESSION_STMT, + VARIABLE_STMT, __LAST, } diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs index a84a52d..8d762d0 100644 --- a/src/syntax/syntax_node.rs +++ b/src/syntax/syntax_node.rs @@ -63,6 +63,6 @@ impl SyntaxTreeBuilder { pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { self.errors - .push(SyntaxError::new_at_offset(*error.0, text_pos)) + .push(SyntaxError::new_at_offset(error.0, text_pos)) } } diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs index 264a6ce..63f2992 100644 --- a/src/syntax/text_tree_sink.rs +++ b/src/syntax/text_tree_sink.rs @@ -148,9 +148,9 @@ fn n_attached_trivias<'a>( match kind { SyntaxKind::RULE | SyntaxKind::BLOCK_EXPR | SyntaxKind::STRINGS | SyntaxKind::CONDITION => { let mut res = 0; - let mut trivias = trivias.enumerate().peekable(); + let trivias = trivias.enumerate().peekable(); - while let Some((i, (kind, text))) = trivias.next() { + for (i, (kind, text)) in trivias { match kind { SyntaxKind::WHITESPACE if text.contains("\n\n") => { break; diff --git a/tests/test1.in b/tests/test1.in new file mode 100644 index 0000000..69ed034 --- /dev/null +++ b/tests/test1.in @@ -0,0 +1,7 @@ +rule test +{ + strings: + $a = "foo" + condition: + $a +} diff --git a/tests/test1.out b/tests/test1.out new file mode 100644 index 0000000..f4c65e8 --- /dev/null +++ b/tests/test1.out @@ -0,0 +1,33 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$a" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test2.in b/tests/test2.in new file mode 100644 index 0000000..4e26293 --- /dev/null +++ b/tests/test2.in @@ -0,0 +1,9 @@ +rule test +{ + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b +} diff --git a/tests/test2.out b/tests/test2.out new file mode 100644 index 0000000..68899c1 --- /dev/null +++ b/tests/test2.out @@ -0,0 +1,48 @@ +- SOURCE_FILE + - RULE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - STRINGS + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - LITERAL + - "$b" VARIABLE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test3.in b/tests/test3.in new file mode 100644 index 0000000..9f68a9b --- /dev/null +++ b/tests/test3.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test3.out b/tests/test3.out new file mode 100644 index 0000000..1d407f9 --- /dev/null +++ b/tests/test3.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test4.err b/tests/test4.err new file mode 100644 index 0000000..bf9ebfc --- /dev/null +++ b/tests/test4.err @@ -0,0 +1 @@ +SyntaxError("expected a variable", 98..98) \ No newline at end of file diff --git a/tests/test4.in b/tests/test4.in new file mode 100644 index 0000000..8f0a414 --- /dev/null +++ b/tests/test4.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + a = "foo" + $b = "bar" + condition: + $a or + $b and true +} diff --git a/tests/test4.out b/tests/test4.out new file mode 100644 index 0000000..699f73c --- /dev/null +++ b/tests/test4.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - EXPRESSION + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test5.err b/tests/test5.err new file mode 100644 index 0000000..af68e68 --- /dev/null +++ b/tests/test5.err @@ -0,0 +1 @@ +SyntaxError("expected expression", 144..144) \ No newline at end of file diff --git a/tests/test5.in b/tests/test5.in new file mode 100644 index 0000000..75bfc9f --- /dev/null +++ b/tests/test5.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a or + b and true +} diff --git a/tests/test5.out b/tests/test5.out new file mode 100644 index 0000000..5753b77 --- /dev/null +++ b/tests/test5.out @@ -0,0 +1,62 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - IDENTIFIER + - "test" IDENTIFIER + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - STRINGS + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - "strings" STRINGS + - ":" COLON + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$a" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"foo\"" STRING + - "\n\t\t" WHITESPACE + - VARIABLE_STMT + - VARIABLE + - "$b" VARIABLE + - " " WHITESPACE + - "=" ASSIGN + - " " WHITESPACE + - STRING + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - EXPRESSION + - EXPRESSION + - LITERAL + - "$a" VARIABLE + - " " WHITESPACE + - "or" OR + - "\n\t\t" WHITESPACE + - ERROR + - "b" IDENTIFIER + - " " WHITESPACE + - "and" AND + - " " WHITESPACE + - LITERAL + - "true" TRUE + - "\n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/tests/test6.err b/tests/test6.err new file mode 100644 index 0000000..b6080fd --- /dev/null +++ b/tests/test6.err @@ -0,0 +1,13 @@ +SyntaxError("expected a name", 38..38) +SyntaxError("expected strings or condition", 92..92) +SyntaxError("expected strings or condition", 98..98) +SyntaxError("expected strings or condition", 102..102) +SyntaxError("expected strings or condition", 104..104) +SyntaxError("expected strings or condition", 106..106) +SyntaxError("expected strings or condition", 114..114) +SyntaxError("expected strings or condition", 117..117) +SyntaxError("expected strings or condition", 119..119) +SyntaxError("expected expression", 139..139) +SyntaxError("expected expression", 141..141) +SyntaxError("expected expression", 150..150) +SyntaxError("Invalid character", 98..99) \ No newline at end of file diff --git a/tests/test6.in b/tests/test6.in new file mode 100644 index 0000000..cc3cb4e --- /dev/null +++ b/tests/test6.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule condition +{ + //Rule block comment + + //String comment + string* + a = 00000 + $b = "bar" + condition: + a ord + $b ant +} diff --git a/tests/test6.out b/tests/test6.out new file mode 100644 index 0000000..9aeedac --- /dev/null +++ b/tests/test6.out @@ -0,0 +1,60 @@ +- SOURCE_FILE + - "//Global comment" COMMENT + - "\n\n" WHITESPACE + - RULE + - "//Rule comment" COMMENT + - "\n" WHITESPACE + - "rule" RULE + - " " WHITESPACE + - ERROR + - "condition" CONDITION + - "\n" WHITESPACE + - BLOCK_EXPR + - "{" LBRACE + - "\n\t" WHITESPACE + - "//Rule block comment" COMMENT + - "\n\n\t" WHITESPACE + - "//String comment" COMMENT + - "\n\t" WHITESPACE + - ERROR + - "string" IDENTIFIER + - ERROR + - "*" ERROR + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "00000" NUMBER + - "\n\t\t" WHITESPACE + - ERROR + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "=" ASSIGN + - " " WHITESPACE + - ERROR + - "\"bar\"" STRING + - "\n\t" WHITESPACE + - CONDITION + - "condition" CONDITION + - ":" COLON + - "\n\t\t" WHITESPACE + - ERROR + - "a" IDENTIFIER + - " " WHITESPACE + - ERROR + - "ord" IDENTIFIER + - "\n\t\t" WHITESPACE + - EXPRESSION_STMT + - LITERAL + - "$b" VARIABLE + - " " WHITESPACE + - ERROR + - "ant" IDENTIFIER + - " \n" WHITESPACE + - "}" RBRACE + - "\n" WHITESPACE diff --git a/yara_subset.grammar b/yara_subset.grammar new file mode 100644 index 0000000..9adf063 --- /dev/null +++ b/yara_subset.grammar @@ -0,0 +1,12 @@ +SOURCE -> RULE | eps. +RULE -> rule identifier lbrace RULEBODY rbrace. +RULEBODY -> STRINGS RULEBODY | CONDITION RULEBODY | eps. +STRINGS -> string colon STRINGSBODY. +CONDITION -> condition colon CONDITIONBODY. +STRINGSBODY -> variable assign string STRINGSBODY | eps. +CONDITIONBODY -> LITERAL CONDITIONBODY | OPERATOR CONDITIONBODY | eps. +LITERAL -> variable | BOOLEAN. +BOOLEAN -> true | false. +OPERATOR -> and | or | not. + +// https://smlweb.cpsc.ucalgary.ca/vital-stats.php?grammar=SOURCE+-%3E+RULE+%7C+eps.%0D%0ARULE+-%3E+rule+identifier+lbrace+RULEBODY+rbrace.%0D%0ARULEBODY+-%3E+STRINGS+%7C+CONDITION+%7C+eps.%0D%0ASTRINGS+-%3E+string+colon+STRINGSBODY.%0D%0ACONDITION+-%3E+condition+colon+CONDITIONBODY.%0D%0ASTRINGSBODY+-%3E+variable+assign+string+STRINGSBODY+%7C+eps.%0D%0ACONDITIONBODY+-%3E+LITERAL+EXPRESSION+%7C+OPERATOR+EXPRESSION+%7C+eps.%0D%0ALITERAL+-%3E+variable+%7C+BOOLEAN.%0D%0ABOOLEAN+-%3E+true+%7C+false.%0D%0AOPERATOR+-%3E+and+%7C+or+%7C+not. \ No newline at end of file