From fbde32a39a8176aad14ee0c0a4ff6b9fafa71bf8 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 16 Jan 2024 10:56:55 +0100 Subject: [PATCH 01/28] feat: implement `console` module. --- .github/workflows/tests.yaml | 2 +- yara-x-py/src/lib.rs | 19 ++++ yara-x-py/tests/test_api.py | 14 +++ yara-x/Cargo.toml | 7 +- yara-x/src/modules/console.rs | 112 ++++++++++++++++++++++++ yara-x/src/modules/modules.rs | 30 ++++--- yara-x/src/modules/protos/console.proto | 14 +++ yara-x/src/scanner/context.rs | 8 ++ yara-x/src/scanner/mod.rs | 16 ++++ yara-x/src/wasm/builder.rs | 24 ++--- 10 files changed, 217 insertions(+), 29 deletions(-) create mode 100644 yara-x/src/modules/console.rs create mode 100644 yara-x/src/modules/protos/console.proto diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 1ce21b6ce..9ae1e015f 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -56,7 +56,7 @@ jobs: - build: no-default-features os: ubuntu-latest rust: stable - args: "--package yara-x --no-default-features --features=test_proto2-module,test_proto3-module,time-module,hash-module,macho-module,math-module,lnk-module,elf-module,pe-module,dotnet-module" + args: "--package yara-x --no-default-features --features=test_proto2-module,test_proto3-module,time-module,hash-module,macho-module,math-module,lnk-module,elf-module,pe-module,dotnet-module,console-module" steps: - name: Checkout sources diff --git a/yara-x-py/src/lib.rs b/yara-x-py/src/lib.rs index fea7a2925..c5acb1176 100644 --- a/yara-x-py/src/lib.rs +++ b/yara-x-py/src/lib.rs @@ -188,6 +188,25 @@ impl Scanner { self.inner.timeout(Duration::from_secs(seconds)); } + /// Sets a callback that is invoked every time a YARA rule calls the + /// `console` module. + /// + /// The `callback` function is invoked with a string representing the + /// message being logged. The function can print the message to stdout, + /// append it to a file, etc. If no callback is set these messages are + /// ignored. + fn console_log(&mut self, callback: PyObject) -> PyResult<()> { + if !Python::with_gil(|py| callback.as_ref(py).is_callable()) { + return Err(PyValueError::new_err("callback is not callable")); + } + self.inner.console_log(move |msg| { + let _ = Python::with_gil(|py| -> PyResult { + callback.call1(py, (msg,)) + }); + }); + Ok(()) + } + /// Scans in-memory data. #[pyo3(signature = (data))] fn scan(&mut self, data: &[u8]) -> PyResult> { diff --git a/yara-x-py/tests/test_api.py b/yara-x-py/tests/test_api.py index 50c32838e..0b95b6344 100644 --- a/yara-x-py/tests/test_api.py +++ b/yara-x-py/tests/test_api.py @@ -103,3 +103,17 @@ def test_scanner_timeout(): scanner.timeout(1) with pytest.raises(Exception, match='timeout'): scanner.scan(b'foobar') + + +def test_console_log(): + ok = False + def callback(msg): + nonlocal ok + if msg == 'foo': + ok = True + compiler = yara_x.Compiler() + compiler.add_source('import "console" rule foo {condition: console.log("foo")}') + scanner = yara_x.Scanner(compiler.build()) + scanner.console_log(callback) + scanner.scan(b'') + assert ok \ No newline at end of file diff --git a/yara-x/Cargo.toml b/yara-x/Cargo.toml index ae187078c..8a4981702 100644 --- a/yara-x/Cargo.toml +++ b/yara-x/Cargo.toml @@ -36,7 +36,7 @@ logging = ["dep:log"] # Enables rules profiling. When this is enabled together with `logging` the # logs will contain information about the most expensive rules after each -# scan. Notice that profiling itself has an noticeable impact on performance. +# scan. Notice that profiling itself has a noticeable impact on performance. rules-profiling = ["logging"] # Features for enabling/disabling modules. @@ -45,10 +45,12 @@ rules-profiling = ["logging"] # a given module is built or not. For instance, if the feature `foo-module` is # enabled, the module `foo` will be built into YARA. +# The `console` module exports functions for printing text from YARA rules. +console-module = [] + # The `dotnet` module parsers .NET files. dotnet-module = [] - # The `elf` module parses ELF files. elf-module = [ "dep:tlsh-fixed" @@ -107,6 +109,7 @@ default = [ "constant-folding", "exact-atoms", "fast-regexp", + "console-module", "dotnet-module", "elf-module", "macho-module", diff --git a/yara-x/src/modules/console.rs b/yara-x/src/modules/console.rs new file mode 100644 index 000000000..d555429a7 --- /dev/null +++ b/yara-x/src/modules/console.rs @@ -0,0 +1,112 @@ +use crate::modules::prelude::*; +use crate::modules::protos::console::*; + +#[module_main] +fn main(_data: &[u8]) -> Console { + // Nothing to do, but we have to return our protobuf + Console::new() +} + +#[module_export(name = "log")] +fn log_str(ctx: &mut ScanContext, string: RuntimeString) -> bool { + ctx.console_log(format!("{}", string.as_bstr(ctx))); + true +} + +#[module_export(name = "log")] +fn log_msg_str( + ctx: &mut ScanContext, + message: RuntimeString, + string: RuntimeString, +) -> bool { + ctx.console_log(format!( + "{}{}", + message.as_bstr(ctx), + string.as_bstr(ctx) + )); + true +} + +#[module_export(name = "log")] +fn log_int(ctx: &mut ScanContext, i: i64) -> bool { + ctx.console_log(format!("{}", i)); + true +} + +#[module_export(name = "log")] +fn log_msg_int(ctx: &mut ScanContext, message: RuntimeString, i: i64) -> bool { + ctx.console_log(format!("{}{}", message.as_bstr(ctx), i)); + true +} + +#[module_export(name = "log")] +fn log_float(ctx: &mut ScanContext, f: f64) -> bool { + ctx.console_log(format!("{}", f)); + true +} + +#[module_export(name = "log")] +fn log_msg_float( + ctx: &mut ScanContext, + message: RuntimeString, + f: f64, +) -> bool { + ctx.console_log(format!("{}{}", message.as_bstr(ctx), f)); + true +} + +#[module_export(name = "hex")] +fn log_hex(ctx: &mut ScanContext, i: i64) -> bool { + ctx.console_log(format!("0x{:x}", i)); + true +} + +#[module_export(name = "hex")] +fn log_msg_hex(ctx: &mut ScanContext, message: RuntimeString, i: i64) -> bool { + ctx.console_log(format!("{}0x{:x}", message.as_bstr(ctx), i)); + true +} + +#[cfg(test)] +mod tests { + + #[test] + fn log() { + let rules = crate::compile( + r#" + import "console" + rule test { + condition: + console.log("foo") and + console.log("bar: ", 1) and + console.log("baz: ", 3.14) and + console.log(10) and + console.log(6.28) and + console.hex(10) and + console.hex("qux: ", 255) + } + "#, + ) + .unwrap(); + + let mut messages = vec![]; + + crate::scanner::Scanner::new(&rules) + .console_log(|message| messages.push(message)) + .scan(b"") + .expect("scan should not fail"); + + assert_eq!( + messages, + vec![ + "foo", + "bar: 1", + "baz: 3.14", + "10", + "6.28", + "0xa", + "qux: 0xff" + ] + ); + } +} diff --git a/yara-x/src/modules/modules.rs b/yara-x/src/modules/modules.rs index 8d7dcce17..28db9baf6 100644 --- a/yara-x/src/modules/modules.rs +++ b/yara-x/src/modules/modules.rs @@ -1,25 +1,27 @@ // File generated automatically by build.rs. Do not edit. -#[cfg(feature = "test_proto2-module")] -mod test_proto2; +#[cfg(feature = "string-module")] +mod string; #[cfg(feature = "macho-module")] mod macho; +#[cfg(feature = "pe-module")] +mod pe; +#[cfg(feature = "elf-module")] +mod elf; +#[cfg(feature = "text-module")] +mod text; +#[cfg(feature = "dotnet-module")] +mod dotnet; #[cfg(feature = "lnk-module")] mod lnk; #[cfg(feature = "hash-module")] mod hash; -#[cfg(feature = "text-module")] -mod text; +#[cfg(feature = "math-module")] +mod math; +#[cfg(feature = "test_proto2-module")] +mod test_proto2; #[cfg(feature = "time-module")] mod time; -#[cfg(feature = "dotnet-module")] -mod dotnet; #[cfg(feature = "test_proto3-module")] mod test_proto3; -#[cfg(feature = "pe-module")] -mod pe; -#[cfg(feature = "string-module")] -mod string; -#[cfg(feature = "elf-module")] -mod elf; -#[cfg(feature = "math-module")] -mod math; \ No newline at end of file +#[cfg(feature = "console-module")] +mod console; \ No newline at end of file diff --git a/yara-x/src/modules/protos/console.proto b/yara-x/src/modules/protos/console.proto new file mode 100644 index 000000000..05ce59fc7 --- /dev/null +++ b/yara-x/src/modules/protos/console.proto @@ -0,0 +1,14 @@ +syntax = "proto2"; +import "yara.proto"; + +package console; + +option (yara.module_options) = { + name : "console" + root_message: "console.Console" + rust_module: "console" +}; + +message Console { + // This module contains only exported functions, and doesn't return any data +} \ No newline at end of file diff --git a/yara-x/src/scanner/context.rs b/yara-x/src/scanner/context.rs index f08c36ef0..21145ade0 100644 --- a/yara-x/src/scanner/context.rs +++ b/yara-x/src/scanner/context.rs @@ -104,6 +104,8 @@ pub(crate) struct ScanContext<'r> { /// is evaluated, it is compiled the first time and stored in this hash /// map. pub regexp_cache: RefCell>, + /// Callback invoked every time a YARA rule calls `console.log`. + pub console_log: Option>, /// Hash map that tracks the time spend on each pattern. Keys are pattern /// PatternIds and values are the cumulative time spent on verifying each /// pattern. @@ -206,6 +208,12 @@ impl ScanContext<'_> { info!("Started rule evaluation: {}:{}", rule_namespace, rule_name); } + pub(crate) fn console_log(&mut self, message: String) { + if let Some(console_log) = &mut self.console_log { + console_log(message) + } + } + pub(crate) fn store_struct( &mut self, s: Rc, diff --git a/yara-x/src/scanner/mod.rs b/yara-x/src/scanner/mod.rs index a4a1be86f..710a6b3ef 100644 --- a/yara-x/src/scanner/mod.rs +++ b/yara-x/src/scanner/mod.rs @@ -120,6 +120,7 @@ impl<'r> Scanner<'r> { wasm_store: NonNull::dangling(), runtime_objects: IndexMap::new(), compiled_rules: rules, + console_log: None, current_struct: None, root_struct: rules.globals().make_root(), scanned_data: null(), @@ -276,6 +277,21 @@ impl<'r> Scanner<'r> { self } + /// Sets a callback that is invoked every time a YARA rule calls the + /// `console` module. + /// + /// The `callback` function is invoked with a string representing the + /// message being logged. The function can print the message to stdout, + /// append it to a file, etc. If no callback is set these messages are + /// ignored. + pub fn console_log(&mut self, callback: F) -> &mut Self + where + F: FnMut(String) + 'r, + { + self.wasm_store.data_mut().console_log = Some(Box::new(callback)); + self + } + /// Scans a file. pub fn scan_file<'a, P>( &'a mut self, diff --git a/yara-x/src/wasm/builder.rs b/yara-x/src/wasm/builder.rs index c6de801ea..a1ad03272 100644 --- a/yara-x/src/wasm/builder.rs +++ b/yara-x/src/wasm/builder.rs @@ -470,38 +470,38 @@ mod tests { assert_eq!( text, r#"(module - (func (;152;) (type 1) (result i32) + (func (;160;) (type 1) (result i32) i32.const 0 global.set 2 i32.const 0 global.set 3 - call 153 - call 154 + call 161 + call 162 global.get 3 ) - (func (;153;) (type 0) + (func (;161;) (type 0) block ;; label = @1 - call 155 + call 163 end block ;; label = @1 - call 156 + call 164 end ) - (func (;154;) (type 0) + (func (;162;) (type 0) block ;; label = @1 - call 157 + call 165 end ) - (func (;155;) (type 0) + (func (;163;) (type 0) i32.const 4 ) - (func (;156;) (type 0) + (func (;164;) (type 0) i32.const 5 ) - (func (;157;) (type 0) + (func (;165;) (type 0) i32.const 6 ) - (export "main" (func 152)) + (export "main" (func 160)) )"# ); } From bd246399e7233c8e5324a6d17c3f44af0259075d Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 16 Jan 2024 17:02:02 +0100 Subject: [PATCH 02/28] chore: `dotnet` module depends on `pe` module. --- yara-x/Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yara-x/Cargo.toml b/yara-x/Cargo.toml index 8a4981702..4f0a42c2f 100644 --- a/yara-x/Cargo.toml +++ b/yara-x/Cargo.toml @@ -49,7 +49,9 @@ rules-profiling = ["logging"] console-module = [] # The `dotnet` module parsers .NET files. -dotnet-module = [] +dotnet-module = [ + "pe-module" +] # The `elf` module parses ELF files. elf-module = [ From 2e87ede8e8e8cfa52bde97d9e716980fca1e6dab Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 16 Jan 2024 17:15:44 +0100 Subject: [PATCH 03/28] ci: fix test cases for `yara-x-py` by linking `openssl` statically. --- .github/workflows/python.yaml | 37 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 3a45dc36f..860f6ad89 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -1,14 +1,10 @@ -# This file is autogenerated by maturin v1.0.1 -# To update, run -# -# maturin generate-ci github -# name: Python extension on: push: paths: - 'yara-x-py/**' + - '.github/workflows/python.yaml' permissions: contents: read @@ -115,22 +111,39 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Test Python - Non-Windows - if: runner.os != 'Windows' + + # Windows + - name: Install OpenSSL + if: runner.os == 'Windows' + id: vcpkg + uses: johnwason/vcpkg-action@v5 + with: + pkgs: openssl + triplet: x64-windows-static + token: ${{ github.token }} + - name: Set OPENSSL_DIR environment variable + if: runner.os == 'Windows' + shell: bash + run: echo "OPENSSL_DIR=${{ github.workspace }}\\vcpkg\\installed\\x64-windows-static" >> $GITHUB_ENV + - name: Test Python - Windows + if: runner.os == 'Windows' run: | pip install virtualenv - virtualenv venv; source venv/bin/activate + virtualenv venv; venv\Scripts\activate.ps1 + pwd python -m pip install --upgrade pip maturin python -m pip install pytest maturin develop --manifest-path yara-x-py/Cargo.toml pytest yara-x-py - - name: Test Python - Windows - if: runner.os == 'Windows' + + # Non-windows + - name: Test Python - Non-Windows + if: runner.os != 'Windows' run: | pip install virtualenv - virtualenv venv; venv\Scripts\activate.ps1 - pwd + virtualenv venv; source venv/bin/activate python -m pip install --upgrade pip maturin python -m pip install pytest maturin develop --manifest-path yara-x-py/Cargo.toml pytest yara-x-py + From cc3409299c6f511d81b5d39ff4367a48c1f0bb0a Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 16 Jan 2024 19:21:36 +0100 Subject: [PATCH 04/28] ci: don't build the release Python wheels After adding the dependency to `openssl` these builds are broken because the `PyO3/maturin-action@v1` action uses `manylinux` images that don't have `openssl` installed. We need to figure out a way for automatically building the Python wheels. --- .github/workflows/python.yaml | 90 +---------------------------------- 1 file changed, 2 insertions(+), 88 deletions(-) diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 860f6ad89..430117328 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -10,94 +10,6 @@ permissions: contents: read jobs: - linux: - runs-on: ubuntu-latest - strategy: - matrix: - target: [x86_64, aarch64] - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - working-directory: yara-x-py - sccache: "true" - manylinux: auto - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - windows: - runs-on: windows-latest - strategy: - matrix: - target: [x64] - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - architecture: ${{ matrix.target }} - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - working-directory: yara-x-py - sccache: "true" - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - macos: - runs-on: macos-latest - strategy: - matrix: - target: [x86_64, aarch64] - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - working-directory: yara-x-py - sccache: "true" - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - release: - runs-on: ubuntu-latest - if: "startsWith(github.ref, 'refs/tags/')" - needs: [linux, windows, macos] - steps: - - uses: actions/download-artifact@v3 - with: - name: wheels - - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - env: - MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} - with: - command: upload - args: --skip-existing * - working-directory: yara-x-py - test: strategy: fail-fast: false @@ -121,10 +33,12 @@ jobs: pkgs: openssl triplet: x64-windows-static token: ${{ github.token }} + - name: Set OPENSSL_DIR environment variable if: runner.os == 'Windows' shell: bash run: echo "OPENSSL_DIR=${{ github.workspace }}\\vcpkg\\installed\\x64-windows-static" >> $GITHUB_ENV + - name: Test Python - Windows if: runner.os == 'Windows' run: | From dbb1cb7672905cc8def51d2cd4c2e3b009dd3933 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 16 Jan 2024 19:38:13 +0100 Subject: [PATCH 05/28] chore: fix warning --- yara-x/src/scanner/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/yara-x/src/scanner/mod.rs b/yara-x/src/scanner/mod.rs index 710a6b3ef..765166bd4 100644 --- a/yara-x/src/scanner/mod.rs +++ b/yara-x/src/scanner/mod.rs @@ -36,7 +36,6 @@ use crate::wasm::{ENGINE, MATCHING_RULES_BITMAP_BASE}; use crate::{modules, wasm, Variable}; pub(crate) use crate::scanner::context::*; -pub use crate::scanner::matches::*; mod context; mod matches; From 6be1371d0eb625f55f9618d2e935ccdbc9881126 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 13:11:16 +0100 Subject: [PATCH 06/28] feat: print console logs in CLI --- yara-x-cli/src/commands/check.rs | 4 ++-- yara-x-cli/src/commands/scan.rs | 21 +++++++++++++++++++-- yara-x-cli/src/walk.rs | 16 ++++++++-------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/yara-x-cli/src/commands/check.rs b/yara-x-cli/src/commands/check.rs index 24698400e..44b2f94de 100644 --- a/yara-x-cli/src/commands/check.rs +++ b/yara-x-cli/src/commands/check.rs @@ -70,8 +70,8 @@ pub fn exec_check(args: &ArgMatches) -> anyhow::Result<()> { w.walk( rules_path, CheckState::new(), - || {}, - |file_path, state, output, _| { + |_, _| {}, + |state, output, file_path, _| { let src = fs::read(file_path.clone()) .with_context(|| { format!("can not read `{}`", file_path.display()) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index 667912b7c..d5da95cba 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -52,6 +52,10 @@ pub fn scan() -> Command { arg!(-D --"dump-module-output") .help("Dumps the data produced by modules") ) + .arg( + arg!(--"disable-console-logs") + .help("Disable printing console log messages") + ) .arg( arg!(-n - -"negate") .help("Print non-satisfied rules only") @@ -105,6 +109,7 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { let skip_larger = args.get_one::("skip-larger"); let negate = args.get_flag("negate"); let dump_module_output = args.get_flag("dump-module-output"); + let disable_console_logs = args.get_flag("disable-console-logs"); let timeout = args.get_one::("timeout"); let mut external_vars: Option> = args @@ -174,8 +179,17 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { w.walk( path, state, - || { + // Initialization + |_, output| { let mut scanner = Scanner::new(rules_ref); + + if !disable_console_logs { + let output = output.clone(); + scanner.console_log(move |msg| { + output.send(Message::Info(msg)).unwrap(); + }); + } + if let Some(ref vars) = external_vars { for (ident, value) in vars { // It's ok to use `unwrap()`, this can not fail because @@ -183,9 +197,11 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { scanner.set_global(ident.as_str(), value).unwrap(); } } + scanner }, - |file_path, state, output, scanner| { + // File handler. Called for every file found while walking the path. + |state, output, file_path, scanner| { let elapsed_time = Instant::elapsed(&start_time); if let Some(timeout) = timeout.checked_sub(elapsed_time) { @@ -260,6 +276,7 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { Ok(()) }, + // Error handler |err, output| { let _ = output.send(Message::Error(format!( "{} {}: {}", diff --git a/yara-x-cli/src/walk.rs b/yara-x-cli/src/walk.rs index cb473de01..295ff5f16 100644 --- a/yara-x-cli/src/walk.rs +++ b/yara-x-cli/src/walk.rs @@ -225,20 +225,20 @@ impl<'a> DirWalker<'a> { /// walker.walk( /// // The path to be walked. /// "." -/// // The first argument is the initial state. This must have some type -/// // `S` that implements the `Component` trait. +/// // The initial state. This must have some type `S` that implements the +/// // `Component` trait. /// state /// // This is the thread initialization function. This is called once /// // per thread, and each thread will own the value returned by this /// // function. A mutable reference to this value is passed as the /// // last argument to the next function. -/// || { +/// |state, output| { /// scanner.Scanner::new(rules) /// }, /// // This function is called for each file, `state` is a reference to /// // the initial state (it's type is `&S`), `output` is of type /// // `Sender`. -/// |file_path, state, output, scanner| { +/// |state, output, file_path, scanner| { /// scanner.scan_file(file_path); /// } /// // This function is called with every error that occurs during the @@ -308,8 +308,8 @@ impl<'a> ParDirWalker<'a> { ) -> thread::Result<()> where S: Component + Send + Sync, - I: Fn() -> T + Send + Copy + Sync, - F: Fn(PathBuf, &S, &Sender, &mut T) -> anyhow::Result<()> + I: Fn(&S, &Sender) -> T + Send + Copy + Sync, + F: Fn(&S, &Sender, PathBuf, &mut T) -> anyhow::Result<()> + Send + Sync + Copy, @@ -347,12 +347,12 @@ impl<'a> ParDirWalker<'a> { let msg_send = msg_send.clone(); let state = state.clone(); threads.push(s.spawn(move |_| { - let mut per_thread_obj = init(); + let mut per_thread_obj = init(&state, &msg_send); for path in paths_recv { let res = func( - path.to_path_buf(), &state, &msg_send, + path.to_path_buf(), &mut per_thread_obj, ); if let Err(err) = res { From cfe04403a7f389ad7165daf9aea61c3536aad825 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 13:22:49 +0100 Subject: [PATCH 07/28] feat: print console logs to stderr in a yellow color --- yara-x-cli/src/commands/scan.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index d5da95cba..16c66a2c9 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -186,7 +186,9 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { if !disable_console_logs { let output = output.clone(); scanner.console_log(move |msg| { - output.send(Message::Info(msg)).unwrap(); + output + .send(Message::Error(format!("{}", Yellow.paint(msg)))) + .unwrap(); }); } From 7c16093c6f01dac7b712a1aea57b883047b6519c Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 13:24:30 +0100 Subject: [PATCH 08/28] chore: remove the `--dump-module-output` option from `scan` command. Now we have the dedicated command `dump` which does the same. --- yara-x-cli/src/commands/scan.rs | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index 16c66a2c9..587086294 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -48,10 +48,6 @@ pub fn scan() -> Command { .help("Print matching patterns, limited to the first N bytes") .value_parser(value_parser!(usize)) ) - .arg( - arg!(-D --"dump-module-output") - .help("Dumps the data produced by modules") - ) .arg( arg!(--"disable-console-logs") .help("Disable printing console log messages") @@ -108,7 +104,6 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { let path_as_namespace = args.get_flag("path-as-namespace"); let skip_larger = args.get_one::("skip-larger"); let negate = args.get_flag("negate"); - let dump_module_output = args.get_flag("dump-module-output"); let disable_console_logs = args.get_flag("disable-console-logs"); let timeout = args.get_one::("timeout"); @@ -256,24 +251,6 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { ); }; - if dump_module_output { - for (mod_name, mod_output) in scan_results.module_outputs() { - output - .send(Message::Info(format!( - ">>> {} {}\n{}<<<", - Yellow.paint(mod_name).bold(), - file_path.display(), - indent_all_by( - 4, - protobuf::text_format::print_to_string_pretty( - mod_output, - ) - ), - ))) - .unwrap(); - } - } - state.num_scanned_files.fetch_add(1, Ordering::Relaxed); Ok(()) From 92c460dd0c96fcd8ecf5154e02f00093ec1e72db Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 13:54:44 +0100 Subject: [PATCH 09/28] chore: remove unused import --- yara-x-cli/src/commands/scan.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index 587086294..39c731791 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -9,7 +9,6 @@ use std::time::{Duration, Instant}; use anyhow::{bail, Context, Error}; use clap::{arg, value_parser, Arg, ArgAction, ArgMatches, Command}; use crossbeam::channel::Sender; -use indent::indent_all_by; use superconsole::style::Stylize; use superconsole::{Component, Line, Lines, Span}; use yansi::Color::{Cyan, Red, Yellow}; From ba1e882dd1c51346c1b1f758180ff6155d3562a7 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 18:12:49 +0100 Subject: [PATCH 10/28] style: fix clippy warnings --- yara-x/benches/benches.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yara-x/benches/benches.rs b/yara-x/benches/benches.rs index 7f3002282..c77307e9d 100644 --- a/yara-x/benches/benches.rs +++ b/yara-x/benches/benches.rs @@ -18,7 +18,7 @@ macro_rules! gen_bench { let mut scanner = rules.scanner().unwrap(); b.iter(|| { - scanner.scan_mem($data).unwrap(); + let _ = scanner.scan_mem($data).unwrap(); }) }); @@ -27,7 +27,7 @@ macro_rules! gen_bench { let mut scanner = yara_x::Scanner::new(&rules); b.iter(|| { - scanner.scan($data); + let _ = scanner.scan($data); }); }); } From 6440b31eb3dd70f3a8a4e3c0460916c10366c22c Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 19:33:23 +0100 Subject: [PATCH 11/28] refactor: expose `range`, `data` and `xor_key` in `Match` as methods instead of fields --- yara-x-cli/src/commands/scan.rs | 9 ++++++--- yara-x/src/scanner/mod.rs | 31 ++++++++++++++++++++----------- yara-x/src/scanner/tests.rs | 15 +++++++-------- yara-x/src/tests/mod.rs | 2 +- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index 39c731791..4100b120e 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -314,14 +314,17 @@ fn print_matching_rules( let limit = print_strings_limit.unwrap_or(&120); for p in matching_rule.patterns() { for m in p.matches() { + let match_range = m.range(); + let match_data = m.data(); + let mut msg = format!( "{:#x}:{}:{}: ", - m.range.start, - m.range.len(), + match_range.start, + match_range.len(), p.identifier(), ); - for b in &m.data[..min(m.data.len(), *limit)] { + for b in &match_data[..min(match_data.len(), *limit)] { for c in b.escape_ascii() { msg.push_str(format!("{}", c as char).as_str()); } diff --git a/yara-x/src/scanner/mod.rs b/yara-x/src/scanner/mod.rs index 765166bd4..fb1473250 100644 --- a/yara-x/src/scanner/mod.rs +++ b/yara-x/src/scanner/mod.rs @@ -895,13 +895,7 @@ impl<'a> Iterator for Matches<'a> { fn next(&mut self) -> Option { if let Some(iter) = &mut self.iterator { - let match_ = iter.next()?; - Some(Match { - range: match_.range.clone(), - data: &self.data.as_ref() - [match_.range.start..match_.range.end], - xor_key: match_.xor_key, - }) + Some(Match { inner: iter.next()?, data: self.data }) } else { None } @@ -909,13 +903,28 @@ impl<'a> Iterator for Matches<'a> { } /// Represents a match. -#[derive(PartialEq, Debug)] pub struct Match<'a> { + inner: &'a matches::Match, + data: &'a ScannedData<'a>, +} + +impl<'a> Match<'a> { /// Range within the original data where the match occurred. - pub range: Range, + #[inline] + pub fn range(&self) -> Range { + self.inner.range.clone() + } + /// Slice containing the data that matched. - pub data: &'a [u8], + #[inline] + pub fn data(&self) -> &'a [u8] { + self.data.as_ref().get(self.inner.range.clone()).unwrap() + } + /// XOR key used for decrypting the data if the pattern had the `xor` /// modifier, or `None` if otherwise. - pub xor_key: Option, + #[inline] + pub fn xor_key(&self) -> Option { + self.inner.xor_key + } } diff --git a/yara-x/src/scanner/tests.rs b/yara-x/src/scanner/tests.rs index c4ba266c2..db72b2e60 100644 --- a/yara-x/src/scanner/tests.rs +++ b/yara-x/src/scanner/tests.rs @@ -1,7 +1,6 @@ use pretty_assertions::assert_eq; use protobuf::MessageDyn; -use crate::scanner; use crate::scanner::Scanner; use crate::variables::VariableError; @@ -64,7 +63,7 @@ fn matches() { matches.extend( pattern .matches() - .map(|x| (pattern.identifier(), x.range, x.data)), + .map(|x| (pattern.identifier(), x.range(), x.data())), ) } } @@ -100,7 +99,7 @@ fn xor_matches() { matches.extend( pattern .matches() - .map(|x| (pattern.identifier(), x.range, x.xor_key)), + .map(|x| (pattern.identifier(), x.range(), x.xor_key())), ) } } @@ -461,10 +460,10 @@ fn max_matches_per_pattern() { // Only one match is returned for pattern $a because the limit has been set // to 1. - assert_eq!( - matches.next(), - Some(scanner::Match { range: (0..3), data: b"foo", xor_key: None }) - ); + let match_ = matches.next().unwrap(); + + assert_eq!(match_.range(), (0..3)); + assert_eq!(match_.data(), b"foo"); - assert_eq!(matches.next(), None); + assert!(matches.next().is_none()); } diff --git a/yara-x/src/tests/mod.rs b/yara-x/src/tests/mod.rs index 19fca9159..2729c361a 100644 --- a/yara-x/src/tests/mod.rs +++ b/yara-x/src/tests/mod.rs @@ -132,7 +132,7 @@ macro_rules! pattern_match { .matches() .next() .unwrap() - .data; + .data(); assert_eq!( matching_data, $expected_result, From d564a1d47f1705c6809656a486b55bdaa8132997 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 17 Jan 2024 19:36:53 +0100 Subject: [PATCH 12/28] style: minor code simplification --- yara-x/src/scanner/mod.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/yara-x/src/scanner/mod.rs b/yara-x/src/scanner/mod.rs index fb1473250..e02058e67 100644 --- a/yara-x/src/scanner/mod.rs +++ b/yara-x/src/scanner/mod.rs @@ -894,11 +894,8 @@ impl<'a> Iterator for Matches<'a> { type Item = Match<'a>; fn next(&mut self) -> Option { - if let Some(iter) = &mut self.iterator { - Some(Match { inner: iter.next()?, data: self.data }) - } else { - None - } + let iter = self.iterator.as_mut()?; + Some(Match { inner: iter.next()?, data: self.data }) } } From 3b200358a054240b4b4216f0117b4d8a4189e946 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 15:33:13 +0100 Subject: [PATCH 13/28] feat: expose information about matching patterns and module outputs in the Python extension API --- Cargo.lock | 1 + Cargo.toml | 1 + yara-x-cli/Cargo.toml | 2 +- yara-x-py/Cargo.toml | 2 + yara-x-py/src/lib.rs | 212 ++++++++++++++++++++++++++++++------ yara-x-py/tests/test_api.py | 105 ++++++++++++------ 6 files changed, 256 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 760b94f17..be89acd4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4469,6 +4469,7 @@ dependencies = [ name = "yara-x-py" version = "0.1.0" dependencies = [ + "protobuf-json-mapping", "pyo3", "pyo3-build-config", "pyo3-file", diff --git a/Cargo.toml b/Cargo.toml index bc9e4744d..20928aa61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,6 +74,7 @@ pest_derive = "2.7.5" pretty_assertions = "1.4.0" protobuf = "3.3.0" protobuf-codegen = "3.3.0" +protobuf-json-mapping = "3.3.0" protobuf-parse = "3.3.0" regex-syntax = { git = "https://github.com/plusvic/regex.git", rev="423493d" } regex-automata = { git = "https://github.com/plusvic/regex.git", rev="423493d" } diff --git a/yara-x-cli/Cargo.toml b/yara-x-cli/Cargo.toml index 29d65d292..dd89cab73 100644 --- a/yara-x-cli/Cargo.toml +++ b/yara-x-cli/Cargo.toml @@ -42,7 +42,7 @@ enable-ansi-support = { workspace = true } env_logger = { workspace = true , optional = true } log = { workspace = true, optional = true } protobuf = { workspace = true } -protobuf-json-mapping = "3.3.0" +protobuf-json-mapping = { workspace = true } serde_json = { workspace = true, features = ["preserve_order"] } yansi = { workspace = true } yara-x = { workspace = true } diff --git a/yara-x-py/Cargo.toml b/yara-x-py/Cargo.toml index 4b352fc38..e7bf70061 100644 --- a/yara-x-py/Cargo.toml +++ b/yara-x-py/Cargo.toml @@ -12,6 +12,8 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.19.2", features = ["abi3", "abi3-py38", "extension-module"]} pyo3-file = "0.7.0" + +protobuf-json-mapping = { workspace = true } yara-x = { workspace = true } [build-dependencies] diff --git a/yara-x-py/src/lib.rs b/yara-x-py/src/lib.rs index c5acb1176..33223a8e8 100644 --- a/yara-x-py/src/lib.rs +++ b/yara-x-py/src/lib.rs @@ -18,13 +18,20 @@ use std::ops::Deref; use std::pin::Pin; use std::time::Duration; +use protobuf_json_mapping::print_to_string; use pyo3::exceptions::{PyIOError, PySyntaxError, PyTypeError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{PyBool, PyBytes, PyFloat, PyInt, PyString, PyTuple}; +use pyo3::types::{ + PyBool, PyBytes, PyDict, PyFloat, PyInt, PyString, PyTuple, +}; use pyo3_file::PyFileLikeObject; use ::yara_x as yrx; +/// Compiles a YARA source code producing a set of compiled [`Rules`]. +/// +/// This function allows compiling simple rules that don't depend on external +/// variables. For more complex use cases you will need to use a [`Compiler`]. #[pyfunction] fn compile(src: &str) -> PyResult { Ok(Rules { @@ -208,34 +215,121 @@ impl Scanner { } /// Scans in-memory data. - #[pyo3(signature = (data))] - fn scan(&mut self, data: &[u8]) -> PyResult> { - let scan_results = self - .inner - .scan(data) - .map_err(|err| PyValueError::new_err(err.to_string()))?; - - Ok(matching_rules_to_py(scan_results.matching_rules())) + fn scan(&mut self, data: &[u8]) -> PyResult> { + Python::with_gil(|py| { + scan_results_to_py( + py, + self.inner + .scan(data) + .map_err(|err| PyValueError::new_err(err.to_string()))?, + ) + }) + } +} + +/// Results produced by a scan operation. +#[pyclass] +struct ScanResults { + /// Vector that contains all the rules that matched during the scan. + matching_rules: Vec>, + /// Dictionary where keys are module names and values are other + /// dictionaries with the information produced by the corresponding module. + module_outputs: Py, +} + +#[pymethods] +impl ScanResults { + #[getter] + /// Rules that matched during the scan. + fn matching_rules(&self) -> Py { + Python::with_gil(|py| PyTuple::new(py, &self.matching_rules).into()) + } + + #[getter] + /// Rules that matched during the scan. + fn module_outputs<'py>(&'py self, py: Python<'py>) -> &'py PyDict { + self.module_outputs.as_ref(py) } } +/// Represents a rule that matched while scanning some data. #[pyclass] -struct MatchingRule { +struct Rule { name: String, namespace: String, + patterns: Vec>, } #[pymethods] -impl MatchingRule { +impl Rule { #[getter] + /// Returns the rule's name. fn name(&self) -> &str { self.name.as_str() } + /// Returns the rule's namespace. #[getter] fn namespace(&self) -> &str { self.namespace.as_str() } + + /// Patterns defined by the rule. + #[getter] + fn patterns(&self) -> Py { + Python::with_gil(|py| PyTuple::new(py, &self.patterns).into()) + } +} + +/// Represents a pattern in a YARA rule. +#[pyclass] +struct Pattern { + identifier: String, + matches: Vec>, +} + +#[pymethods] +impl Pattern { + /// Pattern identifier (e.g: '$a', '$foo'). + #[getter] + fn identifier(&self) -> &str { + self.identifier.as_str() + } + + /// Matches found for this pattern. + #[getter] + fn matches(&self) -> Py { + Python::with_gil(|py| PyTuple::new(py, &self.matches).into()) + } +} + +#[pyclass] +struct Match { + offset: usize, + length: usize, + xor_key: Option, +} + +#[pymethods] +impl Match { + /// Offset where the match occurred. + #[getter] + fn offset(&self) -> usize { + self.offset + } + + /// Length of the match in bytes. + #[getter] + fn length(&self) -> usize { + self.length + } + + /// XOR key used for decrypting the data if the pattern had the xor + /// modifier, or None if otherwise. + #[getter] + fn xor_key(&self) -> Option { + self.xor_key + } } /// A set of YARA rules in compiled form. @@ -254,15 +348,16 @@ struct PinnedRules { #[pymethods] impl Rules { /// Scans in-memory data with these rules. - #[pyo3(signature = (data))] - fn scan(&self, data: &[u8]) -> PyResult> { + fn scan(&self, data: &[u8]) -> PyResult> { let mut scanner = yrx::Scanner::new(&self.inner.rules); - - let scan_results = scanner - .scan(data) - .map_err(|err| PyValueError::new_err(err.to_string()))?; - - Ok(matching_rules_to_py(scan_results.matching_rules())) + Python::with_gil(|py| { + scan_results_to_py( + py, + scanner + .scan(data) + .map_err(|err| PyValueError::new_err(err.to_string()))?, + ) + }) } fn serialize_into(&self, file: PyObject) -> PyResult<()> { @@ -278,20 +373,67 @@ impl Rules { } } -fn matching_rules_to_py(matching_rules: yrx::MatchingRules) -> Py { - Python::with_gil(|py| { - PyTuple::new( - py, - matching_rules.map(|rule| { - MatchingRule { - name: rule.name().to_string(), - namespace: rule.namespace().to_string(), - } - .into_py(py) - }), - ) - .into() - }) +fn scan_results_to_py( + py: Python, + scan_results: yrx::ScanResults, +) -> PyResult> { + let matching_rules = scan_results + .matching_rules() + .map(|rule| rule_to_py(py, rule)) + .collect::>>()?; + + let json = PyModule::import(py, "json")?; + let json_loads = json.getattr("loads")?; + + let module_outputs = PyDict::new(py); + for (module, output) in scan_results.module_outputs() { + let module_output_json = print_to_string(output).unwrap(); + let module_output = json_loads.call((module_output_json,), None)?; + module_outputs.set_item(module, module_output)?; + } + + Py::new( + py, + ScanResults { matching_rules, module_outputs: module_outputs.into() }, + ) +} + +fn rule_to_py(py: Python, rule: yrx::Rule) -> PyResult> { + Py::new( + py, + Rule { + name: rule.name().to_string(), + namespace: rule.namespace().to_string(), + patterns: rule + .patterns() + .map(|pattern| pattern_to_py(py, pattern)) + .collect::, _>>()?, + }, + ) +} + +fn pattern_to_py(py: Python, pattern: yrx::Pattern) -> PyResult> { + Py::new( + py, + Pattern { + identifier: pattern.identifier().to_string(), + matches: pattern + .matches() + .map(|match_| match_to_py(py, match_)) + .collect::, _>>()?, + }, + ) +} + +fn match_to_py(py: Python, match_: yrx::Match) -> PyResult> { + Py::new( + py, + Match { + offset: match_.range().start, + length: match_.range().len(), + xor_key: match_.xor_key(), + }, + ) } /// Python module for compiling YARA rules and scanning data with them. @@ -308,6 +450,8 @@ fn yara_x(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/yara-x-py/tests/test_api.py b/yara-x-py/tests/test_api.py index 0b95b6344..40df7b98f 100644 --- a/yara-x-py/tests/test_api.py +++ b/yara-x-py/tests/test_api.py @@ -20,16 +20,16 @@ def test_int_globals(): rules = compiler.build() scanner = yara_x.Scanner(rules) - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 scanner.set_global('some_int', 2) - matches = scanner.scan(b'') - assert len(matches) == 0 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 0 scanner.set_global('some_int', 1) - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 def test_float_globals(): @@ -39,16 +39,16 @@ def test_float_globals(): rules = compiler.build() scanner = yara_x.Scanner(rules) - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 scanner.set_global('some_float', 2.0) - matches = scanner.scan(b'') - assert len(matches) == 0 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 0 scanner.set_global('some_float', 1.0) - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 def test_str_globals(): @@ -58,42 +58,77 @@ def test_str_globals(): rules = compiler.build() scanner = yara_x.Scanner(rules) - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 scanner.set_global('some_str', 'bar') - matches = scanner.scan(b'') - assert len(matches) == 0 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 0 scanner.set_global('some_str', 'foo') - matches = scanner.scan(b'') - assert len(matches) == 1 + matching_rules = scanner.scan(b'').matching_rules + assert len(matching_rules) == 1 def test_namespaces(): compiler = yara_x.Compiler() compiler.new_namespace('foo') - compiler.add_source('rule foo {strings: $a = "foo" condition: $a}') + compiler.add_source('rule foo {strings: $foo = "foo" condition: $foo}') compiler.new_namespace('bar') - compiler.add_source('rule bar {strings: $a = "bar" condition: $a}') - scanner = yara_x.Scanner(compiler.build()) - matches = scanner.scan(b'foobar') - assert len(matches) == 2 - + compiler.add_source('rule bar {strings: $bar = "bar" condition: $bar}') + rules = compiler.build() + matching_rules = rules.scan(b'foobar').matching_rules + + assert len(matching_rules) == 2 + + assert matching_rules[0].name == 'foo' + assert matching_rules[0].namespace == 'foo' + assert len(matching_rules[0].patterns) == 1 + assert matching_rules[0].patterns[0].identifier == '$foo' + assert len(matching_rules[0].patterns[0].matches) == 1 + assert matching_rules[0].patterns[0].matches[0].offset == 0 + assert matching_rules[0].patterns[0].matches[0].length == 3 + assert matching_rules[0].patterns[0].matches[0].xor_key is None + + assert matching_rules[1].name == 'bar' + assert matching_rules[1].namespace == 'bar' + assert len(matching_rules[1].patterns) == 1 + assert matching_rules[1].patterns[0].identifier == '$bar' + assert len(matching_rules[1].patterns[0].matches) == 1 + assert matching_rules[1].patterns[0].matches[0].offset == 3 + assert matching_rules[1].patterns[0].matches[0].length == 3 + assert matching_rules[1].patterns[0].matches[0].xor_key is None def test_compile_and_scan(): rules = yara_x.compile('rule foo {strings: $a = "foo" condition: $a}') - matches = rules.scan(b'foobar') - assert len(matches) == 1 - assert matches[0].name == 'foo' + matching_rules = rules.scan(b'foobar').matching_rules + assert len(matching_rules) == 1 + assert matching_rules[0].name == 'foo' + assert matching_rules[0].namespace == 'default' + assert len(matching_rules[0].patterns) == 1 + assert matching_rules[0].patterns[0].identifier == '$a' def test_compiler_and_scanner(): - compiler = yara_x.Compiler() - compiler.add_source('rule foo {strings: $a = "foo" condition: $a}') - scanner = yara_x.Scanner(compiler.build()) - matches = scanner.scan(b'foobar') - assert len(matches) == 1 + rules = yara_x.compile('rule foo {strings: $a = "foo" condition: $a}') + matching_rules = rules.scan(b'foobar').matching_rules + assert len(matching_rules) == 1 + assert matching_rules[0].name == 'foo' + assert matching_rules[0].namespace == 'default' + assert len(matching_rules[0].patterns) == 1 + assert matching_rules[0].patterns[0].identifier == '$a' + + +def test_xor_key(): + rules = yara_x.compile('rule foo {strings: $a = "foo" xor condition: $a}') + matching_rules = rules.scan(b'\xCC\xC5\xC5').matching_rules + assert len(matching_rules) == 1 + assert matching_rules[0].name == 'foo' + assert matching_rules[0].namespace == 'default' + assert len(matching_rules[0].patterns) == 1 + assert matching_rules[0].patterns[0].identifier == '$a' + assert len(matching_rules[0].patterns[0].matches) == 1 + assert matching_rules[0].patterns[0].matches[0].xor_key == 0xAA def test_scanner_timeout(): @@ -105,6 +140,12 @@ def test_scanner_timeout(): scanner.scan(b'foobar') +def test_module_outputs(): + rules = yara_x.compile('import "test_proto2" rule foo {condition: false}') + module_outputs = rules.scan(b'').module_outputs + assert module_outputs['test_proto2']['int32One'] == 1 + + def test_console_log(): ok = False def callback(msg): From 45b446d296fa053fd8bddbeaa66eabd0c530c197 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 15:37:46 +0100 Subject: [PATCH 14/28] chore: remove unused dependencies --- Cargo.lock | 8 -------- yara-x-cli/Cargo.toml | 1 - yara-x-fmt/Cargo.toml | 1 - 3 files changed, 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be89acd4d..0b7ccd5b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1566,12 +1566,6 @@ dependencies = [ "quote", ] -[[package]] -name = "indent" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1a0777d972970f204fdf8ef319f1f4f8459131636d7e3c96c5d59570d0fa6" - [[package]] name = "indenter" version = "0.3.3" @@ -4380,7 +4374,6 @@ dependencies = [ "enable-ansi-support", "env_logger", "globwalk", - "indent", "log", "pprof", "protobuf", @@ -4400,7 +4393,6 @@ dependencies = [ name = "yara-x-fmt" version = "0.1.0" dependencies = [ - "anyhow", "bitmask", "lazy_static", "pretty_assertions", diff --git a/yara-x-cli/Cargo.toml b/yara-x-cli/Cargo.toml index dd89cab73..3e9c0ef7d 100644 --- a/yara-x-cli/Cargo.toml +++ b/yara-x-cli/Cargo.toml @@ -53,7 +53,6 @@ yara-x-fmt = { workspace = true } colored_json = "4.0.0" crossbeam = "0.8.2" crossterm = "0.27.0" -indent = "0.1.1" pprof = { version = "0.13.0", features = ["flamegraph"], optional=true } strum_macros = "0.25" superconsole = "0.2.0" diff --git a/yara-x-fmt/Cargo.toml b/yara-x-fmt/Cargo.toml index b8433b3a2..3d9689fdf 100644 --- a/yara-x-fmt/Cargo.toml +++ b/yara-x-fmt/Cargo.toml @@ -13,7 +13,6 @@ rust-version.workspace = true crate-type = ["cdylib", "rlib"] [dependencies] -anyhow = { workspace = true } bitmask = { workspace = true } lazy_static = { workspace = true } thiserror = { workspace = true } From d94f7fddd87d9d272c4e641b8c243ef4c3610de0 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 15:51:13 +0100 Subject: [PATCH 15/28] fix: `yara-x-fmt` needs `anyhow` crate for tests. --- Cargo.lock | 1 + yara-x-fmt/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 0b7ccd5b7..bce3a9107 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4393,6 +4393,7 @@ dependencies = [ name = "yara-x-fmt" version = "0.1.0" dependencies = [ + "anyhow", "bitmask", "lazy_static", "pretty_assertions", diff --git a/yara-x-fmt/Cargo.toml b/yara-x-fmt/Cargo.toml index 3d9689fdf..1cd8682a2 100644 --- a/yara-x-fmt/Cargo.toml +++ b/yara-x-fmt/Cargo.toml @@ -19,4 +19,5 @@ thiserror = { workspace = true } yara-x-parser = { workspace = true } [dev-dependencies] +anyhow = { workspace = true } pretty_assertions = { workspace = true } \ No newline at end of file From e9094b9bade142777237320df9e5d3d22ee44e56 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 15:56:09 +0100 Subject: [PATCH 16/28] fix: add missing constants to `pe` module --- yara-x/src/modules/protos/pe.proto | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/yara-x/src/modules/protos/pe.proto b/yara-x/src/modules/protos/pe.proto index 518a0a2f2..37204d27b 100644 --- a/yara-x/src/modules/protos/pe.proto +++ b/yara-x/src/modules/protos/pe.proto @@ -415,4 +415,19 @@ enum SectionCharacteristics { SECTION_MEM_READ = 32 [(yara.enum_value).i64 = 0x40000000]; SECTION_MEM_WRITE = 33 [(yara.enum_value).i64 = 0x80000000]; SECTION_SCALE_INDEX = 34 [(yara.enum_value).i64 = 0x00000001]; +} + +enum DllCharacteristics { + option (yara.enum_options).inline = true; + HIGH_ENTROPY_VA = 0x0020; + DYNAMIC_BASE = 0x0040; + FORCE_INTEGRITY = 0x0080; + NX_COMPAT = 0x0100; + NO_ISOLATION = 0x0200; + NO_SEH = 0x0400; + NO_BIND = 0x0800; + APPCONTAINER = 0x1000; + WDM_DRIVER = 0x2000; + GUARD_CF = 0x4000; + TERMINAL_SERVER_AWARE = 0x8000; } \ No newline at end of file From bec858b982d23c928eeb265855c4ff5b869c3bfe Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 18:53:08 +0100 Subject: [PATCH 17/28] feat: implement the `Rules::deserialize_from` API. --- yara-x-py/src/lib.rs | 37 +++++++++++++++++++++++------------- yara-x-py/tests/test_api.py | 10 ++++++++++ yara-x/src/compiler/rules.rs | 36 +++++++++++++++++++++++------------ 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/yara-x-py/src/lib.rs b/yara-x-py/src/lib.rs index 33223a8e8..c4eab0917 100644 --- a/yara-x-py/src/lib.rs +++ b/yara-x-py/src/lib.rs @@ -34,13 +34,10 @@ use ::yara_x as yrx; /// variables. For more complex use cases you will need to use a [`Compiler`]. #[pyfunction] fn compile(src: &str) -> PyResult { - Ok(Rules { - inner: Box::pin(PinnedRules { - rules: yrx::compile(src) - .map_err(|err| PyValueError::new_err(err.to_string()))?, - _pinned: PhantomPinned, - }), - }) + let rules = yrx::compile(src) + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(Rules::new(rules)) } /// Compiles YARA source code producing a set of compiled [`Rules`]. @@ -112,12 +109,7 @@ impl Compiler { /// to its initial empty state. fn build(&mut self) -> Rules { let compiler = mem::replace(&mut self.inner, yrx::Compiler::new()); - Rules { - inner: Box::pin(PinnedRules { - rules: compiler.build(), - _pinned: PhantomPinned, - }), - } + Rules::new(compiler.build()) } } @@ -345,6 +337,14 @@ struct PinnedRules { _pinned: PhantomPinned, } +impl Rules { + fn new(rules: yrx::Rules) -> Self { + Rules { + inner: Box::pin(PinnedRules { rules, _pinned: PhantomPinned }), + } + } +} + #[pymethods] impl Rules { /// Scans in-memory data with these rules. @@ -360,6 +360,7 @@ impl Rules { }) } + /// Serializes the rules into a file-like object. fn serialize_into(&self, file: PyObject) -> PyResult<()> { let f = PyFileLikeObject::with_requirements(file, false, true, false)?; self.inner @@ -368,6 +369,16 @@ impl Rules { .map_err(|err| PyIOError::new_err(err.to_string())) } + /// Deserializes rules from a file-like object. + #[staticmethod] + fn deserialize_from(file: PyObject) -> PyResult> { + let f = PyFileLikeObject::with_requirements(file, true, false, false)?; + let rules = yrx::Rules::deserialize_from(f) + .map_err(|err| PyIOError::new_err(err.to_string()))?; + + Python::with_gil(|py| Py::new(py, Rules::new(rules))) + } + fn warnings(&self) -> Vec { self.inner.rules.warnings().iter().map(|w| w.to_string()).collect() } diff --git a/yara-x-py/tests/test_api.py b/yara-x-py/tests/test_api.py index 40df7b98f..1a86d152c 100644 --- a/yara-x-py/tests/test_api.py +++ b/yara-x-py/tests/test_api.py @@ -1,3 +1,4 @@ +import io import pytest import yara_x @@ -146,6 +147,15 @@ def test_module_outputs(): assert module_outputs['test_proto2']['int32One'] == 1 +def test_serialization(): + rules = yara_x.compile('rule foo {condition: true}') + f = io.BytesIO() + rules.serialize_into(f) + f.seek(0) + rules = yara_x.Rules.deserialize_from(f) + assert len(rules.scan(b'').matching_rules) == 1 + + def test_console_log(): ok = False def callback(msg): diff --git a/yara-x/src/compiler/rules.rs b/yara-x/src/compiler/rules.rs index a3bfd10b6..021bae78a 100644 --- a/yara-x/src/compiler/rules.rs +++ b/yara-x/src/compiler/rules.rs @@ -1,4 +1,4 @@ -use std::io::{BufWriter, Write}; +use std::io::{BufWriter, Read, Write}; #[cfg(feature = "logging")] use std::time::Instant; @@ -126,6 +126,16 @@ impl Rules { self.warnings.as_slice() } + /// Serializes the rules as a sequence of bytes. + /// + /// The [`Rules`] can be restored back by passing the bytes to + /// [`Rules::deserialize`]. + pub fn serialize(&self) -> Result, SerializationError> { + let mut bytes = Vec::new(); + self.serialize_into(&mut bytes)?; + Ok(bytes) + } + /// Deserializes the rules from a sequence of bytes produced by /// [`Rules::serialize`]. pub fn deserialize(bytes: B) -> Result @@ -155,17 +165,7 @@ impl Rules { Ok(rules) } - /// Serializes the rules as a sequence of bytes. - /// - /// The [`Rules`] can be restored back by passing the bytes to - /// [`Rules::deserialize`]. - pub fn serialize(&self) -> Result, SerializationError> { - let mut bytes = Vec::new(); - self.serialize_into(&mut bytes)?; - Ok(bytes) - } - - /// Serializes the rules and writes the bytes into a `writer`. + /// Serializes the rules into a `writer`. pub fn serialize_into( &self, writer: W, @@ -184,6 +184,18 @@ impl Rules { .serialize_into(writer, self)?) } + /// Deserializes the rules from a `reader`. + pub fn deserialize_from( + mut reader: R, + ) -> Result + where + R: Read, + { + let mut bytes = Vec::new(); + let _ = reader.read_to_end(&mut bytes)?; + Self::deserialize(bytes) + } + /// Returns a [`RuleInfo`] given its [`RuleId`]. /// /// # Panics From d4748103812d49b2d1ddbc9b641401dfd5c8b5e0 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 19 Jan 2024 10:52:06 +0100 Subject: [PATCH 18/28] feat(py): add `Scanner.scan_file` API. --- yara-x-py/src/lib.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/yara-x-py/src/lib.rs b/yara-x-py/src/lib.rs index c4eab0917..27db1b0a6 100644 --- a/yara-x-py/src/lib.rs +++ b/yara-x-py/src/lib.rs @@ -15,6 +15,7 @@ matches = rules.scan(b'some dummy data') use std::marker::PhantomPinned; use std::mem; use std::ops::Deref; +use std::path::PathBuf; use std::pin::Pin; use std::time::Duration; @@ -217,6 +218,18 @@ impl Scanner { ) }) } + + /// Scans a file. + fn scan_file(&mut self, path: PathBuf) -> PyResult> { + Python::with_gil(|py| { + scan_results_to_py( + py, + self.inner + .scan_file(path) + .map_err(|err| PyValueError::new_err(err.to_string()))?, + ) + }) + } } /// Results produced by a scan operation. From 9b5330688036e5a40b371687e4b5caa790584b3b Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 18 Jan 2024 23:45:32 +0100 Subject: [PATCH 19/28] docs: fix typo in documentation for `yara-x-proto-yaml` crate --- yara-x-proto-yaml/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yara-x-proto-yaml/src/lib.rs b/yara-x-proto-yaml/src/lib.rs index ab24de4ce..012f45d64 100644 --- a/yara-x-proto-yaml/src/lib.rs +++ b/yara-x-proto-yaml/src/lib.rs @@ -37,7 +37,7 @@ import "yaml.proto"; message MyMessage { optional int32 some_field = 1 [(yaml.field).fmt = "x"]; - optional int64 some_timestamp = 2 [(yaml.field).fmt = "x"]; + optional int64 some_timestamp = 2 [(yaml.field).fmt = "t"]; } ``` From d749b3e9f0f9807a5ecc3c43ae2f892352dfc7ef Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 19 Jan 2024 15:49:54 +0100 Subject: [PATCH 20/28] feat: implement `Debug` trait for `Rules` --- yara-x/src/compiler/rules.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/yara-x/src/compiler/rules.rs b/yara-x/src/compiler/rules.rs index 021bae78a..9a5678ed9 100644 --- a/yara-x/src/compiler/rules.rs +++ b/yara-x/src/compiler/rules.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::io::{BufWriter, Read, Write}; #[cfg(feature = "logging")] use std::time::Instant; @@ -431,6 +432,30 @@ where } } +impl fmt::Debug for Rules { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (id, rule) in self.rules.iter().enumerate() { + let name = self.ident_pool.get(rule.ident_id).unwrap(); + let namespace = + self.ident_pool.get(rule.namespace_ident_id).unwrap(); + writeln!(f, "RuleId({})", id)?; + writeln!(f, " namespace: {}", namespace)?; + writeln!(f, " name: {}", name)?; + writeln!(f, " patterns:")?; + for (pattern_ident_id, pattern_id) in &rule.patterns { + let ident = self.ident_pool.get(*pattern_ident_id).unwrap(); + writeln!(f, " {:?} {} ", pattern_id, ident)?; + } + } + + for (id, (pattern_id, _)) in self.sub_patterns.iter().enumerate() { + writeln!(f, "SubPatternId({}) -> {:?}", id, pattern_id)?; + } + + Ok(()) + } +} + /// Information about each of the individual rules included in [`Rules`]. #[derive(Serialize, Deserialize)] pub(crate) struct RuleInfo { From d60d3a72f56c21a2db078d4c9c6e5daa48ec0b78 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 19 Jan 2024 19:23:35 +0100 Subject: [PATCH 21/28] fix: bug while computing `imphash` in `pe` module The PE parser was storing all the functions imported by each DLL in a hash where the key was DLL name, resulting in all imports for "foo.dll" added to a common list of functions. This however doesn't represent well the structure of PE imports, where imports from "foo.dll" can be split into multiple thunk arrays (i.e: there's no a single list of functions imported from "foo.dll" but may of them). In fact, some PE files import a set of functions from "foo.dll", then import some set of functions from "bar.dl", and then import another set of functions from "foo.dll". Putting all the functions imported from "foo.dll" together makes the `imphash` incorrect. --- yara-x/src/modules/pe/parser.rs | 31 ++- ...173e10a4086fc9d7606fac6e945c0c15ca2.in.zip | Bin 0 -> 7989 bytes ...609173e10a4086fc9d7606fac6e945c0c15ca2.out | 261 ++++++++++++++++++ 3 files changed, 277 insertions(+), 15 deletions(-) create mode 100644 yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.in.zip create mode 100644 yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.out diff --git a/yara-x/src/modules/pe/parser.rs b/yara-x/src/modules/pe/parser.rs index 785ab9da0..00573d1e5 100644 --- a/yara-x/src/modules/pe/parser.rs +++ b/yara-x/src/modules/pe/parser.rs @@ -13,7 +13,6 @@ use authenticode_parser::{ }; use bstr::{BStr, ByteSlice}; use byteorder::{ByteOrder, LE}; -use indexmap::IndexMap; use itertools::Itertools; use memchr::memmem; use nom::branch::{alt, permutation}; @@ -84,16 +83,15 @@ pub struct PE<'a> { /// Path to PDB file containing debug information for the PE. pdb_path: OnceCell>, - /// Map that with the DLLs imported by this PE file. Keys are DLL names, - /// and values are vectors of [`ImportedFunc`] that contain information - /// about each function imported from the DLL. We use an [`IndexMap`] - /// instead of a [`HashMap`] because we want to maintain the order in - /// which the DLLs appear in the import table, which is important while - /// computing the imphash. - imports: OnceCell>>>, + /// Vector with the DLLs imported by this PE file. Each item in the vector + /// is a tuple composed of a DLL name and a vector of [`ImportedFunc`] that + /// contains information about each function imported from the DLL. The + /// vector can contain multiple entries for the same DLL, each with a + /// subset of the functions imported by from that DLL. + imports: OnceCell)>>>, /// Similar to `imports` but contains the delayed imports. - delayed_imports: OnceCell>>>, + delayed_imports: OnceCell)>>>, /// Export information about this PE file. exports: OnceCell>>, @@ -1463,7 +1461,7 @@ impl<'a> PE<'a> { } /// Parses PE imports. - fn parse_imports(&self) -> Option>> { + fn parse_imports(&self) -> Option)>> { let (_, _, import_data) = self.get_dir_entry_data(Self::IMAGE_DIRECTORY_ENTRY_IMPORT)?; self.parse_import_impl(import_data, Self::parse_import_descriptor) @@ -1472,7 +1470,7 @@ impl<'a> PE<'a> { /// Parses PE delayed imports. fn parse_delayed_imports( &self, - ) -> Option>> { + ) -> Option)>> { let (_, _, import_data) = self.get_dir_entry_data(Self::IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT)?; self.parse_import_impl(import_data, Self::parse_delay_load_descriptor) @@ -1508,7 +1506,7 @@ impl<'a> PE<'a> { &self, input: &'a [u8], descriptor_parser: P, - ) -> Option>> + ) -> Option)>> where P: FnMut(&'a [u8]) -> IResult<&'a [u8], ImportDescriptor>, { @@ -1528,8 +1526,7 @@ impl<'a> PE<'a> { let is_32_bits = self.optional_hdr.magic == Self::IMAGE_NT_OPTIONAL_HDR32_MAGIC; - let mut imported_funcs: IndexMap<&str, Vec> = - IndexMap::new(); + let mut imported_funcs = Vec::new(); for mut descriptor in import_descriptors { // If the values in the descriptor are virtual addresses, convert @@ -1583,6 +1580,8 @@ impl<'a> PE<'a> { verify(uint(is_32_bits), |thunk| *thunk != 0), ); + let mut funcs = Vec::new(); + for (i, mut thunk) in &mut thunks.enumerate() { // If the most significant bit is set, this is an import by // ordinal. The most significant bit depends on whether this @@ -1623,9 +1622,11 @@ impl<'a> PE<'a> { } if func.ordinal.is_some() || func.name.is_some() { - imported_funcs.entry(dll_name).or_default().push(func) + funcs.push(func); } } + + imported_funcs.push((dll_name, funcs)); } Some(imported_funcs) diff --git a/yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.in.zip b/yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.in.zip new file mode 100644 index 0000000000000000000000000000000000000000..538d0c104f46d99d762e08aa0907873e2fd1ad16 GIT binary patch literal 7989 zcmch6WmFtNv?e-ufB?bWf)m``-QC>>hatEHcMmYQyGs(>WpF1*(BQ!d150-I?R)!g z-=D2IU)8Oudro&(-`joa`_vTS5pZFU->PFaryk7zF14mGFp@A7FxJ+596bE|pEx;r zxi~p^csTj_c{x5=nOj-#u=9W7<+T39ZqCKd$75s3Z^g^Q&SPV4$z#pW#cj!M`H9=o zoP*WgMOzC421&8I%3A3S|9UT9G#FR}gl;$(1Wi*|n0O=ojCFxH=!J(L+pf|b`3RJ) zl8sKkPV3L!Ag$pKDsikqDY@4}R?twoUlHa{mF<8_a<-?g>(>{^D|Xh;ViQxg{;rot zk@isN?N+@P1K%_8NLoU%*>87Gk@ii(D+dn{Qom8|O{MXffMg$61FpwZEkaOw^@G2A zpJ`@h&mYFhaEt9YpJ=`_54e@gG~R@Vua+u9f0@!JdLvlvKwp9*mlp%j_Y-0L5SPI{ z&wm()V_4XfQ8ESLV6gBC*RQ4XJ7jI#^Je_k(w?_wG z2q1|Uo1=1bR~PK{d0L2vV56rZofjKpow{e(hik7eYcUCQJ)Hc37FZqXU-3fpj{0Tt zQTn%RX%#Z=R1glOdwg1s^3bi(W?qh;qs^R`#GoJO&tHW`uvFuy+HTg=dnZRgoI4^S z+_zq7kQZRH3|l9asr+qoInL1>?Kg}Cpxb*B7h@d}A>zkaBdtL%h8QTeO}=A4hb5rT?ja)`hG7!X+qu z#gaudsMV#CK1%`Y#XMzm>EHTX3wN6233#cJoe!Dq-Fk_9U4$N0fd^9G zo!lKx7*2>r{8`Sxc(xVnQFGaT&9G%(=ndk-@A(!Jvh_f`+FO6q{Li>A`F6z{8e)2E z3*R#b0R)zUF`kFF*w5NW;}ii~+DR%Q~(&?PZ$}D#A{vl$6I|_CO3bJ1}axR^H4{OUDT`6g15& z+r)Y%%-SQw+0;~wryCJ}&mJHY#H0K{L6JCUl5qAk?Z7BZ!I%mZ2opJNxq`tMp^fDW zWJwjOPnI!A5<-tj3p47-T$vz{uUvr3vR}*# zTf)=%)gSsz#}n0LW1r+b`{HaBzwYN=%b_Q)R>KJzNU7K;90ShGCbmAqruTIA0;vno zTCSyu(b@0}&jR&nM1!4J59xcOA#j+dMgrYy-p_jXTT=pQ`zD) z-B@Uj(jTE`M)Y3nc|)r;935^NKx}U=j=}x%m}*oaT;FPxmxxW<3^lKQgE(FI4WKfl zph${}$6*dp(ND^5Csxag#Qq}Rn|PaD@L^^G#nBVP0Z9=Ct+Tpr;yQ&y7b8g)9ikje zB4+Ybuwu1V!VGA3DN9t$blCwivmq*~_MHMBu(9Nlkasp0;F3xOl52EZVcmwej{-(5a^|n&rR7al9sxo;r^aTF*dhKcj#se~Tthyn{tcQueHwIgM5QaZ<{^a>Rc%LK7o^ z64a)nB0SG!e9N&QeAl>Al_Yeyk@qQaFB6>;B>--SFVcz&=c28g78ZWnO}_&q;!WlC zC@=S+Ed)315aA^1f;M`VUwCv3c^GIQ$eU>@c;0OW4^yeW;`@~mEL0G^UXLa)tQRHY zM*3;4Qs(htlM7QG(_YyCpOeOsm0C%?_a+OF+uSGCi3exMfq;(as`D#9>n_kLYe&?O z_KLpM+{GX7Gs0sRpK|qXLTo)`Nby$2I_THxd!-^{r#+TdhTQDt*;GAR14_`RZ1 z4PE*CzU4@BZn?%{bTK>Pt@<&r*+#~@hCjQ(S?2r$<%#bA@i$HIf*YO!@(!8~Ph$c( zaYJ!oRoIx0#|oADwwJLu=U?MLv@RCGmmo%1YugV8-dTm{mEugFIO$KkbnlzhI4*Ha zvZ_BJL9?nVL$$i2l9zb~z^VC9vxtVPTNUF);_rl?H8rrVrsGI7znT{^7z_XqdR3P zoj*&fj@FG&ua4oqM&mx+iZ)DPc2_R}3u}x*u{88~(F*gT|Bu{MBR7bd@uN91T|4Fv zNs)I8F`@iqr|sXV@ZYz1$2Sy)?eyvkR3{6e`9#YvZ!}25J?6G9Zc63%d@Pm!X*%!1 zLSMQ-%Y(LHK&wv|AY6q-umICKiI!ezjm?7m*7+@O&R7lvyU zkGvH?d!^trR~daRIAD>Pxa5KU%1M=LMg65Zp5kxdqkTn6mJQMA=6?mY1vV3q9O zb{>|uk_~5+$5SQX3ER++C`e9>L?(6^o-4%)!;rS}Z+iwjc$V9;Yw51j!&=&34qs8& z7j+S*?WD{Mr+jV-jZ_~0(4fMC<^VO(VLx%GB^qlU7#G`9l)O&Mbk#>E19k6JSe0gH z!!GI00xvnvLs&l^4A$MVvt!?<%1|;TS&9#{uMnMn(@m!32xvk<$A_~uGAbVaPN*;=v&OsbfvW!S-<;L_=XEJGo zGtEh@+YCTwQ>lg=aUOThK>ARmrc4lZ-`D-tMy~{YDoDp!LR8RKxU%@Glw-RhIAMjGT1&8eu3W=q1tIpz-hdVJ9+25mMaJ zKSxSyP*R3!wjvavq?nf~{>NXhQS3`W-o*SLymaKDaV3&rjA(P-ryPbI=f*`E&3qR?# z?OX)0|LkgW^`!^vqmRp}XQcbUgVLW|7nks_#J7F%78@(7E_@j=0g6M6j>EIenS%QF zl@mi_H6F=zSWm~C!|t9MOV)nHAO&&MD1gvDXU1SIqt`@0I2b1n)f5{+%oRTwx5hda zsZovcAu5R8mRKla2H`->7^x#Pv~UE=OJU^0bT9U|qg;M^#S~hY3X2eS{(0%-bp88# z82S9XcYywlLD=xSg+2`{V7G7VKFCCh#oDk?dcHPKD?nTEnKc2aQ&J(2Pse&eByUIN zvTXsn>rdJ)H)g_nK^pGKvPN;q^?}=YVz+OikFH*IDI7`GT<*^_@yUv{)fTdY4=VX! zgS>yRucR1=%QWKk=k6r-ll{})S+f{O>Gh7bKd87$U$vnQ55j!q6$DN=0 zyRm?MX-mtgWM*b#63lm4!Rhvo?l25_N`MMpaAOLdzYE#~Q+K_LaknsoMa;t8H>GRf z&hhkda*2&@bCPB0)JMXv9p0!L`r)WoevE6LA2+758fxG-nna_-oaRhB5~y+Y=0+q2 zq-6kO9&0$*6w%*ZMIU7V6ky0|4$)S%;oQ*k6FN4s4Z0>Qz z{`BD9YhcFo+U^fmiED2~GZgPSI)qMLZBvqnFvr%r0e#ayW`1dqrNPqK!8)ge((h=^ z+&dlepBJR)pk7Yxp1frV?HxI7o7~nqQ$8HGt%8sOw3Mw;nZq!P7)x9sy;F1Q3mvw_ zN>|w0P$lkH8;}WA{s+H!^HdcT$>4#~ZEK=qgo$8v)CLZ>Yz=5yhBo^}xzxuD*e$q1 z8bY#<*%E|s6%kUd>r|_<_%9g&;0L)Y@uI&4wEo8>>=_v02cy=&OA}6|l>JBA_T54a z0fV2JRy%f`JgCSFe0)qyQ?h&6I^3ACP2Tcsu|KbLsJe7rJDkBMB);-7Q6!297^F2j zT&`Bw5kV54buHLAm0!;3W0Fs%zJ(54Z9h}d7`GJ`lC*RCmYAgN?z#|(s!hR{m0Svh zq-A-yGne+w?^s%a8OS|Nnsxqw~<{)RD}Z7O8gH_rG2Q1k5F zn^;$<*OD}Ea?qTeA}<2^0p&|}WSL`ZTnt%Cf(i5!J_=2m<4_yY!xO%PFae^p&m)J7 z&#US=jb~=>Rwk=SH>*V@=U_*ebwrDZ+s*>pnpoXicICa0)->EDl{k%EVBtm#GWq~h z?h& zjWRdoF`shg3F=%`3&K^m{cudKf#RuN0BX4JRUuja`LRmRD*E`E`cyIl1jnuznM9In z5PVc0Dxc4fD23DCU)TMT$5Xv?i z`(gKhbBvQJ)ZuS2@pSA;W@7E);@QH5LLW>CWWT7Q>|KvHqQ{SSF?;P}FyG_lwXhdc zG>Q2AKVwgde~qsZ4KiKG0Tp-PuK1!jM#66y7{3%Np!*{*(gXVxu7Nqm55gl> zh`y9_M!_}HGPXKB?Obk_VjBb@;|eyuS#ezFVMy7!Bv|nwoeNyjs*Dm?zxXK{ld<%S zD1`Sdu;p8zqyBE+2DSRL<+_tymku>W`C+sV8S56_Ab%1on<-yK+QG4d%f3|^vI*ADtR^yrVH&*KKO(@bC)3N z)3`* zmJVvxsO+SUv^SX81q#(J1a>%DaJg{I(elf%T<~ef19ESiWF${`NtQ*oR8X9vTK|sm zQ>m#7614nCsG`c7eF&>@c!yVg4(1}#uN~hvpgit)$*w+r`ZCP+cIY)?Soz~6MSXY| zw9u8pl;_+9zAf7FH{k`hE-Xbn(t{5npL~l!!Vllb{eO`8pAW}FQz6%VhcHY46VO-j zYcQwtd=oMqGYyWmPIg^o_RN#5myA<+V2zlz2Hz}{^%yW;FG3kU>kZ9Hcc0PyC~ai1 z7Ou{Ifmt`(HST(|0pCNUZLiKFWi5FE2H9qg^GTL?GwEkAkc{xpdwo;Bk~~W6`GKgw z+xhl%e@(+b+6ECqGyOIA9`xt3{4E_He@#da-t)mZhN5lf-V5<(!%uFK_8EI)Lq1g2g`!&e@yQKLHU-``nHB@L40+c`2Me@Z@&Prt*P@Z_p-s~JPh71 zb8~{{G{Q%9Uw%Cu4&Iw&W*M4zvhi#N7&+%-%)*V6_RDOdm zGWKM9w}m3GsFWxt{k794xJL>jKU4+^&1`00VS3xNHaw;a5fjotBHGdHSob=L?Ot&A zyB{GMuO!!#*B2AL5{b_W`SEeOf1I}e6E)ks0>f1T#-M*3nt~|tLwX?3B@^_`<>`OX zZU2*GOZhKEn?nAZZ4Zr9->Imso8Yi1YfKgRWEt9^&YE-e zm+HIV%Hq~u=Wl66HV429eyihzwEU)GWzfsM&c-1V!GkX7d^ZimHd5vn=BOO{l65oU zf9y8yMgRu&tQ$nRpX!UxPBUfgff#*V_+8G9kE9oSnIX69_&4KSN$C+hVX7d&)U&0I z+H5iH_;1bpuc6&Py_VAD zDOqy3n!5y}yQyU<__<>QMe5>Wf6f_M$&U#$fK9XWl6Y5ioZ-Xmav7*FJwkRvfjCSlUqA7clPteI*NHp0u%?OduV59|+Rt5KYr|XCluzf~JZ9n99Cp`*DpA#K$gO6FR z^b+e#MbwW>jVa{fP8`$VloFG=@%%^l(ZID`GTpN12-0_il0!6X{x*u1{4F==_44l| z`9efQvSPM|Te}aTC*V)AjbaKeJNkp05*87P#k}}$b30o)Il}KrYH;{eI)#@@_(PzI z`SvOH7Xpplo2P(MbF-hS<{tv`390ahB*V=Ck{k6kXQ>Bpr5BtuZ+$XCky% zEAThA;^uvM5A0)hgpf~8!U%@;!Kb?y#0Ah_x5Gsqj$x#SR<+w)%cV*o4`{Qn&o5y0 zhQ@sW6lHg*29~JhjP>n?fh4|nR-JHlK{imc*wp1;wxN1sM|Xtl4m`ib{-3wx7!P_&jQ`gO#0luS+RBgF>$Q(4FhdDZ;ov16 zVWWK4)ky4RQ6&5qhDw9XlLID@jC;;HJb!C&HKowaqR0`+(hK?2LqP?E8z) zH5XxNCFTTt9N3h18R-d*;msrM!gNm~+ zE&vs<;pgV=l};>b#{jtc?Jt-nK!A_YtN$oWXRyuoYAMz(GVCv8#4V)h^)|s}Q3y%@ zZ%QYaiVp*`Dm(9@mbMG#W&xwzw8DciskMS`YL4 z6{%v$Y`px3ayjTV$`tK4AG<(JdD@?N7kETvKkSF<*EJGxiL45=jeb;#e?a}cLF93498Nw z;a^7aT)z_{19pW_&u>bi9{80+;1I`2M3K4Y*K2H zU8rDs_YsfSo7Kto%QJ%`>hS%El>%30cj6((;m+b1%c7Rrr{jw_=EaMfS9#}L`o6#} zIoqTYaO2$_p}Ax`*?W7_fxf+5z5bB6e5;yY0p}zX>Av&dOE`07F!4ukfMXFv1dKWq4qqDQ%tOa{f(3BAgoaz1z5*Q1??Jx+O$= zSw0l~40;|@POf5mJ-G&lg`OqH{LX#Y0>29FWe2BqmiNPN`X~EwR@D@yu`Q;J7Pstk zYHO#V2>NHUlzRQ@znH)(sN%#lyc%O=ykliaG%-8Ma|{b}{>3S9gq ztwH*FfzP(UZQhrE+T-gZ$G?W)AwKXYAg~zx7%ld+s?Ph0siug8{BF8_6G{pL^G|bD tQ-p=Xh5t`I`oFII)1&{Rl!W}V{y(VGYKn+}f0u%NtI2PD0@=UO{{Sg)l6e3C literal 0 HcmV?d00001 diff --git a/yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.out b/yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.out new file mode 100644 index 000000000..254390bce --- /dev/null +++ b/yara-x/src/modules/pe/tests/testdata/ee82699133743266399721dadb609173e10a4086fc9d7606fac6e945c0c15ca2.out @@ -0,0 +1,261 @@ +is_pe: true +machine: MACHINE_I386 +subsystem: SUBSYSTEM_WINDOWS_GUI +os_version: + major: 4 + minor: 0 +subsystem_version: + major: 4 + minor: 0 +image_version: + major: 0 + minor: 0 +linker_version: + major: 2 + minor: 25 +opthdr_magic: IMAGE_NT_OPTIONAL_HDR32_MAGIC +characteristics: 41358 +dll_characteristics: 1 +timestamp: 708992537 +image_base: 8388608 +checksum: 0 +base_of_code: 4096 +base_of_data: 12288 +entry_point: 5272 +entry_point_raw: 8344 +section_alignment: 4096 +file_alignment: 512 +loader_flags: 0 +size_of_optional_header: 224 +size_of_code: 4608 +size_of_initialized_data: 3072 +size_of_uninitialized_data: 0 +size_of_image: 36864 +size_of_headers: 1024 +size_of_stack_reserve: 0 +size_of_stack_commit: 0 +size_of_heap_reserve: 1048576 +size_of_heap_commit: 4096 +pointer_to_symbol_table: 0 +win32_version_value: 0 +number_of_symbols: 0 +number_of_rva_and_sizes: 16 +number_of_sections: 7 +number_of_imported_functions: 28 +number_of_delayed_imported_functions: 0 +number_of_resources: 3 +number_of_version_infos: 0 +number_of_imports: 5 +number_of_delayed_imports: 0 +number_of_exports: 0 +number_of_signatures: 0 +sections: + - name: "CODE" + full_name: "CODE" + characteristics: 1610612768 + raw_data_size: 4608 + raw_data_offset: 1024 + virtual_address: 4096 + virtual_size: 4288 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: "DATA" + full_name: "DATA" + characteristics: 3221225536 + raw_data_size: 512 + raw_data_offset: 5632 + virtual_address: 12288 + virtual_size: 164 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: "BSS" + full_name: "BSS" + characteristics: 3221225472 + raw_data_size: 0 + raw_data_offset: 6144 + virtual_address: 16384 + virtual_size: 2037 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: ".idata" + full_name: ".idata" + characteristics: 3221225536 + raw_data_size: 1024 + raw_data_offset: 6144 + virtual_address: 20480 + virtual_size: 786 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: ".rdata" + full_name: ".rdata" + characteristics: 1342177344 + raw_data_size: 512 + raw_data_offset: 7168 + virtual_address: 24576 + virtual_size: 19 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: ".reloc" + full_name: ".reloc" + characteristics: 1342177344 + raw_data_size: 512 + raw_data_offset: 7680 + virtual_address: 28672 + virtual_size: 472 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 + - name: ".rsrc" + full_name: ".rsrc" + characteristics: 1342177344 + raw_data_size: 512 + raw_data_offset: 8192 + virtual_address: 32768 + virtual_size: 512 + pointer_to_relocations: 0 + pointer_to_line_numbers: 0 + number_of_relocations: 0 + number_of_line_numbers: 0 +data_directories: + - virtual_address: 0 + size: 0 + - virtual_address: 20480 + size: 786 + - virtual_address: 32768 + size: 512 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 28672 + size: 472 + - virtual_address: 0 + size: 0 + - virtual_address: 24576 + size: 19 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 + - virtual_address: 0 + size: 0 +resource_timestamp: 1116228524 # 2005-05-16 07:28:44 UTC +resource_version: + major: 0 + minor: 0 +resources: + - length: 40 + rva: 33016 + offset: 8440 + type: RESOURCE_TYPE_RCDATA + language: 0 + name_string: "D\000E\000S\000C\000R\000I\000P\000T\000I\000O\000N\000" + - length: 16 + rva: 33056 + offset: 8480 + type: RESOURCE_TYPE_RCDATA + language: 0 + name_string: "D\000V\000C\000L\000A\000L\000" + - length: 60 + rva: 33072 + offset: 8496 + type: RESOURCE_TYPE_RCDATA + language: 0 + name_string: "P\000A\000C\000K\000A\000G\000E\000I\000N\000F\000O\000" +import_details: + - library_name: "kernel32.dll" + number_of_functions: 15 + functions: + - name: "DeleteCriticalSection" + rva: 20600 + - name: "LeaveCriticalSection" + rva: 20604 + - name: "EnterCriticalSection" + rva: 20608 + - name: "VirtualFree" + rva: 20612 + - name: "LocalFree" + rva: 20616 + - name: "GetCurrentThreadId" + rva: 20620 + - name: "GetStartupInfoA" + rva: 20624 + - name: "GetCommandLineA" + rva: 20628 + - name: "FreeLibrary" + rva: 20632 + - name: "ExitProcess" + rva: 20636 + - name: "WriteFile" + rva: 20640 + - name: "UnhandledExceptionFilter" + rva: 20644 + - name: "RtlUnwind" + rva: 20648 + - name: "RaiseException" + rva: 20652 + - name: "GetStdHandle" + rva: 20656 + - library_name: "user32.dll" + number_of_functions: 2 + functions: + - name: "GetKeyboardType" + rva: 20664 + - name: "MessageBoxA" + rva: 20668 + - library_name: "advapi32.dll" + number_of_functions: 3 + functions: + - name: "RegQueryValueExA" + rva: 20676 + - name: "RegOpenKeyExA" + rva: 20680 + - name: "RegCloseKey" + rva: 20684 + - library_name: "kernel32.dll" + number_of_functions: 6 + functions: + - name: "TlsSetValue" + rva: 20692 + - name: "TlsGetValue" + rva: 20696 + - name: "TlsFree" + rva: 20700 + - name: "TlsAlloc" + rva: 20704 + - name: "LocalFree" + rva: 20708 + - name: "LocalAlloc" + rva: 20712 + - library_name: "kernel32.dll" + number_of_functions: 2 + functions: + - name: "LoadLibraryA" + rva: 20720 + - name: "GetModuleFileNameA" + rva: 20724 +is_signed: false +overlay: + offset: 0 + size: 0 \ No newline at end of file From da986008ce733e74bc28fcd112300cc4a0519313 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 19 Jan 2024 19:38:39 +0100 Subject: [PATCH 22/28] fix: clippy error due to type too complex. --- yara-x/src/modules/pe/parser.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yara-x/src/modules/pe/parser.rs b/yara-x/src/modules/pe/parser.rs index 00573d1e5..aabf2ee68 100644 --- a/yara-x/src/modules/pe/parser.rs +++ b/yara-x/src/modules/pe/parser.rs @@ -32,6 +32,10 @@ use crate::modules::protos; type Error<'a> = nom::error::Error<&'a [u8]>; +/// Tuple that contains a DLL name and a vector with functions imported from +/// that DLL. +type DllImports<'a> = Vec<(&'a str, Vec)>; + /// The initialization token needed by the authenticode_parser library must be /// created only once per process, and it must be done in a thread-safe way. static AUTHENTICODE_INIT_TOKEN: OnceLock< @@ -88,10 +92,10 @@ pub struct PE<'a> { /// contains information about each function imported from the DLL. The /// vector can contain multiple entries for the same DLL, each with a /// subset of the functions imported by from that DLL. - imports: OnceCell)>>>, + imports: OnceCell>>, /// Similar to `imports` but contains the delayed imports. - delayed_imports: OnceCell)>>>, + delayed_imports: OnceCell>>, /// Export information about this PE file. exports: OnceCell>>, From 923d42f54f1cd97dbb210831032726206d048add Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 19 Jan 2024 22:05:52 +0100 Subject: [PATCH 23/28] refactor: call `simplify_seq` from `seq_to_atoms` Also dedup the sequence before checking if its length is 256. --- yara-x/src/re/thompson/compiler.rs | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/yara-x/src/re/thompson/compiler.rs b/yara-x/src/re/thompson/compiler.rs index fb429865c..08c605b7d 100644 --- a/yara-x/src/re/thompson/compiler.rs +++ b/yara-x/src/re/thompson/compiler.rs @@ -867,9 +867,7 @@ impl hir::Visitor for &mut Compiler { return Ok(()); } - let best_atoms = seq_to_atoms(simplify_seq( - self.lit_extractor.extract(hir), - )); + let best_atoms = seq_to_atoms(self.lit_extractor.extract(hir)); (best_atoms, code_loc) } @@ -883,9 +881,7 @@ impl hir::Visitor for &mut Compiler { return Ok(()); } - let best_atoms = seq_to_atoms(simplify_seq( - self.lit_extractor.extract(hir), - )); + let best_atoms = seq_to_atoms(self.lit_extractor.extract(hir)); (best_atoms, code_loc) } @@ -903,9 +899,7 @@ impl hir::Visitor for &mut Compiler { return Ok(()); } - let best_atoms = seq_to_atoms(simplify_seq( - self.lit_extractor.extract(hir), - )); + let best_atoms = seq_to_atoms(self.lit_extractor.extract(hir)); (best_atoms, code_loc) } @@ -960,9 +954,7 @@ impl hir::Visitor for &mut Compiler { return Ok(()); } - let best_atoms = seq_to_atoms(simplify_seq( - self.lit_extractor.extract(hir), - )); + let best_atoms = seq_to_atoms(self.lit_extractor.extract(hir)); (best_atoms, code_loc) } @@ -976,9 +968,7 @@ impl hir::Visitor for &mut Compiler { return Ok(()); } - let best_atoms = seq_to_atoms(simplify_seq( - self.lit_extractor.extract(hir), - )); + let best_atoms = seq_to_atoms(self.lit_extractor.extract(hir)); (best_atoms, code_loc) } @@ -1529,6 +1519,7 @@ impl Display for InstrSeq { } fn simplify_seq(mut seq: Seq) -> Seq { + seq.dedup(); // If the literal extractor produced exactly 256 atoms, and those atoms // have a common prefix that is one byte shorter than the longest atom, // we are in the case where we have 256 atoms that differ only in the @@ -1547,7 +1538,6 @@ fn simplify_seq(mut seq: Seq) -> Seq { } } } - seq.dedup(); seq } @@ -1620,7 +1610,9 @@ fn concat_seq(seqs: &[Seq]) -> Option { } fn seq_to_atoms(seq: Seq) -> Option> { - seq.literals().map(|literals| literals.iter().map(Atom::from).collect()) + simplify_seq(seq) + .literals() + .map(|literals| literals.iter().map(Atom::from).collect()) } /// A list of [`RegexpAtom`] that contains additional information, like the From 06db4f4e296c131db82bfd7f2f3e2295f61013b0 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Sun, 21 Jan 2024 22:16:41 +0100 Subject: [PATCH 24/28] fix: issue with detection of duplicate patterns. When a pattern was first declared by some rule and used with the `at` operator, and then declared again in second rule but used without the `at` operator, it wasn't added to the Aho-Corasick automata causing false negatives for the second rule. --- yara-x/src/compiler/context.rs | 31 ++++---- yara-x/src/compiler/emit.rs | 56 ++++++++------ yara-x/src/compiler/ir/ast2ir.rs | 71 +++++++++--------- yara-x/src/compiler/ir/mod.rs | 125 ++++++++++++++++++++++++------- yara-x/src/compiler/mod.rs | 113 +++++++++++++++------------- yara-x/src/tests/mod.rs | 19 +++++ 6 files changed, 262 insertions(+), 153 deletions(-) diff --git a/yara-x/src/compiler/context.rs b/yara-x/src/compiler/context.rs index 62da8617e..0511f2ba1 100644 --- a/yara-x/src/compiler/context.rs +++ b/yara-x/src/compiler/context.rs @@ -1,10 +1,12 @@ +use itertools::Itertools; use std::mem::size_of; use std::rc::Rc; use yara_x_parser::report::ReportBuilder; use yara_x_parser::Warning; -use crate::compiler::{ir, IdentId, PatternId, RuleId, RuleInfo}; +use crate::compiler::ir::PatternIdx; +use crate::compiler::{ir, IdentId, RuleId, RuleInfo}; use crate::string_pool::StringPool; use crate::symbols::{StackedSymbolTable, SymbolLookup}; use crate::types::Type; @@ -28,10 +30,9 @@ pub(in crate::compiler) struct CompileContext<'a, 'src, 'sym> { /// Information about the rules compiled so far. pub rules: &'a Vec, - /// A vector that contains the IR for the patterns declared in the current - /// rule, accompanied by their corresponding [`PatternId`]. - pub current_rule_patterns: - &'a mut Vec<(PatternId, ir::PatternInRule<'src>)>, + /// A slice that contains the IR for the patterns declared in the current + /// rule. + pub current_rule_patterns: &'a mut [ir::PatternInRule<'src>], /// Warnings generated during the compilation. pub warnings: &'a mut Vec, @@ -63,7 +64,7 @@ impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> { } /// Given a pattern identifier (e.g. `$a`, `#a`, `@a`) search for it in - /// the current rule and return its [`PatternID`]. + /// the current rule and return its position. /// /// Notice that this function accepts identifiers with any of the valid /// prefixes `$`, `#`, `@` and `!`. @@ -71,7 +72,7 @@ impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> { /// # Panics /// /// Panics if the current rule does not have the requested pattern. - pub fn get_pattern_id(&self, ident: &str) -> PatternId { + pub fn get_pattern_index(&self, ident: &str) -> PatternIdx { // Make sure that identifier starts with `$`, `#`, `@` or `!`. debug_assert!("$#@!".contains( ident @@ -80,15 +81,13 @@ impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> { .expect("identifier must be at least 1 character long") )); - for (pattern_id, pattern) in self.current_rule_patterns.iter() { - // Ignore the first character (`$`, `#`, `@` or `!`) while - // comparing the identifiers. - if pattern.identifier()[1..] == ident[1..] { - return *pattern_id; - } - } + let (position, _) = self + .current_rule_patterns + .iter() + .find_position(|pattern| pattern.identifier()[1..] == ident[1..]) + .expect("pattern not found"); - panic!("pattern `{}` not found", ident); + position.into() } /// Given a pattern identifier (e.g. `$a`, `#a`, `@a`) search for it in @@ -113,7 +112,7 @@ impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> { .expect("identifier must be at least 1 character long") )); - for (_, pattern) in self.current_rule_patterns.iter_mut() { + for pattern in self.current_rule_patterns.iter_mut() { if pattern.identifier()[1..] == ident[1..] { return pattern; } diff --git a/yara-x/src/compiler/emit.rs b/yara-x/src/compiler/emit.rs index 2b355a2cf..d0732fb3e 100644 --- a/yara-x/src/compiler/emit.rs +++ b/yara-x/src/compiler/emit.rs @@ -20,9 +20,12 @@ use yara_x_parser::ast::{RuleFlag, RuleFlags}; use crate::compiler::context::VarStack; use crate::compiler::ir::{ - Expr, ForIn, ForOf, Iterable, MatchAnchor, Of, OfItems, Quantifier, + Expr, ForIn, ForOf, Iterable, MatchAnchor, Of, OfItems, PatternIdx, + Quantifier, +}; +use crate::compiler::{ + LiteralId, PatternId, RegexpId, RuleId, RuleInfo, Var, VarStackFrame, }; -use crate::compiler::{LiteralId, RegexpId, RuleId, Var, VarStackFrame}; use crate::scanner::RuntimeObjectHandle; use crate::string_pool::{BStringPool, StringPool}; use crate::symbols::SymbolKind; @@ -171,6 +174,9 @@ pub(in crate::compiler) struct EmitContext<'a> { /// this tells which specific signature must be used. pub current_signature: Option, + /// Information about the rule whose condition is being emitted. + pub current_rule: &'a RuleInfo, + /// Table with all the symbols (functions, variables) used by WASM. pub wasm_symbols: &'a WasmSymbols, @@ -217,6 +223,14 @@ impl<'a> EmitContext<'a> { panic!("can't find function `{}`", fn_mangled_name) }) } + + /// Given the index of a pattern in a rule, returns its [`PatternId`]. + /// + /// The index of a pattern is the position of the pattern in the `strings` + /// section of the rule. + pub fn pattern_id(&self, index: PatternIdx) -> PatternId { + self.current_rule.patterns[index.as_usize()].1 + } } /// Emits WASM code of a rule. @@ -969,8 +983,8 @@ fn emit_pattern_match( let anchor = match expr { // When the pattern ID is known, simply push the ID into the stack. - Expr::PatternMatch { pattern_id, anchor } => { - instr.i32_const((*pattern_id).into()); + Expr::PatternMatch { pattern, anchor } => { + instr.i32_const(ctx.pattern_id(*pattern).into()); anchor } // When the pattern ID is not known, the ID is taken from a variable. @@ -1022,8 +1036,8 @@ fn emit_pattern_count( let range = match expr { // Cases where the pattern ID is known, simply push the ID into the // stack. - Expr::PatternCount { pattern_id, range } => { - instr.i32_const((*pattern_id).into()); + Expr::PatternCount { pattern, range } => { + instr.i32_const(ctx.pattern_id(*pattern).into()); range } Expr::PatternCountVar { symbol, range } => { @@ -1066,8 +1080,8 @@ fn emit_pattern_offset( let index = match expr { // Cases where the pattern ID is known, simply push the ID into the // stack. - Expr::PatternOffset { pattern_id, index } => { - instr.i32_const((*pattern_id).into()); + Expr::PatternOffset { pattern, index } => { + instr.i32_const(ctx.pattern_id(*pattern).into()); index } Expr::PatternOffsetVar { symbol, index } => { @@ -1114,8 +1128,8 @@ fn emit_pattern_length( let index = match expr { // Cases where the pattern ID is known, simply push the ID into the // stack. - Expr::PatternLength { pattern_id, index } => { - instr.i32_const((*pattern_id).into()); + Expr::PatternLength { pattern, index } => { + instr.i32_const(ctx.pattern_id(*pattern).into()); index } Expr::PatternLengthVar { symbol, index } => { @@ -1393,12 +1407,12 @@ fn emit_of_pattern_set( instr: &mut InstrSeqBuilder, of: &mut Of, ) { - let pattern_ids = cast!(&mut of.items, OfItems::PatternSet); + let patterns = cast!(&mut of.items, OfItems::PatternSet); - debug_assert!(!pattern_ids.is_empty()); + debug_assert!(!patterns.is_empty()); - let num_patterns = pattern_ids.len(); - let mut pattern_ids = pattern_ids.iter().cloned(); + let num_patterns = patterns.len(); + let mut patterns = patterns.iter().cloned(); let next_pattern_id = of.stack_frame.new_var(Type::Integer); // Make sure the pattern search phase is executed, as the `of` statement @@ -1421,9 +1435,9 @@ fn emit_of_pattern_set( // Get the i-th pattern ID, and store it in `next_pattern_id`. set_var(ctx, instr, next_pattern_id, |ctx, instr| { load_var(ctx, instr, i); - emit_switch(ctx, I64, instr, |_, instr| { - if let Some(pattern_id) = pattern_ids.next() { - instr.i64_const(pattern_id.into()); + emit_switch(ctx, I64, instr, |ctx, instr| { + if let Some(pattern) = patterns.next() { + instr.i64_const(ctx.pattern_id(pattern).into()); return true; } false @@ -1512,7 +1526,7 @@ fn emit_for_of_pattern_set( for_of: &mut ForOf, ) { let num_patterns = for_of.pattern_set.len(); - let mut pattern_ids = for_of.pattern_set.iter(); + let mut patterns = for_of.pattern_set.iter(); let next_pattern_id = for_of.variable; emit_for( @@ -1531,9 +1545,9 @@ fn emit_for_of_pattern_set( // Get the i-th pattern ID, and store it in `next_pattern_id`. set_var(ctx, instr, next_pattern_id, |ctx, instr| { load_var(ctx, instr, i); - emit_switch(ctx, I64, instr, |_, instr| { - if let Some(pattern_id) = pattern_ids.next() { - instr.i64_const((*pattern_id).into()); + emit_switch(ctx, I64, instr, |ctx, instr| { + if let Some(pattern) = patterns.next() { + instr.i64_const(ctx.pattern_id(*pattern).into()); return true; } false diff --git a/yara-x/src/compiler/ir/ast2ir.rs b/yara-x/src/compiler/ir/ast2ir.rs index ffac96b48..884898433 100644 --- a/yara-x/src/compiler/ir/ast2ir.rs +++ b/yara-x/src/compiler/ir/ast2ir.rs @@ -15,11 +15,9 @@ use crate::compiler::ir::hex2hir::hex_pattern_hir_from_ast; use crate::compiler::ir::{ Expr, ForIn, ForOf, FuncCall, Iterable, LiteralPattern, Lookup, MatchAnchor, Of, OfItems, Pattern, PatternFlagSet, PatternFlags, - PatternInRule, Quantifier, Range, RegexpPattern, -}; -use crate::compiler::{ - CompileContext, CompileError, CompileErrorInfo, PatternId, + PatternIdx, PatternInRule, Quantifier, Range, RegexpPattern, }; +use crate::compiler::{CompileContext, CompileError, CompileErrorInfo}; use crate::modules::BUILTIN_MODULES; use crate::re; use crate::re::parser::Error; @@ -104,12 +102,13 @@ pub(in crate::compiler) fn text_pattern_from_ast<'src>( Ok(PatternInRule { identifier: pattern.identifier.name, - anchored_at: None, pattern: Pattern::Literal(LiteralPattern { flags, xor_range, base64_alphabet: base64_alphabet.map(String::from), base64wide_alphabet: base64wide_alphabet.map(String::from), + anchored_at: None, + text: pattern.text.as_ref().into(), }), }) @@ -121,10 +120,10 @@ pub(in crate::compiler) fn hex_pattern_from_ast<'src>( ) -> Result, CompileError> { Ok(PatternInRule { identifier: pattern.identifier.name, - anchored_at: None, pattern: Pattern::Regexp(RegexpPattern { flags: PatternFlagSet::from(PatternFlags::Ascii), hir: re::hir::Hir::from(hex_pattern_hir_from_ast(pattern)), + anchored_at: None, }), }) } @@ -195,8 +194,11 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>( Ok(PatternInRule { identifier: pattern.identifier.name, - anchored_at: None, - pattern: Pattern::Regexp(RegexpPattern { flags, hir }), + pattern: Pattern::Regexp(RegexpPattern { + flags, + hir, + anchored_at: None, + }), }) } @@ -422,7 +424,7 @@ pub(in crate::compiler) fn expr_from_ast( } Ok(Expr::PatternMatch { - pattern_id: ctx.get_pattern_id(p.identifier.name), + pattern: ctx.get_pattern_index(p.identifier.name), anchor, }) } @@ -449,14 +451,14 @@ pub(in crate::compiler) fn expr_from_ast( (_, Some(range)) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternCount { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), range: Some(range_from_ast(ctx, range)?), }) } (_, None) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternCount { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), range: None, }) } @@ -487,7 +489,7 @@ pub(in crate::compiler) fn expr_from_ast( (_, Some(index)) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternOffset { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), index: Some(Box::new(integer_in_range_from_ast( ctx, index, @@ -498,7 +500,7 @@ pub(in crate::compiler) fn expr_from_ast( (_, None) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternOffset { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), index: None, }) } @@ -529,7 +531,7 @@ pub(in crate::compiler) fn expr_from_ast( (_, Some(index)) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternLength { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), index: Some(Box::new(integer_in_range_from_ast( ctx, index, @@ -540,7 +542,7 @@ pub(in crate::compiler) fn expr_from_ast( (_, None) => { ctx.get_pattern_mut(p.name).make_non_anchorable(); Ok(Expr::PatternLength { - pattern_id: ctx.get_pattern_id(p.name), + pattern: ctx.get_pattern_index(p.name), index: None, }) } @@ -645,9 +647,9 @@ fn of_expr_from_ast( } // `x of them`, `x of ($a*, $b)` ast::OfItems::PatternSet(pattern_set) => { - let pattern_ids = pattern_set_from_ast(ctx, pattern_set)?; - let num_patterns = pattern_ids.len(); - (OfItems::PatternSet(pattern_ids), num_patterns) + let pattern_indexes = pattern_set_from_ast(ctx, pattern_set)?; + let num_patterns = pattern_indexes.len(); + (OfItems::PatternSet(pattern_indexes), num_patterns) } }; @@ -1032,17 +1034,16 @@ fn quantifier_from_ast( fn pattern_set_from_ast( ctx: &mut CompileContext, pattern_set: &ast::PatternSet, -) -> Result, CompileError> { - let pattern_ids = match pattern_set { +) -> Result, CompileError> { + let pattern_indexes = match pattern_set { // `x of them` ast::PatternSet::Them { span } => { - let pattern_ids: Vec = ctx - .current_rule_patterns - .iter() - .map(|(pattern_id, _)| *pattern_id) - .collect(); + let pattern_indexes: Vec = + (0..ctx.current_rule_patterns.len()) + .map(|i| i.into()) + .collect(); - if pattern_ids.is_empty() { + if pattern_indexes.is_empty() { return Err(CompileError::from( CompileErrorInfo::empty_pattern_set( ctx.report_builder, @@ -1056,11 +1057,11 @@ fn pattern_set_from_ast( } // Make all the patterns in the set non-anchorable. - for (_, pattern) in ctx.current_rule_patterns.iter_mut() { + for pattern in ctx.current_rule_patterns.iter_mut() { pattern.make_non_anchorable(); } - pattern_ids + pattern_indexes } // `x of ($a*, $b)` ast::PatternSet::Set(ref set) => { @@ -1068,7 +1069,7 @@ fn pattern_set_from_ast( if !ctx .current_rule_patterns .iter() - .any(|(_, p)| item.matches(p.identifier())) + .any(|pattern| item.matches(pattern.identifier())) { return Err(CompileError::from( CompileErrorInfo::empty_pattern_set( @@ -1082,21 +1083,23 @@ fn pattern_set_from_ast( )); } } - let mut pattern_ids = Vec::new(); - for (pattern_id, pattern) in ctx.current_rule_patterns.iter_mut() { + let mut pattern_indexes = Vec::new(); + for (i, pattern) in + ctx.current_rule_patterns.iter_mut().enumerate() + { // Iterate over the patterns in the set (e.g: $foo, $foo*) and // check if some of them matches the identifier. if set.iter().any(|p| p.matches(pattern.identifier())) { - pattern_ids.push(*pattern_id); + pattern_indexes.push(i.into()); // All the patterns in the set are made non-anchorable. pattern.make_non_anchorable(); } } - pattern_ids + pattern_indexes } }; - Ok(pattern_ids) + Ok(pattern_indexes) } fn func_call_from_ast( diff --git a/yara-x/src/compiler/ir/mod.rs b/yara-x/src/compiler/ir/mod.rs index 4be1c6d35..9611b8fdc 100644 --- a/yara-x/src/compiler/ir/mod.rs +++ b/yara-x/src/compiler/ir/mod.rs @@ -37,7 +37,6 @@ use bstr::BString; use serde::{Deserialize, Serialize}; use crate::compiler::context::{Var, VarStackFrame}; -use crate::compiler::PatternId; use crate::symbols::Symbol; use crate::types::{Type, TypeValue, Value}; @@ -87,7 +86,6 @@ bitmask! { pub(in crate::compiler) struct PatternInRule<'src> { identifier: &'src str, pattern: Pattern, - anchored_at: Option, } impl<'src> PatternInRule<'src> { @@ -96,11 +94,6 @@ impl<'src> PatternInRule<'src> { self.identifier } - #[inline] - pub fn anchored_at(&self) -> Option { - self.anchored_at - } - #[inline] pub fn into_pattern(self) -> Pattern { self.pattern @@ -111,9 +104,14 @@ impl<'src> PatternInRule<'src> { &self.pattern } + #[inline] + pub fn anchored_at(&self) -> Option { + self.pattern.anchored_at() + } + /// Anchor the pattern to a given offset. This means that the pattern can /// match only at that offset and nowhere else. This is a no-op for - /// regexp patterns, and for patterns that are flagged as non-anchorable. + /// for patterns that are flagged as non-anchorable. /// /// Also, if this function is called twice with different offsets, the /// pattern becomes non-anchorable because it can't be anchored to two @@ -123,19 +121,7 @@ impl<'src> PatternInRule<'src> { /// in order to indicate that the pattern (the `$a` pattern in this case) /// can match only at a fixed offset. pub fn anchor_at(&mut self, offset: usize) { - match self.anchored_at { - Some(o) if o != offset => { - self.anchored_at = None; - self.pattern.flags_mut().set(PatternFlags::NonAnchorable); - } - None => { - if !self.pattern.flags().contains(PatternFlags::NonAnchorable) - { - self.anchored_at = Some(offset); - } - } - _ => {} - } + self.pattern.anchor_at(offset); } /// Make the pattern non-anchorable. Any existing anchor is removed and @@ -148,8 +134,7 @@ impl<'src> PatternInRule<'src> { /// the number of occurrences of `$a`), makes `$a` non-anchorable because /// we need to find all occurrences of `$a`. pub fn make_non_anchorable(&mut self) { - self.pattern.flags_mut().set(PatternFlags::NonAnchorable); - self.anchored_at = None; + self.pattern.make_non_anchorable(); } } @@ -188,12 +173,72 @@ impl Pattern { Pattern::Regexp(regexp) => &mut regexp.flags, } } + + #[inline] + pub fn anchored_at(&self) -> Option { + match self { + Pattern::Literal(literal) => literal.anchored_at, + Pattern::Regexp(regexp) => regexp.anchored_at, + } + } + + /// Anchor the pattern to a given offset. This means that the pattern can + /// match only at that offset and nowhere else. This is a no-op for + /// for patterns that are flagged as non-anchorable. + /// + /// Also, if this function is called twice with different offsets, the + /// pattern becomes non-anchorable because it can't be anchored to two + /// different offsets. + /// + /// This is used when the condition contains an expression like `$a at 0` + /// in order to indicate that the pattern (the `$a` pattern in this case) + /// can match only at a fixed offset. + pub fn anchor_at(&mut self, offset: usize) { + let is_anchorable = + !self.flags().contains(PatternFlags::NonAnchorable); + + let anchored_at = match self { + Pattern::Literal(literal) => &mut literal.anchored_at, + Pattern::Regexp(regexp) => &mut regexp.anchored_at, + }; + + match anchored_at { + Some(o) if *o != offset => { + *anchored_at = None; + self.flags_mut().set(PatternFlags::NonAnchorable); + } + None => { + if is_anchorable { + *anchored_at = Some(offset); + } + } + _ => {} + } + } + + /// Make the pattern non-anchorable. Any existing anchor is removed and + /// future calls to [`PatternInRule::anchor_at`] are ignored. + /// + /// This function is used to indicate that a certain pattern can't be + /// anchored at any fixed offset because it is used in ways that require + /// require finding all the possible matches. For example, in a condition + /// condition like `#a > 0 and $a at 0`, the use of `#a` (which returns + /// the number of occurrences of `$a`), makes `$a` non-anchorable because + /// we need to find all occurrences of `$a`. + pub fn make_non_anchorable(&mut self) { + match self { + Pattern::Literal(literal) => literal.anchored_at = None, + Pattern::Regexp(regexp) => regexp.anchored_at = None, + }; + self.flags_mut().set(PatternFlags::NonAnchorable); + } } #[derive(Clone, Eq, Hash, PartialEq)] pub(in crate::compiler) struct LiteralPattern { pub flags: PatternFlagSet, pub text: BString, + pub anchored_at: Option, pub xor_range: Option>, pub base64_alphabet: Option, pub base64wide_alphabet: Option, @@ -203,6 +248,28 @@ pub(in crate::compiler) struct LiteralPattern { pub(in crate::compiler) struct RegexpPattern { pub flags: PatternFlagSet, pub hir: re::hir::Hir, + pub anchored_at: Option, +} + +/// The index of a pattern in the rule that declares it. +/// +/// The first pattern in the rule has index 0, the second has index 1, and +/// so on. +#[derive(Debug, Clone, Copy)] +pub(in crate::compiler) struct PatternIdx(usize); + +impl PatternIdx { + #[inline] + pub fn as_usize(&self) -> usize { + self.0 + } +} + +impl From for PatternIdx { + #[inline] + fn from(value: usize) -> Self { + Self(value) + } } /// Intermediate representation (IR) for an expression. @@ -397,7 +464,7 @@ pub(in crate::compiler) enum Expr { /// Pattern match expression (e.g. `$a`) PatternMatch { - pattern_id: PatternId, + pattern: PatternIdx, anchor: MatchAnchor, }, @@ -409,7 +476,7 @@ pub(in crate::compiler) enum Expr { /// Pattern count expression (e.g. `#a`, `#a in (0..10)`) PatternCount { - pattern_id: PatternId, + pattern: PatternIdx, range: Option, }, @@ -421,7 +488,7 @@ pub(in crate::compiler) enum Expr { /// Pattern offset expression (e.g. `@a`, `@a[1]`) PatternOffset { - pattern_id: PatternId, + pattern: PatternIdx, index: Option>, }, @@ -433,7 +500,7 @@ pub(in crate::compiler) enum Expr { /// Pattern length expression (e.g. `!a`, `!a[1]`) PatternLength { - pattern_id: PatternId, + pattern: PatternIdx, index: Option>, }, @@ -498,7 +565,7 @@ pub(in crate::compiler) struct Of { pub(in crate::compiler) struct ForOf { pub quantifier: Quantifier, pub variable: Var, - pub pattern_set: Vec, + pub pattern_set: Vec, pub condition: Expr, pub stack_frame: VarStackFrame, } @@ -557,7 +624,7 @@ impl MatchAnchor { /// Items in a `of` expression. #[derive(Debug)] pub(in crate::compiler) enum OfItems { - PatternSet(Vec), + PatternSet(Vec), BoolExprTuple(Vec), } diff --git a/yara-x/src/compiler/mod.rs b/yara-x/src/compiler/mod.rs index b97eacb3e..362ecd449 100644 --- a/yara-x/src/compiler/mod.rs +++ b/yara-x/src/compiler/mod.rs @@ -17,6 +17,7 @@ use std::{fmt, iter, u32}; use bincode::Options; use bitmask::bitmask; use bstr::ByteSlice; +use itertools::izip; #[cfg(feature = "logging")] use log::*; use regex_syntax::hir; @@ -538,8 +539,7 @@ impl<'a> Compiler<'a> { // Sub-patterns that are anchored at some fixed offset are not added to // the Aho-Corasick automata. Instead their IDs are added to the - // sub_patterns_anchored_at_0 list, together with the offset they are - // anchored to. + // anchored_sub_patterns list. if let SubPattern::Literal { anchored_at: Some(_), .. } = sub_pattern { self.anchored_sub_patterns.push(sub_pattern_id); } else { @@ -714,59 +714,34 @@ impl<'a> Compiler<'a> { let snapshot = self.take_snapshot(); // Convert the patterns from AST to IR. - let patterns_in_rule = + let mut patterns_in_rule = patterns_from_ast(&self.report_builder, rule.patterns.as_ref())?; - // Create vector with pairs (IdentId, PatternId). - let mut ident_and_pattern_ids = - Vec::with_capacity(patterns_in_rule.len()); - - // Create vector with pairs (PatternId, Pattern). - let mut patterns_with_ids = Vec::with_capacity(patterns_in_rule.len()); - let mut pending_patterns = HashSet::new(); - - for pattern in patterns_in_rule { - // Check if this pattern has been declared before, in this rule or - // in some other rule. In such cases the pattern ID is re-used, we - // don't need to process (i.e: extract atoms and add them to - // Aho-Corasick automaton) the pattern again. - let pattern_id = - match self.patterns.entry(pattern.pattern().clone()) { - // The pattern already exists, return the existing ID. - Entry::Occupied(entry) => *entry.get(), - // The pattern didn't exist. - Entry::Vacant(entry) => { - let pattern_id = self.next_pattern_id; - self.next_pattern_id.incr(1); - pending_patterns.insert(pattern_id); - entry.insert(pattern_id); - pattern_id - } - }; - // Save pattern identifier (e.g: $a) in the pool of identifiers - // or reuse the IdentId if the identifier has been used already. - ident_and_pattern_ids.push(( - self.ident_pool.get_or_intern(pattern.identifier()), - pattern_id, - )); - - patterns_with_ids.push((pattern_id, pattern)); - } - + // The RuleId for the new rule is current length of `self.rules`. The + // first rule has RuleId = 0. let rule_id = RuleId(self.rules.len() as i32); + // Add the new rule to `self.rules`. The only information about the + // rule that we don't have right now is the PatternId corresponding to + // each pattern, that's why the `pattern` fields is initialized as + // an empty vector. The PatternId corresponding to each pattern can't + // be determined until `bool_expr_from_ast` processes the condition + // and determines which patterns are anchored, because this information + // is required for detecting duplicate patterns that can share the same + // PatternId. self.rules.push(RuleInfo { namespace_id: self.current_namespace.id, namespace_ident_id: self.current_namespace.ident_id, ident_id: self.ident_pool.get_or_intern(rule.identifier.name), ident_span: rule.identifier.span, - patterns: ident_and_pattern_ids, + patterns: vec![], is_global: rule.flags.contains(RuleFlag::Global), is_private: rule.flags.contains(RuleFlag::Private), }); // Convert the rule condition's AST to the intermediate representation - // (IR). + // (IR). Also updates the patterns with information about whether they + // are anchored or not. let condition = bool_expr_from_ast( &mut CompileContext { current_symbol_table: None, @@ -774,7 +749,7 @@ impl<'a> Compiler<'a> { ident_pool: &mut self.ident_pool, report_builder: &self.report_builder, rules: &self.rules, - current_rule_patterns: &mut patterns_with_ids, + current_rule_patterns: patterns_in_rule.as_mut_slice(), warnings: &mut self.warnings, vars: VarStack::new(), }, @@ -809,19 +784,52 @@ impl<'a> Compiler<'a> { // No other symbol with the same identifier should exist. assert!(existing_symbol.is_none()); - let patterns_with_ids_and_span = iter::zip( - patterns_with_ids, - rule.patterns.iter().flatten().map(|p| p.span()), - ); + let mut pattern_ids = Vec::with_capacity(patterns_in_rule.len()); + let mut pending_patterns = HashSet::new(); + + let current_rule = self.rules.last_mut().unwrap(); + + for pattern in &patterns_in_rule { + // Check if this pattern has been declared before, in this rule or + // in some other rule. In such cases the pattern ID is re-used and + // we don't need to process (i.e: extract atoms and add them to + // Aho-Corasick automaton) the pattern again. Two patterns are + // considered equal if they are exactly the same, including any + // modifiers associated to the pattern, and both are non-anchored + // or anchored at the same file offset. + let pattern_id = + match self.patterns.entry(pattern.pattern().clone()) { + // The pattern already exists, return the existing ID. + Entry::Occupied(entry) => *entry.get(), + // The pattern didn't exist. + Entry::Vacant(entry) => { + let pattern_id = self.next_pattern_id; + self.next_pattern_id.incr(1); + pending_patterns.insert(pattern_id); + entry.insert(pattern_id); + pattern_id + } + }; + + current_rule.patterns.push(( + self.ident_pool.get_or_intern(pattern.identifier()), + pattern_id, + )); + + pattern_ids.push(pattern_id); + } // Process the patterns in the rule. This extract the best atoms // from each pattern, adding them to the `self.atoms` vector, it // also creates one or more sub-patterns per pattern and add them // to `self.sub_patterns` - for ((pattern_id, pattern), span) in patterns_with_ids_and_span { - let pending = pending_patterns.contains(&pattern_id); - if pending || pattern.anchored_at().is_some() { - self.current_pattern_id = pattern_id; + for (pattern_id, pattern, span) in izip!( + pattern_ids.iter(), + patterns_in_rule.into_iter(), + rule.patterns.iter().flatten().map(|p| p.span()) + ) { + if pending_patterns.contains(pattern_id) { + self.current_pattern_id = *pattern_id; let anchored_at = pattern.anchored_at(); match pattern.into_pattern() { Pattern::Literal(pattern) => { @@ -836,9 +844,7 @@ impl<'a> Compiler<'a> { } } }; - if pending { - pending_patterns.remove(&pattern_id); - } + pending_patterns.remove(pattern_id); } } @@ -848,6 +854,7 @@ impl<'a> Compiler<'a> { // that if this function fails after emitting the code, some code debris // will remain in the WASM module. let mut ctx = EmitContext { + current_rule: self.rules.last_mut().unwrap(), current_signature: None, lit_pool: &mut self.lit_pool, regexp_pool: &mut self.regexp_pool, diff --git a/yara-x/src/tests/mod.rs b/yara-x/src/tests/mod.rs index 2729c361a..0e660146e 100644 --- a/yara-x/src/tests/mod.rs +++ b/yara-x/src/tests/mod.rs @@ -1652,6 +1652,25 @@ fn match_at() { b"fofofofo" ); + rule_true!( + r#" + rule test1 { + strings: + $a = "bar" + condition: + $a at 0 + } + + rule test2 { + strings: + $a = "bar" + condition: + $a + } + "#, + b"foobar" + ); + #[cfg(feature = "test_proto2-module")] rule_false!( r#" From 219eb3cfc554b771719ea7f5a01b93d829c15e4b Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Mon, 22 Jan 2024 14:04:48 +0100 Subject: [PATCH 25/28] fix(cli): print error when the target path for the `scan` command does not exist --- yara-x-cli/src/walk.rs | 50 ++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/yara-x-cli/src/walk.rs b/yara-x-cli/src/walk.rs index 295ff5f16..463e8291e 100644 --- a/yara-x-cli/src/walk.rs +++ b/yara-x-cli/src/walk.rs @@ -108,32 +108,29 @@ impl<'a> DirWalker<'a> { F: FnMut(&Path) -> anyhow::Result<()>, E: FnMut(anyhow::Error) -> anyhow::Result<()>, { - if path.is_file() { - match path - .metadata() - .with_context(|| format!("can't open {}", path.display())) - { - Ok(metadata) => { - if self.pass_metadata_filter(metadata) { - if let Err(err) = f(path) { - return e(err); - } - } - } - Err(err) => { + let metadata = match path + .metadata() + .with_context(|| format!("can't open {}", path.display())) + { + Ok(metadata) => metadata, + Err(err) => { + return e(err); + } + }; + + if metadata.is_file() { + if self.pass_metadata_filter(metadata) { + if let Err(err) = f(path) { return e(err); } }; return Ok(()); } - let path = match path - .canonicalize() - .with_context(|| format!("can't open {}", path.display())) - { + let path = match path.canonicalize() { Ok(path) => path, Err(err) => { - return e(err); + return e(err.into()); } }; @@ -365,12 +362,6 @@ impl<'a> ParDirWalker<'a> { })); } - // Drop the `msg_send` so that `msg_recv` is closed. This won't - // happen at this point, because there are scan threads retaining - // copies of `msg_send`, however, when all the scan threads end - // `msg_recv` is closed. - drop(msg_send); - // Span a thread that walks the directory and puts file paths in // the channel. threads.push(s.spawn(move |_| { @@ -383,8 +374,15 @@ impl<'a> ParDirWalker<'a> { if err.is::>() { return Err(err); } - // For other types of error (e.g: permission denied) - // keep walking the directory tree. + + // Invoke the error callback and abort the walk if the + // callback returns error. + if let Err(err) = e(err, &msg_send) { + let _ = msg_send.send(Message::Abort); + return Err(err); + } + + // Keep walking the directory tree. Ok(()) }, ); From 89c8e2461071f00e5211bf2914701b2f8546c628 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Jan 2024 23:54:28 +0100 Subject: [PATCH 26/28] chore(deps): bump shlex from 1.2.0 to 1.3.0 (#74) Bumps [shlex](https://github.com/comex/rust-shlex) from 1.2.0 to 1.3.0. - [Changelog](https://github.com/comex/rust-shlex/blob/master/CHANGELOG.md) - [Commits](https://github.com/comex/rust-shlex/commits) --- updated-dependencies: - dependency-name: shlex dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bce3a9107..6ab8a0246 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2940,9 +2940,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" From 42dd9d4e5c910f531ec7fc7cde26b7b691d91985 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 23 Jan 2024 14:01:15 +0100 Subject: [PATCH 27/28] fix: issue when 0 or negative number is passed as index in expressions like `@a[index]` and `!a[index]` --- yara-x/src/tests/mod.rs | 38 ++++++++++++++++++++++++++++++++++++++ yara-x/src/wasm/mod.rs | 12 ++++++------ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/yara-x/src/tests/mod.rs b/yara-x/src/tests/mod.rs index 0e660146e..142d0080c 100644 --- a/yara-x/src/tests/mod.rs +++ b/yara-x/src/tests/mod.rs @@ -2035,6 +2035,25 @@ fn match_offset() { "#, b"foobarfoobar" ); + + #[cfg(feature = "test_proto2-module")] + rule_true!( + r#" + import "test_proto2" + + rule test { + strings: + $a = "foo" + condition: + // The index in @a[] must be 1 or more, if not + // the result must be undefined. We use test_proto2.add(0,0) + // because using a literal causes a compilation error when + // the compiler notices that the index is 0. + not defined @a[test_proto2.add(0,0)] + } + "#, + b"foo" + ); } #[test] @@ -2112,6 +2131,25 @@ fn match_length() { "#, b"foobarfoobar" ); + + #[cfg(feature = "test_proto2-module")] + rule_true!( + r#" + import "test_proto2" + + rule test { + strings: + $a = "foo" + condition: + // The index in !a[] must be 1 or more, if not + // the result must be undefined. We use test_proto2.add(0,0) + // because using a literal causes a compilation error when + // the compiler notices that the index is 0. + not defined !a[test_proto2.add(0,0)] + } + "#, + b"foo" + ); } #[test] diff --git a/yara-x/src/wasm/mod.rs b/yara-x/src/wasm/mod.rs index 6ece5037f..6016cbc41 100644 --- a/yara-x/src/wasm/mod.rs +++ b/yara-x/src/wasm/mod.rs @@ -852,9 +852,9 @@ pub(crate) fn pat_length( index: i64, ) -> Option { if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { - // Make sure that index >= 1. - debug_assert!(index >= 1); - let m = matches.get(index as usize - 1)?; + let index: usize = index.try_into().ok()?; + // Index is 1-based, convert it to 0-based before calling `matches.get` + let m = matches.get(index.checked_sub(1)?)?; Some(ExactSizeIterator::len(&m.range) as i64) } else { None @@ -873,9 +873,9 @@ pub(crate) fn pat_offset( index: i64, ) -> Option { if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { - // Make sure that index >= 1. - debug_assert!(index >= 1); - let m = matches.get(index as usize - 1)?; + let index: usize = index.try_into().ok()?; + // Index is 1-based, convert it to 0-based before calling `matches.get` + let m = matches.get(index.checked_sub(1)?)?; Some(m.range.start as i64) } else { None From c94d41a0a23167a4a18da9c67bed8b725e1cfc0d Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 23 Jan 2024 14:19:23 +0100 Subject: [PATCH 28/28] refactor: use the `Rules::deserialize_from` API in `exec_scan`. --- yara-x-cli/src/commands/scan.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/yara-x-cli/src/commands/scan.rs b/yara-x-cli/src/commands/scan.rs index 4100b120e..aaaa77de0 100644 --- a/yara-x-cli/src/commands/scan.rs +++ b/yara-x-cli/src/commands/scan.rs @@ -1,6 +1,5 @@ use std::cmp::min; use std::fs::File; -use std::io::Read; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; @@ -120,16 +119,10 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> { let rules_path = rules_path.next().unwrap(); - let mut file = File::open(rules_path) + let file = File::open(rules_path) .with_context(|| format!("can not open {:?}", &rules_path))?; - let mut data = Vec::new(); - - File::read_to_end(&mut file, &mut data) - .with_context(|| format!("can not read {:?}", &rules_path))?; - - // TODO: implement Rules::deserialize_from reader - let rules = Rules::deserialize(data.as_slice())?; + let rules = Rules::deserialize_from(file)?; // If the user is defining external variables, make sure that these // variables are valid. A scanner is created only with the purpose