From cc4e6849f23e392fe7c19f621dddc5435a9dfc9f Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Sun, 22 Sep 2024 12:12:17 +0200 Subject: [PATCH] feat: add `native-code-serialization` feature When this feature is enabled, serialized rules include pre-compiled code for the current platform. This reduces the load-time for the compiled rules, but increases the size of serialized rules. --- lib/Cargo.toml | 12 ++++++ lib/src/compiler/errors.rs | 4 ++ lib/src/compiler/rules.rs | 82 +++++++++++++++++++++++++++++++++----- 3 files changed, 87 insertions(+), 11 deletions(-) diff --git a/lib/Cargo.toml b/lib/Cargo.toml index abdb70c8..0279c862 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -52,6 +52,18 @@ protoc = [] # Enables debug logs. logging = ["dep:log"] +# When enabled, the serialization of compiled rules include native code for +# the platform in which the rules where compiled. This reduces the load time, +# as the native code is already included in the serialized rules and doesn't +# need to be generated. In the other hand, it increases the size of the +# serialized rules. If rules that were serialized with native code for one +# platform are deserialized in a different platform, the native code included +# in the serialized rules is ignored and generated again for the current +# platform. +# +# This feature is disabled by default. +native-code-serialization = [] + # Enables parallel compilation of WASM code. When compiling large number of # rules this noticeable reduces compilation time. However, this creates new # threads, which can be problematic in some scenarios. See: diff --git a/lib/src/compiler/errors.rs b/lib/src/compiler/errors.rs index ddedc22a..1510952f 100644 --- a/lib/src/compiler/errors.rs +++ b/lib/src/compiler/errors.rs @@ -27,6 +27,10 @@ pub enum SerializationError { /// I/O error while trying to read or write serialized data. #[error(transparent)] IoError(#[from] io::Error), + + /// Error occurred while deserializing WASM code. + #[error("invalid YARA-X compiled rules file")] + InvalidWASM(#[from] anyhow::Error), } /// Error returned by [`crate::Compiler::emit_wasm_file`]. diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index 39df195d..aeb58a9e 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -9,7 +9,7 @@ use bincode::Options; #[cfg(feature = "logging")] use log::*; use regex_automata::meta::Regex; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::compiler::atoms::Atom; use crate::compiler::errors::SerializationError; @@ -52,7 +52,12 @@ pub struct Rules { pub(in crate::compiler) wasm_mod: Vec, /// WASM module already compiled into native code for the current platform. - #[serde(skip)] + /// When the rules are serialized, the compiled module is included only if + /// the `native-code-serialization` is enabled. + #[serde( + serialize_with = "serialize_wasm_mod", + deserialize_with = "deserialize_wasm_mod" + )] pub(in crate::compiler) compiled_wasm_mod: Option, /// Vector with the names of all the imported modules. The vector contains @@ -163,17 +168,24 @@ impl Rules { .with_varint_encoding() .deserialize::(&bytes[magic.len()..])?; - // Compile the WASM module for the current platform. This panics - // if the WASM code is invalid, which should not happen as the code is - // emitted by YARA itself. If this ever happens is probably because - // wrong WASM code is being emitted. - rules.compiled_wasm_mod = Some( - wasmtime::Module::from_binary( + // `rules.compiled_wasm_mod` can be `None` for two reasons: + // + // 1- The rules were serialized without compiled rules (i.e: the + // `native-code-serialization` feature was disabled, which is + // the default). + // + // 2- The rules were serialized with compiled rules, but they were + // compiled for a different platform, and `deserialize_wasm_mod` + // returned `None`. + // + // In both cases we try to build the module again from the data in + // `rules.wasm_mode`. + if rules.compiled_wasm_mod.is_none() { + rules.compiled_wasm_mod = Some(wasmtime::Module::from_binary( &crate::wasm::ENGINE, rules.wasm_mod.as_slice(), - ) - .expect("WASM module is not valid"), - ); + )?); + } #[cfg(feature = "logging")] info!("Deserialization time: {:?}", Instant::elapsed(&start)); @@ -437,6 +449,54 @@ impl Rules { } } +#[cfg(feature = "native-code-serialization")] +fn serialize_wasm_mod( + wasm_mod: &Option, + serializer: S, +) -> Result +where + S: Serializer, +{ + if let Some(wasm_mod) = wasm_mod { + let bytes = wasm_mod + .serialize() + .map_err(|err| serde::ser::Error::custom(err.to_string()))?; + + serializer.serialize_some(bytes.as_slice()) + } else { + serializer.serialize_none() + } +} + +#[cfg(not(feature = "native-code-serialization"))] +fn serialize_wasm_mod( + _wasm_mod: &Option, + serializer: S, +) -> Result +where + S: Serializer, +{ + serializer.serialize_none() +} + +pub fn deserialize_wasm_mod<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let bytes: Option<&[u8]> = Deserialize::deserialize(deserializer)?; + let module = if let Some(bytes) = bytes { + unsafe { + wasmtime::Module::deserialize(&crate::wasm::ENGINE, bytes).ok() + } + } else { + None + }; + + Ok(module) +} + /// Iterator that yields the of the compiled rules. pub struct RulesIter<'a> { rules: &'a Rules,