Skip to content

Commit

Permalink
perf: put compiled regular expressions in a cache and reuse them
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Aug 17, 2023
1 parent c4f3a41 commit c2e9703
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 7 deletions.
2 changes: 1 addition & 1 deletion yara-x/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1497,7 +1497,7 @@ impl From<RuleId> for usize {
}

/// ID associated to each regexp used in a rule condition.
#[derive(Copy, Clone, Debug)]
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) struct RegexpId(i32);

impl From<i32> for RegexpId {
Expand Down
24 changes: 19 additions & 5 deletions yara-x/src/scanner/context.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::cell::RefCell;
use std::collections::VecDeque;
use std::ops::{Range, RangeInclusive};
use std::ptr::NonNull;
Expand Down Expand Up @@ -96,6 +97,12 @@ pub(crate) struct ScanContext<'r> {
/// When [`HEARTBEAT_COUNTER`] is larger than this value, the scan is
/// aborted due to a timeout.
pub deadline: u64,
/// Hash map that serves as a cache for regexps used in expressions like
/// `some_var matches /foobar/`. Compiling a regexp is a expensive
/// operation. Instead of compiling the regexp each time the expression
/// is evaluated, it is compiled the first time and stored in this hash
/// map.
pub regexp_cache: RefCell<FxHashMap<RegexpId, Regex>>,
}

impl ScanContext<'_> {
Expand All @@ -109,11 +116,18 @@ impl ScanContext<'_> {
}
}

/// Returns a regular expression given its [`RegexpId`].
pub(crate) fn get_regexp(&self, regexp_id: RegexpId) -> Regex {
// TODO: put the regular expressions in a cache and call
// `compiled_rules.get_regexp` only if not found in the cache.
self.compiled_rules.get_regexp(regexp_id)
/// Returns true of the regexp identified by the given [`RegexpId`]
/// matches `haystack`.
pub(crate) fn regexp_matches(
&self,
regexp_id: RegexpId,
haystack: &[u8],
) -> bool {
self.regexp_cache
.borrow_mut()
.entry(regexp_id)
.or_insert_with(|| self.compiled_rules.get_regexp(regexp_id))
.is_match(haystack)
}

/// Returns the protobuf struct produced by a module.
Expand Down
2 changes: 2 additions & 0 deletions yara-x/src/scanner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
The scanner takes the rules produces by the compiler and scans data with them.
*/

use std::cell::RefCell;
use std::io::Read;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
Expand Down Expand Up @@ -128,6 +129,7 @@ impl<'r> Scanner<'r> {
deadline: 0,
limit_reached: BitVec::repeat(false, num_patterns as usize),
max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN,
regexp_cache: RefCell::new(FxHashMap::default()),
},
));

Expand Down
2 changes: 1 addition & 1 deletion yara-x/src/wasm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1367,7 +1367,7 @@ pub(crate) fn str_matches(
rhs: RegexpId,
) -> bool {
let ctx = caller.data();
ctx.get_regexp(rhs).is_match(lhs.as_bstr(ctx))
ctx.regexp_matches(rhs, lhs.as_bstr(ctx))
}

macro_rules! gen_xint_fn {
Expand Down

0 comments on commit c2e9703

Please sign in to comment.