diff --git a/lib/src/scanner/context.rs b/lib/src/scanner/context.rs index db7ecfa76..357441cd7 100644 --- a/lib/src/scanner/context.rs +++ b/lib/src/scanner/context.rs @@ -29,7 +29,7 @@ use crate::compiler::{ use crate::re::fast::fastvm::FastVM; use crate::re::thompson::pikevm::PikeVM; use crate::re::Action; -use crate::scanner::matches::{Match, MatchList, UnconfirmedMatch}; +use crate::scanner::matches::{Match, PatternMatches, UnconfirmedMatch}; use crate::scanner::HEARTBEAT_COUNTER; use crate::types::{Array, Map, Struct}; use crate::wasm::MATCHING_RULES_BITMAP_BASE; @@ -83,7 +83,7 @@ pub(crate) struct ScanContext<'r> { /// Hash map that tracks the matches occurred during a scan. The keys /// are the PatternId of the matching pattern, and values are a list /// of matches. - pub pattern_matches: FxHashMap, + pub pattern_matches: PatternMatches, /// Hash map that tracks the unconfirmed matches for chained patterns. When /// a pattern is split into multiple chained pieces, each piece is handled /// as an individual pattern, but the match of one of the pieces doesn't @@ -95,8 +95,6 @@ pub(crate) struct ScanContext<'r> { /// Set that contains the PatternId for those patterns that have reached /// the maximum number of matches indicated by `max_matches_per_pattern`. pub limit_reached: FxHashSet, - /// Maximum number of matches per pattern. - pub max_matches_per_pattern: usize, /// When [`HEARTBEAT_COUNTER`] is larger than this value, the scan is /// aborted due to a timeout. pub deadline: u64, @@ -339,11 +337,7 @@ impl ScanContext<'_> { bits.set(pattern_id.into(), true); - let matches_list = self.pattern_matches.entry(pattern_id).or_default(); - - if matches_list.len() < self.max_matches_per_pattern { - matches_list.add(match_, replace); - } else { + if !self.pattern_matches.add(pattern_id, match_, replace) { self.limit_reached.insert(pattern_id); } } diff --git a/lib/src/scanner/matches.rs b/lib/src/scanner/matches.rs index 3cc455a68..296a97ecd 100644 --- a/lib/src/scanner/matches.rs +++ b/lib/src/scanner/matches.rs @@ -1,4 +1,7 @@ +use crate::compiler::PatternId; use core::slice::Iter; +use rustc_hash::FxHashMap; +use std::collections::hash_map::Entry; use std::ops::{Range, RangeInclusive}; /// Represents the match of a pattern. @@ -22,8 +25,11 @@ pub struct MatchList { } impl MatchList { - pub fn new() -> Self { - Self { matches: Vec::new() } + /// Creates a new [`MatchList`] that can hold at least `capacity` items + /// without relocating. The capacity will increase if [`MatchList::add`] + /// is called and there's no capacity to store the new item. + pub fn with_capacity(capacity: usize) -> Self { + Self { matches: Vec::with_capacity(capacity) } } /// Adds a new match to the list while keeping the matches sorted by @@ -118,6 +124,11 @@ impl MatchList { } } + #[inline] + pub fn capacity(&self) -> usize { + self.matches.capacity() + } + #[inline] pub fn first(&self) -> Option<&Match> { self.matches.first() @@ -187,6 +198,92 @@ pub struct UnconfirmedMatch { pub chain_length: usize, } +/// A hash map that tracks matches for each pattern. +/// +/// Keys in this map are a [`PatternId`], and values are a [`MatchList`] that +/// contains the matches for that pattern. +pub struct PatternMatches { + matches: FxHashMap, + max_matches_per_pattern: usize, + capacity: usize, +} + +impl PatternMatches { + const DEFAULT_MAX_MATCHES_PER_PATTERN: usize = 1_000_000; + + pub fn new() -> Self { + Self { + matches: FxHashMap::default(), + max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN, + capacity: 0, + } + } + + pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self { + self.max_matches_per_pattern = n; + self + } + + pub fn get(&self, pattern_id: PatternId) -> Option<&MatchList> { + self.matches.get(&pattern_id) + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.matches.is_empty() + } + + pub fn clear(&mut self) { + if self.capacity > self.max_matches_per_pattern * 10 { + self.matches.clear(); + self.capacity = 0; + } else { + for (_, matches) in self.matches.iter_mut() { + matches.clear(); + } + } + } + + /// Adds a new match to the pattern identified by the given [`PatternId`] + /// + /// If a match at the same offset already exists, the `replace_if_longer` + /// argument indicates what to do. If this argument is `true` and the new + /// match is longer than the existing one, the existing match will be + /// replaced. If the argument is `false` the new match will be ignored and + /// the existing one will remain. + /// + /// This function returns `true` if the new match was added, or `false` + /// if the pattern already reached the maximum number of matches and + /// therefore the new match was not added. + pub fn add( + &mut self, + pattern_id: PatternId, + m: Match, + replace_if_longer: bool, + ) -> bool { + match self.matches.entry(pattern_id) { + Entry::Occupied(mut entry) => { + let matches = entry.get_mut(); + if matches.len() < self.max_matches_per_pattern { + self.capacity -= matches.capacity(); + matches.add(m, replace_if_longer); + self.capacity += matches.capacity(); + true + } else { + false + } + } + Entry::Vacant(entry) => { + let mut matches = MatchList::with_capacity(8); + self.capacity += matches.capacity(); + matches.add(m, replace_if_longer); + entry.insert(matches); + true + } + } + } +} + #[cfg(test)] mod test { use crate::scanner::matches::{Match, MatchList}; @@ -194,7 +291,7 @@ mod test { #[test] fn match_list() { - let mut ml = MatchList::new(); + let mut ml = MatchList::with_capacity(5); ml.add(Match { range: (2..10), xor_key: None }, false); ml.add(Match { range: (1..10), xor_key: None }, false); diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index 675bb64f2..33477596d 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -36,6 +36,7 @@ use crate::wasm::{ENGINE, MATCHING_RULES_BITMAP_BASE}; use crate::{modules, wasm, Variable}; pub(crate) use crate::scanner::context::*; +use crate::scanner::matches::PatternMatches; mod context; mod matches; @@ -101,7 +102,6 @@ pub struct Scanner<'r> { } impl<'r> Scanner<'r> { - const DEFAULT_MAX_MATCHES_PER_PATTERN: usize = 1_000_000; const DEFAULT_SCAN_TIMEOUT: u64 = 315_360_000; /// Creates a new scanner. @@ -135,11 +135,10 @@ impl<'r> Scanner<'r> { main_memory: None, module_outputs: FxHashMap::default(), user_provided_module_outputs: FxHashMap::default(), - pattern_matches: FxHashMap::default(), + pattern_matches: PatternMatches::new(), unconfirmed_matches: FxHashMap::default(), deadline: 0, limit_reached: FxHashSet::default(), - max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN, regexp_cache: RefCell::new(FxHashMap::default()), #[cfg(feature = "rules-profiling")] time_spent_in_pattern: FxHashMap::default(), @@ -272,7 +271,7 @@ impl<'r> Scanner<'r> { /// When some pattern reaches the maximum number of patterns it won't /// produce more matches. pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self { - self.wasm_store.data_mut().max_matches_per_pattern = n; + self.wasm_store.data_mut().pattern_matches.max_matches_per_pattern(n); self } @@ -691,19 +690,7 @@ impl<'r> Scanner<'r> { ctx.limit_reached.clear(); // Clear the unconfirmed matches. - // - // We could use `unconfirmed_matches.clear()` for clearing the whole - // hash map, but that would cause that all the vectors are deallocated. - // Instead, each vector is cleared individually, which removes the items - // while maintaining the vector capacity. This way the vector may be - // reused in later scans without memory allocations. However, need keep - // the size of those vector under control by calling `shrink_to`, if - // this map have too many large vectors the overall memory consumption - // would be too high. - for (_, matches) in ctx.unconfirmed_matches.iter_mut() { - matches.clear(); - matches.shrink_to(32); - } + ctx.unconfirmed_matches.clear(); // If some pattern or rule matched, clear the matches. Notice that a // rule may match without any pattern being matched, because there @@ -713,11 +700,7 @@ impl<'r> Scanner<'r> { || !ctx.non_private_matching_rules.is_empty() || !ctx.private_matching_rules.is_empty() { - for (_, matches) in ctx.pattern_matches.iter_mut() { - matches.clear(); - matches.shrink_to(32); - } - + ctx.pattern_matches.clear(); ctx.non_private_matching_rules.clear(); ctx.private_matching_rules.clear(); @@ -1007,8 +990,8 @@ impl<'a, 'r> Pattern<'a, 'r> { iterator: self .ctx .pattern_matches - .get(&self.pattern_id) - .map(|match_list| match_list.iter()), + .get(self.pattern_id) + .map(|matches| matches.iter()), } } } diff --git a/lib/src/wasm/mod.rs b/lib/src/wasm/mod.rs index d251f42a0..141440796 100644 --- a/lib/src/wasm/mod.rs +++ b/lib/src/wasm/mod.rs @@ -791,7 +791,7 @@ pub(crate) fn is_pat_match_at( if offset < 0 { return false; } - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { matches.search(offset.try_into().unwrap()).is_ok() } else { false @@ -810,7 +810,7 @@ pub(crate) fn is_pat_match_in( lower_bound: i64, upper_bound: i64, ) -> bool { - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { matches .matches_in_range(lower_bound as isize..=upper_bound as isize) .is_positive() @@ -825,7 +825,7 @@ pub(crate) fn pat_matches( caller: &mut Caller<'_, ScanContext>, pattern_id: PatternId, ) -> i64 { - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { matches.len().try_into().unwrap() } else { 0 @@ -844,7 +844,7 @@ pub(crate) fn pat_matches_in( lower_bound: i64, upper_bound: i64, ) -> i64 { - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { matches.matches_in_range(lower_bound as isize..=upper_bound as isize) } else { 0 @@ -862,7 +862,7 @@ pub(crate) fn pat_length( pattern_id: PatternId, index: i64, ) -> Option { - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { let index: usize = index.try_into().ok()?; // Index is 1-based, convert it to 0-based before calling `matches.get` let m = matches.get(index.checked_sub(1)?)?; @@ -883,7 +883,7 @@ pub(crate) fn pat_offset( pattern_id: PatternId, index: i64, ) -> Option { - if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) { + if let Some(matches) = caller.data().pattern_matches.get(pattern_id) { let index: usize = index.try_into().ok()?; // Index is 1-based, convert it to 0-based before calling `matches.get` let m = matches.get(index.checked_sub(1)?)?;