Skip to content

Commit

Permalink
refactor: implement a new type PatternMatches that tracks matches f…
Browse files Browse the repository at this point in the history
…or each pattern.
  • Loading branch information
plusvic committed May 2, 2024
1 parent af34adc commit b1a9702
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 42 deletions.
12 changes: 3 additions & 9 deletions lib/src/scanner/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use crate::compiler::{
use crate::re::fast::fastvm::FastVM;
use crate::re::thompson::pikevm::PikeVM;
use crate::re::Action;
use crate::scanner::matches::{Match, MatchList, UnconfirmedMatch};
use crate::scanner::matches::{Match, PatternMatches, UnconfirmedMatch};
use crate::scanner::HEARTBEAT_COUNTER;
use crate::types::{Array, Map, Struct};
use crate::wasm::MATCHING_RULES_BITMAP_BASE;
Expand Down Expand Up @@ -83,7 +83,7 @@ pub(crate) struct ScanContext<'r> {
/// Hash map that tracks the matches occurred during a scan. The keys
/// are the PatternId of the matching pattern, and values are a list
/// of matches.
pub pattern_matches: FxHashMap<PatternId, MatchList>,
pub pattern_matches: PatternMatches,
/// Hash map that tracks the unconfirmed matches for chained patterns. When
/// a pattern is split into multiple chained pieces, each piece is handled
/// as an individual pattern, but the match of one of the pieces doesn't
Expand All @@ -95,8 +95,6 @@ pub(crate) struct ScanContext<'r> {
/// Set that contains the PatternId for those patterns that have reached
/// the maximum number of matches indicated by `max_matches_per_pattern`.
pub limit_reached: FxHashSet<PatternId>,
/// Maximum number of matches per pattern.
pub max_matches_per_pattern: usize,
/// When [`HEARTBEAT_COUNTER`] is larger than this value, the scan is
/// aborted due to a timeout.
pub deadline: u64,
Expand Down Expand Up @@ -339,11 +337,7 @@ impl ScanContext<'_> {

bits.set(pattern_id.into(), true);

let matches_list = self.pattern_matches.entry(pattern_id).or_default();

if matches_list.len() < self.max_matches_per_pattern {
matches_list.add(match_, replace);
} else {
if !self.pattern_matches.add(pattern_id, match_, replace) {
self.limit_reached.insert(pattern_id);
}
}
Expand Down
103 changes: 100 additions & 3 deletions lib/src/scanner/matches.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::compiler::PatternId;
use core::slice::Iter;
use rustc_hash::FxHashMap;
use std::collections::hash_map::Entry;
use std::ops::{Range, RangeInclusive};

/// Represents the match of a pattern.
Expand All @@ -22,8 +25,11 @@ pub struct MatchList {
}

impl MatchList {
pub fn new() -> Self {
Self { matches: Vec::new() }
/// Creates a new [`MatchList`] that can hold at least `capacity` items
/// without relocating. The capacity will increase if [`MatchList::add`]
/// is called and there's no capacity to store the new item.
pub fn with_capacity(capacity: usize) -> Self {
Self { matches: Vec::with_capacity(capacity) }
}

/// Adds a new match to the list while keeping the matches sorted by
Expand Down Expand Up @@ -118,6 +124,11 @@ impl MatchList {
}
}

#[inline]
pub fn capacity(&self) -> usize {
self.matches.capacity()
}

#[inline]
pub fn first(&self) -> Option<&Match> {
self.matches.first()
Expand Down Expand Up @@ -187,14 +198,100 @@ pub struct UnconfirmedMatch {
pub chain_length: usize,
}

/// A hash map that tracks matches for each pattern.
///
/// Keys in this map are a [`PatternId`], and values are a [`MatchList`] that
/// contains the matches for that pattern.
pub struct PatternMatches {
matches: FxHashMap<PatternId, MatchList>,
max_matches_per_pattern: usize,
capacity: usize,
}

impl PatternMatches {
const DEFAULT_MAX_MATCHES_PER_PATTERN: usize = 1_000_000;

pub fn new() -> Self {
Self {
matches: FxHashMap::default(),
max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN,
capacity: 0,
}
}

pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
self.max_matches_per_pattern = n;
self
}

pub fn get(&self, pattern_id: PatternId) -> Option<&MatchList> {
self.matches.get(&pattern_id)
}

#[inline]
pub fn is_empty(&self) -> bool {
self.matches.is_empty()
}

pub fn clear(&mut self) {
if self.capacity > self.max_matches_per_pattern * 10 {
self.matches.clear();
self.capacity = 0;
} else {
for (_, matches) in self.matches.iter_mut() {
matches.clear();
}
}
}

/// Adds a new match to the pattern identified by the given [`PatternId`]
///
/// If a match at the same offset already exists, the `replace_if_longer`
/// argument indicates what to do. If this argument is `true` and the new
/// match is longer than the existing one, the existing match will be
/// replaced. If the argument is `false` the new match will be ignored and
/// the existing one will remain.
///
/// This function returns `true` if the new match was added, or `false`
/// if the pattern already reached the maximum number of matches and
/// therefore the new match was not added.
pub fn add(
&mut self,
pattern_id: PatternId,
m: Match,
replace_if_longer: bool,
) -> bool {
match self.matches.entry(pattern_id) {
Entry::Occupied(mut entry) => {
let matches = entry.get_mut();
if matches.len() < self.max_matches_per_pattern {
self.capacity -= matches.capacity();
matches.add(m, replace_if_longer);
self.capacity += matches.capacity();
true
} else {
false
}
}
Entry::Vacant(entry) => {
let mut matches = MatchList::with_capacity(8);
self.capacity += matches.capacity();
matches.add(m, replace_if_longer);
entry.insert(matches);
true
}
}
}
}

#[cfg(test)]
mod test {
use crate::scanner::matches::{Match, MatchList};
use std::ops::Range;

#[test]
fn match_list() {
let mut ml = MatchList::new();
let mut ml = MatchList::with_capacity(5);

ml.add(Match { range: (2..10), xor_key: None }, false);
ml.add(Match { range: (1..10), xor_key: None }, false);
Expand Down
31 changes: 7 additions & 24 deletions lib/src/scanner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ use crate::wasm::{ENGINE, MATCHING_RULES_BITMAP_BASE};
use crate::{modules, wasm, Variable};

pub(crate) use crate::scanner::context::*;
use crate::scanner::matches::PatternMatches;

mod context;
mod matches;
Expand Down Expand Up @@ -101,7 +102,6 @@ pub struct Scanner<'r> {
}

impl<'r> Scanner<'r> {
const DEFAULT_MAX_MATCHES_PER_PATTERN: usize = 1_000_000;
const DEFAULT_SCAN_TIMEOUT: u64 = 315_360_000;

/// Creates a new scanner.
Expand Down Expand Up @@ -135,11 +135,10 @@ impl<'r> Scanner<'r> {
main_memory: None,
module_outputs: FxHashMap::default(),
user_provided_module_outputs: FxHashMap::default(),
pattern_matches: FxHashMap::default(),
pattern_matches: PatternMatches::new(),
unconfirmed_matches: FxHashMap::default(),
deadline: 0,
limit_reached: FxHashSet::default(),
max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN,
regexp_cache: RefCell::new(FxHashMap::default()),
#[cfg(feature = "rules-profiling")]
time_spent_in_pattern: FxHashMap::default(),
Expand Down Expand Up @@ -272,7 +271,7 @@ impl<'r> Scanner<'r> {
/// When some pattern reaches the maximum number of patterns it won't
/// produce more matches.
pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
self.wasm_store.data_mut().max_matches_per_pattern = n;
self.wasm_store.data_mut().pattern_matches.max_matches_per_pattern(n);
self
}

Expand Down Expand Up @@ -691,19 +690,7 @@ impl<'r> Scanner<'r> {
ctx.limit_reached.clear();

// Clear the unconfirmed matches.
//
// We could use `unconfirmed_matches.clear()` for clearing the whole
// hash map, but that would cause that all the vectors are deallocated.
// Instead, each vector is cleared individually, which removes the items
// while maintaining the vector capacity. This way the vector may be
// reused in later scans without memory allocations. However, need keep
// the size of those vector under control by calling `shrink_to`, if
// this map have too many large vectors the overall memory consumption
// would be too high.
for (_, matches) in ctx.unconfirmed_matches.iter_mut() {
matches.clear();
matches.shrink_to(32);
}
ctx.unconfirmed_matches.clear();

// If some pattern or rule matched, clear the matches. Notice that a
// rule may match without any pattern being matched, because there
Expand All @@ -713,11 +700,7 @@ impl<'r> Scanner<'r> {
|| !ctx.non_private_matching_rules.is_empty()
|| !ctx.private_matching_rules.is_empty()
{
for (_, matches) in ctx.pattern_matches.iter_mut() {
matches.clear();
matches.shrink_to(32);
}

ctx.pattern_matches.clear();
ctx.non_private_matching_rules.clear();
ctx.private_matching_rules.clear();

Expand Down Expand Up @@ -1007,8 +990,8 @@ impl<'a, 'r> Pattern<'a, 'r> {
iterator: self
.ctx
.pattern_matches
.get(&self.pattern_id)
.map(|match_list| match_list.iter()),
.get(self.pattern_id)
.map(|matches| matches.iter()),
}
}
}
Expand Down
12 changes: 6 additions & 6 deletions lib/src/wasm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,7 @@ pub(crate) fn is_pat_match_at(
if offset < 0 {
return false;
}
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
matches.search(offset.try_into().unwrap()).is_ok()
} else {
false
Expand All @@ -810,7 +810,7 @@ pub(crate) fn is_pat_match_in(
lower_bound: i64,
upper_bound: i64,
) -> bool {
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
matches
.matches_in_range(lower_bound as isize..=upper_bound as isize)
.is_positive()
Expand All @@ -825,7 +825,7 @@ pub(crate) fn pat_matches(
caller: &mut Caller<'_, ScanContext>,
pattern_id: PatternId,
) -> i64 {
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
matches.len().try_into().unwrap()
} else {
0
Expand All @@ -844,7 +844,7 @@ pub(crate) fn pat_matches_in(
lower_bound: i64,
upper_bound: i64,
) -> i64 {
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
matches.matches_in_range(lower_bound as isize..=upper_bound as isize)
} else {
0
Expand All @@ -862,7 +862,7 @@ pub(crate) fn pat_length(
pattern_id: PatternId,
index: i64,
) -> Option<i64> {
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
let index: usize = index.try_into().ok()?;
// Index is 1-based, convert it to 0-based before calling `matches.get`
let m = matches.get(index.checked_sub(1)?)?;
Expand All @@ -883,7 +883,7 @@ pub(crate) fn pat_offset(
pattern_id: PatternId,
index: i64,
) -> Option<i64> {
if let Some(matches) = caller.data().pattern_matches.get(&pattern_id) {
if let Some(matches) = caller.data().pattern_matches.get(pattern_id) {
let index: usize = index.try_into().ok()?;
// Index is 1-based, convert it to 0-based before calling `matches.get`
let m = matches.get(index.checked_sub(1)?)?;
Expand Down

0 comments on commit b1a9702

Please sign in to comment.