Skip to content

Commit

Permalink
feat(scanner): allow to specify a maximum number of matches per pattern.
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Aug 16, 2023
1 parent 1444450 commit 40ef687
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 10 deletions.
35 changes: 29 additions & 6 deletions yara-x/src/scanner/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use std::time::Instant;
use base64::Engine;
use bitvec::order::Lsb0;
use bitvec::slice::BitSlice;
use bitvec::vec::BitVec;
use bstr::ByteSlice;
use protobuf::{MessageDyn, MessageFull};
use regex::bytes::Regex;
Expand Down Expand Up @@ -45,8 +46,8 @@ pub(crate) struct ScanContext<'r> {
/// namespace are evaluated, the global rules that matched are moved
/// to this vector.
pub non_private_matching_rules: Vec<RuleId>,
// Vector containing the IDs of the private rules that matched, including
// both blogan and non-global ones.
/// Vector containing the IDs of the private rules that matched, including
/// both global and non-global ones.
pub private_matching_rules: Vec<RuleId>,
/// Map containing the IDs of the global rules that matched.
pub global_matching_rules: FxHashMap<NamespaceId, Vec<RuleId>>,
Expand Down Expand Up @@ -86,6 +87,12 @@ pub(crate) struct ScanContext<'r> {
/// here until they can be confirmed or discarded.
pub unconfirmed_matches:
FxHashMap<SubPatternId, VecDeque<UnconfirmedMatch>>,
/// Bit vector that contains one bit per pattern. The N-th bit is set if
/// pattern with PatternId = N has reached the maximum number of matches
/// indicated by `max_matches_per_pattern`.
pub limit_reached: BitVec,
/// Maximum number of matches per pattern.
pub max_matches_per_pattern: usize,
/// When [`HEARTBEAT_COUNTER`] is larger than this value, the scan is
/// aborted due to a timeout.
pub deadline: u64,
Expand Down Expand Up @@ -204,10 +211,13 @@ impl ScanContext<'_> {

bits.set(pattern_id.into(), true);

self.pattern_matches
.entry(pattern_id)
.or_default()
.add(match_, replace)
let matches_list = self.pattern_matches.entry(pattern_id).or_default();

if matches_list.len() < self.max_matches_per_pattern {
matches_list.add(match_, replace)
} else {
self.limit_reached.set(pattern_id.into(), true);
}
}

/// Search for patterns in the data.
Expand Down Expand Up @@ -271,6 +281,19 @@ impl ScanContext<'_> {
let (pattern_id, sub_pattern) =
&self.compiled_rules.get_sub_pattern(sub_pattern_id);

// Check if the potentially matching pattern has reached the
// maximum number of allowed matches. In that case continue without
// verifying the match. `get_unchecked` is used for performance
// reasons, the number of bits in the bit vector is guaranteed to
// to be the number of patterns.
if unsafe {
*self
.limit_reached
.get_unchecked::<usize>((*pattern_id).into())
} {
continue;
}

// If the atom is exact no further verification is needed, except
// for making sure that the fullword requirements are met. An exact
// atom is enough to guarantee that the whole sub-pattern matched.
Expand Down
22 changes: 18 additions & 4 deletions yara-x/src/scanner/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,13 @@ pub struct Scanner<'r> {
}

impl<'r> Scanner<'r> {
const DEFAULT_MAX_MATCHES_PER_PATTERN: usize = 10_000;

/// Creates a new scanner.
pub fn new(rules: &'r Rules) -> Self {
let num_rules = rules.rules().len() as u32;
let num_patterns = rules.num_patterns() as u32;

// The ScanContext structure belongs to the WASM store, but at the same
// time it must have a reference to the store because it is required
// for accessing the WASM memory from code that only has a reference
Expand Down Expand Up @@ -121,6 +126,8 @@ impl<'r> Scanner<'r> {
pattern_matches: FxHashMap::default(),
unconfirmed_matches: FxHashMap::default(),
deadline: 0,
limit_reached: BitVec::repeat(false, num_patterns as usize),
max_matches_per_pattern: Self::DEFAULT_MAX_MATCHES_PER_PATTERN,
},
));

Expand Down Expand Up @@ -148,9 +155,6 @@ impl<'r> Scanner<'r> {
)
.unwrap();

let num_rules = rules.rules().len() as u32;
let num_patterns = rules.num_patterns() as u32;

// Compute the base offset for the bitmap that contains matching
// information for patterns. This bitmap has 1 bit per pattern,
// the N-th bit is set if pattern with PatternId = N matched. The
Expand Down Expand Up @@ -226,11 +230,20 @@ impl<'r> Scanner<'r> {
/// in some cases, particularly with rules containing only a few patterns,
/// the scanner could potentially continue running for a longer period than
/// the specified timeout.
pub fn set_timeout(&mut self, timeout: Duration) -> &mut Self {
pub fn timeout(&mut self, timeout: Duration) -> &mut Self {
self.timeout = Some(timeout);
self
}

/// Sets the maximum number of matches per pattern.
///
/// When some pattern reaches the maximum number of patterns it won't
/// produce more matches.
pub fn max_matches_per_pattern(&mut self, n: usize) -> &mut Self {
self.wasm_store.data_mut().max_matches_per_pattern = n;
self
}

/// Scans a file.
pub fn scan_file<'a, P>(
&'a mut self,
Expand Down Expand Up @@ -777,6 +790,7 @@ impl<'a> Iterator for Matches<'a> {
}

/// Represents a match.
#[derive(PartialEq, Debug)]
pub struct Match<'a> {
/// Range within the original data where the match occurred.
pub range: Range<usize>,
Expand Down
49 changes: 49 additions & 0 deletions yara-x/src/scanner/tests.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
use pretty_assertions::assert_eq;

use crate::scanner;
use crate::scanner::matches::Match;
use crate::scanner::Scanner;
use crate::variables::VariableError;

Expand Down Expand Up @@ -360,3 +364,48 @@ fn private_rules() {
// Only the non-matching, non-private rules should be reported.
assert_eq!(scan_results.non_matching_rules().len(), 0);
}

#[test]
fn max_matches_per_pattern() {
let mut compiler = crate::Compiler::new();

compiler
.add_source(
r#"
rule test_3 {
strings:
$a = "foo"
condition:
$a
}
"#,
)
.unwrap();

let rules = compiler.build();

let mut scanner = Scanner::new(&rules);
scanner.max_matches_per_pattern(1);
let scan_results =
scanner.scan(b"foofoofoo").expect("scan should not fail");

assert_eq!(scan_results.matching_rules().len(), 1);

let mut matches = scan_results
.matching_rules()
.next()
.unwrap()
.patterns()
.next()
.unwrap()
.matches();

// Only one match is returned for pattern $a because the limit has been set
// to 1.
assert_eq!(
matches.next(),
Some(scanner::Match { range: (0..3), data: b"foo", xor_key: None })
);

assert_eq!(matches.next(), None);
}

0 comments on commit 40ef687

Please sign in to comment.