diff --git a/yara-x/src/compiler/ir/mod.rs b/yara-x/src/compiler/ir/mod.rs index 2e609e16b..502bb9920 100644 --- a/yara-x/src/compiler/ir/mod.rs +++ b/yara-x/src/compiler/ir/mod.rs @@ -108,8 +108,8 @@ impl<'src> Pattern<'src> { /// in order to indicate that the pattern (the `$a` pattern in this case) /// can match only at a fixed offset. pub fn anchor_at(&mut self, offset: usize) { - if let Pattern::Literal(p) = self { - match p.anchored_at { + match self { + Pattern::Literal(p) => match p.anchored_at { Some(o) if o != offset => { p.anchored_at = None; p.flags.set(PatternFlags::NonAnchorable); @@ -120,7 +120,19 @@ impl<'src> Pattern<'src> { } } _ => {} - } + }, + Pattern::Regexp(p) => match p.anchored_at { + Some(o) if o != offset => { + p.anchored_at = None; + p.flags.set(PatternFlags::NonAnchorable); + } + None => { + if !p.flags.contains(PatternFlags::NonAnchorable) { + p.anchored_at = Some(offset); + } + } + _ => {} + }, } } diff --git a/yara-x/src/compiler/mod.rs b/yara-x/src/compiler/mod.rs index 4e9ebe0ee..20c681187 100644 --- a/yara-x/src/compiler/mod.rs +++ b/yara-x/src/compiler/mod.rs @@ -738,6 +738,7 @@ impl<'a> Compiler<'a> { SubPattern::Literal { pattern: pattern_lit_id, flags: flags | SubPatternFlags::Nocase, + anchored_at: None, }, best_atom.case_combinations(), SubPatternAtom::from_atom, @@ -827,14 +828,10 @@ impl<'a> Compiler<'a> { } } else { self.add_sub_pattern( - if let Some(offset) = pattern.anchored_at { - SubPattern::LiteralAnchored { - pattern: pattern_lit_id, - anchored_at: offset, - flags, - } - } else { - SubPattern::Literal { pattern: pattern_lit_id, flags } + SubPattern::Literal { + pattern: pattern_lit_id, + anchored_at: pattern.anchored_at, + flags, }, iter::once(best_atom), SubPatternAtom::from_atom, @@ -868,7 +865,11 @@ impl<'a> Compiler<'a> { // /foo|bar|baz/ // { 01 02 03 } // { (01 02 03 | 04 05 06 ) } - self.process_alternation_literal(head, pattern.flags); + self.process_alternation_literal( + head, + pattern.anchored_at, + pattern.flags, + ); return Ok(()); } @@ -918,6 +919,7 @@ impl<'a> Compiler<'a> { fn process_alternation_literal( &mut self, hir: re::hir::Hir, + anchored_at: Option, flags: PatternFlagSet, ) { let ascii = flags.contains(PatternFlags::Ascii); @@ -944,24 +946,24 @@ impl<'a> Compiler<'a> { self.lit_pool.get_bytes(pattern_lit_id).unwrap(), ); - let sp = SubPattern::Literal { + let flags = + if wide { flags | SubPatternFlags::Wide } else { flags }; + + let sub_pattern = SubPattern::Literal { pattern: pattern_lit_id, - flags: if wide { - flags | SubPatternFlags::Wide - } else { - flags - }, + anchored_at, + flags, }; if case_insensitive { self.add_sub_pattern( - sp, + sub_pattern, best_atom.case_combinations(), SubPatternAtom::from_atom, ); } else { self.add_sub_pattern( - sp, + sub_pattern, iter::once(best_atom), SubPatternAtom::from_atom, ); @@ -1386,7 +1388,7 @@ impl<'a> Compiler<'a> { // the Aho-Corasick automata. Instead their IDs are added to the // sub_patterns_anchored_at_0 list, together with the offset they are // anchored to. - if let SubPattern::LiteralAnchored { .. } = sub_pattern { + if let SubPattern::Literal { anchored_at: Some(_), .. } = sub_pattern { self.anchored_sub_patterns.push(sub_pattern_id); } else { for atom in atoms { @@ -1657,12 +1659,7 @@ bitmask! { pub(crate) enum SubPattern { Literal { pattern: LiteralId, - flags: SubPatternFlagSet, - }, - - LiteralAnchored { - pattern: LiteralId, - anchored_at: usize, + anchored_at: Option, flags: SubPatternFlagSet, }, diff --git a/yara-x/src/compiler/rules.rs b/yara-x/src/compiler/rules.rs index 51a79e965..860338457 100644 --- a/yara-x/src/compiler/rules.rs +++ b/yara-x/src/compiler/rules.rs @@ -283,7 +283,7 @@ impl Rules { info!("Number of rules: {}", self.rules.len()); info!("Number of patterns: {}", self.num_patterns); info!( - "Number of patterns anchored sub-patterns: {}", + "Number of anchored sub-patterns: {}", self.anchored_sub_patterns.len() ); info!("Number of atoms: {}", self.atoms.len()); diff --git a/yara-x/src/scanner/context.rs b/yara-x/src/scanner/context.rs index a6b12709c..a3ed113a3 100644 --- a/yara-x/src/scanner/context.rs +++ b/yara-x/src/scanner/context.rs @@ -388,12 +388,6 @@ impl ScanContext<'_> { let verification_start = Instant::now(); match sub_pattern { - // Anchored patterns should not be found by the Aho-Corasick - // automata because they are not added to the automata in - // the first place. - SubPattern::LiteralAnchored { .. } => unreachable!( - "anchored pattern found by the Aho-Corasick automata" - ), SubPattern::Literal { pattern, flags, .. } | SubPattern::LiteralChainHead { pattern, flags, .. } | SubPattern::LiteralChainTail { pattern, flags, .. } => { @@ -558,10 +552,10 @@ impl ScanContext<'_> { .map(|id| (id, self.compiled_rules.get_sub_pattern(*id))) { match sub_pattern { - SubPattern::LiteralAnchored { + SubPattern::Literal { pattern, flags, - anchored_at, + anchored_at: Some(offset), .. } => { if let Some(match_) = verify_literal_match( @@ -570,7 +564,7 @@ impl ScanContext<'_> { .get_bytes(*pattern) .unwrap(), self.scanned_data(), - *anchored_at, + *offset, *flags, ) { self.handle_sub_pattern_match( @@ -595,7 +589,6 @@ impl ScanContext<'_> { ) { match sub_pattern { SubPattern::Literal { .. } - | SubPattern::LiteralAnchored { .. } | SubPattern::Xor { .. } | SubPattern::Base64 { .. } | SubPattern::Base64Wide { .. }