From 4c867898bf5e07e9765ee59faa3fd3d32a0a5e0b Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 17 Aug 2023 14:01:10 +0200 Subject: [PATCH] perf: some adjustments in the heuristics for computing atom quality --- yara-x/src/compiler/atoms/quality.rs | 84 +++++++++++++++++++++++++--- yara-x/src/re/tests.rs | 7 +++ 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/yara-x/src/compiler/atoms/quality.rs b/yara-x/src/compiler/atoms/quality.rs index 3fa310b3b..092dbaf3b 100644 --- a/yara-x/src/compiler/atoms/quality.rs +++ b/yara-x/src/compiler/atoms/quality.rs @@ -22,9 +22,13 @@ where match *byte { // Common values contribute less to the quality than the // rest of values. - 0x00 | 0x20 | 0x90 | 0xcc | 0xff => { + 0x20 | 0x90 | 0xcc | 0xff => { q += 12; } + // Zeroes are specially bad and contribute less. + 0x00 => { + q += 6; + } // Bytes in the ASCII ranges a-z and A-Z have a slightly // lower quality than the rest. We want to favor atoms that // don't contain too many letters, as they generate less @@ -72,23 +76,37 @@ where pub(crate) struct SeqQuality { seq_len: u32, min_atom_len: u32, + min_atom_quality: i32, } impl SeqQuality { pub fn min() -> Self { - Self { seq_len: u32::MAX, min_atom_len: 0 } + Self { seq_len: u32::MAX, min_atom_len: 0, min_atom_quality: i32::MIN } } } impl PartialOrd for SeqQuality { fn partial_cmp(&self, other: &Self) -> Option { + // This sequence is better than the other if its worst atom is better + // the other's worst atom. + if self.min_atom_quality > other.min_atom_quality { + return Some(Ordering::Greater); + } // If the shortest atom in both sequences have the same length, the - // best sequence is the shortest one. + // best sequence is the one that has the higher min_atom_quality. If + // both have the same min_atom_quality, then the shorter sequence is + // the best. if self.min_atom_len == other.min_atom_len { - return if self.seq_len < other.seq_len { - Some(Ordering::Greater) - } else { - Some(Ordering::Less) + return match (self.min_atom_quality, other.min_atom_quality) { + (q1, q2) if q1 == q2 => { + if self.seq_len < other.seq_len { + Some(Ordering::Greater) + } else { + Some(Ordering::Less) + } + } + (q1, q2) if q1 > q2 => Some(Ordering::Greater), + _ => Some(Ordering::Less), }; } // If the minimum atom length for this sequence is exactly one byte @@ -115,7 +133,9 @@ impl PartialOrd for SeqQuality { // In general, this sequence is better than the other only if // its minimum atom length is greater. - if self.min_atom_len > other.min_atom_len { + if self.min_atom_quality > other.min_atom_quality + || self.min_atom_len > other.min_atom_len + { Some(Ordering::Greater) } else { Some(Ordering::Less) @@ -126,7 +146,14 @@ impl PartialOrd for SeqQuality { pub(crate) fn seq_quality(seq: &Seq) -> Option { seq.len().map(|len| SeqQuality { seq_len: len as u32, - min_atom_len: seq.min_literal_len().unwrap() as u32, + min_atom_len: seq.min_literal_len().unwrap_or(0) as u32, + min_atom_quality: seq + .literals() + .unwrap_or(&[]) + .iter() + .map(|l| atom_quality(l.as_bytes())) + .min() + .unwrap_or(i32::MIN), }) } @@ -190,6 +217,10 @@ mod test { assert!(q_ab > q_01); assert!(q_aa > q_01); assert!(q_ab > q_aa); + dbg!(q_ab); + dbg!(q_00000001); + + assert!(q_ab > q_000001); } #[test] @@ -227,5 +258,40 @@ mod test { Literal::inexact("fgh") ])) ); + + assert!( + seq_quality(&Seq::new(vec![Literal::inexact("abcd"),])) + > seq_quality(&Seq::new(vec![Literal::inexact( + "\x00\x00\x00\x00" + ),])) + ); + + assert!( + seq_quality(&Seq::new(vec![Literal::inexact("abc"),])) + > seq_quality(&Seq::new(vec![Literal::inexact( + "\x00\x00\x00\x00" + ),])) + ); + + assert!( + seq_quality(&Seq::new(vec![Literal::inexact("abc"),])) + > seq_quality(&Seq::new(vec![Literal::inexact( + "\x00\x00\x00\x01" + ),])) + ); + + assert!( + seq_quality(&Seq::new(vec![Literal::inexact("\x01\0x02\0x03"),])) + > seq_quality(&Seq::new(vec![Literal::inexact( + "\x00\x00\x00\x01" + ),])) + ); + + assert!( + seq_quality(&Seq::new(vec![Literal::inexact("ab"),])) + > seq_quality(&Seq::new(vec![Literal::inexact( + "\x00\x00\x00\x00" + ),])) + ); } } diff --git a/yara-x/src/re/tests.rs b/yara-x/src/re/tests.rs index 81a0b22cf..9c0b7295b 100644 --- a/yara-x/src/re/tests.rs +++ b/yara-x/src/re/tests.rs @@ -1051,6 +1051,8 @@ fn re_atoms() { assert_re_atoms!(r#"(?i)abc.*123"#, vec![Atom::inexact(b"123")]); + assert_re_atoms!("\x00\x00\x00\x00.{2,3}abc", vec![Atom::inexact(b"abc")]); + assert_re_atoms!( r#"(?s)a.b.c.d"#, // Atoms a\x00b, a\x01b, a\x02b, .... up to a\xffb @@ -1066,3 +1068,8 @@ fn re_atoms() { vec![Atom::inexact(b"a")] ); } + +#[test] +fn issue() { + assert_re_atoms!("\x00\x00\x00\x00.{2,3}abc", vec![Atom::inexact(b"abc")]); +}