From f5e569074e4acb4e1c42e9e923596fb8546b4c0c Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Tue, 5 Sep 2023 18:07:48 +0200 Subject: [PATCH] feat: implement jumps that match skip over every byte except the newline character This allows to use `FastVM` with regular expressions like `/abc.*def/` where `.` matches everything except newlines. --- yara-x/src/re/fast/compiler.rs | 116 ++++++++++++++-------- yara-x/src/re/fast/fastvm.rs | 172 +++++++++++++++++++++++++++++---- yara-x/src/re/fast/instr.rs | 40 +++++++- yara-x/src/re/hir.rs | 30 ++++++ yara-x/src/tests/mod.rs | 22 +++++ 5 files changed, 316 insertions(+), 64 deletions(-) diff --git a/yara-x/src/re/fast/compiler.rs b/yara-x/src/re/fast/compiler.rs index b9e895acb..c4c9e218b 100644 --- a/yara-x/src/re/fast/compiler.rs +++ b/yara-x/src/re/fast/compiler.rs @@ -98,9 +98,9 @@ impl Compiler { } } } - PatternPiece::JumpExact(_) - | PatternPiece::Jump(_, _) - | PatternPiece::JumpGreedy(_, _) => { + PatternPiece::JumpExact(..) + | PatternPiece::Jump(..) + | PatternPiece::JumpGreedy(..) => { piece_atoms.push((None, None, None, i32::MIN)) } }; @@ -206,13 +206,18 @@ impl Compiler { PatternPiece::Alternation(alt) => { instr.emit_alternation(alt); } - PatternPiece::JumpExact(len) => instr.emit_jump_exact(*len), - PatternPiece::Jump(min, max) - | PatternPiece::JumpGreedy(min, max) => { + PatternPiece::JumpExact(len, accept_newlines) => { + instr.emit_jump_exact(*len as u16, *accept_newlines) + } + PatternPiece::Jump(min, max, accept_newlines) + | PatternPiece::JumpGreedy(min, max, accept_newlines) => { instr.emit_jump( - *min, - *max, - matches!(piece, PatternPiece::JumpGreedy(_, _)), + *min as u16, + // TODO: implement a different type of jump for those cases + // that don't have an upper bound. + max.unwrap_or(u16::MAX as u32) as u16, + matches!(piece, PatternPiece::JumpGreedy(..)), + *accept_newlines, ); } } @@ -229,15 +234,18 @@ impl Compiler { /// /// ```text /// Pattern(Literal([01, 02, 03])) -/// Jump(0,2) +/// Jump(0,2, false) /// Pattern(Masked([04, 00, 06], [FF, F0, FF])) /// ``` +/// +/// The `bool` field in all jump variants mean whether the newline characters +/// are accepted in the data being skipped or not. enum PatternPiece { Pattern(Pattern), Alternation(Vec), - Jump(u16, u16), - JumpGreedy(u16, u16), - JumpExact(u16), + Jump(u32, Option, bool), + JumpGreedy(u32, Option, bool), + JumpExact(u32, bool), } enum Pattern { @@ -330,46 +338,53 @@ impl Visitor for PatternSplitter { if self.in_repetition || self.in_alternation { return Err(Error::FastIncompatible); } - if !re::hir::any_byte(rep.sub.kind()) { + + let any_byte = re::hir::any_byte(rep.sub.kind()); + let any_byte_except_newline = + re::hir::any_byte_except_newline(rep.sub.kind()); + + if !any_byte && !any_byte_except_newline { return Err(Error::FastIncompatible); } + + let accept_newlines = !any_byte_except_newline; + match (rep.min, rep.max) { - // When the jump has a fixed size <= 8 treat it as a - // sequence of ?? wildcards. It's more efficient to - // treat short fixed size jumps as a sequence of - // wildcards than breaking the pattern into more - // pieces. - (min, Some(max)) if min == max && max <= 8 => { + // When the jump has a fixed size <= 8 and accept newlines + // treat it as a sequence of ?? wildcards. It's more + // efficient to treat short fixed size jumps as a sequence + // of wildcards than breaking the pattern into more pieces. + (min, Some(max)) + if min == max && max <= 8 && accept_newlines => + { for _ in 0..max { self.bytes.push(0); self.mask.push(0); } } - (min, Some(max)) => { + (min, max) => { if let Some(pattern) = self.finish_literal() { self.pieces.push(PatternPiece::Pattern(pattern)); } - if min == max { - self.pieces - .push(PatternPiece::JumpExact(min as u16)); + if Some(min) == max { + self.pieces.push(PatternPiece::JumpExact( + min, + accept_newlines, + )); } else if rep.greedy { self.pieces.push(PatternPiece::JumpGreedy( - min as u16, max as u16, + min, + max, + accept_newlines, )); } else { self.pieces.push(PatternPiece::Jump( - min as u16, max as u16, + min, + max, + accept_newlines, )); } } - // This should not happen. Regexp patterns are split - // into multiple chained patterns by calling - // re::hir::Hir::split_at_large_gaps before being passed - // to this compiler. Therefore patterns should not - // contain unbounded jumps when are compiled. - (_, None) => { - unreachable!() - } } self.in_repetition = true; } @@ -438,16 +453,35 @@ impl InstrSeq { self.seq.write_all(&[Instr::MATCH]).unwrap(); } - pub fn emit_jump_exact(&mut self, len: u16) { - self.seq.write_all(&[Instr::JUMP_EXACT]).unwrap(); + pub fn emit_jump_exact(&mut self, len: u16, accept_newlines: bool) { + if accept_newlines { + self.seq.write_all(&[Instr::JUMP_EXACT]).unwrap(); + } else { + self.seq.write_all(&[Instr::JUMP_EXACT_NO_NEWLINE]).unwrap(); + } self.seq.write_all(len.to_le_bytes().as_slice()).unwrap(); } - pub fn emit_jump(&mut self, min: u16, max: u16, greedy: bool) { - if greedy { - self.seq.write_all(&[Instr::JUMP_GREEDY]).unwrap(); - } else { - self.seq.write_all(&[Instr::JUMP]).unwrap(); + pub fn emit_jump( + &mut self, + min: u16, + max: u16, + greedy: bool, + accept_newlines: bool, + ) { + match (greedy, accept_newlines) { + (true, true) => { + self.seq.write_all(&[Instr::JUMP_GREEDY]).unwrap(); + } + (true, false) => { + self.seq.write_all(&[Instr::JUMP_GREEDY_NO_NEWLINE]).unwrap(); + } + (false, true) => { + self.seq.write_all(&[Instr::JUMP]).unwrap(); + } + (false, false) => { + self.seq.write_all(&[Instr::JUMP_NO_NEWLINE]).unwrap(); + } } self.seq.write_all(min.to_le_bytes().as_slice()).unwrap(); self.seq.write_all(max.to_le_bytes().as_slice()).unwrap(); diff --git a/yara-x/src/re/fast/fastvm.rs b/yara-x/src/re/fast/fastvm.rs index 8ed122f02..590d35ee4 100644 --- a/yara-x/src/re/fast/fastvm.rs +++ b/yara-x/src/re/fast/fastvm.rs @@ -201,11 +201,7 @@ impl<'r> FastVM<'r> { } // The only valid instructions in alternatives // are literals. - Instr::Match - | Instr::Alternation(_) - | Instr::JumpExact(_) - | Instr::JumpGreedy(_) - | Instr::Jump(_) => { + _ => { unreachable!() } } @@ -217,8 +213,30 @@ impl<'r> FastVM<'r> { next_positions.insert(position + step * jump as usize); } } - Instr::Jump(ref range) | Instr::JumpGreedy(ref range) => { - let greedy = matches!(instr, Instr::JumpGreedy(_)); + Instr::JumpExactNoNewline(jump) => { + for position in &self.positions { + let jump_range = + *position..*position + step * jump as usize; + if memchr::memchr(0x0A, &input[jump_range]).is_none() { + next_positions + .insert(position + step * jump as usize); + } + } + } + Instr::Jump(ref range) + | Instr::JumpGreedy(ref range) + | Instr::JumpNoNewline(ref range) + | Instr::JumpGreedyNoNewline(ref range) => { + let greedy = matches!( + instr, + Instr::JumpGreedy(_) | Instr::JumpGreedyNoNewline(_) + ); + + let accept_newlines = !matches!( + instr, + Instr::JumpNoNewline(_) + | Instr::JumpGreedyNoNewline(_) + ); match InstrParser::decode_instr(&self.code[ip..]) { (Instr::Literal(literal), _) if backwards => { @@ -231,6 +249,7 @@ impl<'r> FastVM<'r> { literal, wide, greedy, + accept_newlines, range, *position, &mut next_positions, @@ -247,6 +266,7 @@ impl<'r> FastVM<'r> { literal, wide, greedy, + accept_newlines, range, *position, &mut next_positions, @@ -265,6 +285,7 @@ impl<'r> FastVM<'r> { literal, wide, greedy, + accept_newlines, range, *position, &mut next_positions, @@ -283,6 +304,7 @@ impl<'r> FastVM<'r> { literal, wide, greedy, + accept_newlines, range, *position, &mut next_positions, @@ -291,7 +313,29 @@ impl<'r> FastVM<'r> { } _ => { for position in mem::take(&mut self.positions) { + if accept_newlines { + let jmp_min_range = position + ..position + *range.start() as usize; + match input.get(jmp_min_range) { + Some(r) => { + if memchr::memchr(0x0A, r) + .is_some() + { + continue; + } + } + None => continue, + } + } for i in range.clone() { + if accept_newlines { + match input + .get(position + step * i as usize) + { + Some(0x0A) | None => continue, + _ => {} + } + } next_positions .insert(position + step * i as usize); } @@ -436,6 +480,7 @@ impl FastVM<'_> { literal: &[u8], wide: bool, greedy: bool, + accept_newlines: bool, range: &RangeInclusive, position: usize, next_positions: &mut IndexSet, @@ -452,6 +497,13 @@ impl FastVM<'_> { return; } + // If newlines are not accepted in the data being skipped by the jump + // lets make sure that the ranges that goes from the current position + // to position + n doesn't contain any newlines. + if !accept_newlines && memchr::memchr(0x0A, &input[..n]).is_some() { + return; + } + if let Some(jmp_range) = input.get(range_min..range_max) { let lit = *literal.first().unwrap(); let mut on_match_found = |offset| { @@ -466,13 +518,51 @@ impl FastVM<'_> { next_positions.insert(position + n + offset); } }; - if greedy { - for offset in memchr::memrchr_iter(lit, jmp_range) { - on_match_found(offset) + match (greedy, accept_newlines) { + // Non-greedy, newlines accepted. + (false, true) => { + for offset in memchr::memchr_iter(lit, jmp_range) { + on_match_found(offset) + } } - } else { - for offset in memchr::memchr_iter(lit, jmp_range) { - on_match_found(offset) + // Non-greedy, newlines not accepted. + (false, false) => { + // Search for the literal byte and the newline at the same + // time. Any offset found before the newline is a position + // that needs to be verified, but once the newline if found + // no more positions will match and we can return. + for offset in memchr::memchr2_iter(lit, 0x0A, jmp_range) { + if jmp_range[offset] == 0x0A { + return; + } + on_match_found(offset) + } + } + // Greedy, newlines accepted. + (true, true) => { + for offset in memchr::memrchr_iter(lit, jmp_range) { + on_match_found(offset) + } + } + // Greedy, newlines not accepted. + (true, false) => { + // Search for the newline character in the range covered by + // the jump. If found, truncate the range at the point where + // the newline was found. + let jmp_range = if let Some(newline) = + memchr::memchr(0x0A, jmp_range) + { + &jmp_range[..newline] + } else { + jmp_range + }; + // Now search for the literal byte from right to left (in + // opposite direction to the forward jump). As this is a + // greedy jump we want the higher offsets to be inserted + // in `next_positions` first. + for offset in memchr::memrchr_iter(lit, jmp_range) { + on_match_found(offset) + } } } } @@ -485,6 +575,7 @@ impl FastVM<'_> { literal: &[u8], wide: bool, greedy: bool, + accept_newlines: bool, range: &RangeInclusive, position: usize, next_positions: &mut IndexSet, @@ -521,6 +612,15 @@ impl FastVM<'_> { return; } + // If newlines are not accepted in the data being skipped by the jump + // lets make sure that the ranges that goes from the current position + // to position + n doesn't contain any newlines. + if !accept_newlines + && memchr::memchr(0x0A, &input[range_max..]).is_some() + { + return; + } + if let Some(jmp_range) = input.get(range_min..range_max) { let lit = *literal.last().unwrap(); let mut on_match_found = |offset| { @@ -539,13 +639,47 @@ impl FastVM<'_> { ); } }; - if greedy { - for offset in memchr::memchr_iter(lit, jmp_range) { - on_match_found(offset) + match (greedy, accept_newlines) { + // Non-greedy, newlines accepted. + (false, true) => { + for offset in memchr::memrchr_iter(lit, jmp_range) { + on_match_found(offset) + } + } + // Non-greedy, newlines not accepted. + (false, false) => { + for offset in memchr::memrchr2_iter(lit, 0x0A, jmp_range) { + if jmp_range[offset] == 0x0A { + return; + } + on_match_found(offset) + } } - } else { - for offset in memchr::memrchr_iter(lit, jmp_range) { - on_match_found(offset) + // Greedy, newlines accepted. + (true, true) => { + for offset in memchr::memchr_iter(lit, jmp_range) { + on_match_found(offset) + } + } + // Greedy, newlines not accepted. + (true, false) => { + // Search for the newline character in the range covered by + // the jump. If found, truncate the range at the point where + // the newline was found. + let jmp_range = if let Some(newline) = + memchr::memchr(0x0A, jmp_range) + { + &jmp_range[newline + 1..] + } else { + jmp_range + }; + // Now search for the literal byte from left to right (in + // the opposite direction to the backward jump). As this a + // a greedy jump we want the lower offsets to be inserted + // in `next_positions` first. + for offset in memchr::memchr_iter(lit, jmp_range) { + on_match_found(offset) + } } } } diff --git a/yara-x/src/re/fast/instr.rs b/yara-x/src/re/fast/instr.rs index 3b41c5688..f588f83a3 100644 --- a/yara-x/src/re/fast/instr.rs +++ b/yara-x/src/re/fast/instr.rs @@ -2,7 +2,8 @@ use std::mem::size_of; use std::ops::RangeInclusive; use crate::re::fast::instr::Instr::{ - Alternation, Jump, JumpExact, JumpGreedy, Literal, MaskedLiteral, Match, + Alternation, Jump, JumpExact, JumpExactNoNewline, JumpGreedy, + JumpGreedyNoNewline, JumpNoNewline, Literal, MaskedLiteral, Match, }; /// Instructions supported by the Fast VM. @@ -23,13 +24,27 @@ pub enum Instr<'a> { /// as part of an alternation. Alternation(InstrParser<'a>), - /// Matches any string of a fixed length. + /// Matches all strings of a given length. JumpExact(u16), - /// Matches any string with a length in a given range. + /// Matches all strings of a given length, but the string can't contain + /// newline characters. + JumpExactNoNewline(u16), + + /// Matches any string with a length in a given range. This is a + /// non-greedy match, shorter strings are preferred. Jump(RangeInclusive), + /// Matches any string with a length in a given range, but the string can't + /// contain newline characters. This is a non-greedy match, shorter strings + /// are preferred. + JumpNoNewline(RangeInclusive), + + /// Exactly like Jump, but greedy. JumpGreedy(RangeInclusive), + + /// Exactly like JumpNoNewline, but greedy. + JumpGreedyNoNewline(RangeInclusive), } impl<'a> Instr<'a> { @@ -39,7 +54,10 @@ impl<'a> Instr<'a> { pub const JUMP_EXACT: u8 = 0x03; pub const JUMP: u8 = 0x04; pub const JUMP_GREEDY: u8 = 0x05; - pub const ALTERNATION: u8 = 0x06; + pub const JUMP_EXACT_NO_NEWLINE: u8 = 0x06; + pub const JUMP_NO_NEWLINE: u8 = 0x07; + pub const JUMP_GREEDY_NO_NEWLINE: u8 = 0x08; + pub const ALTERNATION: u8 = 0x09; } /// Parses a slice of bytes that contains Fast VM instructions, returning @@ -99,6 +117,20 @@ impl<'a> InstrParser<'a> { let max = Self::decode_u16(&code[1 + size_of::()..]); (JumpGreedy(min..=max), 1 + 2 * size_of::()) } + [Instr::JUMP_EXACT_NO_NEWLINE, ..] => { + let len = Self::decode_u16(&code[1..]); + (JumpExactNoNewline(len), 1 + size_of::()) + } + [Instr::JUMP_NO_NEWLINE, ..] => { + let min = Self::decode_u16(&code[1..]); + let max = Self::decode_u16(&code[1 + size_of::()..]); + (JumpNoNewline(min..=max), 1 + 2 * size_of::()) + } + [Instr::JUMP_GREEDY_NO_NEWLINE, ..] => { + let min = Self::decode_u16(&code[1..]); + let max = Self::decode_u16(&code[1 + size_of::()..]); + (JumpGreedyNoNewline(min..=max), 1 + 2 * size_of::()) + } [Instr::MATCH, ..] => (Match, 1), [opcode, ..] => { unreachable!("unknown opcode for FastVM: {}", opcode) diff --git a/yara-x/src/re/hir.rs b/yara-x/src/re/hir.rs index b11511684..065bd1a99 100644 --- a/yara-x/src/re/hir.rs +++ b/yara-x/src/re/hir.rs @@ -9,6 +9,7 @@ use crate::utils::cast; pub use regex_syntax::hir::Class; pub use regex_syntax::hir::ClassBytes; pub use regex_syntax::hir::HirKind; +use regex_syntax::hir::{ClassBytesRange, ClassUnicode, ClassUnicodeRange}; #[derive(Debug, PartialEq)] pub(crate) struct ChainedPattern { @@ -229,6 +230,35 @@ pub fn any_byte(hir_kind: &HirKind) -> bool { } } +/// Returns true if `hir_kind` is a byte class containing all possible bytes +/// except newline. +/// +/// For example `.` in a regexp that doesn't use the `/s` modifier +/// (i.e: `dot_matches_new_line` is false). +pub fn any_byte_except_newline(hir_kind: &HirKind) -> bool { + match hir_kind { + HirKind::Class(Class::Bytes(class)) => { + // The class must contain two ranges, one that contains all bytes + // in the range 0x00-0x09, and the other that contains all bytes + // in the range 0x0B-0xFF. Only 0x0A (ASCII code for line-feed) is + // excluded. + let all_bytes_except_newline = ClassBytes::new([ + ClassBytesRange::new(0x00, 0x09), + ClassBytesRange::new(0x0B, 0xFF), + ]); + all_bytes_except_newline.eq(class) + } + HirKind::Class(Class::Unicode(class)) => { + let all_bytes_except_newline = ClassUnicode::new([ + ClassUnicodeRange::new(0x00 as char, 0x09 as char), + ClassUnicodeRange::new(0x0B as char, char::MAX), + ]); + all_bytes_except_newline.eq(class) + } + _ => false, + } +} + /// Returns [`Some(HexByte)`] if the given [`ClassBytes`] represents a /// masked byte. /// diff --git a/yara-x/src/tests/mod.rs b/yara-x/src/tests/mod.rs index e0e3fae53..b145f4191 100644 --- a/yara-x/src/tests/mod.rs +++ b/yara-x/src/tests/mod.rs @@ -830,6 +830,23 @@ fn regexp_patterns_1() { pattern_match!(r#"/(a|b|c|d|e)f/"#, b"ef", b"ef"); pattern_match!(r#"/a|b/"#, b"a", b"a"); + pattern_match!(r#"/abcd.*ef/"#, b"abcdef", b"abcdef"); + pattern_match!(r#"/ab.*cdef/"#, b"abcdef", b"abcdef"); + pattern_match!(r#"/abcd.*ef/"#, b"abcdxef", b"abcdxef"); + pattern_match!(r#"/ab.*cdef/"#, b"abxcdef", b"abxcdef"); + pattern_false!(r#"/abcd.*ef/"#, b"abcd\nef"); + pattern_false!(r#"/ab.*cdef/"#, b"ab\ncdef"); + pattern_false!(r#"/abcd.{3}aaa/"#, b"abcd\naaaaaa"); + pattern_false!(r#"/ab.{3}aaa/"#, b"ab\naaaaaa"); + pattern_match!(r#"/abcd.*ef/s"#, b"abcd\nef", b"abcd\nef"); + pattern_match!(r#"/ab.*cdef/s"#, b"ab\ncdef", b"ab\ncdef"); + pattern_match!(r#"/abcd.{3}aaa/s"#, b"abcd\naaaaaaaaa", b"abcd\naaaaa"); + pattern_match!(r#"/ab.{3}aaa/s"#, b"ab\naaaaaaaaa", b"ab\naaaaa"); + pattern_false!(r#"/abcd.{1,2}ef/"#, b"abcdef"); + pattern_false!(r#"/ab.{1,2}cdef/"#, b"abcdef"); + pattern_match!(r#"/abcd.{1,2}ef/"#, b"abcdxef", b"abcdxef"); + pattern_match!(r#"/ab.{1,2}cdef/"#, b"abxcdef", b"abxcdef"); + // TODO: known issue related to exact atoms. The matching string // should be "abbb" and not "abb". pattern_match!(r#"/a(bb|b)b/"#, b"abbbbbbbb", b"abb"); @@ -848,6 +865,11 @@ fn regexp_patterns_1() { ); } +#[test] +fn issue() { + pattern_false!(r#"/abcd.{3}aaa/"#, b"abcd\naaaaaa"); +} + #[test] fn regexp_patterns_2() { pattern_match!(r#"/.b{2}/"#, b"abb", b"abb");