Skip to content

Commit

Permalink
refactor: simplify the implementation of greedy regexps in FastVM
Browse files Browse the repository at this point in the history
Instead of having specific opcodes for greedy jumps, jumps are handled as non-greedy but search is not aborted after the first match, it keeps searching for the longest possible match.
  • Loading branch information
plusvic committed Sep 6, 2023
1 parent f5e5690 commit 925695d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 160 deletions.
40 changes: 7 additions & 33 deletions yara-x/src/re/fast/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,7 @@ impl Compiler {
}
}
}
PatternPiece::JumpExact(..)
| PatternPiece::Jump(..)
| PatternPiece::JumpGreedy(..) => {
PatternPiece::JumpExact(..) | PatternPiece::Jump(..) => {
piece_atoms.push((None, None, None, i32::MIN))
}
};
Expand Down Expand Up @@ -209,14 +207,12 @@ impl Compiler {
PatternPiece::JumpExact(len, accept_newlines) => {
instr.emit_jump_exact(*len as u16, *accept_newlines)
}
PatternPiece::Jump(min, max, accept_newlines)
| PatternPiece::JumpGreedy(min, max, accept_newlines) => {
PatternPiece::Jump(min, max, accept_newlines) => {
instr.emit_jump(
*min as u16,
// TODO: implement a different type of jump for those cases
// that don't have an upper bound.
max.unwrap_or(u16::MAX as u32) as u16,
matches!(piece, PatternPiece::JumpGreedy(..)),
*accept_newlines,
);
}
Expand Down Expand Up @@ -244,7 +240,6 @@ enum PatternPiece {
Pattern(Pattern),
Alternation(Vec<Pattern>),
Jump(u32, Option<u32>, bool),
JumpGreedy(u32, Option<u32>, bool),
JumpExact(u32, bool),
}

Expand Down Expand Up @@ -371,12 +366,6 @@ impl Visitor for PatternSplitter {
min,
accept_newlines,
));
} else if rep.greedy {
self.pieces.push(PatternPiece::JumpGreedy(
min,
max,
accept_newlines,
));
} else {
self.pieces.push(PatternPiece::Jump(
min,
Expand Down Expand Up @@ -462,26 +451,11 @@ impl InstrSeq {
self.seq.write_all(len.to_le_bytes().as_slice()).unwrap();
}

pub fn emit_jump(
&mut self,
min: u16,
max: u16,
greedy: bool,
accept_newlines: bool,
) {
match (greedy, accept_newlines) {
(true, true) => {
self.seq.write_all(&[Instr::JUMP_GREEDY]).unwrap();
}
(true, false) => {
self.seq.write_all(&[Instr::JUMP_GREEDY_NO_NEWLINE]).unwrap();
}
(false, true) => {
self.seq.write_all(&[Instr::JUMP]).unwrap();
}
(false, false) => {
self.seq.write_all(&[Instr::JUMP_NO_NEWLINE]).unwrap();
}
pub fn emit_jump(&mut self, min: u16, max: u16, accept_newlines: bool) {
if accept_newlines {
self.seq.write_all(&[Instr::JUMP]).unwrap();
} else {
self.seq.write_all(&[Instr::JUMP_NO_NEWLINE]).unwrap();
}
self.seq.write_all(min.to_le_bytes().as_slice()).unwrap();
self.seq.write_all(max.to_le_bytes().as_slice()).unwrap();
Expand Down
125 changes: 23 additions & 102 deletions yara-x/src/re/fast/fastvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,20 +223,9 @@ impl<'r> FastVM<'r> {
}
}
}
Instr::Jump(ref range)
| Instr::JumpGreedy(ref range)
| Instr::JumpNoNewline(ref range)
| Instr::JumpGreedyNoNewline(ref range) => {
let greedy = matches!(
instr,
Instr::JumpGreedy(_) | Instr::JumpGreedyNoNewline(_)
);

let accept_newlines = !matches!(
instr,
Instr::JumpNoNewline(_)
| Instr::JumpGreedyNoNewline(_)
);
Instr::Jump(ref range) | Instr::JumpNoNewline(ref range) => {
let accept_newlines =
!matches!(instr, Instr::JumpNoNewline(_));

match InstrParser::decode_instr(&self.code[ip..]) {
(Instr::Literal(literal), _) if backwards => {
Expand All @@ -248,7 +237,6 @@ impl<'r> FastVM<'r> {
&input[..input.len() - position],
literal,
wide,
greedy,
accept_newlines,
range,
*position,
Expand All @@ -265,7 +253,6 @@ impl<'r> FastVM<'r> {
&input[*position..],
literal,
wide,
greedy,
accept_newlines,
range,
*position,
Expand All @@ -284,7 +271,6 @@ impl<'r> FastVM<'r> {
&input[..input.len() - position],
literal,
wide,
greedy,
accept_newlines,
range,
*position,
Expand All @@ -303,7 +289,6 @@ impl<'r> FastVM<'r> {
&input[*position..],
literal,
wide,
greedy,
accept_newlines,
range,
*position,
Expand Down Expand Up @@ -479,7 +464,6 @@ impl FastVM<'_> {
input: &[u8],
literal: &[u8],
wide: bool,
greedy: bool,
accept_newlines: bool,
range: &RangeInclusive<u16>,
position: usize,
Expand Down Expand Up @@ -518,51 +502,20 @@ impl FastVM<'_> {
next_positions.insert(position + n + offset);
}
};
match (greedy, accept_newlines) {
// Non-greedy, newlines accepted.
(false, true) => {
for offset in memchr::memchr_iter(lit, jmp_range) {
on_match_found(offset)
}
if accept_newlines {
for offset in memchr::memchr_iter(lit, jmp_range) {
on_match_found(offset)
}
// Non-greedy, newlines not accepted.
(false, false) => {
// Search for the literal byte and the newline at the same
// time. Any offset found before the newline is a position
// that needs to be verified, but once the newline if found
// no more positions will match and we can return.
for offset in memchr::memchr2_iter(lit, 0x0A, jmp_range) {
if jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
}
// Greedy, newlines accepted.
(true, true) => {
for offset in memchr::memrchr_iter(lit, jmp_range) {
on_match_found(offset)
}
}
// Greedy, newlines not accepted.
(true, false) => {
// Search for the newline character in the range covered by
// the jump. If found, truncate the range at the point where
// the newline was found.
let jmp_range = if let Some(newline) =
memchr::memchr(0x0A, jmp_range)
{
&jmp_range[..newline]
} else {
jmp_range
};
// Now search for the literal byte from right to left (in
// opposite direction to the forward jump). As this is a
// greedy jump we want the higher offsets to be inserted
// in `next_positions` first.
for offset in memchr::memrchr_iter(lit, jmp_range) {
on_match_found(offset)
} else {
// Search for the literal byte and the newline at the same
// time. Any offset found before the newline is a position
// that needs to be verified, but once the newline if found
// no more positions will match and we can return.
for offset in memchr::memchr2_iter(lit, 0x0A, jmp_range) {
if jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
}
}
Expand All @@ -574,7 +527,6 @@ impl FastVM<'_> {
input: &[u8],
literal: &[u8],
wide: bool,
greedy: bool,
accept_newlines: bool,
range: &RangeInclusive<u16>,
position: usize,
Expand Down Expand Up @@ -639,47 +591,16 @@ impl FastVM<'_> {
);
}
};
match (greedy, accept_newlines) {
// Non-greedy, newlines accepted.
(false, true) => {
for offset in memchr::memrchr_iter(lit, jmp_range) {
on_match_found(offset)
}
}
// Non-greedy, newlines not accepted.
(false, false) => {
for offset in memchr::memrchr2_iter(lit, 0x0A, jmp_range) {
if jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
}
// Greedy, newlines accepted.
(true, true) => {
for offset in memchr::memchr_iter(lit, jmp_range) {
on_match_found(offset)
}
if accept_newlines {
for offset in memchr::memrchr_iter(lit, jmp_range) {
on_match_found(offset)
}
// Greedy, newlines not accepted.
(true, false) => {
// Search for the newline character in the range covered by
// the jump. If found, truncate the range at the point where
// the newline was found.
let jmp_range = if let Some(newline) =
memchr::memchr(0x0A, jmp_range)
{
&jmp_range[newline + 1..]
} else {
jmp_range
};
// Now search for the literal byte from left to right (in
// the opposite direction to the backward jump). As this a
// a greedy jump we want the lower offsets to be inserted
// in `next_positions` first.
for offset in memchr::memchr_iter(lit, jmp_range) {
on_match_found(offset)
} else {
for offset in memchr::memrchr2_iter(lit, 0x0A, jmp_range) {
if jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
}
}
Expand Down
28 changes: 5 additions & 23 deletions yara-x/src/re/fast/instr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use std::mem::size_of;
use std::ops::RangeInclusive;

use crate::re::fast::instr::Instr::{
Alternation, Jump, JumpExact, JumpExactNoNewline, JumpGreedy,
JumpGreedyNoNewline, JumpNoNewline, Literal, MaskedLiteral, Match,
Alternation, Jump, JumpExact, JumpExactNoNewline, JumpNoNewline, Literal,
MaskedLiteral, Match,
};

/// Instructions supported by the Fast VM.
Expand Down Expand Up @@ -39,12 +39,6 @@ pub enum Instr<'a> {
/// contain newline characters. This is a non-greedy match, shorter strings
/// are preferred.
JumpNoNewline(RangeInclusive<u16>),

/// Exactly like Jump, but greedy.
JumpGreedy(RangeInclusive<u16>),

/// Exactly like JumpNoNewline, but greedy.
JumpGreedyNoNewline(RangeInclusive<u16>),
}

impl<'a> Instr<'a> {
Expand All @@ -53,11 +47,9 @@ impl<'a> Instr<'a> {
pub const MASKED_LITERAL: u8 = 0x02;
pub const JUMP_EXACT: u8 = 0x03;
pub const JUMP: u8 = 0x04;
pub const JUMP_GREEDY: u8 = 0x05;
pub const JUMP_EXACT_NO_NEWLINE: u8 = 0x06;
pub const JUMP_NO_NEWLINE: u8 = 0x07;
pub const JUMP_GREEDY_NO_NEWLINE: u8 = 0x08;
pub const ALTERNATION: u8 = 0x09;
pub const JUMP_EXACT_NO_NEWLINE: u8 = 0x05;
pub const JUMP_NO_NEWLINE: u8 = 0x06;
pub const ALTERNATION: u8 = 0x07;
}

/// Parses a slice of bytes that contains Fast VM instructions, returning
Expand Down Expand Up @@ -112,11 +104,6 @@ impl<'a> InstrParser<'a> {
let max = Self::decode_u16(&code[1 + size_of::<u16>()..]);
(Jump(min..=max), 1 + 2 * size_of::<u16>())
}
[Instr::JUMP_GREEDY, ..] => {
let min = Self::decode_u16(&code[1..]);
let max = Self::decode_u16(&code[1 + size_of::<u16>()..]);
(JumpGreedy(min..=max), 1 + 2 * size_of::<u16>())
}
[Instr::JUMP_EXACT_NO_NEWLINE, ..] => {
let len = Self::decode_u16(&code[1..]);
(JumpExactNoNewline(len), 1 + size_of::<u16>())
Expand All @@ -126,11 +113,6 @@ impl<'a> InstrParser<'a> {
let max = Self::decode_u16(&code[1 + size_of::<u16>()..]);
(JumpNoNewline(min..=max), 1 + 2 * size_of::<u16>())
}
[Instr::JUMP_GREEDY_NO_NEWLINE, ..] => {
let min = Self::decode_u16(&code[1..]);
let max = Self::decode_u16(&code[1 + size_of::<u16>()..]);
(JumpGreedyNoNewline(min..=max), 1 + 2 * size_of::<u16>())
}
[Instr::MATCH, ..] => (Match, 1),
[opcode, ..] => {
unreachable!("unknown opcode for FastVM: {}", opcode)
Expand Down
6 changes: 5 additions & 1 deletion yara-x/src/scanner/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,11 @@ fn verify_regexp_match(
flags.contains(SubPatternFlags::Wide),
|match_len| {
fwd_match_len = Some(match_len);
Action::Stop
if flags.contains(SubPatternFlags::GreedyRegexp) {
Action::Continue
} else {
Action::Stop
}
},
);
} else {
Expand Down
2 changes: 1 addition & 1 deletion yara-x/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -867,7 +867,7 @@ fn regexp_patterns_1() {

#[test]
fn issue() {
pattern_false!(r#"/abcd.{3}aaa/"#, b"abcd\naaaaaa");
pattern_match!(r#"/abcd.*aaa/"#, b"abcdxxxxaaa", b"abcdxxxxaaa");
}

#[test]
Expand Down

0 comments on commit 925695d

Please sign in to comment.