Skip to content

Commit

Permalink
fix: issue while matching regexps with FastVM
Browse files Browse the repository at this point in the history
When case-insensitive regexps contained jumps that didn't allow newline characters, there were cases in which false-positive matches were returned.
  • Loading branch information
plusvic committed Mar 10, 2024
1 parent 3da6fc8 commit 7ca768d
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 39 deletions.
96 changes: 57 additions & 39 deletions lib/src/re/fast/fastvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ impl<'r> FastVM<'r> {
}
Self::jump_bck(
&input[..input.len() - position],
literal,
literal.last().copied(),
flags,
&range,
*position,
Expand All @@ -287,7 +287,7 @@ impl<'r> FastVM<'r> {
}
Self::jump_fwd(
&input[*position..],
literal,
literal.first().copied(),
flags,
&range,
*position,
Expand All @@ -304,7 +304,7 @@ impl<'r> FastVM<'r> {
}
Self::jump_bck(
&input[..input.len() - position],
literal,
literal.last().copied(),
flags,
&range,
*position,
Expand All @@ -321,7 +321,7 @@ impl<'r> FastVM<'r> {
}
Self::jump_fwd(
&input[*position..],
literal,
literal.first().copied(),
flags,
&range,
*position,
Expand All @@ -337,7 +337,7 @@ impl<'r> FastVM<'r> {
if backwards {
Self::jump_bck(
&input[..input.len() - position],
&[],
None,
flags,
&range,
*position,
Expand All @@ -346,7 +346,7 @@ impl<'r> FastVM<'r> {
} else {
Self::jump_fwd(
&input[*position..],
&[],
None,
flags,
&range,
*position,
Expand Down Expand Up @@ -538,7 +538,7 @@ impl FastVM<'_> {
#[inline]
fn jump_fwd(
input: &[u8],
literal: &[u8],
expected_after_jump: Option<u8>,
flags: JumpFlagSet,
range: &RangeInclusive<u16>,
position: usize,
Expand Down Expand Up @@ -580,14 +580,10 @@ impl FastVM<'_> {
}
};

// If the literal is non-empty, the next positions are those where
// the byte in the data matches the first byte in the literal.
if let Some(lit) = literal.first() {
if flags.contains(JumpFlags::AcceptNewlines) {
for offset in memchr::memchr_iter(*lit, jmp_range) {
on_match_found(offset)
}
} else {
let accept_newlines = flags.contains(JumpFlags::AcceptNewlines);

match expected_after_jump {
Some(b) if !accept_newlines => {
// Search for the literal byte and the newline at the same
// time. Any offset found before the newline is a position
// that needs to be verified, but once the newline is found
Expand All @@ -596,27 +592,36 @@ impl FastVM<'_> {
// There's an edge case when the literal byte is also a
// newline, in such cases any newline found most also be
// verified.
for offset in memchr::memchr2_iter(*lit, 0x0A, jmp_range) {
if *lit != 0x0A && jmp_range[offset] == 0x0A {
for offset in memchr::memchr2_iter(b, 0x0A, jmp_range) {
if b != 0x0A && jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
}
}
// If the literal is empty, every position within the jump range is a
// a match.
else {
for offset in 0..jmp_range.len() {
on_match_found(offset)
Some(b) if accept_newlines => {
// If newlines are accepted, we can search for the literal
// byte alone. There are potential matches only at the
// positions where the byte is found.
for offset in memchr::memchr_iter(b, jmp_range) {
on_match_found(offset)
}
}
_ => {
for (offset, byte) in jmp_range.iter().enumerate() {
if !accept_newlines && *byte == 0x0A {
return;
}
on_match_found(offset)
}
}
}
}

#[inline]
fn jump_bck(
input: &[u8],
literal: &[u8],
expected_after_jump: Option<u8>,
flags: JumpFlagSet,
range: &RangeInclusive<u16>,
position: usize,
Expand Down Expand Up @@ -681,29 +686,42 @@ impl FastVM<'_> {
}
};

// If the literal is non-empty, the next positions are those where
// the byte in the data matches the first byte in the literal.
if let Some(lit) = literal.last() {
if flags.contains(JumpFlags::AcceptNewlines) {
for offset in memchr::memrchr_iter(*lit, jmp_range) {
let accept_newlines = flags.contains(JumpFlags::AcceptNewlines);

match expected_after_jump {
Some(b) if !accept_newlines => {
// Search for the literal byte and the newline at the same
// time. Any offset found before the newline is a position
// that needs to be verified, but once the newline is found
// no more positions will match and we can return.
//
// There's an edge case when the literal byte is also a
// newline, in such cases any newline found most also be
// verified.
for offset in memchr::memrchr2_iter(b, 0x0A, jmp_range) {
if b != 0x0A && jmp_range[offset] == 0x0A {
return;
}
on_match_found(offset)
}
} else {
for offset in memchr::memrchr2_iter(*lit, 0x0A, jmp_range) {
if *lit != 0x0A && jmp_range[offset] == 0x0A {
}
Some(b) if accept_newlines => {
// If newlines are accepted, we can search for the literal
// byte alone. There are potential matches only at the
// positions where the byte is found.
for offset in memchr::memrchr_iter(b, jmp_range) {
on_match_found(offset)
}
}
_ => {
for (offset, byte) in jmp_range.iter().enumerate().rev() {
if !accept_newlines && *byte == 0x0A {
return;
}
on_match_found(offset)
}
}
}
// If the literal is empty, every position within the jump range is a
// a match.
else {
for offset in (0..jmp_range.len()).rev() {
on_match_found(offset)
}
}
}
}

Expand Down
20 changes: 20 additions & 0 deletions lib/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1676,6 +1676,26 @@ fn hex_large_jumps() {
}"#,
JUMPS_DATA.as_bytes()
);

rule_true!(
r#"rule test {
strings:
$a = /dddd.{0,28}?DDDD.{0,28}?dddd/si
condition:
$a
}"#,
JUMPS_DATA.as_bytes()
);

// Newline characters not allowed in jump.
rule_false!(
r#"rule test {
strings:
$a = /dddd.{0,28}?DDDD.{0,28}?dddd/i
condition:
$a
}"#
);
}

#[test]
Expand Down
Binary file modified lib/src/tests/testdata/jumps.bin
Binary file not shown.

0 comments on commit 7ca768d

Please sign in to comment.