From 7ca768d99f30f7ca7f0f5b2029fad335a5abd965 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Sun, 10 Mar 2024 21:03:29 +0100 Subject: [PATCH] fix: issue while matching regexps with FastVM When case-insensitive regexps contained jumps that didn't allow newline characters, there were cases in which false-positive matches were returned. --- lib/src/re/fast/fastvm.rs | 96 ++++++++++++++++++------------- lib/src/tests/mod.rs | 20 +++++++ lib/src/tests/testdata/jumps.bin | Bin 1664 -> 1664 bytes 3 files changed, 77 insertions(+), 39 deletions(-) diff --git a/lib/src/re/fast/fastvm.rs b/lib/src/re/fast/fastvm.rs index f89f48c4c..8b7353d74 100644 --- a/lib/src/re/fast/fastvm.rs +++ b/lib/src/re/fast/fastvm.rs @@ -272,7 +272,7 @@ impl<'r> FastVM<'r> { } Self::jump_bck( &input[..input.len() - position], - literal, + literal.last().copied(), flags, &range, *position, @@ -287,7 +287,7 @@ impl<'r> FastVM<'r> { } Self::jump_fwd( &input[*position..], - literal, + literal.first().copied(), flags, &range, *position, @@ -304,7 +304,7 @@ impl<'r> FastVM<'r> { } Self::jump_bck( &input[..input.len() - position], - literal, + literal.last().copied(), flags, &range, *position, @@ -321,7 +321,7 @@ impl<'r> FastVM<'r> { } Self::jump_fwd( &input[*position..], - literal, + literal.first().copied(), flags, &range, *position, @@ -337,7 +337,7 @@ impl<'r> FastVM<'r> { if backwards { Self::jump_bck( &input[..input.len() - position], - &[], + None, flags, &range, *position, @@ -346,7 +346,7 @@ impl<'r> FastVM<'r> { } else { Self::jump_fwd( &input[*position..], - &[], + None, flags, &range, *position, @@ -538,7 +538,7 @@ impl FastVM<'_> { #[inline] fn jump_fwd( input: &[u8], - literal: &[u8], + expected_after_jump: Option, flags: JumpFlagSet, range: &RangeInclusive, position: usize, @@ -580,14 +580,10 @@ impl FastVM<'_> { } }; - // If the literal is non-empty, the next positions are those where - // the byte in the data matches the first byte in the literal. - if let Some(lit) = literal.first() { - if flags.contains(JumpFlags::AcceptNewlines) { - for offset in memchr::memchr_iter(*lit, jmp_range) { - on_match_found(offset) - } - } else { + let accept_newlines = flags.contains(JumpFlags::AcceptNewlines); + + match expected_after_jump { + Some(b) if !accept_newlines => { // Search for the literal byte and the newline at the same // time. Any offset found before the newline is a position // that needs to be verified, but once the newline is found @@ -596,19 +592,28 @@ impl FastVM<'_> { // There's an edge case when the literal byte is also a // newline, in such cases any newline found most also be // verified. - for offset in memchr::memchr2_iter(*lit, 0x0A, jmp_range) { - if *lit != 0x0A && jmp_range[offset] == 0x0A { + for offset in memchr::memchr2_iter(b, 0x0A, jmp_range) { + if b != 0x0A && jmp_range[offset] == 0x0A { return; } on_match_found(offset) } } - } - // If the literal is empty, every position within the jump range is a - // a match. - else { - for offset in 0..jmp_range.len() { - on_match_found(offset) + Some(b) if accept_newlines => { + // If newlines are accepted, we can search for the literal + // byte alone. There are potential matches only at the + // positions where the byte is found. + for offset in memchr::memchr_iter(b, jmp_range) { + on_match_found(offset) + } + } + _ => { + for (offset, byte) in jmp_range.iter().enumerate() { + if !accept_newlines && *byte == 0x0A { + return; + } + on_match_found(offset) + } } } } @@ -616,7 +621,7 @@ impl FastVM<'_> { #[inline] fn jump_bck( input: &[u8], - literal: &[u8], + expected_after_jump: Option, flags: JumpFlagSet, range: &RangeInclusive, position: usize, @@ -681,29 +686,42 @@ impl FastVM<'_> { } }; - // If the literal is non-empty, the next positions are those where - // the byte in the data matches the first byte in the literal. - if let Some(lit) = literal.last() { - if flags.contains(JumpFlags::AcceptNewlines) { - for offset in memchr::memrchr_iter(*lit, jmp_range) { + let accept_newlines = flags.contains(JumpFlags::AcceptNewlines); + + match expected_after_jump { + Some(b) if !accept_newlines => { + // Search for the literal byte and the newline at the same + // time. Any offset found before the newline is a position + // that needs to be verified, but once the newline is found + // no more positions will match and we can return. + // + // There's an edge case when the literal byte is also a + // newline, in such cases any newline found most also be + // verified. + for offset in memchr::memrchr2_iter(b, 0x0A, jmp_range) { + if b != 0x0A && jmp_range[offset] == 0x0A { + return; + } on_match_found(offset) } - } else { - for offset in memchr::memrchr2_iter(*lit, 0x0A, jmp_range) { - if *lit != 0x0A && jmp_range[offset] == 0x0A { + } + Some(b) if accept_newlines => { + // If newlines are accepted, we can search for the literal + // byte alone. There are potential matches only at the + // positions where the byte is found. + for offset in memchr::memrchr_iter(b, jmp_range) { + on_match_found(offset) + } + } + _ => { + for (offset, byte) in jmp_range.iter().enumerate().rev() { + if !accept_newlines && *byte == 0x0A { return; } on_match_found(offset) } } } - // If the literal is empty, every position within the jump range is a - // a match. - else { - for offset in (0..jmp_range.len()).rev() { - on_match_found(offset) - } - } } } diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index 299c3eacb..8e5a538b4 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1676,6 +1676,26 @@ fn hex_large_jumps() { }"#, JUMPS_DATA.as_bytes() ); + + rule_true!( + r#"rule test { + strings: + $a = /dddd.{0,28}?DDDD.{0,28}?dddd/si + condition: + $a + }"#, + JUMPS_DATA.as_bytes() + ); + + // Newline characters not allowed in jump. + rule_false!( + r#"rule test { + strings: + $a = /dddd.{0,28}?DDDD.{0,28}?dddd/i + condition: + $a + }"# + ); } #[test] diff --git a/lib/src/tests/testdata/jumps.bin b/lib/src/tests/testdata/jumps.bin index fe4fb80aacad729d16cc2da28adb66dfceefdf80..9e13c6059a2e6f3da29058d05c953318455450ea 100644 GIT binary patch delta 13 UcmZqRZQ$J?#Ky=qS(q&s02fOG2LJ#7 delta 14 VcmZqRZQ$J?#5OTnXtD}h3;-XC1Nr~}