From d2f662685e678a07c242bba2aaa9736fd7a652c5 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 09:27:32 +0100 Subject: [PATCH 01/19] feat(capi): improve error handling when strings are not valid UTF-8. --- capi/include/yara-x.h | 1 + capi/src/lib.rs | 1 + capi/src/scanner.rs | 24 +++++++++++++++--------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/capi/include/yara-x.h b/capi/include/yara-x.h index a4b07befb..09dd4fa03 100644 --- a/capi/include/yara-x.h +++ b/capi/include/yara-x.h @@ -23,6 +23,7 @@ typedef enum YRX_RESULT { SCAN_TIMEOUT, INVALID_IDENTIFIER, INVALID_ARGUMENT, + INVALID_UTF8, SERIALIZATION_ERROR, } YRX_RESULT; diff --git a/capi/src/lib.rs b/capi/src/lib.rs index ade9e2ada..80c4b5380 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -43,6 +43,7 @@ pub enum YRX_RESULT { SCAN_TIMEOUT, INVALID_IDENTIFIER, INVALID_ARGUMENT, + INVALID_UTF8, SERIALIZATION_ERROR, } diff --git a/capi/src/scanner.rs b/capi/src/scanner.rs index e1d3edc89..32a319fb5 100644 --- a/capi/src/scanner.rs +++ b/capi/src/scanner.rs @@ -193,7 +193,10 @@ pub unsafe extern "C" fn yrx_scanner_set_module_output( let module_name = match CStr::from_ptr(name).to_str() { Ok(name) => name, - Err(_) => return YRX_RESULT::INVALID_ARGUMENT, + Err(err) => { + LAST_ERROR.set(Some(CString::new(err.to_string()).unwrap())); + return YRX_RESULT::INVALID_UTF8; + } }; let data = match slice_from_ptr_and_len(data, len) { @@ -228,7 +231,10 @@ unsafe extern "C" fn yrx_scanner_set_global< let ident = match CStr::from_ptr(ident).to_str() { Ok(ident) => ident, - Err(_) => return YRX_RESULT::INVALID_ARGUMENT, + Err(err) => { + LAST_ERROR.set(Some(CString::new(err.to_string()).unwrap())); + return YRX_RESULT::INVALID_UTF8; + } }; let scanner = scanner.as_mut().unwrap(); @@ -252,13 +258,13 @@ pub unsafe extern "C" fn yrx_scanner_set_global_str( ident: *const c_char, value: *const c_char, ) -> YRX_RESULT { - let value = if let Ok(value) = CStr::from_ptr(value).to_str() { - value - } else { - return YRX_RESULT::INVALID_ARGUMENT; - }; - - yrx_scanner_set_global(scanner, ident, value) + match CStr::from_ptr(value).to_str() { + Ok(value) => yrx_scanner_set_global(scanner, ident, value), + Err(err) => { + LAST_ERROR.set(Some(CString::new(err.to_string()).unwrap())); + YRX_RESULT::INVALID_UTF8 + } + } } /// Sets the value of a global variable of type bool. From e158260bb46a92b67c5d3f3c89e9e485debba9b3 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 14:14:46 +0100 Subject: [PATCH 02/19] fix: issue with constant folding Constant folding for boolean AND and OR operations was broken. Boolean variables with known values were being folded, even if they were not constant. --- lib/src/compiler/ir/mod.rs | 28 ++++++------- lib/src/scanner/tests.rs | 23 ++++++++--- lib/src/types/mod.rs | 85 ++++++++++++++++++-------------------- 3 files changed, 70 insertions(+), 66 deletions(-) diff --git a/lib/src/compiler/ir/mod.rs b/lib/src/compiler/ir/mod.rs index 8d9f77644..96b1d5146 100644 --- a/lib/src/compiler/ir/mod.rs +++ b/lib/src/compiler/ir/mod.rs @@ -786,15 +786,13 @@ impl Expr { pub fn fold(self) -> Self { match self { Expr::And { mut operands } => { - // Retain the operands whose value is unknown or false, and - // remove those that are known to be true. True values in - // the list of operands don't alter the result of the AND - // operation. + // Retain the operands whose value is not constant, or is + // constant but false, remove those that are known to be + // true. True values in the list of operands don't alter + // the result of the AND operation. operands.retain(|op| { - !op.type_value() - .cast_to_bool() - .try_as_bool() - .unwrap_or(false) + let type_value = op.type_value().cast_to_bool(); + !type_value.is_const() || !type_value.as_bool() }); // No operands left, all were true and therefore the AND is @@ -817,15 +815,13 @@ impl Expr { Expr::And { operands } } Expr::Or { mut operands } => { - // Retain the operands whose value is unknown or true, and - // remove those that are known to be false. False values in - // the list of operands don't alter the result of the OR - // operation. + // Retain the operands whose value is not constant, or is + // constant but true, remove those that are known to be false. + // False values in the list of operands don't alter the result + // of the OR operation. operands.retain(|op| { - op.type_value() - .cast_to_bool() - .try_as_bool() - .unwrap_or(true) + let type_value = op.type_value().cast_to_bool(); + !type_value.is_const() || type_value.as_bool() }); // No operands left, all were false and therefore the OR is diff --git a/lib/src/scanner/tests.rs b/lib/src/scanner/tests.rs index 501ba9366..67eabe84d 100644 --- a/lib/src/scanner/tests.rs +++ b/lib/src/scanner/tests.rs @@ -283,13 +283,16 @@ fn variables_2() { let mut compiler = crate::Compiler::new(); compiler - .define_global("some_int", 0) + .define_global("some_bool", true) + .unwrap() + .define_global("some_str", "") .unwrap() .add_source( r#" rule test { condition: - some_int == 1 + some_bool and + some_str == "foo" } "#, ) @@ -307,17 +310,17 @@ fn variables_2() { 0 ); - scanner.set_global("some_int", 1).unwrap(); + scanner.set_global("some_bool", false).unwrap(); assert_eq!( scanner .scan(&[]) .expect("scan should not fail") .matching_rules() .len(), - 1 + 0 ); - scanner.set_global("some_int", 2).unwrap(); + scanner.set_global("some_str", "foo").unwrap(); assert_eq!( scanner .scan(&[]) @@ -326,6 +329,16 @@ fn variables_2() { .len(), 0 ); + + scanner.set_global("some_bool", true).unwrap(); + assert_eq!( + scanner + .scan(&[]) + .expect("scan should not fail") + .matching_rules() + .len(), + 1 + ); } #[test] diff --git a/lib/src/types/mod.rs b/lib/src/types/mod.rs index 8cb780d25..b0afca073 100644 --- a/lib/src/types/mod.rs +++ b/lib/src/types/mod.rs @@ -17,7 +17,7 @@ pub(crate) use func::*; pub(crate) use map::*; pub(crate) use structure::*; -/// The type of a YARA expression or identifier. +/// The type of YARA expression or identifier. #[derive(Clone, Copy, PartialEq)] pub(crate) enum Type { Unknown, @@ -295,20 +295,6 @@ impl TypeValue { } } - pub fn as_string(&self) -> Rc { - if let TypeValue::String(value) = self { - value - .extract() - .cloned() - .expect("TypeValue doesn't have an associated value") - } else { - panic!( - "called `as_string` on a TypeValue that is not TypeValue::String, it is: {:?}", - self - ) - } - } - pub fn as_array(&self) -> Rc { if let TypeValue::Array(array) = self { array.clone() @@ -353,61 +339,70 @@ impl TypeValue { } } + #[inline] + pub fn as_bool(&self) -> bool { + self.try_as_bool().expect("TypeValue doesn't have an associated value") + } + + #[inline] pub fn as_integer(&self) -> i64 { - if let TypeValue::Integer(value) = self { - value - .extract() - .cloned() - .expect("TypeValue doesn't have an associated value") - } else { - panic!( - "called `as_integer` on a TypeValue that is not TypeValue::Integer, it is: {:?}", - self - ) - } + self.try_as_integer() + .expect("TypeValue doesn't have an associated value") } + #[inline] pub fn as_float(&self) -> f64 { - if let TypeValue::Float(value) = self { - value - .extract() - .cloned() - .expect("TypeValue doesn't have an associated value") + self.try_as_float() + .expect("TypeValue doesn't have an associated value") + } + + #[inline] + pub fn as_string(&self) -> Rc { + self.try_as_string() + .expect("TypeValue doesn't have an associated value") + } + + pub fn try_as_bool(&self) -> Option { + if let TypeValue::Bool(value) = self { + value.extract().cloned() } else { panic!( - "called `as_float` on a TypeValue that is not TypeValue::Float, it is: {:?}", + "called `try_as_bool` on a TypeValue that is not TypeValue::Bool, it is: {:?}", self ) } } - pub fn as_bool(&self) -> bool { - if let TypeValue::Bool(value) = self { - value - .extract() - .cloned() - .expect("TypeValue doesn't have an associated value") + pub fn try_as_integer(&self) -> Option { + if let TypeValue::Integer(value) = self { + value.extract().cloned() } else { panic!( - "called `as_bool` on a TypeValue that is not TypeValue::Bool, it is: {:?}", + "called `try_as_integer` on a TypeValue that is not TypeValue::Integer, it is: {:?}", self ) } } - pub fn try_as_bool(&self) -> Option { - if let TypeValue::Bool(value) = self { + pub fn try_as_float(&self) -> Option { + if let TypeValue::Float(value) = self { value.extract().cloned() } else { - None + panic!( + "called `try_as_float` on a TypeValue that is not TypeValue::Float, it is: {:?}", + self + ) } } - pub fn try_as_integer(&self) -> Option { - if let TypeValue::Integer(value) = self { + pub fn try_as_string(&self) -> Option> { + if let TypeValue::String(value) = self { value.extract().cloned() } else { - None + panic!( + "called `as_string` on a TypeValue that is not TypeValue::String, it is: {:?}", + self + ) } } From 9d93d1448c715bed40fc825466994015d19d9390 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 15:33:16 +0100 Subject: [PATCH 03/19] docs(capi): add documentation of error codes --- capi/include/yara-x.h | 16 +++++++++++++--- capi/src/lib.rs | 14 ++++++++++++-- capi/src/scanner.rs | 4 ++-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/capi/include/yara-x.h b/capi/include/yara-x.h index 09dd4fa03..b5d8cf7c7 100644 --- a/capi/include/yara-x.h +++ b/capi/include/yara-x.h @@ -15,15 +15,25 @@ typedef enum YRX_RESULT { + // Everything was OK. SUCCESS, - PANIC, + // A syntax error occurred while compiling YARA rules. SYNTAX_ERROR, + // An error occurred while defining or setting a global variable. This may + // happen when a variable is defined twice and when you try to set a value + // that doesn't correspond to the variable's type. VARIABLE_ERROR, + // An error occurred during a scan operation. SCAN_ERROR, + // A scan operation was aborted due to a timeout. SCAN_TIMEOUT, - INVALID_IDENTIFIER, + // An error indicating that some of the arguments passed to a function is + // invalid. Usually indicates a nil pointer to a scanner or compiler. INVALID_ARGUMENT, + // An error indicating that some of the strings passed to a function is + // not valid UTF-8. INVALID_UTF8, + // An error occurred while serializing/deserializing YARA rules. SERIALIZATION_ERROR, } YRX_RESULT; @@ -289,7 +299,7 @@ enum YRX_RESULT yrx_scanner_on_matching_rule(struct YRX_SCANNER *scanner, // // The `name` argument is either a YARA module name (i.e: "pe", "elf", "dotnet", // etc.) or the fully-qualified name of the protobuf message associated to -// the module. +// the module. It must be a valid UTF-8 string. enum YRX_RESULT yrx_scanner_set_module_output(struct YRX_SCANNER *scanner, const char *name, const uint8_t *data, diff --git a/capi/src/lib.rs b/capi/src/lib.rs index 80c4b5380..a960b2a68 100644 --- a/capi/src/lib.rs +++ b/capi/src/lib.rs @@ -35,15 +35,25 @@ thread_local! { #[repr(C)] pub enum YRX_RESULT { + /// Everything was OK. SUCCESS, - PANIC, + /// A syntax error occurred while compiling YARA rules. SYNTAX_ERROR, + /// An error occurred while defining or setting a global variable. This may + /// happen when a variable is defined twice and when you try to set a value + /// that doesn't correspond to the variable's type. VARIABLE_ERROR, + /// An error occurred during a scan operation. SCAN_ERROR, + /// A scan operation was aborted due to a timeout. SCAN_TIMEOUT, - INVALID_IDENTIFIER, + /// An error indicating that some of the arguments passed to a function is + /// invalid. Usually indicates a nil pointer to a scanner or compiler. INVALID_ARGUMENT, + /// An error indicating that some of the strings passed to a function is + /// not valid UTF-8. INVALID_UTF8, + /// An error occurred while serializing/deserializing YARA rules. SERIALIZATION_ERROR, } diff --git a/capi/src/scanner.rs b/capi/src/scanner.rs index 32a319fb5..8f4ee5df8 100644 --- a/capi/src/scanner.rs +++ b/capi/src/scanner.rs @@ -179,7 +179,7 @@ pub unsafe extern "C" fn yrx_scanner_on_matching_rule( /// /// The `name` argument is either a YARA module name (i.e: "pe", "elf", "dotnet", /// etc.) or the fully-qualified name of the protobuf message associated to -/// the module. +/// the module. It must be a valid UTF-8 string. #[no_mangle] pub unsafe extern "C" fn yrx_scanner_set_module_output( scanner: *mut YRX_SCANNER, @@ -246,7 +246,7 @@ unsafe extern "C" fn yrx_scanner_set_global< } Err(err) => { LAST_ERROR.set(Some(CString::new(err.to_string()).unwrap())); - YRX_RESULT::SCAN_ERROR + YRX_RESULT::VARIABLE_ERROR } } } From 9cd712daf2f127ff2a9cb951b57554d3d9f7266f Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 15:34:04 +0100 Subject: [PATCH 04/19] chore: fix typos --- lib/src/scanner/mod.rs | 4 ++-- lib/src/wasm/mod.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index 0646f7d76..53c8d3bec 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -685,8 +685,8 @@ impl<'r> Scanner<'r> { // If some pattern or rule matched, clear the matches. Notice that a // rule may match without any pattern being matched, because there - // there are rules without patterns, or that match if the pattern is - // not found. + // are rules without patterns, or that match if the pattern is not + // found. if !ctx.pattern_matches.is_empty() || !ctx.non_private_matching_rules.is_empty() || !ctx.private_matching_rules.is_empty() diff --git a/lib/src/wasm/mod.rs b/lib/src/wasm/mod.rs index 6016cbc41..c44a7af38 100644 --- a/lib/src/wasm/mod.rs +++ b/lib/src/wasm/mod.rs @@ -920,7 +920,7 @@ pub(crate) fn map_len(_: &mut Caller<'_, ScanContext>, map: Rc) -> i64 { /// reach the inner `integer_field`. /// /// The initial structure is the one passed in the `structure` argument, or the -/// the root structure if this argument is `None`. +/// root structure if this argument is `None`. /// /// The sequence of indexes is stored in WASM main memory, starting at /// `LOOKUP_INDEXES_START`, and the number of indexes is indicated by the From 117cc3ce6cb7807359558942f5d7721d2aae48a7 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 15:54:04 +0100 Subject: [PATCH 05/19] chore: remove trailing space --- lib/src/scanner/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index 53c8d3bec..de0322749 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -685,7 +685,7 @@ impl<'r> Scanner<'r> { // If some pattern or rule matched, clear the matches. Notice that a // rule may match without any pattern being matched, because there - // are rules without patterns, or that match if the pattern is not + // are rules without patterns, or that match if the pattern is not // found. if !ctx.pattern_matches.is_empty() || !ctx.non_private_matching_rules.is_empty() From 786442fae6d0323fd973891758a099ac95ebf85e Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 28 Feb 2024 16:30:45 +0100 Subject: [PATCH 06/19] fix(go): allow passing `int32` and `int64` to `Compiler.DefineGlobal` and `Scanner.SetGlobal` --- go/compiler.go | 4 ++++ go/scanner.go | 4 ++++ go/scanner_test.go | 22 ++++++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/go/compiler.go b/go/compiler.go index d760b1423..b5ea54fb8 100644 --- a/go/compiler.go +++ b/go/compiler.go @@ -63,6 +63,10 @@ func (c *Compiler) DefineGlobal(ident string, value interface{}) error { switch v := value.(type) { case int: ret = C.int(C.yrx_compiler_define_global_int(c.cCompiler, cIdent, C.int64_t(v))) + case int32: + ret = C.int(C.yrx_compiler_define_global_int(c.cCompiler, cIdent, C.int64_t(v))) + case int64: + ret = C.int(C.yrx_compiler_define_global_int(c.cCompiler, cIdent, C.int64_t(v))) case bool: ret = C.int(C.yrx_compiler_define_global_bool(c.cCompiler, cIdent, C.bool(v))) case string: diff --git a/go/scanner.go b/go/scanner.go index b7eb0b24c..61061c4df 100644 --- a/go/scanner.go +++ b/go/scanner.go @@ -109,6 +109,10 @@ func (s *Scanner) SetGlobal(ident string, value interface{}) error { switch v := value.(type) { case int: ret = C.int(C.yrx_scanner_set_global_int(s.cScanner, cIdent, C.int64_t(v))) + case int32: + ret = C.int(C.yrx_scanner_set_global_int(s.cScanner, cIdent, C.int64_t(v))) + case int64: + ret = C.int(C.yrx_scanner_set_global_int(s.cScanner, cIdent, C.int64_t(v))) case bool: ret = C.int(C.yrx_scanner_set_global_bool(s.cScanner, cIdent, C.bool(v))) case string: diff --git a/go/scanner_test.go b/go/scanner_test.go index 11c394ccf..51df39747 100644 --- a/go/scanner_test.go +++ b/go/scanner_test.go @@ -51,6 +51,28 @@ func TestScanner3(t *testing.T) { assert.Len(t, matchingRules, 0) } +func TestScanner4(t *testing.T) { + r, _ := Compile( + `rule t { condition: var_int == 1}`, + GlobalVars(map[string]interface{}{"var_int": 0})) + + s := NewScanner(r) + matchingRules, _ := s.Scan([]byte{}) + assert.Len(t, matchingRules, 0) + + assert.NoError(t, s.SetGlobal("var_int", 1)) + matchingRules, _ = s.Scan([]byte{}) + assert.Len(t, matchingRules, 1) + + assert.NoError(t, s.SetGlobal("var_int", int32(1))) + matchingRules, _ = s.Scan([]byte{}) + assert.Len(t, matchingRules, 1) + + assert.NoError(t, s.SetGlobal("var_int", int64(1))) + matchingRules, _ = s.Scan([]byte{}) + assert.Len(t, matchingRules, 1) +} + func TestScannerTimeout(t *testing.T) { r, _ := Compile("rule t { strings: $a = /a(.*)*a/ condition: $a }") s := NewScanner(r) From 889a7cda17b04836e8f0be232d7c7bade63948ca Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 10:21:16 +0100 Subject: [PATCH 07/19] refactor: make `catch_undef` more flexible Now `catch_undef` can be used with blocks that return any type (including blocks that return nothing), and the result returned by the `catch_undef` block in case of exception can be customized. --- lib/src/compiler/emit.rs | 156 +++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/lib/src/compiler/emit.rs b/lib/src/compiler/emit.rs index 54b9540a9..296af5133 100644 --- a/lib/src/compiler/emit.rs +++ b/lib/src/compiler/emit.rs @@ -13,7 +13,9 @@ use bstr::ByteSlice; use itertools::Itertools; use rustc_hash::FxHashMap; use walrus::ir::ExtendedLoad::ZeroExtend; -use walrus::ir::{BinaryOp, InstrSeqId, LoadKind, MemArg, StoreKind, UnaryOp}; +use walrus::ir::{ + BinaryOp, InstrSeqId, InstrSeqType, LoadKind, MemArg, StoreKind, UnaryOp, +}; use walrus::ValType::{I32, I64}; use walrus::{FunctionId, InstrSeqBuilder, ValType}; use yara_x_parser::ast::{RuleFlag, RuleFlags}; @@ -164,6 +166,8 @@ macro_rules! emit_bitwise_op { }}; } +type ExceptionHandler = Box; + /// Structure that contains information used while emitting the code that /// corresponds to the condition of a YARA rule. pub(in crate::compiler) struct EmitContext<'a> { @@ -191,7 +195,9 @@ pub(in crate::compiler) struct EmitContext<'a> { pub lit_pool: &'a mut BStringPool, /// Stack of installed exception handlers for catching undefined values. - pub exception_handler_stack: Vec<(ValType, InstrSeqId)>, + /// When an exception occurs the execution flow will jump out of the block + /// identified by `InstrSeqId`. + pub exception_handler_stack: Vec<(InstrSeqId, ExceptionHandler)>, /// Stack of variables. These are local variables used during the /// evaluation of rule conditions, for example for storing loop variables. @@ -269,9 +275,17 @@ pub(super) fn emit_rule_condition( } // Emit WASM code for the rule's condition. - catch_undef(ctx, &mut instr, |ctx, instr| { - emit_bool_expr(ctx, instr, condition); - }); + catch_undef( + ctx, + Some(I32), + &mut instr, + |ctx, instr| { + emit_bool_expr(ctx, instr, condition); + }, + |instr| { + instr.i32_const(0); + }, + ); // Check if the result from the condition is zero (false). instr.unop(UnaryOp::I32Eqz); @@ -699,17 +713,25 @@ fn emit_defined( // false // } // - catch_undef(ctx, instr, |ctx, instr| { - emit_bool_expr(ctx, instr, operand); - // Drop the operand's value as we are not interested in the - // value, we are interested only in whether it's defined or - // not. - instr.drop(); - // Push a 1 in the stack indicating that the operand is - // defined. This point is not reached if the operand calls - // `throw_undef`. - instr.i32_const(1); - }); + catch_undef( + ctx, + Some(I32), + instr, + |ctx, instr| { + emit_bool_expr(ctx, instr, operand); + // Drop the operand's value as we are not interested in the + // value, we are interested only in whether it's defined or + // not. + instr.drop(); + // Push a 1 in the stack indicating that the operand is + // defined. This point is not reached if the operand calls + // `throw_undef`. + instr.i32_const(1); + }, + |instr| { + instr.i32_const(0); + }, + ); } /// Emits the code for `not` operations. @@ -773,9 +795,17 @@ fn emit_and( |block| { let block_id = block.id(); for operand in operands { - catch_undef(ctx, block, |ctx, instr| { - emit_bool_expr(ctx, instr, operand); - }); + catch_undef( + ctx, + Some(I32), + block, + |ctx, instr| { + emit_bool_expr(ctx, instr, operand); + }, + |instr| { + instr.i32_const(0); + }, + ); // If the operand is `false`, exit from the block // with a `false` result. block.if_else( @@ -829,9 +859,17 @@ fn emit_or( |block| { let block_id = block.id(); for operand in operands { - catch_undef(ctx, block, |ctx, instr| { - emit_bool_expr(ctx, instr, operand); - }); + catch_undef( + ctx, + Some(I32), + block, + |ctx, instr| { + emit_bool_expr(ctx, instr, operand); + }, + |instr| { + instr.i32_const(0); + }, + ); // If the operand is `true`, exit from the block // with a `true` result. block.if_else( @@ -1575,6 +1613,7 @@ fn emit_for_in_range( instr, &mut for_in.stack_frame, &mut for_in.quantifier, + // Loop initialization |ctx, instr, n, loop_end| { // Set n = upper_bound - lower_bound + 1; set_var(ctx, instr, n, |ctx, instr| { @@ -1714,7 +1753,7 @@ fn emit_for_in_map( instr: &mut InstrSeqBuilder, for_in: &mut ForIn, ) { - // A `for` loop in an map has exactly two variables, one for the key + // A `for` loop in a map has exactly two variables, one for the key // and the other for the value. assert_eq!(for_in.variables.len(), 2); @@ -1963,9 +2002,17 @@ fn emit_for( // capturing any undefined exception produced by the condition // because we don't want to abort the loop in such cases. When the // condition is undefined it's handled as a false. - catch_undef(ctx, block, |ctx, block| { - condition(ctx, block); - }); + catch_undef( + ctx, + Some(I32), + block, + |ctx, block| { + condition(ctx, block); + }, + |instr| { + instr.i32_const(0); + }, + ); // At the top of the stack we have the i32 with the result from // the loop condition. Decide what to do depending on the @@ -1984,7 +2031,7 @@ fn emit_for( incr_i_and_repeat(ctx, else_, n, i, loop_start); // If this point is reached is because all the - // the range was iterated without the condition + // range was iterated without the condition // returning true, this means that the whole "for" // statement is true. else_.i32_const(1); @@ -1999,7 +2046,7 @@ fn emit_for( incr_i_and_repeat(ctx, then_, n, i, loop_start); // If this point is reached is because all the - // the range was iterated without the condition + // range was iterated without the condition // returning false, this means that the whole "for" // statement is true. then_.i32_const(1); @@ -2026,7 +2073,7 @@ fn emit_for( incr_i_and_repeat(ctx, else_, n, i, loop_start); // If this point is reached is because all the - // the range was iterated without the condition + // range was iterated without the condition // returning true, this means that the whole "for" // statement is false. else_.i32_const(0); @@ -2083,7 +2130,7 @@ fn emit_for( // range 0..n. If `max_count` is zero this means that all // iterations returned false and therefore the loop must // return true. If `max_count` is non-zero it means that - // `counter` didn't reached `max_count` and the loop must + // `counter` didn't reach `max_count` and the loop must // return false. load_var(ctx, block, max_count); block.unop(UnaryOp::I64Eqz); @@ -2405,7 +2452,7 @@ fn emit_bool_expr( /// Emit function call. fn emit_func_call( - ctx: &EmitContext, + ctx: &mut EmitContext, instr: &mut InstrSeqBuilder, func: &Func, ) { @@ -2436,7 +2483,7 @@ fn emit_func_call( /// is undefined, and throws an exception if that's the case (see: /// [`throw_undef`]) fn emit_call_and_handle_undef( - ctx: &EmitContext, + ctx: &mut EmitContext, instr: &mut InstrSeqBuilder, fn_id: walrus::FunctionId, ) { @@ -2577,29 +2624,32 @@ fn emit_lookup_object(ctx: &mut EmitContext, instr: &mut InstrSeqBuilder) { /// Emits code for catching exceptions caused by undefined values. /// /// This function emits WebAssembly code that behaves similarly to an exception -/// handler. The code inside the catch block must return an `i32`, which is left -/// at the top of the stack. However, at any point inside this block you can use -/// [`throw_undef`] for throwing an exception when an undefined value is detected -/// In that case the execution flow will be interrupted at the point where -/// [`throw_undef`] was found, and the control transferred to the instruction -/// that follows after the `catch_undef` block, leaving a zero value of type -/// `i32` at the top of the stack. -/// -/// In other words, [`catch_undef`] protects a block that returns an `i32` from -/// exceptions caused by undefined values, replacing the block's result with a -/// zero in case such exception occurs. +/// handler. The code in `expr` must return a value of type `ty` which is left +/// at the top of the stack. However, at any point inside this block you can +/// use [`throw_undef`] for throwing an exception when an undefined value is +/// detected. In that case the execution flow will be interrupted at the point +/// where [`throw_undef`] was found, the code in `catch` is executed, leaving +/// its result in the stack, and the control transferred to the instruction +/// that follows after the `catch_undef` block. Notice that `expr` and `catch` +/// must return values of the same type. In a normal execution the result from +/// the `catch_undef` block is the result from `expr`, but when an exception +/// occurs the value is provided by the `catch` block. /// /// [`catch_undef`] blocks can be nested, and in such cases the control will -/// transferred to the end of the innermost block. +/// be transferred to the end of the innermost block. /// /// # Example /// /// ```text /// catch_undef(ctx, instr, -/// |block| { +/// |ctx, block| { /// throw_undef(ctx, block); // The exception is raised here ... /// block.i32_const(1); // ... and this is not executed. /// }, +/// |catch| { +/// catch.i32_const(0); // If an exception is raised, the result +/// // from `catch_undef` will be 0. +/// } /// ); /// // ... at this point we have a zero value of type i32 at the top of the /// // stack. @@ -2607,15 +2657,17 @@ fn emit_lookup_object(ctx: &mut EmitContext, instr: &mut InstrSeqBuilder) { /// fn catch_undef( ctx: &mut EmitContext, + ty: impl Into, instr: &mut InstrSeqBuilder, expr: impl FnOnce(&mut EmitContext, &mut InstrSeqBuilder), + catch: impl Fn(&mut InstrSeqBuilder) + 'static, ) { // Create a new block containing `expr`. When an exception is raised from // within `expr`, the control flow will jump out of this block via a `br` // instruction. - instr.block(I32, |block| { + instr.block(ty, |block| { // Push the type and ID of the current block in the handlers stack. - ctx.exception_handler_stack.push((I32, block.id())); + ctx.exception_handler_stack.push((block.id(), Box::new(catch))); expr(ctx, block); }); @@ -2627,7 +2679,7 @@ fn catch_undef( /// /// For more information see [`catch_undef`]. fn throw_undef(ctx: &EmitContext, instr: &mut InstrSeqBuilder) { - let innermost_handler = *ctx + let innermost_handler = ctx .exception_handler_stack .last() .expect("calling `raise` from outside `try` block"); @@ -2667,14 +2719,10 @@ fn throw_undef(ctx: &EmitContext, instr: &mut InstrSeqBuilder) { // `br $outer`, because that is going to be the result for the $outer // block. // - match innermost_handler.0 { - I32 => instr.i32_const(0), - I64 => instr.i64_const(0), - _ => unreachable!(), - }; + innermost_handler.1(instr); // Jump to the exception handler. - instr.br(innermost_handler.1); + instr.br(innermost_handler.0); } /// Similar to [`throw_undef`], but throws the exception if the top of the From ce6f7e87f265b2c6a57c4475a1716a269d1d8fc0 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 10:40:40 +0100 Subject: [PATCH 08/19] fix: issue with `for .. in ..` loop returning undefined value instead of false. Closes #87 --- lib/src/compiler/emit.rs | 30 +++++++++++++++++++++--------- lib/src/tests/mod.rs | 6 ++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/lib/src/compiler/emit.rs b/lib/src/compiler/emit.rs index 296af5133..1582adf20 100644 --- a/lib/src/compiler/emit.rs +++ b/lib/src/compiler/emit.rs @@ -1617,17 +1617,29 @@ fn emit_for_in_range( |ctx, instr, n, loop_end| { // Set n = upper_bound - lower_bound + 1; set_var(ctx, instr, n, |ctx, instr| { - emit_expr(ctx, instr, &mut range.upper_bound); - emit_expr(ctx, instr, &mut range.lower_bound); + // Catch undefined values in upper_bound and lower_bound + // expressions. In such cases n = 0. + catch_undef( + ctx, + I64, + instr, + |ctx, instr| { + emit_expr(ctx, instr, &mut range.upper_bound); + emit_expr(ctx, instr, &mut range.lower_bound); - // Store lower_bound in temp variable, without removing - // it from the stack. - instr.local_tee(ctx.wasm_symbols.i64_tmp); + // Store lower_bound in temp variable, without removing + // it from the stack. + instr.local_tee(ctx.wasm_symbols.i64_tmp); - // Compute upper_bound - lower_bound + 1. - instr.binop(BinaryOp::I64Sub); - instr.i64_const(1); - instr.binop(BinaryOp::I64Add); + // Compute upper_bound - lower_bound + 1. + instr.binop(BinaryOp::I64Sub); + instr.i64_const(1); + instr.binop(BinaryOp::I64Add); + }, + |instr| { + instr.i64_const(0); + }, + ) }); // If n <= 0, exit from the loop. diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index 6e0f10855..f298e1c92 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -470,6 +470,12 @@ fn for_in() { condition_true!(r#"for 2 s in ("foo", "bar", "baz") : (s contains "ba")"#); condition_true!(r#"for all x in (1.0, 2.0, 3.0) : (x >= 1.0)"#); condition_true!(r#"for none x in (1.0, 2.0, 3.0) : (x > 4.0)"#); + + // https://github.com/VirusTotal/yara-x/issues/87 + #[cfg(feature = "test_proto2-module")] + condition_true!( + r#"not for any i in (0..test_proto2.int64_undef) : (true)"# + ); } #[test] From 7b5e994cfa99159e3ae66d23a654d7ae373a5d43 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 20:21:39 +0100 Subject: [PATCH 09/19] fix: show more explicative error when built-in module is not found --- lib/src/scanner/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index de0322749..717bde55e 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -544,7 +544,11 @@ impl<'r> Scanner<'r> { for module_name in ctx.compiled_rules.imports() { // Lookup the module in the list of built-in modules. - let module = modules::BUILTIN_MODULES.get(module_name).unwrap(); + let module = + modules::BUILTIN_MODULES.get(module_name).unwrap_or_else( + || panic!("module `{}` not found", module_name), + ); + let root_struct_name = module.root_struct_descriptor.full_name(); // If the user already provided some output for the module by From a4dcbce1ff06fa67923c082137c9b2e1c847557f Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 21:43:43 +0100 Subject: [PATCH 10/19] chore: fix typos --- lib/src/compiler/emit.rs | 6 +++--- lib/src/scanner/context.rs | 2 +- lib/src/wasm/mod.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/src/compiler/emit.rs b/lib/src/compiler/emit.rs index 1582adf20..3bdfe185f 100644 --- a/lib/src/compiler/emit.rs +++ b/lib/src/compiler/emit.rs @@ -1215,7 +1215,7 @@ fn emit_check_for_rule_match( // at MATCHING_RULES_BITMAP_BASE. // // The first thing is loading the byte where the bit - // resides.. + // resides. instr.i32_const(rule_id.0 / 8); instr.load( ctx.wasm_symbols.main_memory, @@ -1296,8 +1296,8 @@ fn emit_check_for_pattern_match( /// Emits the code that gets an array item by index. /// /// This function must be called right after emitting the code that leaves the -/// the index in the stack. The code emitted by this function assumes that the -/// top of the stack is an i64 with the index. +/// index in the stack. The code emitted by this function assumes that the top +/// of the stack is an i64 with the index. fn emit_array_indexing( ctx: &mut EmitContext, instr: &mut InstrSeqBuilder, diff --git a/lib/src/scanner/context.rs b/lib/src/scanner/context.rs index 5684c1471..28a4e55c9 100644 --- a/lib/src/scanner/context.rs +++ b/lib/src/scanner/context.rs @@ -249,7 +249,7 @@ impl ScanContext<'_> { /// Called during the scan process when a global rule didn't match. /// - /// When this happen any other global rule in the same namespace that + /// When this happens any other global rule in the same namespace that /// matched previously is reset to a non-matching state. pub(crate) fn track_global_rule_no_match(&mut self, rule_id: RuleId) { let wasm_store = unsafe { self.wasm_store.as_mut() }; diff --git a/lib/src/wasm/mod.rs b/lib/src/wasm/mod.rs index c44a7af38..a675bc057 100644 --- a/lib/src/wasm/mod.rs +++ b/lib/src/wasm/mod.rs @@ -722,8 +722,8 @@ pub(crate) fn new_linker<'r>() -> Linker> { } /// Invoked from WASM before starting the evaluation of the rule identified -/// by the given [`RuleId`]. This only happens when the the "logging" feature -/// is enabled. +/// by the given [`RuleId`]. This only happens when the "logging" feature is +/// enabled. #[wasm_export] #[cfg(feature = "logging")] pub(crate) fn log_rule_eval_start( From 5b7e79f82d1a8bdb4bdd891bddbcb2063601bfe2 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 23:53:16 +0100 Subject: [PATCH 11/19] fix: issues when number of rules is multiple of 8. --- lib/src/scanner/context.rs | 53 +++++++++++++++++++++++++------------ lib/src/scanner/mod.rs | 4 +-- lib/src/tests/mod.rs | 54 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 19 deletions(-) diff --git a/lib/src/scanner/context.rs b/lib/src/scanner/context.rs index 28a4e55c9..5c7d92332 100644 --- a/lib/src/scanner/context.rs +++ b/lib/src/scanner/context.rs @@ -252,29 +252,31 @@ impl ScanContext<'_> { /// When this happens any other global rule in the same namespace that /// matched previously is reset to a non-matching state. pub(crate) fn track_global_rule_no_match(&mut self, rule_id: RuleId) { - let wasm_store = unsafe { self.wasm_store.as_mut() }; - let main_mem = self.main_memory.unwrap().data_mut(wasm_store); - - let base = MATCHING_RULES_BITMAP_BASE as usize; - let bits = BitSlice::::from_slice_mut(&mut main_mem[base..]); - let rule = self.compiled_rules.get(rule_id); // This function must be called only for global rules. debug_assert!(rule.is_global); // All the global rules that matched previously, and are in the same - // namespace than the non-matching rule, must be removed from the + // namespace as the non-matching rule, must be removed from the // `global_matching_rules` map. Also, their corresponding bits in // the matching rules bitmap must be cleared. if let Some(rules) = self.global_matching_rules.get_mut(&rule.namespace_id) { - for rule_id in rules.iter() { - bits.set((*rule_id).into(), false); - } + let wasm_store = unsafe { self.wasm_store.as_mut() }; + let main_mem = self.main_memory.unwrap().data_mut(wasm_store); + + let base = MATCHING_RULES_BITMAP_BASE as usize; + let num_rules = self.compiled_rules.rules().len(); + + let bits = BitSlice::::from_slice_mut( + &mut main_mem[base..base + num_rules.div_ceil(8)], + ); - rules.clear() + for rule_id in rules.drain(0..) { + bits.set(rule_id.into(), false); + } } } @@ -283,6 +285,17 @@ impl ScanContext<'_> { pub(crate) fn track_rule_match(&mut self, rule_id: RuleId) { let rule = self.compiled_rules.get(rule_id); + #[cfg(feature = "logging")] + info!( + "Rule match: {}:{} {:?}", + self.compiled_rules + .ident_pool() + .get(rule.namespace_ident_id) + .unwrap(), + self.compiled_rules.ident_pool().get(rule.ident_id).unwrap(), + rule_id, + ); + if rule.is_global { self.global_matching_rules .entry(rule.namespace_id) @@ -295,10 +308,13 @@ impl ScanContext<'_> { } let wasm_store = unsafe { self.wasm_store.as_mut() }; - let main_mem = self.main_memory.unwrap().data_mut(wasm_store); + let mem = self.main_memory.unwrap().data_mut(wasm_store); + let num_rules = self.compiled_rules.rules().len(); let base = MATCHING_RULES_BITMAP_BASE as usize; - let bits = BitSlice::::from_slice_mut(&mut main_mem[base..]); + let bits = BitSlice::::from_slice_mut( + &mut mem[base..base + num_rules.div_ceil(8)], + ); // The RuleId-th bit in the `rule_matches` bit vector is set to 1. bits.set(rule_id.into(), true); @@ -313,11 +329,14 @@ impl ScanContext<'_> { replace: bool, ) { let wasm_store = unsafe { self.wasm_store.as_mut() }; - let main_mem = self.main_memory.unwrap().data_mut(wasm_store); + let mem = self.main_memory.unwrap().data_mut(wasm_store); let num_rules = self.compiled_rules.rules().len(); - - let base = MATCHING_RULES_BITMAP_BASE as usize + num_rules / 8 + 1; - let bits = BitSlice::::from_slice_mut(&mut main_mem[base..]); + let num_patterns = self.compiled_rules.num_patterns(); + + let base = MATCHING_RULES_BITMAP_BASE as usize + num_rules.div_ceil(8); + let bits = BitSlice::::from_slice_mut( + &mut mem[base..base + num_patterns.div_ceil(8)], + ); bits.set(pattern_id.into(), true); diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index 717bde55e..d4bb8443a 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -722,8 +722,8 @@ impl<'r> Scanner<'r> { let base = MATCHING_RULES_BITMAP_BASE as usize; let bitmap = BitSlice::<_, Lsb0>::from_slice_mut( &mut mem[base..base - + (num_rules / 8 + 1) - + (num_patterns / 8 + 1)], + + num_rules.div_ceil(8) + + num_patterns.div_ceil(8)], ); // Set to zero all bits in the bitmap. diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index f298e1c92..b798a4e6b 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -3008,6 +3008,60 @@ fn rule_reuse_2() { ); } +#[test] +fn eight_rules() { + let rules = crate::compile( + r#" + rule rule_1 { + strings: + $a = "foo" + condition: + $a + } + rule rule_2 { + condition: + false + } + rule rule_3 { + condition: + true + } + rule rule_4 { + condition: + false + } + rule rule_5 { + condition: + true + } + rule rule_6 { + condition: + false + } + rule rule_7 { + condition: + true + } + rule rule_8 { + condition: + false + } + "#, + ) + .unwrap(); + + let mut scanner = crate::scanner::Scanner::new(&rules); + + assert_eq!( + scanner + .scan(b"foo") + .expect("scan should not fail") + .matching_rules() + .len(), + 4 + ); +} + #[test] fn test_defined_1() { condition_true!(r#"defined 1"#); From 0ea820466a85a8df1a065e5aacc2d6d2db6ebcbb Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Thu, 29 Feb 2024 23:55:03 +0100 Subject: [PATCH 12/19] style: remove trailing spaces --- lib/src/compiler/emit.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/src/compiler/emit.rs b/lib/src/compiler/emit.rs index 3bdfe185f..91b369224 100644 --- a/lib/src/compiler/emit.rs +++ b/lib/src/compiler/emit.rs @@ -1296,7 +1296,7 @@ fn emit_check_for_pattern_match( /// Emits the code that gets an array item by index. /// /// This function must be called right after emitting the code that leaves the -/// index in the stack. The code emitted by this function assumes that the top +/// index in the stack. The code emitted by this function assumes that the top /// of the stack is an i64 with the index. fn emit_array_indexing( ctx: &mut EmitContext, From 432d83ce29fddb46903e5e17ebd027f9138b8995 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 00:05:18 +0100 Subject: [PATCH 13/19] style: add `num_rules()` method to `Rules` --- lib/src/compiler/rules.rs | 9 +++++++-- lib/src/scanner/context.rs | 10 +++++----- lib/src/scanner/mod.rs | 8 ++++---- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index 61336b391..9643d5994 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -309,6 +309,11 @@ impl Rules { self.re_code.as_slice() } + #[inline] + pub(crate) fn num_rules(&self) -> usize { + self.rules.len() + } + #[inline] pub(crate) fn num_patterns(&self) -> usize { self.num_patterns @@ -376,8 +381,8 @@ impl Rules { Instant::elapsed(&start) ); - info!("Number of rules: {}", self.rules.len()); - info!("Number of patterns: {}", self.num_patterns); + info!("Number of rules: {}", self.num_rules()); + info!("Number of patterns: {}", self.num_patterns()); info!( "Number of anchored sub-patterns: {}", self.anchored_sub_patterns.len() diff --git a/lib/src/scanner/context.rs b/lib/src/scanner/context.rs index 5c7d92332..9b8a4bd76 100644 --- a/lib/src/scanner/context.rs +++ b/lib/src/scanner/context.rs @@ -119,7 +119,7 @@ pub(crate) struct ScanContext<'r> { #[cfg(feature = "rules-profiling")] impl<'r> ScanContext<'r> { pub fn most_expensive_rules(&self) -> Vec<(&'r str, &'r str, Duration)> { - let mut result = Vec::with_capacity(self.compiled_rules.rules().len()); + let mut result = Vec::with_capacity(self.compiled_rules.num_rules()); for r in self.compiled_rules.rules() { let mut rule_time = Duration::default(); @@ -268,7 +268,7 @@ impl ScanContext<'_> { let main_mem = self.main_memory.unwrap().data_mut(wasm_store); let base = MATCHING_RULES_BITMAP_BASE as usize; - let num_rules = self.compiled_rules.rules().len(); + let num_rules = self.compiled_rules.num_rules(); let bits = BitSlice::::from_slice_mut( &mut main_mem[base..base + num_rules.div_ceil(8)], @@ -309,7 +309,7 @@ impl ScanContext<'_> { let wasm_store = unsafe { self.wasm_store.as_mut() }; let mem = self.main_memory.unwrap().data_mut(wasm_store); - let num_rules = self.compiled_rules.rules().len(); + let num_rules = self.compiled_rules.num_rules(); let base = MATCHING_RULES_BITMAP_BASE as usize; let bits = BitSlice::::from_slice_mut( @@ -330,9 +330,9 @@ impl ScanContext<'_> { ) { let wasm_store = unsafe { self.wasm_store.as_mut() }; let mem = self.main_memory.unwrap().data_mut(wasm_store); - let num_rules = self.compiled_rules.rules().len(); + let num_rules = self.compiled_rules.num_rules(); let num_patterns = self.compiled_rules.num_patterns(); - + let base = MATCHING_RULES_BITMAP_BASE as usize + num_rules.div_ceil(8); let bits = BitSlice::::from_slice_mut( &mut mem[base..base + num_patterns.div_ceil(8)], diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index d4bb8443a..db4e0c4ad 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -106,7 +106,7 @@ impl<'r> Scanner<'r> { /// Creates a new scanner. pub fn new(rules: &'r Rules) -> Self { - let num_rules = rules.rules().len() as u32; + let num_rules = rules.num_rules() as u32; let num_patterns = rules.num_patterns() as u32; // The ScanContext structure belongs to the WASM store, but at the same @@ -675,7 +675,7 @@ impl<'r> Scanner<'r> { /// scan. This clears all the information generated the previous scan. fn reset(&mut self) { let ctx = self.wasm_store.data_mut(); - let num_rules = ctx.compiled_rules.rules().len(); + let num_rules = ctx.compiled_rules.num_rules(); let num_patterns = ctx.compiled_rules.num_patterns(); // Clear the array that tracks the patterns that reached the maximum @@ -824,7 +824,7 @@ pub struct NonMatchingRules<'a, 'r> { impl<'a, 'r> NonMatchingRules<'a, 'r> { fn new(ctx: &'a ScanContext<'r>, data: &'a ScannedData<'a>) -> Self { - let num_rules = ctx.compiled_rules.rules().len(); + let num_rules = ctx.compiled_rules.num_rules(); let main_memory = ctx.main_memory.unwrap().data(unsafe { ctx.wasm_store.as_ref() }); @@ -849,7 +849,7 @@ impl<'a, 'r> NonMatchingRules<'a, 'r> { // The number of non-matching rules is the total number of rules // minus the number of matching rules, both private and // non-private. - len: ctx.compiled_rules.rules().len() + len: ctx.compiled_rules.num_rules() - ctx.private_matching_rules.len() - ctx.non_private_matching_rules.len(), } From fc7aa805a9f093be0b865432987f480bd4efdf89 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 00:06:59 +0100 Subject: [PATCH 14/19] chore: remove unused method --- lib/src/compiler/rules.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index 9643d5994..05807403c 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -206,12 +206,6 @@ impl Rules { self.rules.get(rule_id.0 as usize).unwrap() } - /// Returns a slice with the individual rules that were compiled. - #[inline] - pub(crate) fn rules(&self) -> &[RuleInfo] { - self.rules.as_slice() - } - /// Returns a regular expression by [`RegexpId`]. /// /// # Panics From 8bf8ce8e033fe13fa7f36e541e39cdfdadecc512 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 09:29:52 +0100 Subject: [PATCH 15/19] fix: compilation error when logging is enabled --- lib/src/compiler/rules.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index 05807403c..16c5e8cea 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -278,7 +278,7 @@ impl Rules { sub_pattern_id: SubPatternId, ) -> Option<(RuleId, IdentId)> { let (target_pattern_id, _) = self.get_sub_pattern(sub_pattern_id); - for (rule_id, rule) in self.rules().iter().enumerate() { + for (rule_id, rule) in self.rules.iter().enumerate() { for (ident_id, pattern_id) in &rule.patterns { if pattern_id == target_pattern_id { return Some((rule_id.into(), *ident_id)); From 4767ae103fc33150da623ec7cde4d6eec0f16ff8 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 10:47:24 +0100 Subject: [PATCH 16/19] fix: compiling error when rules profiling is enabled --- lib/src/compiler/rules.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index 16c5e8cea..0caaefcc5 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -288,6 +288,12 @@ impl Rules { None } + #[cfg(feature = "rules-profiling")] + #[inline] + pub(crate) fn rules(&self) -> &[RuleInfo] { + self.rules.as_slice() + } + #[inline] pub(crate) fn atoms(&self) -> &[SubPatternAtom] { self.atoms.as_slice() From dc2c9ea99753e7fbb00a4cf38a368aba0cc4d4bd Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 17:35:56 +0100 Subject: [PATCH 17/19] fix: issue with wide regexps causing false positives Strings matching wide regexps must contain interleaved zeroes, but we were not making sure that these bytes were actually zero when matching the regexp with `PikeVM`. --- lib/src/re/fast/fastvm.rs | 2 +- lib/src/re/thompson/pikevm.rs | 134 +++++++++++++++++++++++++++++++--- lib/src/tests/mod.rs | 34 +++++++++ 3 files changed, 157 insertions(+), 13 deletions(-) diff --git a/lib/src/re/fast/fastvm.rs b/lib/src/re/fast/fastvm.rs index d6c1f664f..80ed030e9 100644 --- a/lib/src/re/fast/fastvm.rs +++ b/lib/src/re/fast/fastvm.rs @@ -51,7 +51,7 @@ impl<'r> FastVM<'r> { /// VM before aborting. /// /// This sets a limit on the number of bytes that the VM will read from the - /// input while trying find a match. Without a limit, the VM will can incur + /// input while trying to find a match. Without a limit, the VM can incur /// in excessive execution time for regular expressions that are unbounded, /// like `foo.*bar`. For inputs that starts with `foo`, this regexp will /// try to scan the whole input, and that would take a long time if the diff --git a/lib/src/re/thompson/pikevm.rs b/lib/src/re/thompson/pikevm.rs index 5232c5a21..b842105a3 100644 --- a/lib/src/re/thompson/pikevm.rs +++ b/lib/src/re/thompson/pikevm.rs @@ -1,3 +1,4 @@ +use std::cell::Cell; use std::mem; use bitvec::array::BitArray; @@ -90,23 +91,41 @@ impl<'r> PikeVM<'r> { self.try_match_impl(start, right.iter(), left.iter().rev(), f) } // Going forward, wide. - (false, true) => self.try_match_impl( - start, - right.iter().step_by(2), - left.iter().rev().skip(1).step_by(2), - |match_len| f(match_len * 2), - ), + (false, true) => { + let invalid_wide = Cell::new(false); + self.try_match_impl( + start, + WideIter::non_zero_first(right.iter(), &invalid_wide), + WideIter::zero_first(left.iter().rev(), &invalid_wide), + |match_len| { + if invalid_wide.get() { + Action::Stop + } else { + f(match_len * 2) + } + }, + ) + } // Going backward, not wide. (true, false) => { self.try_match_impl(start, left.iter().rev(), right.iter(), f) } // Going backward, wide. - (true, true) => self.try_match_impl( - start, - left.iter().rev().skip(1).step_by(2), - right.iter().step_by(2), - |match_len| f(match_len * 2), - ), + (true, true) => { + let invalid_wide = Cell::new(false); + self.try_match_impl( + start, + WideIter::zero_first(left.iter().rev(), &invalid_wide), + WideIter::non_zero_first(right.iter(), &invalid_wide), + |match_len| { + if invalid_wide.get() { + Action::Stop + } else { + f(match_len * 2) + } + }, + ) + } } } @@ -385,3 +404,94 @@ pub(crate) fn epsilon_closure( } } } + +/// WideIter is an iterator that takes a byte iterator and consumes it two +/// bytes at a time, returning one of the bytes and making sure that other +/// is zero. Which of the two bytes is returned and which is zero depends +/// on the kind of iterator you create. With [`WideIter::non_zero_first`] +/// you create an iterator that expects the first byte of each pair to be +/// non-zero, and the second one to be zero. +/// +/// In the other hand, [`WideIter::zero_first`] expect the first byte of +/// each pair to be zero, and the second one to be the non-zero byte +/// returned by the iterator. +/// +/// ```ignore +/// let error = Cell::new(false); +/// let v = vec![1,0,2,0,3,0]; +/// // The non-zero values are expected to be the first of each pair. +/// let i = WideIter::non_zero_first(v.iter(), &error); +/// assert_eq!(i.collect(), vec![1,2,3]); +/// // No error. +/// assert!(!error_flag.get()); +/// ``` +/// +/// ```ignore +/// let error = Cell::new(false); +/// let v = vec![1,100,2,0,3,0]; +/// let i = WideIter::non_zero_first(v.iter(), &error); +/// assert_eq!(i.collect(), vec![1,2,3]); +/// // Error! between 1 and 2 there's a non-zero value 100 +/// assert!(error_flag.get()); +/// ``` +/// +/// ```ignore +/// let error = Cell::new(false); +/// let v = vec![0,1,0,2,0,3]; +/// // The zero values are expected to be the first of each pair. +/// let i = WideIter::zero_first(v.iter(), &error); +/// assert_eq!(i.collect(), vec![1,2,3]); +/// // No error +/// assert!(!error_flag.get()); +/// ``` +struct WideIter<'a, I> +where + I: Iterator, +{ + iter: I, + error_flag: &'a Cell, + zero_first: bool, +} + +impl<'a, I> WideIter<'a, I> +where + I: Iterator, +{ + pub fn non_zero_first(iter: I, error_flag: &'a Cell) -> Self + where + I: Iterator, + { + WideIter { iter, error_flag, zero_first: false } + } + + pub fn zero_first(iter: I, error_flag: &'a Cell) -> Self + where + I: Iterator, + { + WideIter { iter, error_flag, zero_first: true } + } +} + +impl<'a, I> Iterator for WideIter<'a, I> +where + I: Iterator, +{ + type Item = I::Item; + + fn next(&mut self) -> Option { + let first_byte = self.iter.next()?; + let second_byte = self.iter.next()?; + + if self.zero_first { + if *first_byte != 0_u8 { + self.error_flag.replace(true); + } + Some(second_byte) + } else { + if *second_byte != 0_u8 { + self.error_flag.replace(true); + } + Some(first_byte) + } + } +} diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index b798a4e6b..3ee9406bd 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1480,6 +1480,40 @@ fn regexp_wide() { b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00", b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00" ); + + pattern_false!( + r#"/foobar.[A-Z]{1}/ wide"#, + b"f\x00o\x00o\x00b\x00a\x00r\x00\x00\x15W\x00" + ); + + pattern_false!( + r#"/.[A-Z]{1}foobar/ wide"#, + b"\x00\x15W\x00f\x00o\x00o\x00b\x00a\x00r\x00" + ); + + pattern_false!( + r#"/\bfoobar/ wide"#, + b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00" + ); + + pattern_true!( + r#"/\Bfoobar/ wide"#, + b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00" + ); + + pattern_false!( + r#"/foobar\b/ wide"#, + b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00" + ); + + pattern_true!( + r#"/foobar\B/ wide"#, + b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00" + ); + + pattern_true!(r#"/foobar\b/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); + pattern_false!(r#"/foobar\B/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); + pattern_true!(r#"/foobar$/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); } #[test] From cd87d883788b4fd0928fc857f7e788af8d9da66c Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 19:48:03 +0100 Subject: [PATCH 18/19] fix: issue introduced in dc2c9ea99753e7fbb00a4cf38a368aba0cc4d4bd The previous commit fixed part of the issue with wide regexps, but didn't fix it correctly. --- lib/src/re/thompson/pikevm.rs | 89 ++++++++++++++++++----------------- lib/src/tests/mod.rs | 6 +++ 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/lib/src/re/thompson/pikevm.rs b/lib/src/re/thompson/pikevm.rs index b842105a3..ad24d1547 100644 --- a/lib/src/re/thompson/pikevm.rs +++ b/lib/src/re/thompson/pikevm.rs @@ -92,17 +92,15 @@ impl<'r> PikeVM<'r> { } // Going forward, wide. (false, true) => { - let invalid_wide = Cell::new(false); + let error_fwd = Cell::new(None); + let error_bck = Cell::new(None); self.try_match_impl( start, - WideIter::non_zero_first(right.iter(), &invalid_wide), - WideIter::zero_first(left.iter().rev(), &invalid_wide), - |match_len| { - if invalid_wide.get() { - Action::Stop - } else { - f(match_len * 2) - } + WideIter::non_zero_first(right.iter(), &error_fwd), + WideIter::zero_first(left.iter().rev(), &error_bck), + |match_len| match error_fwd.get() { + Some(pos) if pos < match_len => Action::Stop, + _ => f(match_len * 2), }, ) } @@ -112,17 +110,15 @@ impl<'r> PikeVM<'r> { } // Going backward, wide. (true, true) => { - let invalid_wide = Cell::new(false); + let error_fwd = Cell::new(None); + let error_bck = Cell::new(None); self.try_match_impl( start, - WideIter::zero_first(left.iter().rev(), &invalid_wide), - WideIter::non_zero_first(right.iter(), &invalid_wide), - |match_len| { - if invalid_wide.get() { - Action::Stop - } else { - f(match_len * 2) - } + WideIter::zero_first(left.iter().rev(), &error_fwd), + WideIter::non_zero_first(right.iter(), &error_bck), + |match_len| match error_fwd.get() { + Some(pos) if pos < match_len => Action::Stop, + _ => f(match_len * 2), }, ) } @@ -409,47 +405,54 @@ pub(crate) fn epsilon_closure( /// bytes at a time, returning one of the bytes and making sure that other /// is zero. Which of the two bytes is returned and which is zero depends /// on the kind of iterator you create. With [`WideIter::non_zero_first`] -/// you create an iterator that expects the first byte of each pair to be -/// non-zero, and the second one to be zero. +/// the iterator expects the first byte of each pair to be non-zero, and +/// the second one to be zero. /// -/// In the other hand, [`WideIter::zero_first`] expect the first byte of -/// each pair to be zero, and the second one to be the non-zero byte -/// returned by the iterator. +/// In the other hand, with [`WideIter::zero_first`] the iterator expects +/// the first byte of each pair to be zero, and the second one to be the +/// non-zero byte. +/// +/// When the iterator finds a byte that is expected to be zero, but it's +/// not, it saves the number of valid pairs that were consumed before +/// finding this invalid pair. /// /// ```ignore -/// let error = Cell::new(false); +/// let error_pos = Cell::new(None); /// let v = vec![1,0,2,0,3,0]; /// // The non-zero values are expected to be the first of each pair. -/// let i = WideIter::non_zero_first(v.iter(), &error); +/// let i = WideIter::non_zero_first(v.iter(), &error_pos); /// assert_eq!(i.collect(), vec![1,2,3]); /// // No error. -/// assert!(!error_flag.get()); +/// assert_eq!(error_pos.get(), None); /// ``` /// /// ```ignore -/// let error = Cell::new(false); +/// let error_pos = Cell::new(None); /// let v = vec![1,100,2,0,3,0]; -/// let i = WideIter::non_zero_first(v.iter(), &error); +/// let i = WideIter::non_zero_first(v.iter(), &error_pos); /// assert_eq!(i.collect(), vec![1,2,3]); -/// // Error! between 1 and 2 there's a non-zero value 100 -/// assert!(error_flag.get()); +/// // Error! the 1 is followed by 100 instead of 0. The +/// // error position is 0 because no valid pairs were found +/// // before this error. +/// assert!(error_pos.get(), Some(0)); /// ``` /// /// ```ignore -/// let error = Cell::new(false); +/// let error_pos = Cell::new(None); /// let v = vec![0,1,0,2,0,3]; /// // The zero values are expected to be the first of each pair. -/// let i = WideIter::zero_first(v.iter(), &error); +/// let i = WideIter::zero_first(v.iter(), &error_pos); /// assert_eq!(i.collect(), vec![1,2,3]); /// // No error -/// assert!(!error_flag.get()); +/// assert!(error_pos.get(), None); /// ``` struct WideIter<'a, I> where I: Iterator, { iter: I, - error_flag: &'a Cell, + error_pos: &'a Cell>, + valid_pairs: usize, zero_first: bool, } @@ -457,18 +460,18 @@ impl<'a, I> WideIter<'a, I> where I: Iterator, { - pub fn non_zero_first(iter: I, error_flag: &'a Cell) -> Self + pub fn non_zero_first(iter: I, error_pos: &'a Cell>) -> Self where I: Iterator, { - WideIter { iter, error_flag, zero_first: false } + WideIter { iter, error_pos, valid_pairs: 0, zero_first: false } } - pub fn zero_first(iter: I, error_flag: &'a Cell) -> Self + pub fn zero_first(iter: I, error_pos: &'a Cell>) -> Self where I: Iterator, { - WideIter { iter, error_flag, zero_first: true } + WideIter { iter, error_pos, valid_pairs: 0, zero_first: true } } } @@ -483,14 +486,16 @@ where let second_byte = self.iter.next()?; if self.zero_first { - if *first_byte != 0_u8 { - self.error_flag.replace(true); + if *first_byte != 0_u8 && self.error_pos.get().is_none() { + self.error_pos.set(Some(self.valid_pairs)); } + self.valid_pairs += 1; Some(second_byte) } else { - if *second_byte != 0_u8 { - self.error_flag.replace(true); + if *second_byte != 0_u8 && self.error_pos.get().is_none() { + self.error_pos.set(Some(self.valid_pairs)); } + self.valid_pairs += 1; Some(first_byte) } } diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index 3ee9406bd..c255e10a9 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1481,6 +1481,12 @@ fn regexp_wide() { b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00" ); + pattern_match!( + r#"/https?:\/\/.{5,128}\.png/ wide"#, + b"\xcc\xcch\x00t\x00t\x00p\x00s\x00:\x00/\x00/\x00f\x00o\x00o\x00b\x00a\x00r\x00/\x00b\x00a\x00z\x00.\x00p\x00n\x00g\x00\xcc\xcc", + b"h\x00t\x00t\x00p\x00s\x00:\x00/\x00/\x00f\x00o\x00o\x00b\x00a\x00r\x00/\x00b\x00a\x00z\x00.\x00p\x00n\x00g\x00" + ); + pattern_false!( r#"/foobar.[A-Z]{1}/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00\x00\x15W\x00" From 9ad76ce73bd25f27fef1eb2c77ecc8b1a77235ef Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Fri, 1 Mar 2024 20:36:41 +0100 Subject: [PATCH 19/19] style: fix clippy warnings --- lib/src/tests/mod.rs | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index c255e10a9..d2ee8a109 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1482,7 +1482,7 @@ fn regexp_wide() { ); pattern_match!( - r#"/https?:\/\/.{5,128}\.png/ wide"#, + r"/https?:\/\/.{5,128}\.png/ wide", b"\xcc\xcch\x00t\x00t\x00p\x00s\x00:\x00/\x00/\x00f\x00o\x00o\x00b\x00a\x00r\x00/\x00b\x00a\x00z\x00.\x00p\x00n\x00g\x00\xcc\xcc", b"h\x00t\x00t\x00p\x00s\x00:\x00/\x00/\x00f\x00o\x00o\x00b\x00a\x00r\x00/\x00b\x00a\x00z\x00.\x00p\x00n\x00g\x00" ); @@ -1497,29 +1497,13 @@ fn regexp_wide() { b"\x00\x15W\x00f\x00o\x00o\x00b\x00a\x00r\x00" ); - pattern_false!( - r#"/\bfoobar/ wide"#, - b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00" - ); - - pattern_true!( - r#"/\Bfoobar/ wide"#, - b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00" - ); - - pattern_false!( - r#"/foobar\b/ wide"#, - b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00" - ); - - pattern_true!( - r#"/foobar\B/ wide"#, - b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00" - ); - - pattern_true!(r#"/foobar\b/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); - pattern_false!(r#"/foobar\B/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); - pattern_true!(r#"/foobar$/ wide"#, b"f\x00o\x00o\x00b\x00a\x00r\x00x"); + pattern_false!(r"/\bfoobar/ wide", b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00"); + pattern_true!(r"/\Bfoobar/ wide", b"x\x00f\x00o\x00o\x00b\x00a\x00r\x00"); + pattern_false!(r"/foobar\b/ wide", b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00"); + pattern_true!(r"/foobar\B/ wide", b"f\x00o\x00o\x00b\x00a\x00r\x00x\x00"); + pattern_true!(r"/foobar\b/ wide", b"f\x00o\x00o\x00b\x00a\x00r\x00x"); + pattern_false!(r"/foobar\B/ wide", b"f\x00o\x00o\x00b\x00a\x00r\x00x"); + pattern_true!(r"/foobar$/ wide", b"f\x00o\x00o\x00b\x00a\x00r\x00x"); } #[test]