feat: interpret the sequences \< and \> in regexp as literals ins…

…tead of word boundaries The `regex_syntax` crate interprets `\<` and `\>` as start of word and end of word boundaries respectively. However, in YARA this has always been interpreted as the escaped form of `<` and `>` literals. In order to keep backward compatibility, this commits introduce the `Transformer` type, which receives the AST produced by `regex_syntax` and replace the word boundary tokens by literal ones.
VirusTotal · Mar 21, 2024 · 0c6672d · 0c6672d
1 parent d5a5430
commit 0c6672d
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 8 deletions.
diff --git a/lib/src/compiler/ir/ast2ir.rs b/lib/src/compiler/ir/ast2ir.rs
@@ -178,7 +178,7 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
     // we can know the overall greediness of the regexp, and decide whether we
     // should aim for the longest, or the shortest possible match when multiple
     // matches that start at the same offset are found while scanning backwards
-    // (right-to-left). However, if the regexp contains a mix of greedy an
+    // (right-to-left). However, if the regexp contains a mix of greedy and
     // non-greedy repetitions the decision becomes impossible.
     let hir = re::parser::Parser::new()
         .force_case_insensitive(flags.contains(PatternFlags::Nocase))

diff --git a/lib/src/re/parser.rs b/lib/src/re/parser.rs
@@ -1,6 +1,10 @@
+use std::collections::VecDeque;
 use std::fmt::{Debug, Display, Formatter};
+use std::mem;
+use std::mem::replace;
 
 use regex_syntax as re;
+use regex_syntax::ast::{AssertionKind, Ast, Literal, LiteralKind};
 use thiserror::Error;
 
 use crate::re::hir::Hir;
@@ -121,6 +125,7 @@ impl Parser {
             }
         })?;
 
+        let ast = Transformer::new().transform(ast);
         let greedy = Validator::new().validate(&ast);
 
         // `greedy` is set to Some(true) if all regexp quantifiers are greedy,
@@ -168,7 +173,7 @@ impl Validator {
         Self { first_rep: None }
     }
 
-    fn validate(&mut self, ast: &re::ast::Ast) -> Result<Option<bool>, Error> {
+    fn validate(&mut self, ast: &Ast) -> Result<Option<bool>, Error> {
         re::ast::visit(ast, self)
     }
 }
@@ -181,8 +186,8 @@ impl re::ast::Visitor for &mut Validator {
         Ok(self.first_rep.map(|rep| rep.0))
     }
 
-    fn visit_pre(&mut self, ast: &re::ast::Ast) -> Result<(), Self::Err> {
-        if let re::ast::Ast::Repetition(rep) = ast {
+    fn visit_pre(&mut self, ast: &Ast) -> Result<(), Self::Err> {
+        if let Ast::Repetition(rep) = ast {
             if let Some(first_rep) = self.first_rep {
                 if rep.greedy != first_rep.0 {
                     return Err(Error::MixedGreediness {
@@ -199,3 +204,109 @@ impl re::ast::Visitor for &mut Validator {
         Ok(())
     }
 }
+
+/// Performs some transformations to the regexp AST.
+///
+/// This type takes an AST produced by the `regex_syntax` crate and returns
+/// it with some changes that are necessary to ensure that regexps are
+/// compatible with YARA.
+///
+/// At this moment the only change applied is the replacement of AST nodes
+/// `WordBoundaryStartAngle` and `WordBoundaryEndAngle` with literals `<` and
+/// `>` respectively. This is necessary because the `regex_syntax` crate
+/// interprets sequences `\<` and `\>` as word start and word end boundaries,
+/// equivalent to `\b{start}` and `\b{end}`, respectively. See [documentation][1]
+///
+/// YARA in the other hand, interprets these sequences as the escaped form for
+/// literals `<` and `>`.
+///
+/// [1]: https://docs.rs/regex/latest/regex/#empty-matches
+struct Transformer {}
+
+impl Transformer {
+    pub fn new() -> Self {
+        Self {}
+    }
+
+    pub fn transform(&self, mut ast: Ast) -> Ast {
+        self.traverse(&mut ast);
+        ast
+    }
+}
+
+impl Transformer {
+    fn traverse(&self, ast: &mut Ast) {
+        let mut stack = VecDeque::new();
+
+        stack.push_back(ast);
+
+        while let Some(ast) = stack.pop_front() {
+            match ast {
+                Ast::Empty(_) => {}
+                Ast::Flags(_) => {}
+                Ast::Literal(_) => {}
+                Ast::Dot(_) => {}
+                Ast::Assertion(assertion) => match assertion.kind {
+                    AssertionKind::WordBoundaryStartAngle => {}
+                    AssertionKind::WordBoundaryEndAngle => {}
+                    _ => {}
+                },
+                Ast::ClassUnicode(_) => {}
+                Ast::ClassPerl(_) => {}
+                Ast::ClassBracketed(_) => {}
+                Ast::Repetition(rep) => {
+                    self.replace_word_boundary_assertions(rep.ast.as_mut());
+                    stack.push_back(rep.ast.as_mut());
+                }
+                Ast::Group(group) => {
+                    self.replace_word_boundary_assertions(group.ast.as_mut());
+                    stack.push_back(group.ast.as_mut());
+                }
+                Ast::Alternation(alternation) => {
+                    for ast in alternation.asts.iter_mut() {
+                        self.replace_word_boundary_assertions(ast);
+                    }
+                    for ast in alternation.asts.iter_mut() {
+                        stack.push_back(ast);
+                    }
+                }
+                Ast::Concat(concat) => {
+                    for ast in concat.asts.iter_mut() {
+                        self.replace_word_boundary_assertions(ast);
+                    }
+                    for ast in concat.asts.iter_mut() {
+                        stack.push_back(ast);
+                    }
+                }
+            }
+        }
+    }
+
+    fn replace_word_boundary_assertions(&self, ast: &mut Ast) {
+        if let Ast::Assertion(ref assertion) = ast {
+            match assertion.kind {
+                AssertionKind::WordBoundaryStartAngle => {
+                    let _ = replace(
+                        ast,
+                        Ast::literal(Literal {
+                            span: assertion.span,
+                            kind: LiteralKind::Verbatim,
+                            c: '<',
+                        }),
+                    );
+                }
+                AssertionKind::WordBoundaryEndAngle => {
+                    let _ = replace(
+                        ast,
+                        Ast::literal(Literal {
+                            span: assertion.span,
+                            kind: LiteralKind::Verbatim,
+                            c: '>',
+                        }),
+                    );
+                }
+                _ => {}
+            }
+        }
+    }
+}
diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs
@@ -1288,12 +1288,14 @@ fn regexp_patterns_4() {
     pattern_match!(r"/\w\w\w\B/", b"abcd", b"abc");
     pattern_match!(r"/\B\w\w\w/", b"abcd", b"bcd");
     pattern_false!(r"/\B\w\w\w\B/", b"abcd");
-    pattern_match!(r"/\<abc/", b"abc", b"abc");
-    pattern_match!(r"/abc\>/", b"abc", b"abc");
+    pattern_match!(r"/\<abc/", b"<abc", b"<abc");
+    pattern_match!(r"/abc\>/", b"abc>", b"abc>");
     pattern_match!(r"/\b{start}abc/", b"abc", b"abc");
     pattern_match!(r"/abc\b{end}/", b"abc", b"abc");
-    pattern_match!(r"/\<abc/", b" abc", b"abc");
-    pattern_match!(r"/abc\>/", b"abc ", b"abc");
+    pattern_match!(r"/\b{start}abc/", b" abc", b"abc");
+    pattern_match!(r"/abc\b{end}/", b"abc ", b"abc");
+    pattern_false!(r"/\<abc/", b" abc");
+    pattern_false!(r"/abc\>/", b"abc ");
     pattern_false!(r"/\<abc/", b"1abc");
     pattern_false!(r"/abc\>/", b"abc1");