Skip to content

Commit

Permalink
feat: interpret the sequences \< and \> in regexp as literals ins…
Browse files Browse the repository at this point in the history
…tead of word boundaries

The `regex_syntax` crate interprets `\<` and `\>` as start of word and end of word boundaries respectively. However, in YARA this has always been interpreted as the escaped form of `<` and `>` literals. In order to keep backward compatibility, this commits introduce the `Transformer` type, which receives the AST produced by `regex_syntax` and replace the word boundary tokens by literal ones.
  • Loading branch information
plusvic committed Mar 21, 2024
1 parent d5a5430 commit 0c6672d
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 8 deletions.
2 changes: 1 addition & 1 deletion lib/src/compiler/ir/ast2ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
// we can know the overall greediness of the regexp, and decide whether we
// should aim for the longest, or the shortest possible match when multiple
// matches that start at the same offset are found while scanning backwards
// (right-to-left). However, if the regexp contains a mix of greedy an
// (right-to-left). However, if the regexp contains a mix of greedy and
// non-greedy repetitions the decision becomes impossible.
let hir = re::parser::Parser::new()
.force_case_insensitive(flags.contains(PatternFlags::Nocase))
Expand Down
117 changes: 114 additions & 3 deletions lib/src/re/parser.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use std::collections::VecDeque;
use std::fmt::{Debug, Display, Formatter};
use std::mem;
use std::mem::replace;

use regex_syntax as re;
use regex_syntax::ast::{AssertionKind, Ast, Literal, LiteralKind};
use thiserror::Error;

use crate::re::hir::Hir;
Expand Down Expand Up @@ -121,6 +125,7 @@ impl Parser {
}
})?;

let ast = Transformer::new().transform(ast);
let greedy = Validator::new().validate(&ast);

// `greedy` is set to Some(true) if all regexp quantifiers are greedy,
Expand Down Expand Up @@ -168,7 +173,7 @@ impl Validator {
Self { first_rep: None }
}

fn validate(&mut self, ast: &re::ast::Ast) -> Result<Option<bool>, Error> {
fn validate(&mut self, ast: &Ast) -> Result<Option<bool>, Error> {
re::ast::visit(ast, self)
}
}
Expand All @@ -181,8 +186,8 @@ impl re::ast::Visitor for &mut Validator {
Ok(self.first_rep.map(|rep| rep.0))
}

fn visit_pre(&mut self, ast: &re::ast::Ast) -> Result<(), Self::Err> {
if let re::ast::Ast::Repetition(rep) = ast {
fn visit_pre(&mut self, ast: &Ast) -> Result<(), Self::Err> {
if let Ast::Repetition(rep) = ast {
if let Some(first_rep) = self.first_rep {
if rep.greedy != first_rep.0 {
return Err(Error::MixedGreediness {
Expand All @@ -199,3 +204,109 @@ impl re::ast::Visitor for &mut Validator {
Ok(())
}
}

/// Performs some transformations to the regexp AST.
///
/// This type takes an AST produced by the `regex_syntax` crate and returns
/// it with some changes that are necessary to ensure that regexps are
/// compatible with YARA.
///
/// At this moment the only change applied is the replacement of AST nodes
/// `WordBoundaryStartAngle` and `WordBoundaryEndAngle` with literals `<` and
/// `>` respectively. This is necessary because the `regex_syntax` crate
/// interprets sequences `\<` and `\>` as word start and word end boundaries,
/// equivalent to `\b{start}` and `\b{end}`, respectively. See [documentation][1]
///
/// YARA in the other hand, interprets these sequences as the escaped form for
/// literals `<` and `>`.
///
/// [1]: https://docs.rs/regex/latest/regex/#empty-matches
struct Transformer {}

impl Transformer {
pub fn new() -> Self {
Self {}
}

pub fn transform(&self, mut ast: Ast) -> Ast {
self.traverse(&mut ast);
ast
}
}

impl Transformer {
fn traverse(&self, ast: &mut Ast) {
let mut stack = VecDeque::new();

stack.push_back(ast);

while let Some(ast) = stack.pop_front() {
match ast {
Ast::Empty(_) => {}
Ast::Flags(_) => {}
Ast::Literal(_) => {}
Ast::Dot(_) => {}
Ast::Assertion(assertion) => match assertion.kind {
AssertionKind::WordBoundaryStartAngle => {}
AssertionKind::WordBoundaryEndAngle => {}
_ => {}
},
Ast::ClassUnicode(_) => {}
Ast::ClassPerl(_) => {}
Ast::ClassBracketed(_) => {}
Ast::Repetition(rep) => {
self.replace_word_boundary_assertions(rep.ast.as_mut());
stack.push_back(rep.ast.as_mut());
}
Ast::Group(group) => {
self.replace_word_boundary_assertions(group.ast.as_mut());
stack.push_back(group.ast.as_mut());
}
Ast::Alternation(alternation) => {
for ast in alternation.asts.iter_mut() {
self.replace_word_boundary_assertions(ast);
}
for ast in alternation.asts.iter_mut() {
stack.push_back(ast);
}
}
Ast::Concat(concat) => {
for ast in concat.asts.iter_mut() {
self.replace_word_boundary_assertions(ast);
}
for ast in concat.asts.iter_mut() {
stack.push_back(ast);
}
}
}
}
}

fn replace_word_boundary_assertions(&self, ast: &mut Ast) {
if let Ast::Assertion(ref assertion) = ast {
match assertion.kind {
AssertionKind::WordBoundaryStartAngle => {
let _ = replace(
ast,
Ast::literal(Literal {
span: assertion.span,
kind: LiteralKind::Verbatim,
c: '<',
}),
);
}
AssertionKind::WordBoundaryEndAngle => {
let _ = replace(
ast,
Ast::literal(Literal {
span: assertion.span,
kind: LiteralKind::Verbatim,
c: '>',
}),
);
}
_ => {}
}
}
}
}
10 changes: 6 additions & 4 deletions lib/src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1288,12 +1288,14 @@ fn regexp_patterns_4() {
pattern_match!(r"/\w\w\w\B/", b"abcd", b"abc");
pattern_match!(r"/\B\w\w\w/", b"abcd", b"bcd");
pattern_false!(r"/\B\w\w\w\B/", b"abcd");
pattern_match!(r"/\<abc/", b"abc", b"abc");
pattern_match!(r"/abc\>/", b"abc", b"abc");
pattern_match!(r"/\<abc/", b"<abc", b"<abc");
pattern_match!(r"/abc\>/", b"abc>", b"abc>");
pattern_match!(r"/\b{start}abc/", b"abc", b"abc");
pattern_match!(r"/abc\b{end}/", b"abc", b"abc");
pattern_match!(r"/\<abc/", b" abc", b"abc");
pattern_match!(r"/abc\>/", b"abc ", b"abc");
pattern_match!(r"/\b{start}abc/", b" abc", b"abc");
pattern_match!(r"/abc\b{end}/", b"abc ", b"abc");
pattern_false!(r"/\<abc/", b" abc");
pattern_false!(r"/abc\>/", b"abc ");
pattern_false!(r"/\<abc/", b"1abc");
pattern_false!(r"/abc\>/", b"abc1");

Expand Down

0 comments on commit 0c6672d

Please sign in to comment.