diff --git a/lib/src/compiler/tests/testdata/errors/138.in b/lib/src/compiler/tests/testdata/errors/138.in new file mode 100644 index 00000000..6c5ea655 --- /dev/null +++ b/lib/src/compiler/tests/testdata/errors/138.in @@ -0,0 +1,6 @@ +rule test { + strings: + $a = /abcd((efg){0,10000}){0,10000}/ + condition: + $a +} \ No newline at end of file diff --git a/lib/src/compiler/tests/testdata/errors/138.out b/lib/src/compiler/tests/testdata/errors/138.out new file mode 100644 index 00000000..ecb7929d --- /dev/null +++ b/lib/src/compiler/tests/testdata/errors/138.out @@ -0,0 +1,6 @@ +error[E014]: invalid regular expression + --> line:3:3 + | +3 | $a = /abcd((efg){0,10000}){0,10000}/ + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ regexp is too large + | \ No newline at end of file diff --git a/lib/src/re/bitmapset.rs b/lib/src/re/bitmapset.rs index 0e8ed5cd..d82ce111 100644 --- a/lib/src/re/bitmapset.rs +++ b/lib/src/re/bitmapset.rs @@ -1,89 +1,105 @@ use bitvec::vec::BitVec; +use rustc_hash::FxHashSet; +use std::hash::Hash; -/// A high-performance set of `usize` values. +/// A high-performance set of (`usize`, T) pairs. /// -/// As in any set, the values are guaranteed to be unique, the `insert` -/// operation is a no-op if the new value already exists in the set. -/// Additionally, this type supports iterating the values in insertion order. +/// As in any set, the pairs are guaranteed to be unique, the `insert` +/// operation is a no-op if the new pair already exists in the set. +/// Additionally, this type supports iterating the pairs in insertion order. /// /// The distinguishing feature of this set lies in its utilization of bitmaps -/// for efficient membership checks. However, practical limitations prevent -/// having a bitmap with one bit per possible `usize` value, spanning from 0 to -/// `usize::MAX`. Instead, positions in the bitmap are determined relative to -/// the initial value inserted in the set. For instance, if the first value is -/// `1234`, the first bitmap bit corresponds to `1234`, the second to `1235`, -/// the third to `1236`, and so on. A separate bitmap is maintained for values -/// lower than the initial one, with `1233` represented as the first bit in -/// this other bitmap. Both bitmaps dynamically expand to accommodate newly -/// inserted values. +/// for checking if the `usize` key in a pair already exists in the set. +/// However, practical limitations prevent having a bitmap with one bit per +/// possible `usize` value, spanning from 0 to `usize::MAX`. Instead, positions +/// in the bitmap are determined relative to the initial key inserted in the +/// set. For instance, if the first value is (`1234`, T), the first bitmap bit +/// corresponds to key `1234`, the second to key `1235`, the third to key +/// `1236`, and so on. A separate bitmap is maintained for keys lower than +/// the initial one, with `1233` represented as the first bit in this other +/// bitmap. Both bitmaps dynamically expand to accommodate newly inserted +/// values. /// -/// `BitmapSet` works well with values that are close to each other. Outliers +/// `BitmapSet` works well with keys that are close to each other. Outliers /// can make the memory required for storing the bitmaps to grow very quickly. /// Another property of this type is that values inserted in the set can be /// iterated in insertion order. -#[derive(Debug, PartialEq, Default)] -pub(crate) struct BitmapSet { - // Vector that contains the values in the set, in insertion order. - values: Vec, - // First value inserted in the set. - initial_value: usize, - // Bitmap for values that are > initial_value. +#[derive(Debug, Default)] +pub(crate) struct BitmapSet +where + T: Default + Copy + PartialEq + Eq + Hash, +{ + // Vector that contains the (key,value) pairs in the set, in insertion + // order. + items: Vec<(usize, T)>, + // Set that contains the (key,value) pairs. + set: FxHashSet<(usize, T)>, + // Bitmap for keys that are > initial_key. p_bitmap: BitVec, - // Bitmap for values that are < initial_value. + // Bitmap for keys that are < initial_key. n_bitmap: BitVec, } -impl BitmapSet { +impl BitmapSet +where + T: Default + Copy + PartialEq + Eq + Hash, +{ pub const MAX_OFFSET: usize = 524288; pub fn new() -> Self { Self { - values: Vec::new(), - initial_value: 0, + items: Vec::new(), + set: FxHashSet::default(), p_bitmap: BitVec::repeat(false, 1024), n_bitmap: BitVec::repeat(false, 1024), } } - /// Adds a value to the set. + /// Adds a (key,value) pair to the set. /// - /// Returns `true` if the value didn't exist in the set and was added, and - /// `false` if the value already existed. + /// Returns `true` if the (key,value) pair didn't exist in the map and was + /// added, and `false` if the pair already existed. /// /// # Panics /// - /// If `value` is too far from the first value added to the set. - /// Specifically, it panics when `abs(value - initial_value) >= MAX_OFFSET` + /// If `key` is too far from the first key added to the set. + /// Specifically, it panics when `abs(key - initial_key) >= MAX_OFFSET` /// #[inline] - pub fn insert(&mut self, value: usize) -> bool { - // Special case when the set is totally empty. - if self.values.is_empty() { - self.initial_value = value; - self.values.push(value); - return true; - } - // Special case where the new value is equal to the first value - // added to the set. We don't need to spare a bit on this value. - if self.initial_value == value { + pub fn insert(&mut self, key: usize, value: T) -> bool { + let first = match self.items.first() { + Some(first) => first, + None => { + // The set is empty, store the first item and return. + self.items.push((key, value)); + return true; + } + }; + + // Special case when the new (key,value) pair is equal to the + // first one added to the set. + if first.0 == key && first.1 == value { return false; } - let offset = value as isize - self.initial_value as isize; + let offset = key as isize - first.0 as isize; match offset { offset if offset < 0 => { - let offset = -offset as usize; + let offset = (-offset as usize) - 1; unsafe { if self.n_bitmap.len() <= offset { assert!(offset < Self::MAX_OFFSET); self.n_bitmap.resize(offset + 1, false); self.n_bitmap.set_unchecked(offset, true); - self.values.push(value); - true + self.items.push((key, value)); + self.set.insert((key, value)) } else if !*self.n_bitmap.get_unchecked(offset) { self.n_bitmap.set_unchecked(offset, true); - self.values.push(value); + self.items.push((key, value)); + self.set.insert((key, value)) + } else if self.set.insert((key, value)) { + self.items.push((key, value)); true } else { false @@ -91,19 +107,20 @@ impl BitmapSet { } } offset => { - // At this point `offset` cannot be zero, it's safe to subtract - // 1 so that the first bit in the `p_bitmap` is used. - let offset = offset as usize - 1; + let offset = offset as usize; unsafe { if self.p_bitmap.len() <= offset { assert!(offset < Self::MAX_OFFSET); self.p_bitmap.resize(offset + 1, false); self.p_bitmap.set_unchecked(offset, true); - self.values.push(value); - true + self.items.push((key, value)); + self.set.insert((key, value)) } else if !*self.p_bitmap.get_unchecked(offset) { self.p_bitmap.set_unchecked(offset, true); - self.values.push(value); + self.items.push((key, value)); + self.set.insert((key, value)) + } else if self.set.insert((key, value)) { + self.items.push((key, value)); true } else { false @@ -115,39 +132,35 @@ impl BitmapSet { #[inline] pub fn is_empty(&self) -> bool { - self.values.is_empty() + self.items.is_empty() } /// Removes all values in the set. #[inline] pub fn clear(&mut self) { - for thread in self.values.drain(0..) { - let offset = thread as isize - self.initial_value as isize; + let first_key = match self.items.first() { + Some(first) => first.0, + None => return, + }; + for (key, _) in self.items.drain(0..) { + let offset = key as isize - first_key as isize; match offset { - offset if offset > 0 => { - self.p_bitmap.set((offset - 1) as usize, false); - } offset if offset < 0 => { - self.n_bitmap.set((-offset) as usize, false); + self.n_bitmap.set(((-offset) as usize) - 1, false); } - _ => { - // when `offset` is 0 there's no bit to clear, the initial - // value doesn't have a bit in neither of the bitmaps. + offset => { + self.p_bitmap.set(offset as usize, false); } } } + self.set.clear(); } /// Returns an iterator for the items in the set. /// /// Items are returned in insertion order. - pub fn iter(&self) -> impl Iterator { - self.values.iter() - } - - #[cfg(test)] - pub fn into_vec(self) -> Vec { - self.values + pub fn iter(&self) -> impl Iterator { + self.items.iter() } } @@ -159,28 +172,41 @@ mod tests { fn thread_set() { let mut s = BitmapSet::new(); - assert!(s.insert(4)); - assert!(s.insert(2)); - assert!(s.insert(10)); - assert!(s.insert(0)); - assert!(s.insert(2000)); - - assert!(!s.insert(4)); - assert!(!s.insert(2)); - assert!(!s.insert(10)); - assert!(!s.insert(0)); - assert!(!s.insert(2000)); - - assert_eq!(s.values, vec![4, 2, 10, 0, 2000]); + assert!(s.insert(4, 0)); + assert!(s.insert(2, 0)); + assert!(s.insert(3, 0)); + assert!(s.insert(10, 0)); + assert!(s.insert(0, 0)); + assert!(s.insert(2000, 0)); + + assert!(!s.insert(4, 0)); + assert!(!s.insert(2, 0)); + assert!(!s.insert(3, 0)); + assert!(!s.insert(10, 0)); + assert!(!s.insert(0, 0)); + assert!(!s.insert(2000, 0)); + assert!(s.insert(4, 1)); + assert!(!s.insert(4, 1)); + + assert_eq!( + s.items, + vec![(4, 0), (2, 0), (3, 0), (10, 0), (0, 0), (2000, 0), (4, 1)] + ); s.clear(); - assert!(s.insert(200)); - assert!(s.insert(2)); - assert!(s.insert(10)); - assert!(s.insert(300)); - assert!(s.insert(250)); + assert_eq!(s.p_bitmap.count_ones(), 0); + assert_eq!(s.n_bitmap.count_ones(), 0); + + assert!(s.insert(200, 0)); + assert!(s.insert(3, 0)); + assert!(s.insert(10, 0)); + assert!(s.insert(300, 0)); + assert!(s.insert(250, 0)); - assert_eq!(s.values, vec![200, 2, 10, 300, 250]); + assert_eq!( + s.items, + vec![(200, 0), (3, 0), (10, 0), (300, 0), (250, 0)] + ); } } diff --git a/lib/src/re/fast/fastvm.rs b/lib/src/re/fast/fastvm.rs index d3b09a8f..1f58e198 100644 --- a/lib/src/re/fast/fastvm.rs +++ b/lib/src/re/fast/fastvm.rs @@ -31,10 +31,10 @@ pub(crate) struct FastVM<'r> { /// is faster than `HashSet`, at the price of higher memory usage when /// the values in the set are not close to each others. However, the /// positions stored in this set are relatively close to each other. - positions: BitmapSet, + positions: BitmapSet<()>, /// The set that will replace `positions` in the next iteration of the /// VM loop. - next_positions: BitmapSet, + next_positions: BitmapSet<()>, } impl<'r> FastVM<'r> { @@ -85,7 +85,7 @@ impl<'r> FastVM<'r> { let step = if wide { 2 } else { 1 }; - self.positions.insert(0); + self.positions.insert(0, ()); let mut flags = JumpFlagSet::none(); @@ -102,7 +102,7 @@ impl<'r> FastVM<'r> { match instr { Instr::Match => { let mut stop = false; - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { match f(*position) { Action::Stop => { stop = true; @@ -117,7 +117,7 @@ impl<'r> FastVM<'r> { } } Instr::Literal(literal) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -136,12 +136,12 @@ impl<'r> FastVM<'r> { }; if is_match { self.next_positions - .insert(position + step * literal.len()); + .insert(position + step * literal.len(), ()); } } } Instr::MaskedLiteral(literal, mask) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -162,13 +162,13 @@ impl<'r> FastVM<'r> { }; if is_match { self.next_positions - .insert(position + step * literal.len()); + .insert(position + step * literal.len(), ()); } } } Instr::Alternation(alternatives) => { for alt in alternatives { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -190,6 +190,7 @@ impl<'r> FastVM<'r> { if is_match { self.next_positions.insert( position + step * literal.len(), + (), ); } } @@ -212,6 +213,7 @@ impl<'r> FastVM<'r> { if is_match { self.next_positions.insert( position + step * literal.len(), + (), ); } } @@ -225,19 +227,21 @@ impl<'r> FastVM<'r> { } } Instr::JumpExact(jump) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { self.next_positions - .insert(position + step * jump as usize); + .insert(position + step * jump as usize, ()); } } Instr::JumpExactNoNewline(jump) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { let jump_range = *position..*position + step * jump as usize; if let Some(jump_range) = input.get(jump_range) { if memchr::memchr(0x0A, jump_range).is_none() { - self.next_positions - .insert(position + step * jump as usize); + self.next_positions.insert( + position + step * jump as usize, + (), + ); } } } @@ -266,7 +270,7 @@ impl<'r> FastVM<'r> { match InstrParser::decode_instr(&self.code[ip..]) { (Instr::Literal(literal), _) if backwards => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -281,7 +285,7 @@ impl<'r> FastVM<'r> { } } (Instr::Literal(literal), _) if !backwards => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -298,7 +302,7 @@ impl<'r> FastVM<'r> { (Instr::MaskedLiteral(literal, mask), _) if backwards && mask.last() == Some(&0xff) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -315,7 +319,7 @@ impl<'r> FastVM<'r> { (Instr::MaskedLiteral(literal, mask), _) if !backwards && mask.first() == Some(&0xff) => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -330,7 +334,7 @@ impl<'r> FastVM<'r> { } } _ => { - for position in self.positions.iter() { + for (position, _) in self.positions.iter() { if *position >= input.len() { continue; } @@ -544,7 +548,7 @@ impl FastVM<'_> { flags: JumpFlagSet, range: &RangeInclusive, position: usize, - next_positions: &mut BitmapSet, + next_positions: &mut BitmapSet<()>, ) { let step = if flags.contains(JumpFlags::Wide) { 2 } else { 1 }; @@ -570,7 +574,7 @@ impl FastVM<'_> { let jmp_range = &input[range_min..range_max]; let mut on_match_found = |offset| { - next_positions.insert(position + range_min + offset); + next_positions.insert(position + range_min + offset, ()); }; let accept_newlines = flags.contains(JumpFlags::AcceptNewlines); @@ -618,7 +622,7 @@ impl FastVM<'_> { flags: JumpFlagSet, range: &RangeInclusive, position: usize, - next_positions: &mut BitmapSet, + next_positions: &mut BitmapSet<()>, ) { let step = if flags.contains(JumpFlags::Wide) { 2 } else { 1 }; @@ -665,7 +669,7 @@ impl FastVM<'_> { let mut on_match_found = |offset| { next_positions - .insert(position + n + jmp_range.len() - offset - step); + .insert(position + n + jmp_range.len() - offset - step, ()); }; let accept_newlines = flags.contains(JumpFlags::AcceptNewlines); diff --git a/lib/src/re/thompson/compiler.rs b/lib/src/re/thompson/compiler.rs index b871d471..eb76627f 100644 --- a/lib/src/re/thompson/compiler.rs +++ b/lib/src/re/thompson/compiler.rs @@ -45,6 +45,9 @@ pub(crate) struct CodeLoc { impl CodeLoc { fn sub(&self, rhs: &Self) -> Result { + // Code locations can be subtracted if they belong to the same + // instruction sequence. + assert_eq!(self.bck_seq_id, rhs.bck_seq_id); Ok(CodeLocOffset { fwd: (self.fwd as isize - rhs.fwd as isize) .try_into() @@ -122,11 +125,19 @@ pub(crate) struct Compiler { /// that could be zero-length, and the same happens with `b(cd)e`. The /// value of `zero_rep_depth` when visiting `cd` is 2. /// - /// This used for determining whether to extract atoms from certain nodes + /// This is used for determining whether to extract atoms from certain nodes /// in the HIR or not. Extracting atoms from a subtree under a zero-length /// repetition doesn't make sense, atoms must be extracted from portions of /// the pattern that are required to be present in any matching string. zero_rep_depth: u32, + + /// Stack that indicates whether repetitions are represented by + /// [`Instr::RepeatGreedy`] or [`Instr::RepeatNonGreedy`] instructions. While + /// traversing the HIR the depth of this stack indicates the number of nested + /// repetitions. A value in the stack will be `true` if the corresponding + /// repetition is represented using the REPEAT instructions, or `false` if + /// otherwise. + repeats: Vec, } impl Compiler { @@ -162,6 +173,7 @@ impl Compiler { best_atoms_stack: vec![RegexpAtoms::empty()], depth: 0, zero_rep_depth: 0, + repeats: Vec::new(), } } @@ -207,6 +219,8 @@ impl Compiler { } impl Compiler { + const REPEAT_INSTR_THRESHOLD: u32 = 10; + pub(super) fn compile_internal( self, hir: &re::hir::Hir, @@ -251,6 +265,78 @@ impl Compiler { self.backward_code_chunks.last_mut().unwrap_or(&mut self.backward_code) } + #[inline] + fn start_backward_code_chunk(&mut self) { + self.bookmarks.push(self.location()); + self.backward_code_chunks.push(self.backward_code().next()); + } + + fn reverse_backward_code_chunks(&mut self, n: usize) -> Vec { + // Split `backward_code_chunks` in two halves, [0, len-n) and + // [len-n, len). The first half stays in `backward_code_chunks` while + // the second half is stored in `last_n_chunks`. + let last_n_chunks = self + .backward_code_chunks + .split_off(self.backward_code_chunks.len() - n); + + // Obtain a reference to the backward code chunk that remains at the + // top of the `backward_code_chunks` stack after removing n of them. + // It would be better to use `self.backward_code_mut()`, but it causes + // a mutable borrow on `self`, while this other way borrows + // `self.backward_code_chunks` or `self.backward_code` but not `self. + let backward_code = self + .backward_code_chunks + .last_mut() + .unwrap_or(&mut self.backward_code); + + // Update the split ID for the backward code chunk at the top of the + // stack. If any of the n chunks previously removed contain a split + // instruction, and therefore incremented its split_id, this increment + // must be reflected in node that remains at the top of the stack, so + // that any other node emitted later don't reuse an already existing + // split ID. + if let Some(last_chunks) = last_n_chunks.last() { + backward_code.split_id = last_chunks.split_id; + } + + // The top N bookmarks corresponds to the beginning of each chunk. + let mut locations = self.bookmarks.split_off(self.bookmarks.len() - n); + + // Both `locations` and `last_n_chunks` have the same length N. + debug_assert_eq!(locations.len(), last_n_chunks.len()); + + // All chunks in `last_n_chucks` will be appended to the backward code + // in reverse order. The offset where each chunk resides in the backward + // code is stored in the hash map. + let mut chunk_locations = HashMap::new(); + + for (location, chunk) in + zip(locations.iter_mut(), last_n_chunks.iter()).rev() + { + chunk_locations.insert(chunk.seq_id(), backward_code.location()); + backward_code.append(chunk); + + location.bck_seq_id = backward_code.seq_id(); + location.bck = backward_code.location(); + } + + // Atoms may be pointing to some code located in one of the chunks that + // were written to backward code in a different order, the backward code + // location for those atoms needs to be adjusted accordingly. + let best_atoms = self.best_atoms_stack.last_mut().unwrap(); + + for atom in best_atoms.iter_mut() { + if let Some(adjustment) = + chunk_locations.get(&atom.code_loc.bck_seq_id) + { + atom.code_loc.bck_seq_id = backward_code.seq_id(); + atom.code_loc.bck += adjustment; + } + } + + locations + } + fn location(&self) -> CodeLoc { CodeLoc { fwd: self.forward_code().location(), @@ -275,6 +361,14 @@ impl Compiler { }) } + fn emit_repeat(&mut self, min: u32, max: u32, greedy: bool) -> CodeLoc { + CodeLoc { + fwd: self.forward_code_mut().emit_repeat(min, max, greedy), + bck_seq_id: self.backward_code().seq_id(), + bck: self.backward_code_mut().emit_repeat(min, max, greedy), + } + } + fn emit_masked_byte(&mut self, b: HexByte) -> CodeLoc { CodeLoc { fwd: self.forward_code_mut().emit_masked_byte(b), @@ -304,10 +398,31 @@ impl Compiler { start: CodeLoc, end: CodeLoc, ) -> Result { + // Both `start` and `end` must to some location within the same + // instruction sequence. + assert_eq!(start.bck_seq_id, end.bck_seq_id); + + let bck = if self.backward_code().seq_id() == start.bck_seq_id { + self.backward_code_mut().emit_clone(start.bck, end.bck)? + } else { + let cloned_code = self + .backward_code_chunks + .iter() + .rev() + .find(|chunk| chunk.seq_id == start.bck_seq_id) + .unwrap_or(&self.backward_code) + .as_slice() + .get(start.bck..end.bck) + .unwrap() + .to_vec(); // TODO: avoid copy? + + self.backward_code_mut().emit_bytes(cloned_code.as_slice())? + }; + Ok(CodeLoc { fwd: self.forward_code_mut().emit_clone(start.fwd, end.fwd)?, bck_seq_id: self.backward_code().seq_id(), - bck: self.backward_code_mut().emit_clone(start.bck, end.bck)?, + bck, }) } @@ -400,83 +515,19 @@ impl Compiler { } fn visit_pre_concat(&mut self) { - self.bookmarks.push(self.location()); // A new child of a `Concat` node is about to be processed, // create the chunk that will receive the code for this child. - self.backward_code_chunks.push(self.backward_code().next()); + self.start_backward_code_chunk(); } fn visit_post_concat(&mut self, expressions: &[Hir]) -> Vec { // We are here because all the children of a `Concat` node have been - // processed. The last N chunks in `backward_code_chunks` contain the - // code produced for each of the N children, but the nodes where - // processed left-to-right, and we want the chunks right-to-left, so - // these last N chunks will be copied into backward code in reverse + // processed. The last N chunks in the `backward_code_chunks` stack + // contain the code produced for each of the N children, but the nodes + // where processed left-to-right, and we want the chunks right-to-left, + // so these last N chunks will be copied into backward code in reverse // order. - let n = expressions.len(); - - // Split `backward_code_chunks` in two halves, [0, len-n) and - // [len-n, len). The first half stays in `backward_code_chunks` while - // the second half is stored in `last_n_chunks`. - let last_n_chunks = self - .backward_code_chunks - .split_off(self.backward_code_chunks.len() - n); - - // Obtain a reference to the backward code corresponding to the `Concat` - // node. It would be better to use `self.backward_code_mut()`, but it - // causes a mutable borrow on `self`, while the code below borrows - // `self.backward_code_chunks` or `self.backward_code` but not `self. - let backward_code = self - .backward_code_chunks - .last_mut() - .unwrap_or(&mut self.backward_code); - - // Update the split ID for the `Concat` node. If any of the children - // emitted a split instruction, and therefore incremented its split_id, - // this increment must be reflected in the parent node (`Concat`), so - // that any other node emitted after the parent doesn't reuse an already - // existing split ID. - if let Some(last_chunks) = last_n_chunks.last() { - backward_code.split_id = last_chunks.split_id; - } - - // The top N bookmarks corresponds to the beginning of the code for - // each expression in the concatenation. - let mut locations = self.bookmarks.split_off(self.bookmarks.len() - n); - - // Both `locations` and `last_n_chunks` have the same length N. - debug_assert_eq!(locations.len(), last_n_chunks.len()); - - // All chunks in `last_n_chucks` will be appended to the backward code - // in reverse order. The offset where each chunk resides in the backward - // code is stored in the hash map. - let mut chunk_locations = HashMap::new(); - - for (location, chunk) in - zip(locations.iter_mut(), last_n_chunks.iter()).rev() - { - chunk_locations.insert(chunk.seq_id(), backward_code.location()); - backward_code.append(chunk); - - location.bck_seq_id = backward_code.seq_id(); - location.bck = backward_code.location(); - } - - // Atoms may be pointing to some code located in one of the chunks that - // were written to backward code in a different order, the backward code - // location for those atoms needs to be adjusted accordingly. - let best_atoms = self.best_atoms_stack.last_mut().unwrap(); - - for atom in best_atoms.iter_mut() { - if let Some(adjustment) = - chunk_locations.get(&atom.code_loc.bck_seq_id) - { - atom.code_loc.bck_seq_id = backward_code.seq_id(); - atom.code_loc.bck += adjustment; - } - } - - locations + self.reverse_backward_code_chunks(expressions.len()) } fn visit_pre_alternation( @@ -574,39 +625,80 @@ impl Compiler { } fn visit_pre_repetition(&mut self, rep: &Repetition) -> Result<(), Error> { - match (rep.min, rep.max, rep.greedy) { + let nested_rep = self.repeats.last().cloned().unwrap_or(false); + + match (rep.min, rep.max, rep.greedy, nested_rep) { // e* and e*? // // l1: split_a l3 ( split_b for the non-greedy e*? ) // ... code for e ... // l2: jump l1 // l3: - (0, None, greedy) => { + (0, None, greedy, _) => { let l1 = self.emit_instr(if greedy { Instr::SPLIT_A } else { Instr::SPLIT_B })?; + self.repeats.push(false); self.bookmarks.push(l1); - self.zero_rep_depth += 1; } // e+ and e+? // // l1: ... code for e ... // l2: split_b l1 ( split_a for the non-greedy e+? ) // l3: - (1, None, _) => { + (1, None, _, _) => { let l1 = self.location(); + self.repeats.push(false); self.bookmarks.push(l1); } // e{min,} min > 1 // - // ... code for e repeated min times + // ... code for e repeated min - 2 times + // l1: ... code for e ... + // l2: split_b l1 ( split_a for the non-greedy e{min,}? ) + // ... code for e + (_, None, _, _) => { + self.repeats.push(false); + self.bookmarks.push(self.location()); + } + // e{0,max} (not inside repetition_start/repetition_end yet) + // + // l1: split_a l4 ( split_a for the non-greedy e{0,max}? ) + // l2 ... code for e ... + // l3: repeat l2, 0, max + // l4: + // + (0, Some(max), greedy, false) + if max > Self::REPEAT_INSTR_THRESHOLD => + { + let l1 = self.emit_instr(if greedy { + Instr::SPLIT_A + } else { + Instr::SPLIT_B + })?; + let l2 = self.location(); + self.bookmarks.push(l2); + self.bookmarks.push(l1); + self.repeats.push(true); + } + // e{min,max} min > 0 (not inside repetition_start/repetition_end yet) // - (_, None, _) => { + // ... code for e ... + // l1: ... code for e ... + // l2: repeat l1, min-1, max-1 + // l3: + // + (min, Some(max), _, false) + if min > Self::REPEAT_INSTR_THRESHOLD + || max > Self::REPEAT_INSTR_THRESHOLD => + { + self.start_backward_code_chunk(); + self.repeats.push(true); self.bookmarks.push(self.location()); } - // e{min,max} + // e{min,max} min > 0 // // ... code for e ... -+ // ... code for e ... | min times @@ -616,8 +708,7 @@ impl Compiler { // split end | // ... code for e ... -+ // end: - // - (min, Some(_), greedy) => { + (min, Some(_), greedy, _) => { if min == 0 { let split = self.emit_instr(if greedy { Instr::SPLIT_A @@ -625,8 +716,8 @@ impl Compiler { Instr::SPLIT_B })?; self.bookmarks.push(split); - self.zero_rep_depth += 1; } + self.repeats.push(false); self.bookmarks.push(self.location()); } } @@ -638,29 +729,28 @@ impl Compiler { &mut self, rep: &Repetition, ) -> Result { - match (rep.min, rep.max, rep.greedy) { + match (rep.min, rep.max, rep.greedy, self.repeats.pop().unwrap()) { // e* and e*? // - // l1: split_a l3 ( split_b for the non-greedy e*? ) - // ... code for e ... + // l1: split_a l3 (split_b for the non-greedy e*?, + // emitted by visit_pre_repetition) + // ... code for e ... (emitted while visiting child nodes) // l2: jump l1 // l3: - (0, None, _) => { + (0, None, _, _) => { let l1 = self.bookmarks.pop().unwrap(); let l2 = self.emit_instr(Instr::JUMP)?; let l3 = self.location(); self.patch_instr(&l1, l3.sub(&l1)?); self.patch_instr(&l2, l1.sub(&l2)?); - self.zero_rep_depth -= 1; - Ok(l1) } // e+ and e+? // - // l1: ... code for e ... - // l2: split_b l1 ( split_a for the non-greedy e+? ) + // l1: ... code for e ... (emitted while visiting child nodes) + // l2: split_b l1 (split_a for the non-greedy e+?) // l3: - (1, None, greedy) => { + (1, None, greedy, _) => { let l1 = self.bookmarks.pop().unwrap(); let l2 = self.emit_instr(if greedy { Instr::SPLIT_B @@ -668,16 +758,15 @@ impl Compiler { Instr::SPLIT_A })?; self.patch_instr(&l2, l1.sub(&l2)?); - Ok(l1) } - // e{min,} min > 1 + // e{min,} min > 1 // // ... code for e repeated min - 2 times // l1: ... code for e ... // l2: split_b l1 ( split_a for the non-greedy e{min,}? ) // ... code for e - (min, None, greedy) => { + (min, None, greedy, _) => { assert!(min >= 2); // min == 0 and min == 1 handled above. // `start` and `end` are the locations where the code for `e` @@ -736,8 +825,88 @@ impl Compiler { Ok(start) } + // e{0,max} + // + // l1: split_a l4 (emitted by visit_pre_repetition) + // l2: ... code for e ... (emitted while visiting child nodes) + // l3: repeat l1, 0, max + // l4: + // + (0, Some(max), greedy, true) + if max > Self::REPEAT_INSTR_THRESHOLD => + { + debug_assert!(max > 0); + + let l1 = self.bookmarks.pop().unwrap(); + let l2 = self.bookmarks.pop().unwrap(); + let l3 = self.emit_repeat(0, max, greedy); + let l4 = self.location(); + + self.patch_instr(&l1, l4.sub(&l1)?); + self.patch_instr(&l3, l2.sub(&l3)?); + + Ok(l1) + } // e{min,max} // + // if min == 1: + // + // ... code for e ... (emitted while visiting child nodes) + // split: split_a l3 + // l1: ... code for e ... + // l2: repeat l1, min-1, max-1 + // l3: + // + // if min > 1: + // + // ... code for e ... (emitted while visiting child nodes) + // l1: ... code for e ... + // l2: repeat l1, min-1, max-1 + // l3: + // + (min, Some(max), greedy, true) + if min > Self::REPEAT_INSTR_THRESHOLD + || max > Self::REPEAT_INSTR_THRESHOLD => + { + debug_assert!(min > 0); + debug_assert!(max >= min); + + let start = self.bookmarks.pop().unwrap(); + let end = self.location(); + + self.start_backward_code_chunk(); + + let split = if min == 1 { + Some(self.emit_instr(if greedy { + Instr::SPLIT_A + } else { + Instr::SPLIT_B + })?) + } else { + None + }; + + let l1 = self.emit_clone(start, end)?; + let l2 = self.emit_repeat(min - 1, max - 1, greedy); + let l3 = self.location(); + + self.patch_instr(&l2, l1.sub(&l2)?); + + if let Some(split) = split { + self.patch_instr(&split, l3.sub(&split)?); + } + + let locations = self.reverse_backward_code_chunks(2); + + Ok(locations[0]) + } + // This is the approach used when the repetition can't be expressed + // using a REPEAT instruction. This happens when we have nested + // repetitions, or when the number of repetitions is not large + // enough. + // + // e{min,max} min >= 0 + // // ... code for e ... -+ // ... code for e ... | min times // ... code for e ... -+ @@ -747,7 +916,7 @@ impl Compiler { // ... code for e ... -+ // end: // - (min, Some(max), greedy) => { + (min, Some(max), greedy, _) => { debug_assert!(min <= max); // `start` and `end` are the locations where the code for `e` @@ -798,10 +967,6 @@ impl Compiler { self.patch_instr(&split, end.sub(&split)?); } - if min == 0 { - self.zero_rep_depth -= 1; - } - Ok(start) } } @@ -838,8 +1003,11 @@ impl hir::Visitor for Compiler { } self.visit_pre_alternation(alternatives)?; } - HirKind::Repetition(rep) => { - self.visit_pre_repetition(rep)?; + HirKind::Repetition(repetition) => { + if repetition.min == 0 { + self.zero_rep_depth += 1; + } + self.visit_pre_repetition(repetition)?; } } @@ -1005,13 +1173,23 @@ impl hir::Visitor for Compiler { (best_atoms, code_loc) } - HirKind::Repetition(repeated) => { - let mut code_loc = self.visit_post_repetition(repeated)?; + HirKind::Repetition(repetition) => { + let mut code_loc = self.visit_post_repetition(repetition)?; code_loc.bck_seq_id = self.backward_code().seq_id(); code_loc.bck = self.backward_code().location(); - if self.zero_rep_depth > 0 { + if repetition.min == 0 { + self.zero_rep_depth -= 1; + } + + // If the minimum number of repetitions is zero (because this + // repetition can be repeated zero times, or because we are inside + // some other repetition that can be repeated zero times) we don't + // extract atoms from the repeated expression. It doesn't make sense + // to extract atoms from a portion of the regexp that may not appear + // in the scanned data. + if repetition.min == 0 || self.zero_rep_depth > 0 { return Ok(()); } @@ -1097,11 +1275,9 @@ impl hir::Visitor for Compiler { } fn visit_concat_in(&mut self) -> Result<(), Self::Err> { - self.bookmarks.push(self.location()); // A new child of a `Concat` node is about to be processed, // create the chunk that will receive the code for this child. - self.backward_code_chunks.push(self.backward_code().next()); - + self.start_backward_code_chunk(); Ok(()) } } @@ -1152,6 +1328,11 @@ impl InstrSeq { } } + /// Returns the code in the [`InstrSeq`] as a slice of raw bytes. + pub fn as_slice(&self) -> &[u8] { + self.seq.get_ref().as_slice() + } + /// Consumes the [`InstrSeq`] and returns the inner vector that contains /// the code. pub fn into_inner(self) -> Vec { @@ -1215,7 +1396,9 @@ impl InstrSeq { } Instr::JUMP => { // Jump instructions are followed by a 16-bits offset that is - // relative to the start of the instruction. + // relative to the start of the instruction. This offset is + // initially set to 0, but later updated with `patch_instr`, + // when the target offset is known. self.seq .write_all(&[0x00; size_of::()]) .unwrap(); @@ -1249,6 +1432,30 @@ impl InstrSeq { Ok(location) } + /// Adds a [`Instr::RepeatGreedy`] instruction at the end of the + /// sequence and returns the location where the newly added instruction + /// resides. + pub fn emit_repeat(&mut self, min: u32, max: u32, greedy: bool) -> usize { + let location = self.location(); + + self.seq + .write_all(&[ + OPCODE_PREFIX, + if greedy { + Instr::REPEAT_GREEDY + } else { + Instr::REPEAT_NON_GREEDY + }, + ]) + .unwrap(); + + self.seq.write_all(&[0x00; size_of::()]).unwrap(); + self.seq.write_all(min.to_le_bytes().as_slice()).unwrap(); + self.seq.write_all(max.to_le_bytes().as_slice()).unwrap(); + + location + } + /// Adds a [`Instr::MaskedByte`] instruction at the end of the sequence and /// returns the location where the newly added instruction resides. pub fn emit_masked_byte(&mut self, b: HexByte) -> usize { @@ -1281,7 +1488,7 @@ impl InstrSeq { } } else { // Create a bitmap where the N-th bit is set if byte N is part of - // any of the ranges in the class. + // some of the ranges in the class. let mut bitmap: BitArray<_, Lsb0> = BitArray::new([0_u8; 32]); for range in c.ranges() { let range = range.start() as usize..=range.end() as usize; @@ -1313,6 +1520,13 @@ impl InstrSeq { location } + pub fn emit_bytes(&mut self, bytes: &[u8]) -> Result { + let location = self.location(); + self.seq.write_all(bytes).unwrap(); + self.update_split_ids(location)?; + Ok(location) + } + /// Emits a clone of the code that goes from `start` to `end`, both /// inclusive. /// @@ -1330,35 +1544,41 @@ impl InstrSeq { // Extend the code by cloning the ranges that go from `start` to `end`. self.seq.get_mut().extend_from_within(start..end); - // Create two slices, one that covers all the previously existing code - // and another that covers the newly cloned code. - let (original_code, cloned_code) = - self.seq.get_mut().as_mut_slice().split_at_mut(location); - // Every split instruction has an ID, we don't want the split - // instructions in the cloned code to have the same IDs than - // in the original code, those IDs need to be updated. - for (instr, offset) in InstrParser::new(&original_code[start..end]) { + // instructions in the cloned code to have the same IDs as in the + // original code, those IDs need to be updated. + self.update_split_ids(location)?; + + self.seq.seek(SeekFrom::Current(end as i64 - start as i64)).unwrap(); + + Ok(location) + } + + /// Update the ID of split instructions after the give location. + /// + /// The ID of every split instruction that appears after the given location + /// is replaced with a new one that has not been used yet. This function + /// used after cloning a portion of the code to make sure that the cloned + /// code doesn't have split instruction with the same IDs as in the original + /// code. + fn update_split_ids(&mut self, location: usize) -> Result<(), Error> { + let code = + self.seq.get_mut().as_mut_slice().get_mut(location..).unwrap(); + + let mut offsets = Vec::new(); + + // First parse the code and store the offsets of the split IDs that + // must be updated. + for (instr, offset) in InstrParser::new(code) { match instr { Instr::SplitA(_, _) | Instr::SplitB(_, _) | Instr::SplitN(_) => { - debug_assert_eq!( - cloned_code[offset], - original_code[start + offset] - ); - debug_assert_eq!( - cloned_code[offset + 1], - original_code[start + offset + 1] - ); - // Update the split ID, which is at `offset + 2` because - // `offset` is the offset where the opcode starts, and the - // first two bytes are the prefix and the opcode itself. - cloned_code[offset + 2..offset + 2 + size_of::()] - .copy_from_slice( - self.split_id.to_le_bytes().as_slice(), - ); - + // The offset of the SplitId is offset + 2 because the + // opcode starts with the prefix 0xAA followed by the + // byte that identifies the split instruction, and then + // follows the SplitId. + offsets.push((offset + 2, self.split_id)); if let Some(incremented) = self.split_id.add(1) { self.split_id = incremented } else { @@ -1369,17 +1589,22 @@ impl InstrSeq { } } - self.seq.seek(SeekFrom::Current(end as i64 - start as i64)).unwrap(); + // Update the split IDs + for (offset, split_id) in offsets { + code[offset..offset + size_of::()] + .copy_from_slice(split_id.to_le_bytes().as_slice()); + } - Ok(location) + Ok(()) } /// Patches the offset of the instruction that starts at the given location. /// /// # Panics /// - /// If the instruction at `location` is not one that have an offset as its - /// argument, like [`Instr::Jump`], [`Instr::SplitA`] or [`Instr::SplitB`]. + /// If the instruction at `location` is not one that have an offset among + /// its arguments, like [`Instr::Jump`], [`Instr::SplitA`], [`Instr::SplitB`], + /// etc. pub fn patch_instr(&mut self, location: usize, offset: instr::Offset) { // Save the current position for the forward code in order to restore // it later. @@ -1396,10 +1621,10 @@ impl InstrSeq { assert_eq!(buf[0], OPCODE_PREFIX); match buf[1] { - Instr::JUMP => {} + Instr::JUMP | Instr::REPEAT_GREEDY | Instr::REPEAT_NON_GREEDY => {} Instr::SPLIT_A | Instr::SPLIT_B => { - // Skip the split ID. self.seq + // Skip the split ID. .seek(SeekFrom::Current(size_of::() as i64)) .unwrap(); } @@ -1410,9 +1635,7 @@ impl InstrSeq { // Write the given offset after the instruction opcode. This will // overwrite any existing offsets, usually initialized with 0. - self.seq - .write_all(instr::Offset::to_le_bytes(offset).as_slice()) - .unwrap(); + self.seq.write_all(offset.to_le_bytes().as_slice()).unwrap(); // Restore to the previous current position. self.seq.seek(SeekFrom::Start(saved_loc as u64)).unwrap(); @@ -1461,10 +1684,7 @@ impl InstrSeq { for _ in 0..n { self.seq - .write_all( - instr::Offset::to_le_bytes(offsets.next().unwrap()) - .as_slice(), - ) + .write_all(offsets.next().unwrap().to_le_bytes().as_slice()) .unwrap(); } @@ -1496,16 +1716,16 @@ impl Display for InstrSeq { writeln!(f, "{:05x}: CASE_INSENSITIVE {:#04x}", addr, c)?; } Instr::ClassRanges(class) => { - write!(f, "{:05x}: CLASS_RANGES ", addr)?; + write!(f, "{:05x}: CLASS_RANGES", addr)?; for range in class.ranges() { - write!(f, "[{:#04x}-{:#04x}] ", range.0, range.1)?; + write!(f, " [{:#04x}-{:#04x}]", range.0, range.1)?; } writeln!(f)?; } Instr::ClassBitmap(class) => { - write!(f, "{:05x}: CLASS_BITMAP ", addr)?; + write!(f, "{:05x}: CLASS_BITMAP", addr)?; for byte in class.bytes() { - write!(f, "{:#04x} ", byte)?; + write!(f, " {:#04x}", byte)?; } writeln!(f)?; } @@ -1514,7 +1734,7 @@ impl Display for InstrSeq { f, "{:05x}: JUMP {:05x}", addr, - addr as isize + offset as isize, + (addr as isize).saturating_add(offset.into()), )?; } Instr::SplitA(id, offset) => { @@ -1523,7 +1743,7 @@ impl Display for InstrSeq { "{:05x}: SPLIT_A({}) {:05x}", addr, id, - addr as isize + offset as isize, + (addr as isize).saturating_add(offset.into()), )?; } Instr::SplitB(id, offset) => { @@ -1532,16 +1752,40 @@ impl Display for InstrSeq { "{:05x}: SPLIT_B({}) {:05x}", addr, id, - addr as isize + offset as isize, + (addr as isize).saturating_add(offset.into()), )?; } Instr::SplitN(split) => { write!(f, "{:05x}: SPLIT_N({})", addr, split.id())?; for offset in split.offsets() { - write!(f, " {:05x}", addr as isize + offset as isize)?; + write!( + f, + " {:05x}", + (addr as isize).saturating_add(offset.into()) + )?; } writeln!(f)?; } + Instr::RepeatGreedy { min, max, offset } => { + writeln!( + f, + "{:05x}: REPEAT_GREEDY {:05x} {}-{}", + addr, + (addr as isize).saturating_add(offset.into()), + min, + max + )?; + } + Instr::RepeatNonGreedy { min, max, offset } => { + writeln!( + f, + "{:05x}: REPEAT_NON_GREEDY {:05x} {}-{}", + addr, + (addr as isize).saturating_add(offset.into()), + min, + max + )?; + } Instr::Start => { writeln!(f, "{:05x}: START", addr)?; } diff --git a/lib/src/re/thompson/instr.rs b/lib/src/re/thompson/instr.rs index 3692af67..a64bfc8e 100644 --- a/lib/src/re/thompson/instr.rs +++ b/lib/src/re/thompson/instr.rs @@ -53,6 +53,7 @@ solely matches the `0xAA` byte. use std::fmt::{Display, Formatter}; use std::mem::size_of; +use std::num::TryFromIntError; use bitvec::order::Lsb0; use bitvec::slice::{BitSlice, IterOnes}; @@ -111,7 +112,49 @@ impl SplitId { /// Offset for jump and split instructions. The offset is always relative to /// the address where the instruction starts. -pub type Offset = i32; +pub struct Offset(i32); + +impl From for i32 { + #[inline] + fn from(value: Offset) -> Self { + value.0 + } +} + +impl From for isize { + #[inline] + fn from(value: Offset) -> Self { + value.0 as isize + } +} + +impl From for Offset { + #[inline] + fn from(value: usize) -> Self { + Self(i32::try_from(value).unwrap()) + } +} + +impl TryFrom for Offset { + type Error = TryFromIntError; + + #[inline] + fn try_from(value: isize) -> Result { + Ok(Self(i32::try_from(value)?)) + } +} + +impl Offset { + #[inline] + pub fn from_le_bytes(bytes: [u8; size_of::()]) -> Offset { + Offset(i32::from_le_bytes(bytes)) + } + + #[inline] + pub fn to_le_bytes(&self) -> [u8; size_of::()] { + self.0.to_le_bytes() + } +} /// Instructions supported by the Pike VM. pub enum Instr<'a> { @@ -168,6 +211,28 @@ pub enum Instr<'a> { /// continue at the remaining locations. SplitN(SplitN<'a>), + /// Jumps back to the beginning of some repetition, continues with the + /// code that comes after the repetition, or both, depending on the + /// repetition count for the current thread and the minimum and maximum + /// number of repetitions indicated by `min` and `max`. The logic goes + /// as follows: + /// - If the repetition count for the current thread (`rep_count`) is + /// less than `min`, only jumps to the beginning of the repetition as + /// the minimum number of repetitions has not been reached yet. + /// - If `rep_count` is between `min` and `max`, the current thread jumps + /// to the beginning of the repetition, and a new thread continues + /// executing the code after the repetition. + /// - If `rep_count` reached `max`, the current thread continues executing + /// the code after the repetition. + RepeatGreedy { offset: Offset, min: u32, max: u32 }, + + /// Similar to [`RepeatGreedy`], the only difference resides on the priority + /// of newly created threads vs the existing thread. In the greedy version + /// of this instruction going back to the start of the repetition has higher + /// priority, while for this instruction continuing with the code after the + /// repetition has higher priority. + RepeatNonGreedy { offset: Offset, min: u32, max: u32 }, + /// Relative jump. The opcode is followed by an offset, the location /// of the target instruction is computed by adding this offset to the /// location of the jump opcode. @@ -218,6 +283,8 @@ impl<'a> Instr<'a> { pub const WORD_BOUNDARY_NEG: u8 = 0x0D; pub const WORD_START: u8 = 0x0E; pub const WORD_END: u8 = 0x0F; + pub const REPEAT_GREEDY: u8 = 0x10; + pub const REPEAT_NON_GREEDY: u8 = 0x11; } /// Parses a slice of bytes that contains Pike VM instructions, returning @@ -285,6 +352,32 @@ impl<'a> InstrParser<'a> { + size_of::() * n as usize, ) } + [OPCODE_PREFIX, Instr::REPEAT_GREEDY, ..] => { + let offset = Self::decode_offset(&code[2..]); + let min = Self::decode_u32(&code[2 + size_of::()..]); + let max = Self::decode_u32( + &code[2 + size_of::() + size_of::()..], + ); + ( + Instr::RepeatGreedy { offset, min, max }, + 2 + size_of::() + + size_of::() + + size_of::(), + ) + } + [OPCODE_PREFIX, Instr::REPEAT_NON_GREEDY, ..] => { + let offset = Self::decode_offset(&code[2..]); + let min = Self::decode_u32(&code[2 + size_of::()..]); + let max = Self::decode_u32( + &code[2 + size_of::() + size_of::()..], + ); + ( + Instr::RepeatNonGreedy { offset, min, max }, + 2 + size_of::() + + size_of::() + + size_of::(), + ) + } [OPCODE_PREFIX, Instr::CLASS_RANGES, ..] => { let n = *unsafe { code.get_unchecked(2) } as usize; let ranges = @@ -318,6 +411,13 @@ impl<'a> InstrParser<'a> { } } + fn decode_u32(slice: &[u8]) -> u32 { + let bytes: &[u8; size_of::()] = + unsafe { &*(slice.as_ptr() as *const [u8; size_of::()]) }; + + u32::from_le_bytes(*bytes) + } + fn decode_offset(slice: &[u8]) -> Offset { let bytes: &[u8; size_of::()] = unsafe { &*(slice.as_ptr() as *const [u8; size_of::()]) }; @@ -454,7 +554,7 @@ impl<'a> ClassBitmap<'a> { /// Returns the length of the code emitted for the given literal. /// -/// Usually the code emitted for a literal has the same length than the literal +/// Usually the code emitted for a literal has the same length as the literal /// itself, because each byte in the literal corresponds to one byte in the /// code. However, this is not true if the literal contains one or more bytes /// equal to [`OPCODE_PREFIX`]. In such cases the code is longer than the diff --git a/lib/src/re/thompson/mod.rs b/lib/src/re/thompson/mod.rs index 4c66571a..baf59b03 100644 --- a/lib/src/re/thompson/mod.rs +++ b/lib/src/re/thompson/mod.rs @@ -1,7 +1,25 @@ -/*! A regexp compiler using the [Thompson's construction][1] algorithm that +/*! A regexp compiler based on the [Thompson's construction][1] algorithm that produces code for the Pike VM described in Russ Cox's article [Regular Expression Matching: the Virtual Machine Approach][2]. +The only fundamental difference with the algorithms described in the cited +articles are the way in which repetitions are handled. In the original +algorithm a repetition like `abc{3}` is actually implemented by repeating +the pattern three times, as in `abcabcabc`. A pattern like `abc{2,4}` is +expressed like `abcabc(abc)?(abc)?`. + +This approach is simple, but the size of the code produced for the Pike VM +is very large when the number of repetitions is large. Also, the number of +active threads can become very large, which has an important impact on +performance. + +In this implementation we introduce two new instructions for the Pike VM: +REPEAT_GREEDY and REPEAT_UNGREEDY, which are used for expressing some of the +repetitions found in the regular expression. Particularly those that are +repeated a large number of times. This also implies that each thread not only +has an instruction pointer, it also has a repetition count. Both the +instruction pointer and the repetition count are part of the thread's state. + [1]: https://en.wikipedia.org/wiki/Thompson%27s_construction [2]: https://swtch.com/~rsc/regexp/regexp2.html */ diff --git a/lib/src/re/thompson/pikevm.rs b/lib/src/re/thompson/pikevm.rs index 6d146865..cbe37506 100644 --- a/lib/src/re/thompson/pikevm.rs +++ b/lib/src/re/thompson/pikevm.rs @@ -3,7 +3,7 @@ use std::mem; use bitvec::array::BitArray; -use super::instr::{Instr, InstrParser}; +use super::instr::{Instr, InstrParser, Offset}; use crate::re::bitmapset::BitmapSet; use crate::re::thompson::instr::SplitId; use crate::re::{Action, CodeLoc, WideIter, DEFAULT_SCAN_LIMIT}; @@ -17,10 +17,10 @@ pub(crate) struct PikeVM<'r> { /// position within the VM code, pointing to some VM instruction. Each item /// in the set is unique, the VM guarantees that there aren't two active /// threads at the same VM instruction. - threads: BitmapSet, + threads: BitmapSet, /// The set of threads that will become the active threads when the next /// byte is read from the input. - next_threads: BitmapSet, + next_threads: BitmapSet, /// Maximum number of bytes to scan. The VM will abort after ingesting /// this number of bytes from the input. scan_limit: u16, @@ -174,6 +174,7 @@ impl<'r> PikeVM<'r> { epsilon_closure( self.code, start, + 0, curr_byte, bck_input.next(), &mut self.cache, @@ -183,8 +184,8 @@ impl<'r> PikeVM<'r> { while !self.threads.is_empty() { let next_byte = fwd_input.next(); - for ip in self.threads.iter() { - let (instr, size) = InstrParser::decode_instr(unsafe { + for (ip, rep_count) in self.threads.iter() { + let (instr, instr_size) = InstrParser::decode_instr(unsafe { self.code.get_unchecked(*ip..) }); @@ -215,7 +216,8 @@ impl<'r> PikeVM<'r> { if is_match { epsilon_closure( self.code, - C::from(*ip + size), + C::from(*ip + instr_size), + *rep_count, next_byte, curr_byte, &mut self.cache, @@ -242,7 +244,9 @@ impl<'r> PikeVM<'r> { /// its state during the computation of an epsilon closure. See the /// documentation of [`epsilon_closure`] for details. pub struct EpsilonClosureState { - threads: Vec, + /// Pairs (instruction pointer, repetition count) describing the existing + /// threads. + threads: Vec<(usize, u32)>, /// This bit array has one bit per possible value of SplitId. If the /// split instruction with SplitId = N is executed, the N-th bit in the /// array is set to 1. @@ -308,20 +312,24 @@ impl EpsilonClosureState { pub(crate) fn epsilon_closure( code: &[u8], start: C, + rep_count: u32, curr_byte: Option<&u8>, prev_byte: Option<&u8>, state: &mut EpsilonClosureState, - closure: &mut BitmapSet, + closure: &mut BitmapSet, ) { - state.threads.push(start.location()); + state.threads.push((start.location(), rep_count)); state.dirty = true; let is_word_char = |c: u8| c == b'_' || c.is_ascii_alphanumeric(); - while let Some(ip) = state.threads.pop() { - let (instr, size) = + let apply_offset = |ip: usize, offset: Offset| -> usize { + (ip as isize).saturating_add(offset.into()).try_into().unwrap() + }; + + while let Some((ip, mut rep_count)) = state.threads.pop() { + let (instr, instr_size) = InstrParser::decode_instr(unsafe { code.get_unchecked(ip..) }); - let next = ip + size; match instr { Instr::AnyByte | Instr::Byte(_) @@ -330,54 +338,88 @@ pub(crate) fn epsilon_closure( | Instr::ClassBitmap(_) | Instr::ClassRanges(_) | Instr::Match => { - closure.insert(ip); + closure.insert(ip, rep_count); } Instr::SplitA(id, offset) => { if !state.executed(id) { - state - .threads - .push((ip as i64 + offset as i64).try_into().unwrap()); - state.threads.push(next); + state.threads.push((apply_offset(ip, offset), rep_count)); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } Instr::SplitB(id, offset) => { if !state.executed(id) { - state.threads.push(next); - state - .threads - .push((ip as i64 + offset as i64).try_into().unwrap()); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); + state.threads.push((apply_offset(ip, offset), rep_count)); } } Instr::SplitN(split) => { if !state.executed(split.id()) { for offset in split.offsets().rev() { - state.threads.push( - (ip as i64 + offset as i64).try_into().unwrap(), - ); + state + .threads + .push((apply_offset(ip, offset), rep_count)); } } } + Instr::RepeatGreedy { offset, min, max } => { + rep_count += 1; + if rep_count >= min { + state + .threads + .push((apply_offset(ip, instr_size.into()), 0)); + } + if rep_count < max { + state.threads.push((apply_offset(ip, offset), rep_count)); + } + } + Instr::RepeatNonGreedy { offset, min, max } => { + rep_count += 1; + if rep_count < max { + state.threads.push((apply_offset(ip, offset), rep_count)); + } + if rep_count >= min { + state + .threads + .push((apply_offset(ip, instr_size.into()), 0)); + } + } Instr::Jump(offset) => { - state - .threads - .push((ip as i64 + offset as i64).try_into().unwrap()); + state.threads.push((apply_offset(ip, offset), rep_count)); } Instr::Start => { if start.backwards() { if curr_byte.is_none() { - state.threads.push(next); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } else if prev_byte.is_none() { - state.threads.push(next); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } Instr::End => { if start.backwards() { if prev_byte.is_none() { - state.threads.push(next); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } else if curr_byte.is_none() { - state.threads.push(next); + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } Instr::WordStart => { @@ -392,7 +434,10 @@ pub(crate) fn epsilon_closure( _ => false, }; if is_match { - state.threads.push(next) + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } Instr::WordEnd => { @@ -407,7 +452,10 @@ pub(crate) fn epsilon_closure( _ => false, }; if is_match { - state.threads.push(next) + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } Instr::WordBoundary | Instr::WordBoundaryNeg => { @@ -422,7 +470,10 @@ pub(crate) fn epsilon_closure( } if is_match { - state.threads.push(next) + state.threads.push(( + apply_offset(ip, instr_size.into()), + rep_count, + )); } } } diff --git a/lib/src/re/thompson/tests.rs b/lib/src/re/thompson/tests.rs index 7ea1c21f..45086e15 100644 --- a/lib/src/re/thompson/tests.rs +++ b/lib/src/re/thompson/tests.rs @@ -24,31 +24,39 @@ macro_rules! assert_re_code { assert_eq!($bck, bck_code.to_string()); assert_eq!($atoms, atoms); - let mut fwd_closure = BitmapSet::new(); + let mut fwd_closure = BitmapSet::::new(); let mut cache = EpsilonClosureState::new(); epsilon_closure( fwd_code.as_ref(), FwdCodeLoc::try_from(0_usize).unwrap(), + 0, None, None, &mut cache, &mut fwd_closure, ); - assert_eq!($fwd_closure, fwd_closure.into_vec()); + assert_eq!( + $fwd_closure, + fwd_closure.iter().map(|(ip, _)| *ip).collect::>() + ); - let mut bck_closure = BitmapSet::new(); + let mut bck_closure = BitmapSet::::new(); epsilon_closure( bck_code.as_ref(), BckCodeLoc::try_from(0_usize).unwrap(), + 0, None, None, &mut cache, &mut bck_closure, ); - assert_eq!($bck_closure, bck_closure.into_vec()); + assert_eq!( + $bck_closure, + bck_closure.iter().map(|(ip, _)| *ip).collect::>() + ); }}; } @@ -440,7 +448,7 @@ fn re_code_9() { 00000: LIT 0x61 00001: LIT 0x62 00002: LIT 0x63 -00003: CLASS_RANGES [0x30-0x32] [0x78-0x79] +00003: CLASS_RANGES [0x30-0x32] [0x78-0x79] 0000a: LIT 0x64 0000b: LIT 0x65 0000c: LIT 0x66 @@ -451,7 +459,7 @@ fn re_code_9() { 00000: LIT 0x66 00001: LIT 0x65 00002: LIT 0x64 -00003: CLASS_RANGES [0x30-0x32] [0x78-0x79] +00003: CLASS_RANGES [0x30-0x32] [0x78-0x79] 0000a: LIT 0x63 0000b: LIT 0x62 0000c: LIT 0x61 @@ -497,7 +505,7 @@ fn re_code_10() { 00001: LIT 0x62 00002: LIT 0x63 00003: LIT 0x64 -00004: CLASS_BITMAP 0x30 0x32 0x34 0x61 0x63 0x65 0x67 0x69 0x6b 0x6d 0x6f 0x71 0x73 0x75 0x77 0x79 +00004: CLASS_BITMAP 0x30 0x32 0x34 0x61 0x63 0x65 0x67 0x69 0x6b 0x6d 0x6f 0x71 0x73 0x75 0x77 0x79 00026: LIT 0x65 00027: LIT 0x66 00028: MATCH @@ -506,7 +514,7 @@ fn re_code_10() { r#" 00000: LIT 0x66 00001: LIT 0x65 -00002: CLASS_BITMAP 0x30 0x32 0x34 0x61 0x63 0x65 0x67 0x69 0x6b 0x6d 0x6f 0x71 0x73 0x75 0x77 0x79 +00002: CLASS_BITMAP 0x30 0x32 0x34 0x61 0x63 0x65 0x67 0x69 0x6b 0x6d 0x6f 0x71 0x73 0x75 0x77 0x79 00024: LIT 0x64 00025: LIT 0x63 00026: LIT 0x62 @@ -742,16 +750,10 @@ fn re_code_14() { 00016: MATCH "#, // Atoms - vec![ - RegexpAtom { - atom: Atom::inexact(vec![0x61, 0x62, 0x63]), - code_loc: CodeLoc { fwd: 0x08, bck_seq_id: 0, bck: 0x16 } - }, - RegexpAtom { - atom: Atom::exact(vec![]), - code_loc: CodeLoc { fwd: 0x08, bck_seq_id: 0, bck: 0x16 } - } - ], + vec![RegexpAtom { + atom: Atom::inexact(vec![]), + code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x00 } + }], // Epsilon closure starting at forward code 0. vec![0x08, 0x16], // Epsilon closure starting at backward code 0. @@ -786,20 +788,10 @@ fn re_code_15() { 0002b: MATCH "#, // Atoms - vec![ - RegexpAtom { - atom: Atom::inexact(vec![0x61]), - code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x2b } - }, - RegexpAtom { - atom: Atom::inexact(vec![0x62]), - code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x2b } - }, - RegexpAtom { - atom: Atom::exact(vec![]), - code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x2b } - } - ], + vec![RegexpAtom { + atom: Atom::inexact(vec![]), + code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x00 } + }], // Epsilon closure starting at forward code 0. vec![0x15, 0x24, 0x2b], // Epsilon closure starting at backward code 0. @@ -1029,9 +1021,9 @@ fn re_code_21() { r#"(?is)[a-z]{1,2}ab"#, // Forward code r#" -00000: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] +00000: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] 00007: SPLIT_A(0) 00016 -0000f: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] +0000f: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] 00016: MASKED_BYTE 0x41 0xdf 0001a: MASKED_BYTE 0x42 0xdf 0001e: MATCH @@ -1040,9 +1032,9 @@ fn re_code_21() { r#" 00000: MASKED_BYTE 0x42 0xdf 00004: MASKED_BYTE 0x41 0xdf -00008: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] +00008: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] 0000f: SPLIT_A(0) 0001e -00017: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] +00017: CLASS_RANGES [0x41-0x5a] [0x61-0x7a] 0001e: MATCH "#, // Atoms @@ -1126,6 +1118,104 @@ fn re_code_22() { ); } +#[test] +fn re_code_23() { + assert_re_code!( + r#"(abc+d+){200}"#, + // Forward code + r#" +00000: LIT 0x61 +00001: LIT 0x62 +00002: LIT 0x63 +00003: SPLIT_B(0) 00002 +0000b: LIT 0x64 +0000c: SPLIT_B(1) 0000b +00014: LIT 0x61 +00015: LIT 0x62 +00016: LIT 0x63 +00017: SPLIT_B(2) 00016 +0001f: LIT 0x64 +00020: SPLIT_B(3) 0001f +00028: REPEAT_GREEDY 00014 199-199 +00036: MATCH +"#, + // Backward code + r#" +00000: LIT 0x64 +00001: SPLIT_B(2) 00000 +00009: LIT 0x63 +0000a: SPLIT_B(3) 00009 +00012: LIT 0x62 +00013: LIT 0x61 +00014: REPEAT_GREEDY 00000 199-199 +00022: LIT 0x64 +00023: SPLIT_B(1) 00022 +0002b: LIT 0x63 +0002c: SPLIT_B(0) 0002b +00034: LIT 0x62 +00035: LIT 0x61 +00036: MATCH +"#, + // Atoms + vec![RegexpAtom { + atom: Atom::inexact(vec![0x61, 0x62, 0x63]), + code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x36 } + },], + // Epsilon closure starting at forward code 0. + vec![0x00], + // Epsilon closure starting at backward code 0. + vec![0x00] + ); +} + +#[test] +fn re_code_24() { + assert_re_code!( + r#"(a{2,3}?b){1,13}?"#, + // Forward code + r#" +00000: LIT 0x61 +00001: LIT 0x61 +00002: SPLIT_B(0) 0000b +0000a: LIT 0x61 +0000b: LIT 0x62 +0000c: SPLIT_B(1) 0002e +00014: LIT 0x61 +00015: LIT 0x61 +00016: SPLIT_B(2) 0001f +0001e: LIT 0x61 +0001f: LIT 0x62 +00020: REPEAT_NON_GREEDY 00014 0-12 +0002e: MATCH +"#, + // Backward code + r#" +00000: SPLIT_B(1) 00022 +00008: LIT 0x62 +00009: LIT 0x61 +0000a: LIT 0x61 +0000b: SPLIT_B(2) 00014 +00013: LIT 0x61 +00014: REPEAT_NON_GREEDY 00008 0-12 +00022: LIT 0x62 +00023: LIT 0x61 +00024: LIT 0x61 +00025: SPLIT_B(0) 0002e +0002d: LIT 0x61 +0002e: MATCH +"#, + // Atoms + vec![RegexpAtom { + atom: Atom::inexact(vec![0x61, 0x61]), + code_loc: CodeLoc { fwd: 0x00, bck_seq_id: 0, bck: 0x2e } + },], + // Epsilon closure starting at forward code 0. + vec![0x00], + // Epsilon closure starting at backward code 0. + vec![0x22, 0x08] + ); +} + #[rustfmt::skip] #[test] fn re_atoms() { @@ -1167,6 +1257,11 @@ fn re_atoms() { ] ); + assert_re_atoms!( + r#"ab{0,2}cd"#, + vec![Atom::inexact(b"cd")] + ); + assert_re_atoms!( r#"ab.*cde"#, vec![Atom::inexact(b"cde")] diff --git a/lib/src/tests/mod.rs b/lib/src/tests/mod.rs index 72e00183..01008a27 100644 --- a/lib/src/tests/mod.rs +++ b/lib/src/tests/mod.rs @@ -1255,6 +1255,74 @@ fn regexp_patterns_2() { #[test] fn regexp_patterns_3() { + pattern_match!(r#"/.b{15}/"#, b"abbbbbbbbbbbbbbb", b"abbbbbbbbbbbbbbb"); + pattern_match!( + r#"/.b{15,16}/"#, + b"abbbbbbbbbbbbbbbb", + b"abbbbbbbbbbbbbbbb" + ); + pattern_match!( + r#"/.b{15,16}?/"#, + b"abbbbbbbbbbbbbbbb", + b"abbbbbbbbbbbbbbb" + ); + pattern_match!( + r#"/ab{15,16}?c/"#, + b"abbbbbbbbbbbbbbbc", + b"abbbbbbbbbbbbbbbc" + ); + pattern_match!( + r#"/.b{15,16}cccc/"#, + b"abbbbbbbbbbbbbbbbcccc", + b"abbbbbbbbbbbbbbbbcccc" + ); + pattern_match!( + r#"/.b{15,16}?cccc/"#, + b"abbbbbbbbbbbbbbbcccc", + b"abbbbbbbbbbbbbbbcccc" + ); + pattern_match!( + r#"/a.b{15,16}cccc/"#, + b"aabbbbbbbbbbbbbbbcccc", + b"aabbbbbbbbbbbbbbbcccc" + ); + pattern_match!( + r#"/abcd.{0,11}efgh.{0,11}ijk/"#, + b"abcd123456789ABefgh123456789ABijk", + b"abcd123456789ABefgh123456789ABijk" + ); + pattern_match!( + r#"/abcd.{0,11}?efgh.{0,11}?ijk/"#, + b"abcd123456789ABefgh123456789ABijk", + b"abcd123456789ABefgh123456789ABijk" + ); + pattern_match!(r#"/abcd.{0,11}?abcd/"#, b"abcdabcdabcd", b"abcdabcd"); + pattern_match!(r#"/abcd.{0,11}abcd/"#, b"abcdabcdabcd", b"abcdabcdabcd"); + pattern_match!(r#"/ab{2,15}c/"#, b"abbbc", b"abbbc"); + pattern_match!(r#"/ab{2,15}?c/"#, b"abbbc", b"abbbc"); + pattern_match!(r#"/ab{0,15}?c/"#, b"abc", b"abc"); + pattern_match!(r#"/ab{,15}?c/"#, b"abc", b"abc"); + pattern_match!(r#"/a{0,15}bc/"#, b"bbc", b"bc"); + pattern_match!(r#"/a{0,15}?bc/"#, b"abc", b"abc"); + pattern_match!(r#"/a{0,15}?bc/"#, b"bc", b"bc"); + pattern_match!(r#"/aa{0,15}?bc/"#, b"abc", b"abc"); + pattern_match!(r#"/aa{0,15}bc/"#, b"abc", b"abc"); + pattern_match!(r#"/ab{11}c/"#, b"abbbbbbbbbbbc", b"abbbbbbbbbbbc"); + pattern_false!(r#"/ab{11}c/"#, b"ac"); + pattern_match!(r#"/ab{11,}c/"#, b"abbbbbbbbbbbbc", b"abbbbbbbbbbbbc"); + pattern_false!(r#"/ab{11,}b/"#, b"abbbbbbbbbbb"); + pattern_match!(r#"/ab{0,11}c/"#, b"abbbbbbbbbc", b"abbbbbbbbbc"); + pattern_match!(r#"/(a{2,13}b){2,13}/"#, b"aabaaabaab", b"aabaaabaab"); + pattern_match!(r#"/(a{2,13}?b){2,13}?/"#, b"aabaaabaab", b"aabaaab"); + pattern_match!( + r#"/(a{4,5}b){4,15}/"#, + b"aaaabaaaabaaaaabaaaaab", + b"aaaabaaaabaaaaabaaaaab" + ); +} + +#[test] +fn regexp_patterns_4() { pattern_match!(r#"/a[bx]c/"#, b"abc", b"abc"); pattern_match!(r#"/a[bx]c/"#, b"axc", b"axc"); pattern_match!(r#"/a[0-9]*b/"#, b"ab", b"ab"); @@ -1375,7 +1443,7 @@ fn regexp_patterns_3() { } #[test] -fn regexp_patterns_4() { +fn regexp_patterns_5() { pattern_match!(r"/\\/", b"\\", b"\\"); pattern_match!(r"/\babc/", b"abc", b"abc"); pattern_match!(r"/abc\b/", b"abc", b"abc"); @@ -1445,7 +1513,7 @@ fn regexp_patterns_4() { } #[test] -fn regexp_patterns_5() { +fn regexp_patterns_6() { rule_true!( r#"rule test { strings: