From 94940db2e3580427f85d9a879b9a085c25c42afd Mon Sep 17 00:00:00 2001 From: Matthew Herzl Date: Wed, 25 Sep 2024 14:32:16 -0500 Subject: [PATCH] Add range-based iteration --- roaring/Cargo.toml | 1 + roaring/src/bitmap/iter.rs | 7 +- roaring/src/bitmap/mod.rs | 1 + roaring/src/bitmap/range.rs | 137 ++++++++++++++ roaring/src/bitmap/store/array_store/mod.rs | 14 ++ roaring/src/bitmap/store/bitmap_store.rs | 200 ++++++++++++++++++++ roaring/src/bitmap/store/mod.rs | 46 +++++ roaring/src/bitmap/util.rs | 20 +- roaring/tests/range.rs | 61 ++++++ 9 files changed, 476 insertions(+), 11 deletions(-) create mode 100644 roaring/src/bitmap/range.rs create mode 100644 roaring/tests/range.rs diff --git a/roaring/Cargo.toml b/roaring/Cargo.toml index 3c81d047..34af5310 100644 --- a/roaring/Cargo.toml +++ b/roaring/Cargo.toml @@ -19,6 +19,7 @@ license = "MIT OR Apache-2.0" bytemuck = { workspace = true, optional = true } byteorder = { workspace = true, optional = true } serde = { workspace = true, optional = true } +num = "0.4" [features] default = ["std"] diff --git a/roaring/src/bitmap/iter.rs b/roaring/src/bitmap/iter.rs index 59463a21..6ae5d3ed 100644 --- a/roaring/src/bitmap/iter.rs +++ b/roaring/src/bitmap/iter.rs @@ -20,11 +20,14 @@ pub struct IntoIter { size_hint: u64, } -impl Iter<'_> { - fn new(containers: &[Container]) -> Iter { +impl<'a> Iter<'a> { + pub(super) fn new(containers: &[Container]) -> Iter { let size_hint = containers.iter().map(|c| c.len()).sum(); Iter { inner: containers.iter().flatten(), size_hint } } + pub(super) fn empty() -> Iter<'a> { + Iter { inner: [].iter().flatten(), size_hint: 0 } + } } impl IntoIter { diff --git a/roaring/src/bitmap/mod.rs b/roaring/src/bitmap/mod.rs index ed5567d5..5417617a 100644 --- a/roaring/src/bitmap/mod.rs +++ b/roaring/src/bitmap/mod.rs @@ -14,6 +14,7 @@ mod iter; mod ops; #[cfg(feature = "std")] mod ops_with_serialized; +mod range; #[cfg(feature = "serde")] mod serde; #[cfg(feature = "std")] diff --git a/roaring/src/bitmap/range.rs b/roaring/src/bitmap/range.rs new file mode 100644 index 00000000..82cbeb0e --- /dev/null +++ b/roaring/src/bitmap/range.rs @@ -0,0 +1,137 @@ +use core::ops::RangeBounds; +use core::ops::RangeInclusive; + +use super::container::Container; +use super::iter; +use super::store; +use super::util; +use crate::RoaringBitmap; + +/// Iterator over a consecutive subsequence of a bitmap. +/// Efficient; O( log[n] + k ), +/// where n is the bitmap's length +/// and k is the subsequence's length. +pub struct RangeIter<'a> { + first: store::StorePartIter<'a>, + between: iter::Iter<'a>, + last: store::StorePartIter<'a>, + // size_hint: u64, +} + +impl<'a> RangeIter<'a> { + pub fn new(containers: &'a [Container], range: R) -> RangeIter<'a> + where + R: RangeBounds, + { + let (start, end) = match util::convert_range_to_inclusive(range) { + Some(range) => (*range.start(), *range.end()), + None => return RangeIter::empty(), + }; + + let (start_key, start_low) = util::split(start); + let (end_key, end_low) = util::split(end); + + let s = containers.binary_search_by_key(&start_key, |c| c.key); + let e = containers.binary_search_by_key(&end_key, |c| c.key); + + if s == e { + // single container + return match s { + Ok(i) => RangeIter { + first: Self::container_part(&containers[i], start_low..=end_low, start_key), + between: iter::Iter::empty(), + last: store::StorePartIter::empty(), + }, + Err(_) => RangeIter::empty(), // nothing to iterate over + }; + } + + // multiple containers + let (first, inner_start) = match s { + Ok(i) => (Self::container_part(&containers[i], start_low..=u16::MAX, start_key), i + 1), + Err(i) => (store::StorePartIter::empty(), i), + }; + let (last, inner_stop) = match e { + Ok(i) => (Self::container_part(&containers[i], u16::MIN..=end_low, end_key), i), + Err(i) => (store::StorePartIter::empty(), i), + }; + let between = iter::Iter::new(&containers[inner_start..inner_stop]); + + RangeIter { first, between, last } + } + fn container_part( + container: &Container, + range: RangeInclusive, + key: u16, + ) -> store::StorePartIter { + store::StorePartIter::new(key, &container.store, range) + } + fn empty() -> RangeIter<'a> { + RangeIter { + first: store::StorePartIter::empty(), + between: iter::Iter::empty(), + last: store::StorePartIter::empty(), + } + } +} + +impl<'a> Iterator for RangeIter<'a> { + type Item = u32; + + fn next(&mut self) -> Option { + if let f @ Some(_) = self.first.next() { + return f; + } + if let b @ Some(_) = self.between.next() { + return b; + } + self.last.next() + } +} + +impl RoaringBitmap { + /// Efficiently obtains an iterator over the specified range. + /// + /// # Examples + /// + /// ``` + /// use roaring::RoaringBitmap; + /// + /// // let mut rb = RoaringBitmap::new(); + /// // rb.insert(0); + /// // rb.insert(1); + /// // rb.insert(10); + /// // rb.insert(999_999); + /// // rb.insert(1_000_000); + /// // + /// // let expected = vec![1,10,999_999]; + /// // let actual: Vec = rb.range(1..=999_999).collect(); + /// // assert_eq!(expected, actual); + /// + /// let rb = RoaringBitmap::from_sorted_iter(10..5000).unwrap(); + /// + /// let expected = vec![10,11,12]; + /// let actual: Vec = rb.range(0..13).collect(); + /// assert_eq!(expected, actual); + /// ``` + pub fn range(&self, range: R) -> RangeIter + where + R: RangeBounds, + { + RangeIter::new(&self.containers, range) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_range_bitmap() { + let rb = RoaringBitmap::from_sorted_iter(10..5000).unwrap(); + + let expected = vec![10, 11, 12]; + let actual: Vec = rb.range(0..13).collect(); + assert_eq!(expected, actual); + } +} diff --git a/roaring/src/bitmap/store/array_store/mod.rs b/roaring/src/bitmap/store/array_store/mod.rs index 883db31f..7996139d 100644 --- a/roaring/src/bitmap/store/array_store/mod.rs +++ b/roaring/src/bitmap/store/array_store/mod.rs @@ -3,6 +3,7 @@ mod vector; mod visitor; use crate::bitmap::store::array_store::visitor::{CardinalityCounter, VecWriter}; +// use crate::bitmap::util; use core::cmp::Ordering; use core::cmp::Ordering::*; use core::fmt::{Display, Formatter}; @@ -235,6 +236,19 @@ impl ArrayStore { &self.vec } + pub fn range_iter(&self, range: RangeInclusive) -> core::slice::Iter { + let start_index = match self.vec.binary_search(range.start()) { + Ok(i) => i, + Err(i) => i, + }; + let end_index = match self.vec.binary_search(range.end()) { + Ok(i) => i + 1, + Err(i) => i, + }; + let r = start_index..end_index; + self.vec[r].iter() + } + /// Retains only the elements specified by the predicate. pub fn retain(&mut self, mut f: impl FnMut(u16) -> bool) { // Idea to avoid branching from "Engineering Fast Indexes for Big Data diff --git a/roaring/src/bitmap/store/bitmap_store.rs b/roaring/src/bitmap/store/bitmap_store.rs index f349a2aa..9ca7d5c9 100644 --- a/roaring/src/bitmap/store/bitmap_store.rs +++ b/roaring/src/bitmap/store/bitmap_store.rs @@ -1,8 +1,10 @@ use core::borrow::Borrow; use core::cmp::Ordering; use core::fmt::{Display, Formatter}; +use core::ops::RangeBounds; use core::ops::{BitAndAssign, BitOrAssign, BitXorAssign, RangeInclusive, SubAssign}; +use super::super::util; use super::ArrayStore; #[cfg(not(feature = "std"))] @@ -308,6 +310,10 @@ impl BitmapStore { BitmapIter::new(self.bits) } + pub fn range_iter<'a>(&'a self, range: RangeInclusive) -> BlockRangeIter<'a> { + BlockRangeIter::new(&self.bits, range) + } + pub fn as_array(&self) -> &[u64; BITMAP_LENGTH] { &self.bits } @@ -403,6 +409,146 @@ impl Display for Error { #[cfg(feature = "std")] impl std::error::Error for Error {} +/// Iterator over a consecutive subsequence of a BitmapStore. +/// A 'Block' is one of the 1024 u64s forming the BitmapStore. +pub struct BlockRangeIter<'a> { + first: BlockPartIter, + between: BlockSeqIter<'a>, + last: BlockPartIter, +} + +impl<'a> Iterator for BlockRangeIter<'a> { + type Item = u16; + fn next(&mut self) -> Option { + if let f @ Some(_) = self.first.next() { + return f; + } + if let b @ Some(_) = self.between.next() { + return b; + } + self.last.next() + } +} + +impl<'a> BlockRangeIter<'a> { + pub(crate) fn new(bits: &'a [u64; BITMAP_LENGTH], range: R) -> BlockRangeIter<'a> + where + R: RangeBounds, + { + let inc_range = match util::convert_range_to_inclusive(range) { + Some(range) => range, + None => return BlockRangeIter::empty(), + }; + let (start, end) = (*inc_range.start(), *inc_range.end()); + + let (start_key, start_bit) = (key(start), bit(start)); + let (end_key, end_bit) = (key(end), bit(end)); + + if start_key == end_key { + // single u64 + let block_iter = + Self::partial_block_iter(start_key, bits[start_key], start_bit..=end_bit); + return BlockRangeIter::single_block(block_iter); + } + + let first = Self::partial_block_iter(start_key, bits[start_key], start_bit..=63); + let start_key_p1 = start_key + 1; + let between = BlockSeqIter::new(start_key_p1, &bits[start_key_p1..end_key]); + let last = Self::partial_block_iter(end_key, bits[end_key], 0..=end_bit); + + BlockRangeIter { first, between, last } + } + #[inline] + fn start_mask(s: usize) -> u64 { + match s { + 0 => u64::MAX, + _ if s >= 64 => u64::MIN, + _ => !((1u64 << s) - 1), + } + } + #[inline] + fn end_mask(e: usize) -> u64 { + match e { + _ if e >= 63 => u64::MAX, + _ => (1u64 << e + 1) - 1, + } + } + fn partial_block_iter(key: usize, block: u64, range: RangeInclusive) -> BlockPartIter { + let (start, end) = (*range.start(), *range.end()); + let start_mask = Self::start_mask(start); + let end_mask = Self::end_mask(end); + BlockPartIter::new(key, block & start_mask & end_mask) + } + fn empty() -> BlockRangeIter<'a> { + BlockRangeIter { + first: BlockPartIter::empty(), + between: BlockSeqIter::empty(), + last: BlockPartIter::empty(), + } + } + fn single_block(block: BlockPartIter) -> BlockRangeIter<'a> { + BlockRangeIter { + first: block, + between: BlockSeqIter::empty(), + last: BlockPartIter::empty(), + } + } +} + +struct BlockPartIter { + key: usize, + value: u64, +} + +impl BlockPartIter { + fn new(key: usize, block: u64) -> BlockPartIter { + BlockPartIter { key, value: block } + } + fn empty() -> BlockPartIter { + Self::new(0, 0) + } +} + +impl Iterator for BlockPartIter { + type Item = u16; + fn next(&mut self) -> Option { + if self.value == 0 { + return None; + } + let index = self.value.trailing_zeros() as usize; + self.value &= self.value - 1; + return Some((64 * self.key + index) as u16); + } +} + +struct BlockSeqIter<'a> { + start_key: usize, + block_range: &'a [u64], +} +impl<'a> BlockSeqIter<'a> { + fn new(start_key: usize, block_range: &'a [u64]) -> BlockSeqIter<'a> { + BlockSeqIter { start_key, block_range } + } + fn empty() -> BlockSeqIter<'a> { + BlockSeqIter { start_key: 0, block_range: &[] } + } +} +impl<'a> Iterator for BlockSeqIter<'a> { + type Item = u16; + fn next(&mut self) -> Option { + if self.block_range.len() == 0 { + return None; + } + let mut current = BlockPartIter::new(self.start_key, self.block_range[0]); + if let c @ Some(_) = current.next() { + return c; + } + self.start_key += 1; + self.block_range = &self.block_range[1..]; + self.next() + } +} + pub struct BitmapIter> { key: usize, value: u64, @@ -556,6 +702,60 @@ impl BitXorAssign<&ArrayStore> for BitmapStore { mod tests { use super::*; + #[test] + fn partial_block_iter() { + let b = 0b1111111111111111111111111111111111111111111111111111111111111111; + let r = 0..=63; + let expected: Vec = (0..64).collect(); + let actual: Vec = BlockRangeIter::partial_block_iter(0, b, r).collect(); + assert_eq!(expected, actual); + let b = 0b0000000000000000000000000000000000000000000000000000000000111111; + let r = 0..=63; + let expected: Vec = (0..=5).collect(); + let actual: Vec = BlockRangeIter::partial_block_iter(0, b, r).collect(); + assert_eq!(expected, actual); + let b = 0b0000000000000000000000000000000000000000000000000000000000101010; + let r = 0..=63; + let expected: Vec = vec![1, 3, 5]; + let actual: Vec = BlockRangeIter::partial_block_iter(0, b, r).collect(); + assert_eq!(expected, actual); + let b = 0b0000000000000000000000000000000000000000000000000000000000101010; + let r = 2..=63; + let expected: Vec = vec![3, 5]; + let actual: Vec = BlockRangeIter::partial_block_iter(0, b, r).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn start_mask() { + let e = 0b1111111111111111111111111111111111111111111111111111111111111111; + assert_eq!(e, BlockRangeIter::start_mask(0)); + let e = 0b1111111111111111111111111111111111111111111111111111111111111110; + assert_eq!(e, BlockRangeIter::start_mask(1)); + let e = 0b1000000000000000000000000000000000000000000000000000000000000000; + assert_eq!(e, BlockRangeIter::start_mask(63)); + let e = 0b0000000000000000000000000000000000000000000000000000000000000000; + assert_eq!(e, BlockRangeIter::start_mask(64)); + let e = 0b0000000000000000000000000000000000000000000000000000000000000000; + assert_eq!(e, BlockRangeIter::start_mask(65)); + } + + #[test] + fn end_mask() { + let e = 0b0000000000000000000000000000000000000000000000000000000000000001; + assert_eq!(e, BlockRangeIter::end_mask(0)); + let e = 0b0000000000000000000000000000000000000000000000000000000000000011; + assert_eq!(e, BlockRangeIter::end_mask(1)); + let e = 0b0111111111111111111111111111111111111111111111111111111111111111; + assert_eq!(e, BlockRangeIter::end_mask(62)); + let e = 0b1111111111111111111111111111111111111111111111111111111111111111; + assert_eq!(e, BlockRangeIter::end_mask(63)); + let e = 0b1111111111111111111111111111111111111111111111111111111111111111; + assert_eq!(e, BlockRangeIter::end_mask(64)); + let e = 0b1111111111111111111111111111111111111111111111111111111111111111; + assert_eq!(e, BlockRangeIter::end_mask(65)); + } + #[test] fn test_bitmap_remove_smallest() { let mut store = BitmapStore::new(); diff --git a/roaring/src/bitmap/store/mod.rs b/roaring/src/bitmap/store/mod.rs index bb0d5822..3777e310 100644 --- a/roaring/src/bitmap/store/mod.rs +++ b/roaring/src/bitmap/store/mod.rs @@ -10,6 +10,7 @@ use core::slice; pub use self::bitmap_store::BITMAP_LENGTH; use self::Store::{Array, Bitmap}; +use super::util; pub use self::array_store::ArrayStore; pub use self::bitmap_store::{BitmapIter, BitmapStore}; @@ -32,6 +33,51 @@ pub enum Iter<'a> { BitmapOwned(BitmapIter>), } +/// Iterator over a consecutive subsequence of a Store, +/// whether the Store contains an ArrayStore or a BitmapStore. +pub enum StorePartIter<'a> { + Array { key: u16, values: slice::Iter<'a, u16> }, + Bitmap { key: u16, values: bitmap_store::BlockRangeIter<'a> }, +} + +impl<'a> StorePartIter<'a> { + pub fn new(key: u16, store: &'a Store, range: RangeInclusive) -> StorePartIter<'a> { + match store { + Array(array_store) => { + StorePartIter::Array { key, values: array_store.range_iter(range) } + } + Bitmap(bitmap_store) => { + StorePartIter::Bitmap { key, values: bitmap_store.range_iter(range) } + } + } + } + pub fn empty() -> StorePartIter<'a> { + StorePartIter::Array { key: 0, values: [].iter() } + } +} + +impl<'a> Iterator for StorePartIter<'a> { + type Item = u32; + fn next(&mut self) -> Option { + match self { + StorePartIter::Array { key, values } => { + if let Some(low) = values.next().cloned() { + Some(util::join(*key, low)) + } else { + None + } + } + StorePartIter::Bitmap { key, values } => { + if let Some(low) = values.next() { + Some(util::join(*key, low)) + } else { + None + } + } + } + } +} + impl Store { pub fn new() -> Store { Store::Array(ArrayStore::new()) diff --git a/roaring/src/bitmap/util.rs b/roaring/src/bitmap/util.rs index 3565c34a..dcc04f82 100644 --- a/roaring/src/bitmap/util.rs +++ b/roaring/src/bitmap/util.rs @@ -1,4 +1,5 @@ use core::ops::{Bound, RangeBounds, RangeInclusive}; +use num::{Bounded, CheckedAdd, CheckedSub, Integer}; /// Returns the container key and the index /// in this container for a given integer. @@ -14,20 +15,21 @@ pub fn join(high: u16, low: u16) -> u32 { (u32::from(high) << 16) + u32::from(low) } -/// Convert a `RangeBounds` object to `RangeInclusive`, -pub fn convert_range_to_inclusive(range: R) -> Option> +/// Convert a `RangeBounds` object to `RangeInclusive`, +pub fn convert_range_to_inclusive(range: R) -> Option> where - R: RangeBounds, + R: RangeBounds, + T: Integer + CheckedAdd + CheckedSub + Bounded + std::cmp::PartialOrd + Copy, { - let start: u32 = match range.start_bound() { + let start: T = match range.start_bound() { Bound::Included(&i) => i, - Bound::Excluded(&i) => i.checked_add(1)?, - Bound::Unbounded => 0, + Bound::Excluded(&i) => i.checked_add(&T::one())?, + Bound::Unbounded => T::zero(), }; - let end: u32 = match range.end_bound() { + let end: T = match range.end_bound() { Bound::Included(&i) => i, - Bound::Excluded(&i) => i.checked_sub(1)?, - Bound::Unbounded => u32::MAX, + Bound::Excluded(&i) => i.checked_sub(&T::one())?, + Bound::Unbounded => T::max_value(), }; if end < start { return None; diff --git a/roaring/tests/range.rs b/roaring/tests/range.rs new file mode 100644 index 00000000..afb75e41 --- /dev/null +++ b/roaring/tests/range.rs @@ -0,0 +1,61 @@ +extern crate roaring; + +use proptest::collection::btree_set; +use proptest::prelude::*; +use roaring::RoaringBitmap; + +#[test] +fn range_array() { + let mut rb = RoaringBitmap::new(); + rb.insert(0); + rb.insert(1); + rb.insert(10); + rb.insert(100_000); + rb.insert(999_999); + rb.insert(1_000_000); + + let expected = vec![1, 10, 100_000, 999_999]; + let actual: Vec = rb.range(1..=999_999).collect(); + assert_eq!(expected, actual); +} + +#[test] +fn range_bitmap() { + let rb = RoaringBitmap::from_sorted_iter(10..5000).unwrap(); + + let expected = vec![10, 11, 12]; + let actual: Vec = rb.range(0..13).collect(); + assert_eq!(expected, actual); +} + +#[test] +fn range_none() { + let rb = RoaringBitmap::from_sorted_iter(10..5000).unwrap(); + + let expected: Vec = vec![]; + let actual: Vec = rb.range(13..0).collect(); + assert_eq!(expected, actual); +} + +proptest! { + #[test] + fn proptest_range( + values in btree_set(..=262_143_u32, ..=1000), + range_a in 0u32..262_143, + range_b in 0u32..262_143, + ){ + let range = if range_a <= range_b { + range_a..=range_b + } else { + range_b..=range_a + }; + + let bitmap = RoaringBitmap::from_sorted_iter(values.iter().cloned()).unwrap(); + let expected: Vec = values.iter().cloned() + .filter(|&x| range.contains(&x)) + .collect(); + let actual: Vec = bitmap.range(range.clone()).collect(); + + assert_eq!(expected, actual); + } +}