From 88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:37:41 -0400 Subject: [PATCH] Use containers offsets when available --- src/bitmap/ops_with_serialized.rs | 105 +++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index e9404e9d..6711b1b0 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -8,7 +8,7 @@ use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ - NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, + NO_OFFSET_THRESHOLD, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, }; use crate::RoaringBitmap; @@ -93,21 +93,35 @@ impl RoaringBitmap { } // Read the container descriptions - let mut description_bytes = vec![[0u16; 2]; size]; - reader.read_exact(cast_slice_mut(&mut description_bytes))?; - description_bytes.iter_mut().for_each(|[ref mut key, ref mut len]| { + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { *key = u16::from_le(*key); *len = u16::from_le(*len); }); - if has_offsets { - // I could use these offsets but I am a lazy developer (for now) - reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + + // Loop on the materialized containers if there + // are less or as many of them than serialized ones. + if self.containers.len() <= size { + return self.intersection_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } } // Read each container and skip the useless ones - for (i, &[key, len_minus_one]) in description_bytes.iter().enumerate() { + let mut containers = Vec::new(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { let container = match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(index) => self.containers.get(index), Err(_) => None, @@ -191,6 +205,81 @@ impl RoaringBitmap { Ok(RoaringBitmap { containers }) } + + fn intersection_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + for container in &self.containers { + let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => index, + Err(_) => continue, + }; + + // Seek to the bytes of the container we want. + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let [key, len_minus_one] = descriptions[i]; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + + Ok(RoaringBitmap { containers }) + } } #[cfg(test)]