From ebc52a032e55a54415bf74ab2d19d18a80731365 Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Tue, 23 Apr 2024 14:33:47 +0100 Subject: [PATCH 1/5] migrate --- Cargo.lock | 37 ++++ Cargo.toml | 2 +- bench-vortex/Cargo.toml | 2 +- bench-vortex/src/lib.rs | 5 +- pyvortex/Cargo.toml | 2 +- pyvortex/src/array.rs | 21 +- pyvortex/src/lib.rs | 4 +- .../src/array/bool/compute/as_contiguous.rs | 4 +- vortex-array/src/array/bool/compute/fill.rs | 4 +- vortex-array/src/array/primitive/mod.rs | 4 +- vortex-array/src/compute/as_contiguous.rs | 6 +- vortex-array/src/data.rs | 4 +- vortex-array/src/ptype.rs | 3 +- vortex-roaring/Cargo.toml | 2 + vortex-roaring/src/boolean/compress.rs | 26 +-- vortex-roaring/src/boolean/compute.rs | 73 +++---- vortex-roaring/src/boolean/mod.rs | 203 ++++++++---------- vortex-roaring/src/boolean/serde.rs | 53 ----- vortex-roaring/src/boolean/stats.rs | 111 ---------- vortex-roaring/src/downcast.rs | 46 ---- vortex-roaring/src/integer/compress.rs | 50 +++-- vortex-roaring/src/integer/compute.rs | 8 +- vortex-roaring/src/integer/mod.rs | 162 +++++--------- vortex-roaring/src/integer/serde.rs | 57 ----- vortex-roaring/src/integer/stats.rs | 22 -- vortex-roaring/src/lib.rs | 9 - vortex-roaring/src/serde_tests.rs | 30 +-- 27 files changed, 319 insertions(+), 631 deletions(-) delete mode 100644 vortex-roaring/src/boolean/serde.rs delete mode 100644 vortex-roaring/src/boolean/stats.rs delete mode 100644 vortex-roaring/src/downcast.rs delete mode 100644 vortex-roaring/src/integer/serde.rs delete mode 100644 vortex-roaring/src/integer/stats.rs diff --git a/Cargo.lock b/Cargo.lock index cc7cc731cb..30acb68d80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1155,6 +1155,7 @@ dependencies = [ "vortex-fastlanes", "vortex-ipc", "vortex-ree", + "vortex-roaring", "vortex-schema", ] @@ -1653,6 +1654,25 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "croaring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7266f0a7275b00ce4c4f4753e8c31afdefe93828101ece83a06e2ddab1dd1010" +dependencies = [ + "byteorder", + "croaring-sys", +] + +[[package]] +name = "croaring-sys" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47112498c394a7067949ebc07ef429b7384a413cf0efcf675846a47bcd307fb" +dependencies = [ + "cc", +] + [[package]] name = "crossbeam-channel" version = "0.5.12" @@ -4303,6 +4323,7 @@ dependencies = [ "vortex-error", "vortex-fastlanes", "vortex-ree", + "vortex-roaring", "vortex-schema", ] @@ -5774,6 +5795,22 @@ dependencies = [ "vortex-schema", ] +[[package]] +name = "vortex-roaring" +version = "0.1.0" +dependencies = [ + "arrow-buffer 51.0.0", + "croaring", + "linkme", + "log", + "num-traits", + "paste", + "serde", + "vortex-array", + "vortex-error", + "vortex-schema", +] + [[package]] name = "vortex-schema" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index e0e474eea3..e6936a9084 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ "vortex-flatbuffers", "vortex-ipc", "vortex-ree", - #"vortex-roaring", + "vortex-roaring", "vortex-schema", #"vortex-zigzag", ] diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index 204fbd82c1..973b4dc199 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -16,7 +16,7 @@ workspace = true [dependencies] #vortex-alp = { path = "../vortex-alp" } -#vortex-roaring = { path = "../vortex-roaring" } +vortex-roaring = { path = "../vortex-roaring" } #vortex-zigzag = { path = "../vortex-zigzag" } arrow = { workspace = true } arrow-array = { workspace = true } diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index b0e4dc79cf..17765e0804 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -20,6 +20,7 @@ use vortex_datetime_parts::DateTimePartsEncoding; use vortex_dict::DictEncoding; use vortex_fastlanes::{BitPackedEncoding, FoREncoding}; use vortex_ree::REEEncoding; +use vortex_roaring::RoaringBoolEncoding; use vortex_schema::DType; use crate::data_downloads::FileType; @@ -114,8 +115,8 @@ pub fn enumerate_arrays() -> Vec { &DateTimePartsEncoding, // &DeltaEncoding, Blows up the search space too much. &REEEncoding, - //&RoaringBoolEncoding, - // RoaringIntEncoding, + &RoaringBoolEncoding, + // &RoaringIntEncoding, // Doesn't offer anything more than FoR really // ZigZagEncoding, ] diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 556bcd9c2d..ec8cb2c5f1 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -26,7 +26,7 @@ vortex-dict = { path = "../vortex-dict" } vortex-error = { path = "../vortex-error" } vortex-fastlanes = { path = "../vortex-fastlanes" } vortex-ree = { path = "../vortex-ree" } -#vortex-roaring = { path = "../vortex-roaring" } +vortex-roaring = { path = "../vortex-roaring" } vortex-schema = { path = "../vortex-schema" } #vortex-zigzag = { path = "../vortex-zigzag" } itertools = { workspace = true } diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index e2324ab71c..4d43ef6967 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -23,6 +23,11 @@ use vortex_fastlanes::{ FoREncoding, OwnedBitPackedArray, OwnedDeltaArray, OwnedFoRArray, }; use vortex_ree::{OwnedREEArray, REEArray, REEEncoding, REE}; +use vortex_roaring::{ + OwnedRoaringBoolArray, OwnedRoaringIntArray, RoaringBool, RoaringBoolArray, + RoaringBoolEncoding, RoaringInt, RoaringIntArray, RoaringIntEncoding, +}; + use crate::dtype::PyDType; use crate::error::PyVortexError; @@ -74,8 +79,8 @@ pyarray!(FoREncoding, FoRArray, "FoRArray"); pyarray!(DeltaEncoding, DeltaArray, "DeltaArray"); pyarray!(DictEncoding, DictArray, "DictArray"); pyarray!(REEEncoding, REEArray, "REEArray"); -// pyarray!(RoaringBoolEncoding, RoaringBoolArray, "RoaringBoolArray"); -// pyarray!(RoaringIntEncoding, RoaringIntArray, "RoaringIntArray"); +pyarray!(RoaringBoolEncoding, RoaringBoolArray, "RoaringBoolArray"); +pyarray!(RoaringIntEncoding, RoaringIntArray, "RoaringIntArray"); // pyarray!(ZigZagEncoding, ZigZagArray, "ZigZagArray"); impl PyArray { @@ -162,6 +167,18 @@ impl PyArray { OwnedALPArray::try_from(inner.into_array()).map_err(PyVortexError::map_err)?, )? .extract(py), + RoaringBool::ID => PyBitPackedArray::wrap( + py, + OwnedBitPackedArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), + RoaringInt::ID => PyBitPackedArray::wrap( + py, + OwnedBitPackedArray::try_from(inner.into_array()) + .map_err(PyVortexError::map_err)?, + )? + .extract(py), _ => Py::new( py, Self { diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index 964ddca02e..fb3d08b160 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -40,8 +40,8 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - // m.add_class::()?; - // m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/vortex-array/src/array/bool/compute/as_contiguous.rs b/vortex-array/src/array/bool/compute/as_contiguous.rs index da29c00446..969381e266 100644 --- a/vortex-array/src/array/bool/compute/as_contiguous.rs +++ b/vortex-array/src/array/bool/compute/as_contiguous.rs @@ -4,10 +4,10 @@ use vortex_error::VortexResult; use crate::array::bool::BoolArray; use crate::compute::as_contiguous::AsContiguousFn; use crate::validity::Validity; -use crate::{Array, ArrayDType, IntoArray}; +use crate::{Array, ArrayDType, IntoArray, OwnedArray}; impl AsContiguousFn for BoolArray<'_> { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult> { + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult { let validity = if self.dtype().is_nullable() { Validity::from_iter(arrays.iter().map(|a| a.with_dyn(|a| a.logical_validity()))) } else { diff --git a/vortex-array/src/array/bool/compute/fill.rs b/vortex-array/src/array/bool/compute/fill.rs index 2efeb0f88e..7987020dd5 100644 --- a/vortex-array/src/array/bool/compute/fill.rs +++ b/vortex-array/src/array/bool/compute/fill.rs @@ -4,10 +4,10 @@ use vortex_schema::Nullability; use crate::array::bool::BoolArray; use crate::compute::fill::FillForwardFn; use crate::validity::ArrayValidity; -use crate::{Array, ArrayDType, IntoArray, ToArrayData}; +use crate::{ArrayDType, IntoArray, OwnedArray, ToArrayData}; impl FillForwardFn for BoolArray<'_> { - fn fill_forward(&self) -> VortexResult> { + fn fill_forward(&self) -> VortexResult { if self.dtype().nullability() == Nullability::NonNullable { return Ok(self.to_array_data().into_array()); } diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index d80689ea9e..f37354bef8 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -8,7 +8,7 @@ use crate::buffer::Buffer; use crate::ptype::{NativePType, PType}; use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use crate::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::{impl_encoding, ArrayDType}; +use crate::{impl_encoding, ArrayDType, OwnedArray}; use crate::{match_each_native_ptype, ArrayFlatten}; mod accessor; @@ -129,7 +129,7 @@ impl From> for PrimitiveArray<'_> { } impl IntoArray<'static> for Vec { - fn into_array(self) -> Array<'static> { + fn into_array(self) -> OwnedArray { PrimitiveArray::from(self).into_array() } } diff --git a/vortex-array/src/compute/as_contiguous.rs b/vortex-array/src/compute/as_contiguous.rs index 6115e5611a..2b6af58d6b 100644 --- a/vortex-array/src/compute/as_contiguous.rs +++ b/vortex-array/src/compute/as_contiguous.rs @@ -1,13 +1,13 @@ use itertools::Itertools; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use crate::{Array, ArrayDType}; +use crate::{Array, ArrayDType, OwnedArray}; pub trait AsContiguousFn { - fn as_contiguous(&self, arrays: &[Array]) -> VortexResult>; + fn as_contiguous(&self, arrays: &[Array]) -> VortexResult; } -pub fn as_contiguous(arrays: &[Array]) -> VortexResult> { +pub fn as_contiguous(arrays: &[Array]) -> VortexResult { if arrays.is_empty() { vortex_bail!(ComputeError: "No arrays to concatenate"); } diff --git a/vortex-array/src/data.rs b/vortex-array/src/data.rs index 9ff1dcf0d6..5e6e791500 100644 --- a/vortex-array/src/data.rs +++ b/vortex-array/src/data.rs @@ -9,7 +9,7 @@ use crate::encoding::EncodingRef; use crate::scalar::Scalar; use crate::stats::Stat; use crate::stats::Statistics; -use crate::{Array, ArrayMetadata, IntoArray, ToArray}; +use crate::{Array, ArrayMetadata, IntoArray, OwnedArray, ToArray}; #[derive(Clone, Debug)] pub struct ArrayData { @@ -134,7 +134,7 @@ impl ToArray for ArrayData { } impl IntoArray<'static> for ArrayData { - fn into_array(self) -> Array<'static> { + fn into_array(self) -> OwnedArray { Array::Data(self) } } diff --git a/vortex-array/src/ptype.rs b/vortex-array/src/ptype.rs index 2989c87e3b..bd34934ba0 100644 --- a/vortex-array/src/ptype.rs +++ b/vortex-array/src/ptype.rs @@ -5,6 +5,7 @@ use arrow_array::types::*; use arrow_buffer::ArrowNativeType; use half::f16; use num_traits::{Num, NumCast}; +use serde::{Deserialize, Serialize}; use vortex_error::{vortex_err, VortexError, VortexResult}; use vortex_schema::DType::*; use vortex_schema::{DType, FloatWidth, IntWidth}; @@ -12,7 +13,7 @@ use vortex_schema::{DType, FloatWidth, IntWidth}; use crate::scalar::{PScalar, Scalar}; #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)] pub enum PType { U8, U16, diff --git a/vortex-roaring/Cargo.toml b/vortex-roaring/Cargo.toml index 68b60f7bfa..8921cef7e6 100644 --- a/vortex-roaring/Cargo.toml +++ b/vortex-roaring/Cargo.toml @@ -20,6 +20,8 @@ linkme = { workspace = true } croaring = { workspace = true } num-traits = { workspace = true } log = { workspace = true } +serde = { workspace = true } +paste = { workspace = true } [lints] workspace = true diff --git a/vortex-roaring/src/boolean/compress.rs b/vortex-roaring/src/boolean/compress.rs index b1bff86b4b..ea5e3ebe15 100644 --- a/vortex-roaring/src/boolean/compress.rs +++ b/vortex-roaring/src/boolean/compress.rs @@ -1,22 +1,22 @@ use croaring::Bitmap; -use vortex::array::bool::{BoolArray, BoolEncoding}; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::{Array, ArrayRef}; +use vortex::array::bool::BoolArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; +use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray}; use vortex_error::VortexResult; use vortex_schema::DType; use vortex_schema::Nullability::NonNullable; -use crate::boolean::{RoaringBoolArray, RoaringBoolEncoding}; +use crate::boolean::RoaringBoolArray; +use crate::{OwnedRoaringBoolArray, RoaringBool, RoaringBoolEncoding}; impl EncodingCompression for RoaringBoolEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support bool enc arrays - if array.encoding().id() != BoolEncoding::ID { + if array.encoding().id() != RoaringBool::ID { return None; } @@ -34,19 +34,19 @@ impl EncodingCompression for RoaringBoolEncoding { fn compress( &self, - array: &dyn Array, - _like: Option<&dyn Array>, + array: &Array, + _like: Option<&Array>, _ctx: CompressCtx, - ) -> VortexResult { - Ok(roaring_encode(array.as_bool()).into_array()) + ) -> VortexResult { + roaring_encode(array.clone().flatten_bool()?).map(move |a| a.into_array()) } } -pub fn roaring_encode(bool_array: &BoolArray) -> RoaringBoolArray { +pub fn roaring_encode(bool_array: BoolArray) -> VortexResult { let mut bitmap = Bitmap::new(); bitmap.extend( bool_array - .buffer() + .boolean_buffer() .iter() .enumerate() .filter(|(_, b)| *b) @@ -55,5 +55,5 @@ pub fn roaring_encode(bool_array: &BoolArray) -> RoaringBoolArray { bitmap.run_optimize(); bitmap.shrink_to_fit(); - RoaringBoolArray::new(bitmap, bool_array.buffer().len()) + RoaringBoolArray::try_new(bitmap, bool_array.len()) } diff --git a/vortex-roaring/src/boolean/compute.rs b/vortex-roaring/src/boolean/compute.rs index bc741fe8e3..fa869f53c4 100644 --- a/vortex-roaring/src/boolean/compute.rs +++ b/vortex-roaring/src/boolean/compute.rs @@ -1,22 +1,17 @@ -use arrow_buffer::{BooleanBuffer, Buffer}; use croaring::Bitmap; -use vortex::array::bool::BoolArray; -use vortex::array::{Array, ArrayRef}; -use vortex::compute::flatten::{FlattenFn, FlattenedArray}; use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::slice::SliceFn; use vortex::compute::ArrayCompute; -use vortex::scalar::{AsBytes, Scalar}; -use vortex::validity::Validity; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::Nullability; +use vortex::scalar::Scalar; +use vortex::{IntoArray, OwnedArray}; +use vortex_error::VortexResult; use crate::RoaringBoolArray; -impl ArrayCompute for RoaringBoolArray { - fn flatten(&self) -> Option<&dyn FlattenFn> { - Some(self) - } +impl ArrayCompute for RoaringBoolArray<'_> { + // fn flatten(&self) -> Option<&dyn FlattenFn> { + // Some(self) + // } fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) @@ -27,30 +22,30 @@ impl ArrayCompute for RoaringBoolArray { } } -impl FlattenFn for RoaringBoolArray { - fn flatten(&self) -> VortexResult { - // TODO(ngates): benchmark the fastest conversion from BitMap. - // Via bitset requires two copies. - let bitset = self - .bitmap - .to_bitset() - .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; - - let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; - let buffer = Buffer::from_slice_ref(bytes); - Ok(FlattenedArray::Bool(BoolArray::new( - BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), - match self.nullability() { - Nullability::NonNullable => None, - Nullability::Nullable => Some(Validity::Valid(self.len())), - }, - ))) - } -} - -impl ScalarAtFn for RoaringBoolArray { +// impl FlattenFn for RoaringBoolArray { +// fn flatten(&self) -> VortexResult { +// // TODO(ngates): benchmark the fastest conversion from BitMap. +// // Via bitset requires two copies. +// let bitset = self +// .bitmap +// .to_bitset() +// .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; +// +// let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; +// let buffer = Buffer::from_slice_ref(bytes); +// Ok(FlattenedArray::Bool(BoolArray::new( +// BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), +// match self.nullability() { +// Nullability::NonNullable => None, +// Nullability::Nullable => Some(Validity::Valid(self.len())), +// }, +// ))) +// } +// } + +impl ScalarAtFn for RoaringBoolArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { - if self.bitmap.contains(index as u32) { + if self.bitmap().contains(index as u32) { Ok(true.into()) } else { Ok(false.into()) @@ -58,11 +53,11 @@ impl ScalarAtFn for RoaringBoolArray { } } -impl SliceFn for RoaringBoolArray { - fn slice(&self, start: usize, stop: usize) -> VortexResult { +impl SliceFn for RoaringBoolArray<'_> { + fn slice(&self, start: usize, stop: usize) -> VortexResult { let slice_bitmap = Bitmap::from_range(start as u32..stop as u32); - let bitmap = self.bitmap.and(&slice_bitmap).add_offset(-(start as i64)); + let bitmap = self.bitmap().and(&slice_bitmap).add_offset(-(start as i64)); - Ok(RoaringBoolArray::new(bitmap, stop - start).into_array()) + RoaringBoolArray::try_new(bitmap, stop - start).map(|a| a.into_array()) } } diff --git a/vortex-roaring/src/boolean/mod.rs b/vortex-roaring/src/boolean/mod.rs index cbef387c7d..d508c6f865 100644 --- a/vortex-roaring/src/boolean/mod.rs +++ b/vortex-roaring/src/boolean/mod.rs @@ -1,160 +1,131 @@ -use std::sync::{Arc, RwLock}; - +use arrow_buffer::BooleanBuffer; +use arrow_buffer::Buffer as ArrowBuffer; use compress::roaring_encode; -use croaring::{Bitmap, Native}; -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; -use vortex_error::{vortex_err, VortexResult}; -use vortex_schema::DType; +use croaring::{Bitmap, Portable}; +use serde::{Deserialize, Serialize}; +use vortex::array::bool::{Bool, BoolArray}; +use vortex::buffer::Buffer; +use vortex::scalar::AsBytes; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayDType, ArrayFlatten, OwnedArray}; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; +use vortex_schema::Nullability; use vortex_schema::Nullability::NonNullable; +use Nullability::Nullable; mod compress; mod compute; -mod serde; -mod stats; -#[derive(Debug, Clone)] -pub struct RoaringBoolArray { - bitmap: Bitmap, +impl_encoding!("vortex.roaring_bool", RoaringBool); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RoaringBoolMetadata { + // NB: this is stored because we want to avoid the overhead of deserializing the bitmap + // on every len() call. It's CRITICAL that this is kept up-to date. length: usize, - stats: Arc>, } -impl RoaringBoolArray { - pub fn new(bitmap: Bitmap, length: usize) -> Self { - Self { - bitmap, - length, - stats: Arc::new(RwLock::new(StatsSet::new())), +impl RoaringBoolArray<'_> { + pub fn try_new(bitmap: Bitmap, length: usize) -> VortexResult { + if length > bitmap.cardinality() as usize { + vortex_bail!("RoaringBoolArray length is greater than bitmap cardinality") + } else { + Ok(Self { + typed: TypedArray::try_from_parts( + DType::Bool(NonNullable), + RoaringBoolMetadata { length }, + Some(Buffer::Owned(bitmap.serialize::().into())), + vec![].into(), + HashMap::default(), + )?, + }) } } - pub fn bitmap(&self) -> &Bitmap { - &self.bitmap + pub fn bitmap(&self) -> Bitmap { + //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + Bitmap::deserialize::( + self.array() + .buffer() + .expect("RoaringBoolArray buffer is missing") + .as_slice(), + ) } - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Bool(p) => Ok(roaring_encode(p)), - _ => Err(vortex_err!("RoaringBool can only encode bool arrays")), + pub fn encode(array: OwnedArray) -> VortexResult { + if array.encoding().id() == Bool::ID { + roaring_encode(BoolArray::try_from(array)?).map(|a| a.into_array()) + } else { + Err(vortex_err!("RoaringInt can only encode boolean arrays")) } } } - -impl Array for RoaringBoolArray { - impl_array!(); - #[inline] - fn len(&self) -> usize { - self.length - } - - #[inline] - fn is_empty(&self) -> bool { - self.length == 0 - } - - #[inline] - fn dtype(&self) -> &DType { - &DType::Bool(NonNullable) - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &RoaringBoolEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - // TODO(ngates): do we want Native serializer? Or portable? Or frozen? - self.bitmap.get_serialized_size_in_bytes::() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - // TODO(ngates): should we store a buffer in memory? Or delay serialization? - // Or serialize into metadata? The only reason we support buffers is so we can write to - // the wire without copying into FlatBuffers. But if we need to allocate to serialize - // the bitmap anyway, then may as well shove it into metadata. +impl AcceptArrayVisitor for RoaringBoolArray<'_> { + fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { todo!() } } -impl ArrayValidity for RoaringBoolArray { - fn logical_validity(&self) -> Validity { - Validity::Valid(self.len()) - } - - fn is_valid(&self, _index: usize) -> bool { - true - } -} - -impl ArrayDisplay for RoaringBoolArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("bitmap", format!("{:?}", self.bitmap())) +impl ArrayTrait for RoaringBoolArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } -#[derive(Debug)] -pub struct RoaringBoolEncoding; - -impl RoaringBoolEncoding { - pub const ID: EncodingId = EncodingId::new("roaring.bool"); -} +impl ArrayStatisticsCompute for RoaringBoolArray<'_> {} -impl Encoding for RoaringBoolEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl ArrayValidity for RoaringBoolArray<'_> { + fn logical_validity(&self) -> LogicalValidity { + LogicalValidity::AllValid(self.len()) } - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) + fn is_valid(&self, _index: usize) -> bool { + true } +} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayFlatten for RoaringBoolArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + // TODO(ngates): benchmark the fastest conversion from BitMap. + // Via bitset requires two copies. + let bitset = self + .bitmap() + .to_bitset() + .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; + + let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; + let buffer = ArrowBuffer::from_slice_ref(bytes); + Ok(Flattened::Bool(BoolArray::try_new( + BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), + match self.dtype().nullability() { + NonNullable => Validity::NonNullable, + Nullable => Validity::AllValid, + }, + )?)) } } #[cfg(test)] mod test { use vortex::array::bool::BoolArray; - use vortex::array::Array; use vortex::compute::scalar_at::scalar_at; use vortex::scalar::Scalar; + use vortex::IntoArray; use vortex_error::VortexResult; use crate::RoaringBoolArray; #[test] pub fn iter() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, false, true, true]); - let array = RoaringBoolArray::encode(bool)?; - - let values = array.bitmap().to_vec(); + let bool: BoolArray = BoolArray::from(vec![true, false, true, true]); + let array = RoaringBoolArray::encode(bool.into_array())?; + let round_trip = RoaringBoolArray::try_from(array.clone())?; + let values = round_trip.bitmap().to_vec(); assert_eq!(values, vec![0, 2, 3]); Ok(()) @@ -162,8 +133,8 @@ mod test { #[test] pub fn test_scalar_at() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, false, true, true]); - let array = RoaringBoolArray::encode(bool)?; + let bool: BoolArray = BoolArray::from(vec![true, false, true, true]); + let array = RoaringBoolArray::encode(bool.into_array())?; let truthy: Scalar = true.into(); let falsy: Scalar = false.into(); diff --git a/vortex-roaring/src/boolean/serde.rs b/vortex-roaring/src/boolean/serde.rs deleted file mode 100644 index ed246594ad..0000000000 --- a/vortex-roaring/src/boolean/serde.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::io; -use std::io::ErrorKind; - -use croaring::{Bitmap, Portable}; -use vortex::array::{Array, ArrayRef}; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{RoaringBoolArray, RoaringBoolEncoding}; - -impl ArraySerde for RoaringBoolArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - ctx.write_usize(self.len())?; - let mut data = Vec::new(); - self.bitmap().serialize_into::(&mut data); - ctx.write_slice(data.as_slice()) - } - - fn metadata(&self) -> VortexResult>> { - todo!() - } -} - -impl EncodingSerde for RoaringBoolEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let len = ctx.read_usize()?; - let bitmap_data = ctx.read_slice()?; - Ok(RoaringBoolArray::new( - Bitmap::try_deserialize::(bitmap_data.as_slice()) - .ok_or(io::Error::new(ErrorKind::InvalidData, "invalid bitmap"))?, - len, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use croaring::Bitmap; - - use crate::downcast::DowncastRoaring; - use crate::serde_tests::test::roundtrip_array; - use crate::RoaringBoolArray; - - #[test] - fn roundtrip() { - let arr = RoaringBoolArray::new(Bitmap::from_range(245..63000), 65536); - let read_arr = roundtrip_array(&arr).unwrap(); - - let read_roaring = read_arr.as_roaring_bool(); - assert_eq!(arr.bitmap(), read_roaring.bitmap()); - } -} diff --git a/vortex-roaring/src/boolean/stats.rs b/vortex-roaring/src/boolean/stats.rs deleted file mode 100644 index aefcdbf82c..0000000000 --- a/vortex-roaring/src/boolean/stats.rs +++ /dev/null @@ -1,111 +0,0 @@ -use vortex::array::Array; -use vortex::stats::{Stat, StatsCompute, StatsSet}; -use vortex_error::VortexResult; - -use crate::boolean::RoaringBoolArray; - -impl StatsCompute for RoaringBoolArray { - fn compute(&self, stat: &Stat) -> VortexResult { - let cardinality = self.bitmap().cardinality() as usize; - if let Some(value) = match stat { - Stat::IsConstant => Some((cardinality == self.len() || cardinality == 0).into()), - Stat::Max => { - if self.len() > 0 { - Some((cardinality > 0).into()) - } else { - None - } - } - Stat::Min => { - if self.len() > 0 { - Some((cardinality == self.len()).into()) - } else { - None - } - } - Stat::TrueCount => Some(cardinality.into()), - Stat::NullCount => Some(0.into()), - _ => None, - } { - Ok(StatsSet::of(*stat, value)) - } else { - Ok(StatsSet::default()) - } - } -} - -#[cfg(test)] -mod test { - use vortex::array::bool::BoolArray; - use vortex::array::Array; - use vortex::stats::Stat::*; - use vortex_error::VortexResult; - - use crate::RoaringBoolArray; - - #[test] - pub fn stats_all_true() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![true, true]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(true) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(true)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(true)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 2 - ); - - Ok(()) - } - - #[test] - pub fn stats_all_false() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![false, false]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(true) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(false)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(false)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 0 - ); - - Ok(()) - } - - #[test] - pub fn stats_mixed() -> VortexResult<()> { - let bool: &dyn Array = &BoolArray::from(vec![false, true, true]); - let array = RoaringBoolArray::encode(bool)?; - - assert_eq!( - array.stats().get_or_compute_as::(&IsConstant), - Some(false) - ); - assert_eq!(array.stats().get_or_compute_as::(&Min), Some(false)); - assert_eq!(array.stats().get_or_compute_as::(&Max), Some(true)); - assert_eq!( - array - .stats() - .get_or_compute_cast::(&TrueCount) - .unwrap(), - 2 - ); - - Ok(()) - } -} diff --git a/vortex-roaring/src/downcast.rs b/vortex-roaring/src/downcast.rs deleted file mode 100644 index 2ad13158e5..0000000000 --- a/vortex-roaring/src/downcast.rs +++ /dev/null @@ -1,46 +0,0 @@ -use vortex::array::{Array, ArrayRef}; - -use crate::{RoaringBoolArray, RoaringIntArray}; - -mod private { - pub trait Sealed {} -} - -#[allow(dead_code)] -pub trait DowncastRoaring: private::Sealed { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray>; - - fn as_roaring_int(&self) -> &RoaringIntArray { - self.maybe_roaring_int().unwrap() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray>; - - fn as_roaring_bool(&self) -> &RoaringBoolArray { - self.maybe_roaring_bool().unwrap() - } -} - -impl private::Sealed for dyn Array {} - -impl DowncastRoaring for dyn Array { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray> { - self.as_any().downcast_ref() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray> { - self.as_any().downcast_ref() - } -} - -impl private::Sealed for ArrayRef {} - -impl DowncastRoaring for ArrayRef { - fn maybe_roaring_int(&self) -> Option<&RoaringIntArray> { - self.as_any().downcast_ref() - } - - fn maybe_roaring_bool(&self) -> Option<&RoaringBoolArray> { - self.as_any().downcast_ref() - } -} diff --git a/vortex-roaring/src/integer/compress.rs b/vortex-roaring/src/integer/compress.rs index ec978f3db6..e342c400a9 100644 --- a/vortex-roaring/src/integer/compress.rs +++ b/vortex-roaring/src/integer/compress.rs @@ -1,27 +1,26 @@ use croaring::Bitmap; use log::debug; use num_traits::NumCast; -use vortex::array::downcast::DowncastArrayBuiltin; -use vortex::array::primitive::{PrimitiveArray, PrimitiveEncoding}; -use vortex::array::{Array, ArrayRef}; +use vortex::array::primitive::PrimitiveArray; use vortex::compress::{CompressConfig, CompressCtx, EncodingCompression}; use vortex::ptype::{NativePType, PType}; -use vortex::stats::Stat; +use vortex::stats::{ArrayStatistics, Stat}; +use vortex::{Array, ArrayDType, ArrayDef, IntoArray, OwnedArray, ToStatic}; use vortex_error::VortexResult; use vortex_schema::DType; use vortex_schema::Nullability::NonNullable; use vortex_schema::Signedness::Unsigned; -use crate::{RoaringIntArray, RoaringIntEncoding}; +use crate::{OwnedRoaringIntArray, RoaringInt, RoaringIntArray, RoaringIntEncoding}; impl EncodingCompression for RoaringIntEncoding { fn can_compress( &self, - array: &dyn Array, + array: &Array, _config: &CompressConfig, ) -> Option<&dyn EncodingCompression> { // Only support primitive enc arrays - if array.encoding().id() != PrimitiveEncoding::ID { + if array.encoding().id() != RoaringInt::ID { return None; } @@ -33,14 +32,20 @@ impl EncodingCompression for RoaringIntEncoding { // Only support sorted unique arrays if !array - .stats() - .get_or_compute_or(false, &Stat::IsStrictSorted) + .statistics() + .compute_as(Stat::IsStrictSorted) + .unwrap_or(false) { debug!("Skipping roaring int, not strict sorted"); return None; } - if array.stats().get_or_compute_or(0usize, &Stat::Max) > u32::MAX as usize { + if array + .statistics() + .compute_as(Stat::Max) + .map(|s: usize| s > u32::MAX as usize) + .unwrap_or(false) + { debug!("Skipping roaring int, max is larger than {}", u32::MAX); return None; } @@ -51,25 +56,26 @@ impl EncodingCompression for RoaringIntEncoding { fn compress( &self, - array: &dyn Array, - _like: Option<&dyn Array>, + array: &Array, + _like: Option<&Array>, _ctx: CompressCtx, - ) -> VortexResult { - Ok(roaring_encode(array.as_primitive()).into_array()) + ) -> VortexResult { + let parray = array.clone().flatten_primitive()?; + Ok(roaring_encode(parray).into_array().to_static()) } } -pub fn roaring_encode(primitive_array: &PrimitiveArray) -> RoaringIntArray { - match primitive_array.ptype() { - PType::U8 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U16 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U32 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - PType::U64 => roaring_encode_primitive::(primitive_array.buffer().typed_data()), - _ => panic!("Unsupported ptype {}", primitive_array.ptype()), +pub fn roaring_encode(parray: PrimitiveArray) -> RoaringIntArray { + match parray.ptype() { + PType::U8 => roaring_encode_primitive::(parray.typed_data()), + PType::U16 => roaring_encode_primitive::(parray.typed_data()), + PType::U32 => roaring_encode_primitive::(parray.typed_data()), + PType::U64 => roaring_encode_primitive::(parray.typed_data()), + _ => panic!("Unsupported ptype {}", parray.ptype()), } } -fn roaring_encode_primitive(values: &[T]) -> RoaringIntArray { +fn roaring_encode_primitive(values: &[T]) -> OwnedRoaringIntArray { let mut bitmap = Bitmap::new(); bitmap.extend(values.iter().map(|i| i.to_u32().unwrap())); bitmap.run_optimize(); diff --git a/vortex-roaring/src/integer/compute.rs b/vortex-roaring/src/integer/compute.rs index 9ca725c680..69d87863c5 100644 --- a/vortex-roaring/src/integer/compute.rs +++ b/vortex-roaring/src/integer/compute.rs @@ -6,17 +6,17 @@ use vortex_error::VortexResult; use crate::RoaringIntArray; -impl ArrayCompute for RoaringIntArray { +impl ArrayCompute for RoaringIntArray<'_> { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } } -impl ScalarAtFn for RoaringIntArray { +impl ScalarAtFn for RoaringIntArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { // Unwrap since we know the index is valid - let bitmap_value = self.bitmap.select(index as u32).unwrap(); - let scalar: Scalar = match self.ptype { + let bitmap_value = self.bitmap().select(index as u32).unwrap(); + let scalar: Scalar = match self.metadata().ptype { PType::U8 => (bitmap_value as u8).into(), PType::U16 => (bitmap_value as u16).into(), PType::U32 => bitmap_value.into(), diff --git a/vortex-roaring/src/integer/mod.rs b/vortex-roaring/src/integer/mod.rs index 9a9c7c78a0..d542211041 100644 --- a/vortex-roaring/src/integer/mod.rs +++ b/vortex-roaring/src/integer/mod.rs @@ -1,34 +1,28 @@ -use std::sync::{Arc, RwLock}; - use compress::roaring_encode; -use croaring::{Bitmap, Native}; -use vortex::array::{Array, ArrayKind, ArrayRef}; -use vortex::compress::EncodingCompression; -use vortex::compute::ArrayCompute; -use vortex::encoding::{Encoding, EncodingId, EncodingRef}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; +use croaring::{Bitmap, Portable}; +use serde::{Deserialize, Serialize}; +use vortex::array::primitive::{Primitive, PrimitiveArray}; +use vortex::buffer::Buffer; use vortex::ptype::PType; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; -use vortex::validity::ArrayValidity; -use vortex::validity::Validity; -use vortex::{impl_array, ArrayWalker}; +use vortex::stats::ArrayStatisticsCompute; +use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; +use vortex::{impl_encoding, ArrayFlatten, OwnedArray}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; -use vortex_schema::DType; +use vortex_schema::Nullability::NonNullable; mod compress; mod compute; -mod serde; -mod stats; -#[derive(Debug, Clone)] -pub struct RoaringIntArray { - bitmap: Bitmap, +impl_encoding!("vortex.roaring_int", RoaringInt); + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RoaringIntMetadata { ptype: PType, - stats: Arc>, + length: usize, } -impl RoaringIntArray { +impl RoaringIntArray<'_> { pub fn new(bitmap: Bitmap, ptype: PType) -> Self { Self::try_new(bitmap, ptype).unwrap() } @@ -37,81 +31,46 @@ impl RoaringIntArray { if !ptype.is_unsigned_int() { vortex_bail!("RoaringInt expected unsigned int"); } - Ok(Self { - bitmap, - ptype, - stats: Arc::new(RwLock::new(StatsSet::new())), + typed: TypedArray::try_from_parts( + DType::Bool(NonNullable), + RoaringIntMetadata { + ptype, + length: bitmap.statistics().cardinality as usize, + }, + Some(Buffer::Owned(bitmap.serialize::().into())), + vec![].into(), + HashMap::default(), + )?, }) } - pub fn bitmap(&self) -> &Bitmap { - &self.bitmap + pub fn bitmap(&self) -> Bitmap { + //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + Bitmap::deserialize::( + self.array() + .buffer() + .expect("RoaringBoolArray buffer is missing") + .as_slice(), + ) } pub fn ptype(&self) -> PType { - self.ptype + self.metadata().ptype } - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Primitive(p) => Ok(roaring_encode(p)), - _ => Err(vortex_err!("RoaringInt can only encode primitive arrays")), + pub fn encode(array: OwnedArray) -> VortexResult { + if array.encoding().id() == Primitive::ID { + Ok(roaring_encode(PrimitiveArray::try_from(array)?).into_array()) + } else { + Err(vortex_err!("RoaringInt can only encode primitive arrays")) } } } -impl Array for RoaringIntArray { - impl_array!(); - #[inline] - fn len(&self) -> usize { - self.bitmap.cardinality() as usize - } - - #[inline] - fn is_empty(&self) -> bool { - self.bitmap().is_empty() - } - - #[inline] - fn dtype(&self) -> &DType { - self.ptype.into() - } - - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &RoaringIntEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.bitmap.get_serialized_size_in_bytes::() - } - - #[inline] - fn with_compute_mut( - &self, - f: &mut dyn FnMut(&dyn ArrayCompute) -> VortexResult<()>, - ) -> VortexResult<()> { - f(self) - } - - fn serde(&self) -> Option<&dyn ArraySerde> { - Some(self) - } - - fn walk(&self, _walker: &mut dyn ArrayWalker) -> VortexResult<()> { - todo!() - } -} - -impl ArrayValidity for RoaringIntArray { - fn logical_validity(&self) -> Validity { - Validity::Valid(self.len()) +impl ArrayValidity for RoaringIntArray<'_> { + fn logical_validity(&self) -> LogicalValidity { + LogicalValidity::AllValid(self.bitmap().iter().count()) } fn is_valid(&self, _index: usize) -> bool { @@ -119,30 +78,26 @@ impl ArrayValidity for RoaringIntArray { } } -impl ArrayDisplay for RoaringIntArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.property("bitmap", format!("{:?}", self.bitmap())) +impl ArrayFlatten for RoaringIntArray<'_> { + fn flatten<'a>(self) -> VortexResult> + where + Self: 'a, + { + todo!() } } -#[derive(Debug)] -pub struct RoaringIntEncoding; - -impl RoaringIntEncoding { - pub const ID: EncodingId = EncodingId::new("roaring.int"); -} - -impl Encoding for RoaringIntEncoding { - fn id(&self) -> EncodingId { - Self::ID +impl AcceptArrayVisitor for RoaringIntArray<'_> { + fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + todo!() } +} - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } +impl ArrayStatisticsCompute for RoaringIntArray<'_> {} - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) +impl ArrayTrait for RoaringIntArray<'_> { + fn len(&self) -> usize { + self.metadata().length } } @@ -150,14 +105,15 @@ impl Encoding for RoaringIntEncoding { mod test { use vortex::array::primitive::PrimitiveArray; use vortex::compute::scalar_at::scalar_at; + use vortex::IntoArray; use vortex_error::VortexResult; use crate::RoaringIntArray; #[test] pub fn test_scalar_at() -> VortexResult<()> { - let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]); - let array = RoaringIntArray::encode(&ints)?; + let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); + let array = RoaringIntArray::encode(ints)?; assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); diff --git a/vortex-roaring/src/integer/serde.rs b/vortex-roaring/src/integer/serde.rs deleted file mode 100644 index f30cf0dcb1..0000000000 --- a/vortex-roaring/src/integer/serde.rs +++ /dev/null @@ -1,57 +0,0 @@ -use std::io; -use std::io::ErrorKind; - -use croaring::{Bitmap, Portable}; -use vortex::array::{Array, ArrayRef}; -use vortex::ptype::PType; -use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use vortex_error::VortexResult; - -use crate::{RoaringIntArray, RoaringIntEncoding}; - -impl ArraySerde for RoaringIntArray { - fn write(&self, ctx: &mut WriteCtx) -> VortexResult<()> { - let mut data = Vec::new(); - self.bitmap().serialize_into::(&mut data); - ctx.write_slice(data.as_slice()) - } - - fn metadata(&self) -> VortexResult>> { - Ok(None) - } -} - -impl EncodingSerde for RoaringIntEncoding { - fn read(&self, ctx: &mut ReadCtx) -> VortexResult { - let bitmap_data = ctx.read_slice()?; - let ptype: PType = ctx - .schema() - .try_into() - .map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?; - Ok(RoaringIntArray::new( - Bitmap::try_deserialize::(bitmap_data.as_slice()) - .ok_or(io::Error::new(ErrorKind::InvalidData, "invalid bitmap"))?, - ptype, - ) - .into_array()) - } -} - -#[cfg(test)] -mod test { - use croaring::Bitmap; - use vortex::ptype::PType; - - use crate::downcast::DowncastRoaring; - use crate::serde_tests::test::roundtrip_array; - use crate::RoaringIntArray; - - #[test] - fn roundtrip() { - let arr = RoaringIntArray::new(Bitmap::from_range(245..63000), PType::U32); - let read_arr = roundtrip_array(&arr).unwrap(); - let read_roaring = read_arr.as_roaring_int(); - assert_eq!(arr.ptype(), read_roaring.ptype()); - assert_eq!(arr.bitmap(), read_roaring.bitmap()); - } -} diff --git a/vortex-roaring/src/integer/stats.rs b/vortex-roaring/src/integer/stats.rs deleted file mode 100644 index d4c69f4a7d..0000000000 --- a/vortex-roaring/src/integer/stats.rs +++ /dev/null @@ -1,22 +0,0 @@ -use vortex::stats::{Stat, StatsCompute, StatsSet}; -use vortex_error::VortexResult; - -use crate::RoaringIntArray; - -impl StatsCompute for RoaringIntArray { - fn compute(&self, stat: &Stat) -> VortexResult { - if let Some(value) = match stat { - Stat::IsConstant => Some((self.bitmap.cardinality() <= 1).into()), - Stat::IsSorted => Some(true.into()), - Stat::IsStrictSorted => Some(true.into()), - Stat::Max => self.bitmap.minimum().map(|v| v.into()), - Stat::Min => self.bitmap.maximum().map(|v| v.into()), - Stat::NullCount => Some(0.into()), - _ => None, - } { - Ok(StatsSet::of(*stat, value)) - } else { - Ok(StatsSet::default()) - } - } -} diff --git a/vortex-roaring/src/lib.rs b/vortex-roaring/src/lib.rs index dd7e25b519..aa058cbbd0 100644 --- a/vortex-roaring/src/lib.rs +++ b/vortex-roaring/src/lib.rs @@ -1,15 +1,6 @@ pub use boolean::*; pub use integer::*; -use linkme::distributed_slice; -use vortex::encoding::{EncodingRef, ENCODINGS}; mod boolean; -mod downcast; mod integer; mod serde_tests; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_ROARING_BOOL: EncodingRef = &RoaringBoolEncoding; - -#[distributed_slice(ENCODINGS)] -static ENCODINGS_ROARING_INT: EncodingRef = &RoaringIntEncoding; diff --git a/vortex-roaring/src/serde_tests.rs b/vortex-roaring/src/serde_tests.rs index 88935c0e11..fbd44935af 100644 --- a/vortex-roaring/src/serde_tests.rs +++ b/vortex-roaring/src/serde_tests.rs @@ -1,15 +1,15 @@ -#[cfg(test)] -pub mod test { - use vortex::array::{Array, ArrayRef}; - use vortex::serde::{ReadCtx, WriteCtx}; - use vortex_error::VortexResult; - - pub fn roundtrip_array(array: &dyn Array) -> VortexResult { - let mut buf = Vec::::new(); - let mut write_ctx = WriteCtx::new(&mut buf); - write_ctx.write(array)?; - let mut read = buf.as_slice(); - let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); - read_ctx.read() - } -} +// #[cfg(test)] +// pub mod test { +// use vortex::array::{Array, ArrayRef}; +// use vortex::serde::{ReadCtx, WriteCtx}; +// use vortex_error::VortexResult; +// +// pub fn roundtrip_array(array: &dyn Array) -> VortexResult { +// let mut buf = Vec::::new(); +// let mut write_ctx = WriteCtx::new(&mut buf); +// write_ctx.write(array)?; +// let mut read = buf.as_slice(); +// let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); +// read_ctx.read() +// } +// } From e7a7ff4101dbfbfc4d5f9cea7c85d46352f4ce8e Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Tue, 23 Apr 2024 14:58:23 +0100 Subject: [PATCH 2/5] nit --- pyvortex/src/array.rs | 1 - vortex-roaring/src/boolean/mod.rs | 10 ++++++---- vortex-roaring/src/integer/mod.rs | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 4d43ef6967..034c0c7892 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -28,7 +28,6 @@ use vortex_roaring::{ RoaringBoolEncoding, RoaringInt, RoaringIntArray, RoaringIntEncoding, }; - use crate::dtype::PyDType; use crate::error::PyVortexError; use crate::vortex_arrow; diff --git a/vortex-roaring/src/boolean/mod.rs b/vortex-roaring/src/boolean/mod.rs index d508c6f865..80f4b65701 100644 --- a/vortex-roaring/src/boolean/mod.rs +++ b/vortex-roaring/src/boolean/mod.rs @@ -22,15 +22,13 @@ impl_encoding!("vortex.roaring_bool", RoaringBool); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RoaringBoolMetadata { - // NB: this is stored because we want to avoid the overhead of deserializing the bitmap - // on every len() call. It's CRITICAL that this is kept up-to date. length: usize, } impl RoaringBoolArray<'_> { pub fn try_new(bitmap: Bitmap, length: usize) -> VortexResult { - if length > bitmap.cardinality() as usize { - vortex_bail!("RoaringBoolArray length is greater than bitmap cardinality") + if length < bitmap.cardinality() as usize { + vortex_bail!("RoaringBoolArray length is less than bitmap cardinality") } else { Ok(Self { typed: TypedArray::try_from_parts( @@ -64,6 +62,10 @@ impl RoaringBoolArray<'_> { } impl AcceptArrayVisitor for RoaringBoolArray<'_> { fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + // TODO(ngates): should we store a buffer in memory? Or delay serialization? + // Or serialize into metadata? The only reason we support buffers is so we can write to + // the wire without copying into FlatBuffers. But if we need to allocate to serialize + // the bitmap anyway, then may as well shove it into metadata. todo!() } } diff --git a/vortex-roaring/src/integer/mod.rs b/vortex-roaring/src/integer/mod.rs index d542211041..af3c3bb34a 100644 --- a/vortex-roaring/src/integer/mod.rs +++ b/vortex-roaring/src/integer/mod.rs @@ -19,6 +19,8 @@ impl_encoding!("vortex.roaring_int", RoaringInt); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RoaringIntMetadata { ptype: PType, + // NB: this is stored because we want to avoid the overhead of deserializing the bitmap + // on every len() call. It's CRITICAL that this is kept up-to date. length: usize, } From 6878730f430263b376cff94130c1a70c4a317506 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Apr 2024 16:35:36 +0100 Subject: [PATCH 3/5] remove commented --- vortex-roaring/src/boolean/compute.rs | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/vortex-roaring/src/boolean/compute.rs b/vortex-roaring/src/boolean/compute.rs index fa869f53c4..272a9aff11 100644 --- a/vortex-roaring/src/boolean/compute.rs +++ b/vortex-roaring/src/boolean/compute.rs @@ -9,10 +9,6 @@ use vortex_error::VortexResult; use crate::RoaringBoolArray; impl ArrayCompute for RoaringBoolArray<'_> { - // fn flatten(&self) -> Option<&dyn FlattenFn> { - // Some(self) - // } - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -22,27 +18,6 @@ impl ArrayCompute for RoaringBoolArray<'_> { } } -// impl FlattenFn for RoaringBoolArray { -// fn flatten(&self) -> VortexResult { -// // TODO(ngates): benchmark the fastest conversion from BitMap. -// // Via bitset requires two copies. -// let bitset = self -// .bitmap -// .to_bitset() -// .ok_or(vortex_err!("Failed to convert RoaringBitmap to Bitset"))?; -// -// let bytes = &bitset.as_slice().as_bytes()[0..bitset.size_in_bytes()]; -// let buffer = Buffer::from_slice_ref(bytes); -// Ok(FlattenedArray::Bool(BoolArray::new( -// BooleanBuffer::new(buffer, 0, bitset.size_in_bits()), -// match self.nullability() { -// Nullability::NonNullable => None, -// Nullability::Nullable => Some(Validity::Valid(self.len())), -// }, -// ))) -// } -// } - impl ScalarAtFn for RoaringBoolArray<'_> { fn scalar_at(&self, index: usize) -> VortexResult { if self.bitmap().contains(index as u32) { From 79484e2a12f3c861481421236698c80106653b2b Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Apr 2024 16:38:15 +0100 Subject: [PATCH 4/5] remove commented --- vortex-roaring/src/serde_tests.rs | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 vortex-roaring/src/serde_tests.rs diff --git a/vortex-roaring/src/serde_tests.rs b/vortex-roaring/src/serde_tests.rs deleted file mode 100644 index fbd44935af..0000000000 --- a/vortex-roaring/src/serde_tests.rs +++ /dev/null @@ -1,15 +0,0 @@ -// #[cfg(test)] -// pub mod test { -// use vortex::array::{Array, ArrayRef}; -// use vortex::serde::{ReadCtx, WriteCtx}; -// use vortex_error::VortexResult; -// -// pub fn roundtrip_array(array: &dyn Array) -> VortexResult { -// let mut buf = Vec::::new(); -// let mut write_ctx = WriteCtx::new(&mut buf); -// write_ctx.write(array)?; -// let mut read = buf.as_slice(); -// let mut read_ctx = ReadCtx::new(array.dtype(), &mut read); -// read_ctx.read() -// } -// } From dfeeafcfaa2ee187ee1bcbd8b5821fa292131b78 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Apr 2024 16:38:20 +0100 Subject: [PATCH 5/5] remove commented --- vortex-roaring/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex-roaring/src/lib.rs b/vortex-roaring/src/lib.rs index aa058cbbd0..8f1ab0eed3 100644 --- a/vortex-roaring/src/lib.rs +++ b/vortex-roaring/src/lib.rs @@ -3,4 +3,3 @@ pub use integer::*; mod boolean; mod integer; -mod serde_tests;