From 056f1b44cd91278621d7a330736bf4ac25395469 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Wed, 24 Jul 2024 14:49:35 +0100 Subject: [PATCH] Added bool iterators index and slice and filtering across some array types (#505) Implemented for (bool, primitive, varbin and constant). --- encodings/byte-bool/src/lib.rs | 10 +- encodings/roaring/src/boolean/mod.rs | 10 +- encodings/runend-bool/src/array.rs | 10 +- vortex-array/src/array/bool/compute/filter.rs | 102 +++++++ vortex-array/src/array/bool/compute/mod.rs | 1 + vortex-array/src/array/bool/mod.rs | 38 ++- vortex-array/src/array/chunked/variants.rs | 10 +- vortex-array/src/array/constant/compute.rs | 21 +- vortex-array/src/array/constant/variants.rs | 48 +++- .../src/array/primitive/compute/filter.rs | 81 ++++++ .../src/array/primitive/compute/mod.rs | 1 + vortex-array/src/array/sparse/variants.rs | 10 +- vortex-array/src/array/varbin/builder.rs | 13 + .../src/array/varbin/compute/filter.rs | 262 ++++++++++++++++++ vortex-array/src/array/varbin/compute/mod.rs | 1 + vortex-array/src/compute/filter.rs | 4 +- vortex-array/src/compute/unary/mod.rs | 10 +- vortex-array/src/implementation.rs | 20 +- vortex-array/src/lib.rs | 7 + vortex-array/src/validity.rs | 9 +- vortex-array/src/variants.rs | 25 +- 21 files changed, 673 insertions(+), 20 deletions(-) create mode 100644 vortex-array/src/array/bool/compute/filter.rs create mode 100644 vortex-array/src/array/primitive/compute/filter.rs create mode 100644 vortex-array/src/array/varbin/compute/filter.rs diff --git a/encodings/byte-bool/src/lib.rs b/encodings/byte-bool/src/lib.rs index 72292a0111..63a1cebc01 100644 --- a/encodings/byte-bool/src/lib.rs +++ b/encodings/byte-bool/src/lib.rs @@ -81,7 +81,15 @@ impl ArrayVariants for ByteBoolArray { } } -impl BoolArrayTrait for ByteBoolArray {} +impl BoolArrayTrait for ByteBoolArray { + fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } + + fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } +} impl From> for ByteBoolArray { fn from(value: Vec) -> Self { diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index 4adc6c52e7..8c07638c31 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -73,7 +73,15 @@ impl ArrayVariants for RoaringBoolArray { } } -impl BoolArrayTrait for RoaringBoolArray {} +impl BoolArrayTrait for RoaringBoolArray { + fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } + + fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } +} impl AcceptArrayVisitor for RoaringBoolArray { fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { diff --git a/encodings/runend-bool/src/array.rs b/encodings/runend-bool/src/array.rs index 10b78b5b23..82a360e184 100644 --- a/encodings/runend-bool/src/array.rs +++ b/encodings/runend-bool/src/array.rs @@ -98,7 +98,15 @@ impl RunEndBoolArray { } } -impl BoolArrayTrait for RunEndBoolArray {} +impl BoolArrayTrait for RunEndBoolArray { + fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } + + fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a> { + todo!() + } +} impl ArrayVariants for RunEndBoolArray { fn as_bool_array(&self) -> Option<&dyn BoolArrayTrait> { diff --git a/vortex-array/src/array/bool/compute/filter.rs b/vortex-array/src/array/bool/compute/filter.rs new file mode 100644 index 0000000000..aa37610445 --- /dev/null +++ b/vortex-array/src/array/bool/compute/filter.rs @@ -0,0 +1,102 @@ +use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; +use vortex_error::{vortex_err, VortexResult}; + +use crate::array::bool::BoolArray; +use crate::compute::FilterFn; +use crate::validity::filter_validity; +use crate::variants::BoolArrayTrait; +use crate::{Array, IntoArray}; + +impl FilterFn for BoolArray { + fn filter(&self, predicate: &Array) -> VortexResult { + filter_select_bool(self, predicate).map(|a| a.into_array()) + } +} + +fn filter_select_bool(arr: &BoolArray, predicate: &Array) -> VortexResult { + predicate.with_dyn(|b| { + let validity = filter_validity(arr.validity(), predicate)?; + let predicate = b.as_bool_array().ok_or(vortex_err!( + NotImplemented: "as_bool_array", + predicate.encoding().id() + ))?; + let selection_count = predicate.true_count(); + let out = if selection_count * 2 > arr.len() { + filter_select_bool_by_slice(&arr.boolean_buffer(), predicate, selection_count) + } else { + filter_select_bool_by_index(&arr.boolean_buffer(), predicate, selection_count) + }; + BoolArray::try_new(out, validity) + }) +} + +fn filter_select_bool_by_slice( + values: &BooleanBuffer, + predicate: &dyn BoolArrayTrait, + selection_count: usize, +) -> BooleanBuffer { + let mut out_buf = BooleanBufferBuilder::new(selection_count); + predicate.maybe_null_slices_iter().for_each(|(start, end)| { + out_buf.append_buffer(&values.slice(start, end - start)); + }); + out_buf.finish() +} + +fn filter_select_bool_by_index( + values: &BooleanBuffer, + predicate: &dyn BoolArrayTrait, + selection_count: usize, +) -> BooleanBuffer { + let mut out_buf = BooleanBufferBuilder::new(selection_count); + predicate + .maybe_null_indices_iter() + .for_each(|idx| out_buf.append(values.value(idx))); + out_buf.finish() +} + +#[cfg(test)] +mod test { + use itertools::Itertools; + + use crate::array::bool::compute::filter::{ + filter_select_bool, filter_select_bool_by_index, filter_select_bool_by_slice, + }; + use crate::array::bool::BoolArray; + use crate::ToArray; + + #[test] + fn filter_bool_test() { + let arr = BoolArray::from(vec![true, true, false]); + let filter = BoolArray::from(vec![true, false, true]); + + let filtered = filter_select_bool(&arr, &filter.to_array()).unwrap(); + assert_eq!(2, filtered.len()); + + assert_eq!( + vec![true, false], + filtered.boolean_buffer().iter().collect_vec() + ) + } + + #[test] + fn filter_bool_by_slice_test() { + let arr = BoolArray::from(vec![true, true, false]); + let filter = BoolArray::from(vec![true, false, true]); + + let filtered = filter_select_bool_by_slice(&arr.boolean_buffer(), &filter, 2); + assert_eq!(2, filtered.len()); + + assert_eq!(vec![true, false], filtered.iter().collect_vec()) + } + + #[test] + fn filter_bool_by_index_test() { + let arr = BoolArray::from(vec![true, true, false]); + let filter = BoolArray::from(vec![true, false, true]); + + let filtered = filter_select_bool_by_index(&arr.boolean_buffer(), &filter, 2); + assert_eq!(2, filtered.len()); + + assert_eq!(vec![true, false], filtered.iter().collect_vec()) + } +} diff --git a/vortex-array/src/array/bool/compute/mod.rs b/vortex-array/src/array/bool/compute/mod.rs index c6e1187a45..e442d03a98 100644 --- a/vortex-array/src/array/bool/compute/mod.rs +++ b/vortex-array/src/array/bool/compute/mod.rs @@ -5,6 +5,7 @@ use crate::compute::{ArrayCompute, CompareFn, SliceFn, TakeFn}; mod boolean; mod compare; mod fill; +mod filter; mod flatten; mod scalar_at; mod slice; diff --git a/vortex-array/src/array/bool/mod.rs b/vortex-array/src/array/bool/mod.rs index a5a5c13d5a..57ff469c8b 100644 --- a/vortex-array/src/array/bool/mod.rs +++ b/vortex-array/src/array/bool/mod.rs @@ -1,3 +1,4 @@ +use arrow_buffer::bit_iterator::{BitIndexIterator, BitSliceIterator}; use arrow_buffer::BooleanBuffer; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -83,7 +84,15 @@ impl ArrayVariants for BoolArray { } } -impl BoolArrayTrait for BoolArray {} +impl BoolArrayTrait for BoolArray { + fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a> { + Box::new(BitIndexIterator::new(self.buffer(), 0, self.len())) + } + + fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a> { + Box::new(BitSliceIterator::new(self.buffer(), 0, self.len())) + } +} impl From for BoolArray { fn from(value: BooleanBuffer) -> Self { @@ -139,8 +148,11 @@ impl AcceptArrayVisitor for BoolArray { #[cfg(test)] mod tests { + use itertools::Itertools; + use crate::array::bool::BoolArray; use crate::compute::unary::scalar_at; + use crate::variants::BoolArrayTrait; use crate::IntoArray; #[test] @@ -170,4 +182,28 @@ mod tests { let scalar = scalar_at(&arr, 4).unwrap(); assert!(scalar.is_null()); } + + #[test] + fn constant_iter_true_test() { + let arr = BoolArray::from(vec![true, true, true]); + assert_eq!(vec![0, 1, 2], arr.maybe_null_indices_iter().collect_vec()); + assert_eq!(vec![(0, 3)], arr.maybe_null_slices_iter().collect_vec()); + } + + #[test] + fn constant_iter_true_false_test() { + let arr = BoolArray::from(vec![true, false, true]); + assert_eq!(vec![0, 2], arr.maybe_null_indices_iter().collect_vec()); + assert_eq!( + vec![(0, 1), (2, 3)], + arr.maybe_null_slices_iter().collect_vec() + ); + } + + #[test] + fn constant_iter_false_test() { + let arr = BoolArray::from(vec![false, false, false]); + assert_eq!(0, arr.maybe_null_indices_iter().collect_vec().len()); + assert_eq!(0, arr.maybe_null_slices_iter().collect_vec().len()); + } } diff --git a/vortex-array/src/array/chunked/variants.rs b/vortex-array/src/array/chunked/variants.rs index 02dfff90ef..44d9932616 100644 --- a/vortex-array/src/array/chunked/variants.rs +++ b/vortex-array/src/array/chunked/variants.rs @@ -76,7 +76,15 @@ impl ArrayVariants for ChunkedArray { impl NullArrayTrait for ChunkedArray {} -impl BoolArrayTrait for ChunkedArray {} +impl BoolArrayTrait for ChunkedArray { + fn maybe_null_indices_iter(&self) -> Box> { + todo!() + } + + fn maybe_null_slices_iter(&self) -> Box> { + todo!() + } +} impl PrimitiveArrayTrait for ChunkedArray {} diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index c7a19eb95b..056c01a4c0 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use arrow_array::Datum; use vortex_dtype::Nullability; -use vortex_error::{vortex_bail, VortexResult}; +use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_expr::Operator; use vortex_scalar::Scalar; @@ -11,7 +11,7 @@ use crate::array::constant::ConstantArray; use crate::arrow::FromArrowArray; use crate::compute::unary::{scalar_at, ScalarAtFn}; use crate::compute::{ - scalar_cmp, AndFn, ArrayCompute, CompareFn, OrFn, SearchResult, SearchSortedFn, + scalar_cmp, AndFn, ArrayCompute, CompareFn, FilterFn, OrFn, SearchResult, SearchSortedFn, SearchSortedSide, SliceFn, TakeFn, }; use crate::stats::{ArrayStatistics, Stat}; @@ -65,6 +65,23 @@ impl SliceFn for ConstantArray { } } +impl FilterFn for ConstantArray { + fn filter(&self, predicate: &Array) -> VortexResult { + Ok(Self::new( + self.scalar().clone(), + predicate.with_dyn(|p| { + p.as_bool_array() + .ok_or(vortex_err!( + NotImplemented: "as_bool_array", + predicate.encoding().id() + )) + .map(|x| x.true_count()) + })?, + ) + .into_array()) + } +} + impl SearchSortedFn for ConstantArray { fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult { match self.scalar().partial_cmp(value).unwrap_or(Ordering::Less) { diff --git a/vortex-array/src/array/constant/variants.rs b/vortex-array/src/array/constant/variants.rs index 28c9a22509..c8fd644848 100644 --- a/vortex-array/src/array/constant/variants.rs +++ b/vortex-array/src/array/constant/variants.rs @@ -1,3 +1,5 @@ +use std::iter; + use vortex_dtype::DType; use vortex_scalar::StructScalar; @@ -77,7 +79,27 @@ impl ArrayVariants for ConstantArray { impl NullArrayTrait for ConstantArray {} -impl BoolArrayTrait for ConstantArray {} +impl BoolArrayTrait for ConstantArray { + fn maybe_null_indices_iter(&self) -> Box> { + let value = self.scalar().value().as_bool().unwrap(); + if value.unwrap_or(false) { + Box::new(0..self.len()) + } else { + Box::new(iter::empty()) + } + } + + fn maybe_null_slices_iter(&self) -> Box> { + // Must be a boolean scalar + let value = self.scalar().value().as_bool().unwrap(); + + if value.unwrap_or(false) { + Box::new(iter::once((0, self.len()))) + } else { + Box::new(iter::empty()) + } + } +} impl PrimitiveArrayTrait for ConstantArray {} @@ -97,3 +119,27 @@ impl StructArrayTrait for ConstantArray { impl ListArrayTrait for ConstantArray {} impl ExtensionArrayTrait for ConstantArray {} + +#[cfg(test)] +mod test { + use itertools::Itertools; + use vortex_dtype::Nullability; + use vortex_scalar::Scalar; + + use crate::array::constant::ConstantArray; + use crate::variants::BoolArrayTrait; + + #[test] + fn constant_iter_true_test() { + let arr = ConstantArray::new(Scalar::bool(true, Nullability::NonNullable), 3); + assert_eq!(vec![0, 1, 2], arr.maybe_null_indices_iter().collect_vec()); + assert_eq!(vec![(0, 3)], arr.maybe_null_slices_iter().collect_vec()); + } + + #[test] + fn constant_iter_false_test() { + let arr = ConstantArray::new(Scalar::bool(false, Nullability::NonNullable), 3); + assert_eq!(0, arr.maybe_null_indices_iter().collect_vec().len()); + assert_eq!(0, arr.maybe_null_slices_iter().collect_vec().len()); + } +} diff --git a/vortex-array/src/array/primitive/compute/filter.rs b/vortex-array/src/array/primitive/compute/filter.rs new file mode 100644 index 0000000000..de4c2ccb01 --- /dev/null +++ b/vortex-array/src/array/primitive/compute/filter.rs @@ -0,0 +1,81 @@ +use vortex_dtype::{match_each_native_ptype, NativePType}; +use vortex_error::{vortex_err, VortexResult}; + +use crate::array::primitive::PrimitiveArray; +use crate::compute::FilterFn; +use crate::validity::filter_validity; +use crate::variants::BoolArrayTrait; +use crate::{Array, IntoArray}; + +impl FilterFn for PrimitiveArray { + fn filter(&self, predicate: &Array) -> VortexResult { + filter_select_primitive(self, predicate).map(|a| a.into_array()) + } +} + +fn filter_select_primitive( + arr: &PrimitiveArray, + predicate: &Array, +) -> VortexResult { + predicate.with_dyn(|b| { + let validity = filter_validity(arr.validity(), predicate)?; + let predicate = b.as_bool_array().ok_or_else(||vortex_err!( + NotImplemented: "as_bool_array", + predicate.encoding().id() + ))?; + let selection_count = predicate.true_count(); + match_each_native_ptype!(arr.ptype(), |$T| { + let slice = arr.maybe_null_slice::<$T>(); + Ok(PrimitiveArray::from_vec(filter_primitive_slice(slice, predicate, selection_count), validity)) + }) + }) +} + +pub fn filter_primitive_slice( + arr: &[T], + predicate: &dyn BoolArrayTrait, + selection_count: usize, +) -> Vec { + let mut chunks = Vec::with_capacity(selection_count); + if selection_count * 2 > predicate.len() { + predicate.maybe_null_slices_iter().for_each(|(start, end)| { + chunks.extend_from_slice(&arr[start..end]); + }); + } else { + chunks.extend(predicate.maybe_null_indices_iter().map(|idx| arr[idx])); + } + chunks +} + +#[cfg(test)] +mod test { + use itertools::Itertools; + + use crate::array::bool::BoolArray; + use crate::array::primitive::compute::filter::filter_select_primitive; + use crate::array::primitive::PrimitiveArray; + + #[test] + fn filter_run_variant_mixed_test() { + let filter = vec![true, true, false, true, true, true, false, true]; + let arr = PrimitiveArray::from(vec![1u32, 24, 54, 2, 3, 2, 3, 2]); + + let filtered = + filter_select_primitive(&arr, BoolArray::from(filter.clone()).array()).unwrap(); + assert_eq!( + filtered.len(), + filter.iter().filter(|x| **x).collect_vec().len() + ); + + let rust_arr = arr.maybe_null_slice::(); + assert_eq!( + filtered.maybe_null_slice::().to_vec(), + filter + .iter() + .enumerate() + .filter(|(_idx, b)| **b) + .map(|m| rust_arr[m.0]) + .collect_vec() + ) + } +} diff --git a/vortex-array/src/array/primitive/compute/mod.rs b/vortex-array/src/array/primitive/compute/mod.rs index c72bd5c1a9..914ad12b55 100644 --- a/vortex-array/src/array/primitive/compute/mod.rs +++ b/vortex-array/src/array/primitive/compute/mod.rs @@ -5,6 +5,7 @@ use crate::compute::{ArrayCompute, CompareFn, FilterIndicesFn, SearchSortedFn, S mod cast; mod compare; mod fill; +mod filter; mod filter_indices; mod scalar_at; mod search_sorted; diff --git a/vortex-array/src/array/sparse/variants.rs b/vortex-array/src/array/sparse/variants.rs index 298c3cefa3..27b22336fa 100644 --- a/vortex-array/src/array/sparse/variants.rs +++ b/vortex-array/src/array/sparse/variants.rs @@ -77,7 +77,15 @@ impl ArrayVariants for SparseArray { impl NullArrayTrait for SparseArray {} -impl BoolArrayTrait for SparseArray {} +impl BoolArrayTrait for SparseArray { + fn maybe_null_indices_iter(&self) -> Box> { + Box::new(self.resolved_indices().into_iter()) + } + + fn maybe_null_slices_iter(&self) -> Box> { + todo!() + } +} impl PrimitiveArrayTrait for SparseArray {} diff --git a/vortex-array/src/array/varbin/builder.rs b/vortex-array/src/array/varbin/builder.rs index c5dea80bb4..121f4db87f 100644 --- a/vortex-array/src/array/varbin/builder.rs +++ b/vortex-array/src/array/varbin/builder.rs @@ -1,5 +1,6 @@ use arrow_buffer::NullBufferBuilder; use bytes::BytesMut; +use num_traits::AsPrimitive; use vortex_dtype::{DType, NativePType}; use crate::array::primitive::PrimitiveArray; @@ -46,6 +47,18 @@ impl VarBinBuilder { self.validity.append_null(); } + #[inline] + pub fn push_values(&mut self, values: &[u8], end_offsets: impl Iterator, num: usize) + where + O: 'static, + usize: AsPrimitive, + { + self.offsets + .extend(end_offsets.map(|offset| offset + self.data.len().as_())); + self.data.extend_from_slice(values); + self.validity.append_n_non_nulls(num); + } + pub fn finish(mut self, dtype: DType) -> VarBinArray { let offsets = PrimitiveArray::from(self.offsets); let data = PrimitiveArray::from_bytes(self.data.freeze(), Validity::NonNullable); diff --git a/vortex-array/src/array/varbin/compute/filter.rs b/vortex-array/src/array/varbin/compute/filter.rs new file mode 100644 index 0000000000..ea89d554c6 --- /dev/null +++ b/vortex-array/src/array/varbin/compute/filter.rs @@ -0,0 +1,262 @@ +use itertools::Itertools; +use num_traits::{AsPrimitive, Zero}; +use vortex_dtype::{match_each_integer_ptype, DType, NativePType}; +use vortex_error::{vortex_err, VortexResult}; + +use crate::array::varbin::builder::VarBinBuilder; +use crate::array::varbin::VarBinArray; +use crate::compute::FilterFn; +use crate::validity::Validity; +use crate::variants::BoolArrayTrait; +use crate::{Array, ArrayDType, IntoArray, IntoArrayVariant}; + +impl FilterFn for VarBinArray { + fn filter(&self, predicate: &Array) -> VortexResult { + filter_select_var_bin(self, predicate).map(|a| a.into_array()) + } +} + +fn filter_select_var_bin(arr: &VarBinArray, predicate: &Array) -> VortexResult { + predicate.with_dyn(|p| { + let predicate = p.as_bool_array().ok_or_else(|| { + vortex_err!( + NotImplemented: "as_bool_array", + predicate.encoding().id() + ) + })?; + let selection_count = predicate.true_count(); + if selection_count * 2 > predicate.len() { + filter_select_var_bin_by_slice(arr, predicate, selection_count) + } else { + filter_select_var_bin_by_index(arr, predicate, selection_count) + } + }) +} + +fn filter_select_var_bin_by_slice( + values: &VarBinArray, + predicate: &dyn BoolArrayTrait, + selection_count: usize, +) -> VortexResult { + let offsets = values.offsets().as_primitive(); + match_each_integer_ptype!(offsets.ptype(), |$O| { + filter_select_var_bin_by_slice_primitive_offset( + values.dtype().clone(), + offsets.maybe_null_slice::<$O>(), + values.bytes().into_primitive()?.maybe_null_slice::(), + predicate, + values.validity(), + selection_count + ) + }) +} + +fn filter_select_var_bin_by_slice_primitive_offset( + dtype: DType, + offsets: &[O], + data: &[u8], + predicate: &dyn BoolArrayTrait, + validity: Validity, + selection_count: usize, +) -> VortexResult +where + O: NativePType + 'static + Zero, + usize: AsPrimitive, +{ + let logical_validity = validity.to_logical(offsets.len() - 1); + if let Some(val) = logical_validity.to_null_buffer()? { + let mut builder = VarBinBuilder::::with_capacity(selection_count); + + predicate.maybe_null_slices_iter().for_each(|(start, end)| { + let null_sl = val.slice(start, end - start); + if null_sl.null_count() == 0 { + update_non_nullable_slice(data, offsets, &mut builder, start, end) + } else { + null_sl.iter().enumerate().for_each(|(idx, valid)| { + if valid { + let (s, e) = ( + offsets[idx + start].to_usize().unwrap(), + offsets[idx + start + 1].to_usize().unwrap(), + ); + builder.push_value(&data[s..e]) + } else { + builder.push_null() + } + }) + } + }); + + return Ok(builder.finish(dtype)); + } + + let mut builder = VarBinBuilder::::with_capacity(selection_count); + + predicate.maybe_null_slices_iter().for_each(|(start, end)| { + update_non_nullable_slice(data, offsets, &mut builder, start, end) + }); + + Ok(builder.finish(dtype)) +} + +fn update_non_nullable_slice( + data: &[u8], + offsets: &[O], + builder: &mut VarBinBuilder, + start: usize, + end: usize, +) where + O: NativePType + 'static + Zero + Copy, + usize: AsPrimitive, +{ + let (offset_start, offset_end) = (&offsets[start], &offsets[end]); + let new_data = &data[offset_start.to_usize().unwrap()..offset_end.to_usize().unwrap()]; + let new_offsets = offsets[start..end + 1] + .iter() + .map(|o| *o - *offset_start) + .dropping(1); + builder.push_values(new_data, new_offsets, end - start) +} + +fn filter_select_var_bin_by_index( + values: &VarBinArray, + predicate: &dyn BoolArrayTrait, + selection_count: usize, +) -> VortexResult { + let offsets = values.offsets().as_primitive(); + match_each_integer_ptype!(offsets.ptype(), |$O| { + filter_select_var_bin_by_index_primitive_offset( + values.dtype().clone(), + offsets.maybe_null_slice::<$O>(), + values.bytes().into_primitive()?.maybe_null_slice::(), + predicate, + values.validity(), + selection_count + ) + }) +} + +fn filter_select_var_bin_by_index_primitive_offset( + dtype: DType, + offsets: &[O], + data: &[u8], + predicate: &dyn BoolArrayTrait, + validity: Validity, + selection_count: usize, +) -> VortexResult { + let mut builder = VarBinBuilder::::with_capacity(selection_count); + predicate.maybe_null_indices_iter().for_each(|idx| { + if validity.is_valid(idx) { + let (start, end) = ( + offsets[idx].to_usize().unwrap(), + offsets[idx + 1].to_usize().unwrap(), + ); + builder.push(Some(&data[start..end])) + } else { + builder.push_null() + } + }); + Ok(builder.finish(dtype)) +} + +#[cfg(test)] +mod test { + use itertools::Itertools; + use vortex_dtype::DType; + use vortex_dtype::Nullability::{NonNullable, Nullable}; + use vortex_scalar::Scalar; + + use crate::array::bool::BoolArray; + use crate::array::primitive::PrimitiveArray; + use crate::array::varbin::compute::filter::{ + filter_select_var_bin_by_index, filter_select_var_bin_by_slice, + }; + use crate::array::varbin::VarBinArray; + use crate::compute::unary::scalar_at; + use crate::validity::Validity; + use crate::ToArray; + + fn nullable_scalar_str(s: &str) -> Scalar { + Scalar::utf8(s.to_owned(), Nullable) + } + + #[test] + fn filter_var_bin_test() { + let arr = VarBinArray::from_vec( + vec![ + b"hello".as_slice(), + b"world".as_slice(), + b"filter".as_slice(), + ], + DType::Utf8(NonNullable), + ); + let filter = BoolArray::from(vec![true, false, true]); + + let buf = filter_select_var_bin_by_index(&arr, &filter, 2) + .unwrap() + .to_array(); + + assert_eq!(buf.len(), 2); + assert_eq!(scalar_at(&buf, 0).unwrap(), "hello".into()); + assert_eq!(scalar_at(&buf, 1).unwrap(), "filter".into()); + } + + #[test] + fn filter_var_bin_slice_test() { + let arr = VarBinArray::from_vec( + vec![ + b"hello".as_slice(), + b"world".as_slice(), + b"filter".as_slice(), + b"filter2".as_slice(), + b"filter3".as_slice(), + ], + DType::Utf8(NonNullable), + ); + let filter = BoolArray::from(vec![true, false, true, false, true]); + + let buf = filter_select_var_bin_by_slice(&arr, &filter, 3) + .unwrap() + .to_array(); + + assert_eq!(buf.len(), 3); + assert_eq!(scalar_at(&buf, 0).unwrap(), "hello".into()); + assert_eq!(scalar_at(&buf, 1).unwrap(), "filter".into()); + assert_eq!(scalar_at(&buf, 2).unwrap(), "filter3".into()); + } + + #[test] + fn filter_var_bin_slice_null_test() { + let x = vec![ + b"one".as_slice(), + b"two".as_slice(), + b"three".as_slice(), + b"four".as_slice(), + b"five".as_slice(), + b"six".as_slice(), + ] + .into_iter() + .flat_map(|x| x.iter().cloned()) + .collect_vec(); + + let bytes = PrimitiveArray::from(x).to_array(); + + let offsets = PrimitiveArray::from(vec![0, 3, 6, 11, 15, 19, 22]).to_array(); + let validity = + Validity::Array(BoolArray::from(vec![true, false, true, true, true, true]).to_array()); + let arr = VarBinArray::try_new(offsets, bytes, DType::Utf8(Nullable), validity).unwrap(); + let filter = BoolArray::from(vec![true, true, true, false, true, true]); + + let buf = filter_select_var_bin_by_slice(&arr, &filter, 5) + .unwrap() + .to_array(); + + let null = Scalar::null(DType::Utf8(Nullable)); + assert_eq!(buf.len(), 5); + + assert_eq!(scalar_at(&buf, 0).unwrap(), nullable_scalar_str("one")); + assert_eq!(scalar_at(&buf, 1).unwrap(), null); + assert_eq!(scalar_at(&buf, 2).unwrap(), nullable_scalar_str("three")); + assert_eq!(scalar_at(&buf, 3).unwrap(), nullable_scalar_str("five")); + assert_eq!(scalar_at(&buf, 4).unwrap(), nullable_scalar_str("six")); + } +} diff --git a/vortex-array/src/array/varbin/compute/mod.rs b/vortex-array/src/array/varbin/compute/mod.rs index 312699da2f..e56dfac773 100644 --- a/vortex-array/src/array/varbin/compute/mod.rs +++ b/vortex-array/src/array/varbin/compute/mod.rs @@ -7,6 +7,7 @@ use crate::compute::{ArrayCompute, SliceFn, TakeFn}; use crate::validity::ArrayValidity; use crate::ArrayDType; +mod filter; mod slice; mod take; diff --git a/vortex-array/src/compute/filter.rs b/vortex-array/src/compute/filter.rs index 05f4348fdd..230ae3f609 100644 --- a/vortex-array/src/compute/filter.rs +++ b/vortex-array/src/compute/filter.rs @@ -7,7 +7,7 @@ use crate::{Array, ArrayDType, ArrayData, IntoArray, IntoCanonical}; pub trait FilterFn { /// Filter an array by the provided predicate. - fn filter(&self, predicate: &Array) -> Array; + fn filter(&self, predicate: &Array) -> VortexResult; } /// Return a new array by applying a boolean predicate to select items from a base Array. @@ -34,7 +34,7 @@ pub fn filter(array: &Array, predicate: &Array) -> VortexResult { array.with_dyn(|a| { if let Some(filter_fn) = a.filter() { - Ok(filter_fn.filter(array)) + filter_fn.filter(predicate) } else { // Fallback: implement using Arrow kernels. let array_ref = array.clone().into_canonical()?.into_arrow(); diff --git a/vortex-array/src/compute/unary/mod.rs b/vortex-array/src/compute/unary/mod.rs index 488ba26cc3..d59959b99f 100644 --- a/vortex-array/src/compute/unary/mod.rs +++ b/vortex-array/src/compute/unary/mod.rs @@ -1,9 +1,9 @@ -mod cast; -mod fill_forward; -mod scalar_at; -mod scalar_subtract; - pub use cast::{try_cast, CastFn}; pub use fill_forward::{fill_forward, FillForwardFn}; pub use scalar_at::{scalar_at, ScalarAtFn}; pub use scalar_subtract::{subtract_scalar, SubtractScalarFn}; + +mod cast; +mod fill_forward; +mod scalar_at; +mod scalar_subtract; diff --git a/vortex-array/src/implementation.rs b/vortex-array/src/implementation.rs index d8b0562a62..4cbaa086c7 100644 --- a/vortex-array/src/implementation.rs +++ b/vortex-array/src/implementation.rs @@ -6,8 +6,8 @@ use crate::encoding::{ArrayEncoding, ArrayEncodingExt, ArrayEncodingRef, Encodin use crate::stats::{ArrayStatistics, Statistics}; use crate::visitor::ArrayVisitor; use crate::{ - Array, ArrayDType, ArrayData, ArrayMetadata, ArrayTrait, AsArray, GetArrayMetadata, IntoArray, - IntoArrayData, ToArrayData, TryDeserializeArrayMetadata, + Array, ArrayDType, ArrayData, ArrayLen, ArrayMetadata, ArrayTrait, AsArray, GetArrayMetadata, + IntoArray, IntoArrayData, ToArrayData, TryDeserializeArrayMetadata, }; /// Trait the defines the set of types relating to an array. @@ -173,6 +173,22 @@ impl ArrayDType for T { } } +impl ArrayLen for T { + fn len(&self) -> usize { + match self.as_array_ref() { + Array::Data(d) => d.len(), + Array::View(v) => v.len(), + } + } + + fn is_empty(&self) -> bool { + match self.as_array_ref() { + Array::Data(d) => d.is_empty(), + Array::View(v) => v.is_empty(), + } + } +} + impl ArrayStatistics for T { fn statistics(&self) -> &(dyn Statistics + '_) { match self.as_array_ref() { diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 07d1aab079..5c7f4f3c79 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -231,6 +231,7 @@ pub trait ArrayTrait: ArrayEncodingRef + ArrayCompute + ArrayDType + + ArrayLen + ArrayVariants + IntoCanonical + ArrayValidity @@ -251,6 +252,12 @@ pub trait ArrayDType { fn dtype(&self) -> &DType; } +pub trait ArrayLen { + fn len(&self) -> usize; + + fn is_empty(&self) -> bool; +} + struct NBytesVisitor(usize); impl ArrayVisitor for NBytesVisitor { diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index dc1e53d64c..741745b515 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -5,7 +5,7 @@ use vortex_error::{vortex_bail, VortexResult}; use crate::array::bool::BoolArray; use crate::compute::unary::scalar_at; -use crate::compute::{slice, take}; +use crate::compute::{filter, slice, take}; use crate::stats::ArrayStatistics; use crate::{Array, IntoArray, IntoArrayVariant}; @@ -288,3 +288,10 @@ impl IntoArray for LogicalValidity { } } } + +pub fn filter_validity(validity: Validity, predicate: &Array) -> VortexResult { + match validity { + v @ (Validity::NonNullable | Validity::AllValid | Validity::AllInvalid) => Ok(v), + Validity::Array(arr) => Ok(Validity::Array(filter(&arr, predicate)?)), + } +} diff --git a/vortex-array/src/variants.rs b/vortex-array/src/variants.rs index c6a3aa0038..710074dd9e 100644 --- a/vortex-array/src/variants.rs +++ b/vortex-array/src/variants.rs @@ -74,7 +74,30 @@ pub trait ArrayVariants { pub trait NullArrayTrait: ArrayTrait {} -pub trait BoolArrayTrait: ArrayTrait {} +pub trait BoolArrayTrait: ArrayTrait { + fn true_count(&self) -> usize { + self.statistics() + .compute_true_count() + .unwrap_or_else(|| self.maybe_null_indices_iter().count()) + } + + // An iterator over the sorted indices of set values in the underlying boolean array + // good to array with low number of set values. + fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a>; + + // An iterator over the sorted disjoint contiguous range set values in the underlying boolean + // array good for arrays with only long runs of set values. + fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a>; + + // Other possible iterators include: + // - True(usize) | False(usize) | Mixed(BooleanBuffer) where True/False are long runs of either + // true or false values and mixed + // is everything else + // - T|F + [(usize, BooleanBuffer)] where usize represents an offset into the original array + // and the buffer is a slice of that array, omitted slices + // could be either true or false signified by the initial + // value returned. +} pub trait PrimitiveArrayTrait: ArrayTrait {}