diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index d3d0779690..5251000fe9 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -123,6 +123,7 @@ async fn take_vortex( LayoutContext::default().into(), ), ) + .with_io_dispatcher(DISPATCHER.clone()) .with_indices(ArrayData::from(indices.to_vec())) .build() .await? diff --git a/encodings/datetime-parts/src/compute/filter.rs b/encodings/datetime-parts/src/compute/filter.rs new file mode 100644 index 0000000000..c7c337a986 --- /dev/null +++ b/encodings/datetime-parts/src/compute/filter.rs @@ -0,0 +1,17 @@ +use vortex_array::compute::{filter, FilterFn, FilterMask}; +use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; +use vortex_error::VortexResult; + +use crate::{DateTimePartsArray, DateTimePartsEncoding}; + +impl FilterFn for DateTimePartsEncoding { + fn filter(&self, array: &DateTimePartsArray, mask: FilterMask) -> VortexResult { + Ok(DateTimePartsArray::try_new( + array.dtype().clone(), + filter(array.days().as_ref(), mask.clone())?, + filter(array.seconds().as_ref(), mask.clone())?, + filter(array.subsecond().as_ref(), mask)?, + )? + .into_array()) + } +} diff --git a/encodings/datetime-parts/src/compute.rs b/encodings/datetime-parts/src/compute/mod.rs similarity index 92% rename from encodings/datetime-parts/src/compute.rs rename to encodings/datetime-parts/src/compute/mod.rs index aab4b28169..a1863e28b8 100644 --- a/encodings/datetime-parts/src/compute.rs +++ b/encodings/datetime-parts/src/compute/mod.rs @@ -1,7 +1,10 @@ +mod filter; +mod take; + use itertools::Itertools as _; use vortex_array::array::{PrimitiveArray, TemporalArray}; use vortex_array::compute::{ - scalar_at, slice, take, ComputeVTable, ScalarAtFn, SliceFn, TakeFn, TakeOptions, + scalar_at, slice, ComputeVTable, FilterFn, ScalarAtFn, SliceFn, TakeFn, }; use vortex_array::validity::ArrayValidity; use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant}; @@ -13,6 +16,10 @@ use vortex_scalar::Scalar; use crate::{DateTimePartsArray, DateTimePartsEncoding}; impl ComputeVTable for DateTimePartsEncoding { + fn filter_fn(&self) -> Option<&dyn FilterFn> { + Some(self) + } + fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -26,23 +33,6 @@ impl ComputeVTable for DateTimePartsEncoding { } } -impl TakeFn for DateTimePartsEncoding { - fn take( - &self, - array: &DateTimePartsArray, - indices: &ArrayData, - options: TakeOptions, - ) -> VortexResult { - Ok(DateTimePartsArray::try_new( - array.dtype().clone(), - take(array.days(), indices, options)?, - take(array.seconds(), indices, options)?, - take(array.subsecond(), indices, options)?, - )? - .into_array()) - } -} - impl SliceFn for DateTimePartsEncoding { fn slice( &self, diff --git a/encodings/datetime-parts/src/compute/take.rs b/encodings/datetime-parts/src/compute/take.rs new file mode 100644 index 0000000000..dcd697e326 --- /dev/null +++ b/encodings/datetime-parts/src/compute/take.rs @@ -0,0 +1,22 @@ +use vortex_array::compute::{take, TakeFn, TakeOptions}; +use vortex_array::{ArrayDType, ArrayData, IntoArrayData}; +use vortex_error::VortexResult; + +use crate::{DateTimePartsArray, DateTimePartsEncoding}; + +impl TakeFn for DateTimePartsEncoding { + fn take( + &self, + array: &DateTimePartsArray, + indices: &ArrayData, + options: TakeOptions, + ) -> VortexResult { + Ok(DateTimePartsArray::try_new( + array.dtype().clone(), + take(array.days(), indices, options)?, + take(array.seconds(), indices, options)?, + take(array.subsecond(), indices, options)?, + )? + .into_array()) + } +} diff --git a/vortex-array/src/array/chunked/compute/boolean.rs b/vortex-array/src/array/chunked/compute/boolean.rs new file mode 100644 index 0000000000..7011da02aa --- /dev/null +++ b/vortex-array/src/array/chunked/compute/boolean.rs @@ -0,0 +1,34 @@ +use vortex_dtype::{DType, Nullability}; +use vortex_error::VortexResult; + +use crate::array::{ChunkedArray, ChunkedEncoding}; +use crate::compute::{and, and_kleene, or, or_kleene, slice, BinaryBooleanFn, BinaryOperator}; +use crate::{ArrayData, IntoArrayData}; + +impl BinaryBooleanFn for ChunkedEncoding { + fn binary_boolean( + &self, + lhs: &ChunkedArray, + rhs: &ArrayData, + op: BinaryOperator, + ) -> VortexResult> { + let mut idx = 0; + let mut chunks = Vec::with_capacity(lhs.nchunks()); + + for chunk in lhs.chunks() { + let sliced = slice(rhs, idx, idx + chunk.len())?; + let result = match op { + BinaryOperator::And => and(&chunk, &sliced), + BinaryOperator::AndKleene => and_kleene(&chunk, &sliced), + BinaryOperator::Or => or(&chunk, &sliced), + BinaryOperator::OrKleene => or_kleene(&chunk, &sliced), + }; + chunks.push(result?); + idx += chunk.len(); + } + + Ok(Some( + ChunkedArray::try_new(chunks, DType::Bool(Nullability::Nullable))?.into_array(), + )) + } +} diff --git a/vortex-array/src/array/chunked/compute/compare.rs b/vortex-array/src/array/chunked/compute/compare.rs new file mode 100644 index 0000000000..5dcb31e7bb --- /dev/null +++ b/vortex-array/src/array/chunked/compute/compare.rs @@ -0,0 +1,30 @@ +use vortex_dtype::{DType, Nullability}; +use vortex_error::VortexResult; + +use crate::array::{ChunkedArray, ChunkedEncoding}; +use crate::compute::{compare, slice, CompareFn, Operator}; +use crate::{ArrayData, IntoArrayData}; + +impl CompareFn for ChunkedEncoding { + fn compare( + &self, + lhs: &ChunkedArray, + rhs: &ArrayData, + operator: Operator, + ) -> VortexResult> { + let mut idx = 0; + let mut compare_chunks = Vec::with_capacity(lhs.nchunks()); + + for chunk in lhs.chunks() { + let sliced = slice(rhs, idx, idx + chunk.len())?; + let cmp_result = compare(&chunk, &sliced, operator)?; + compare_chunks.push(cmp_result); + + idx += chunk.len(); + } + + Ok(Some( + ChunkedArray::try_new(compare_chunks, DType::Bool(Nullability::Nullable))?.into_array(), + )) + } +} diff --git a/vortex-array/src/array/chunked/compute/mod.rs b/vortex-array/src/array/chunked/compute/mod.rs index 1d1e5be899..befdb365aa 100644 --- a/vortex-array/src/array/chunked/compute/mod.rs +++ b/vortex-array/src/array/chunked/compute/mod.rs @@ -1,14 +1,16 @@ -use vortex_dtype::{DType, Nullability}; +use vortex_dtype::DType; use vortex_error::VortexResult; use crate::array::chunked::ChunkedArray; use crate::array::ChunkedEncoding; use crate::compute::{ - compare, slice, try_cast, CastFn, CompareFn, ComputeVTable, FilterFn, InvertFn, Operator, - ScalarAtFn, SliceFn, SubtractScalarFn, TakeFn, + try_cast, BinaryBooleanFn, CastFn, CompareFn, ComputeVTable, FilterFn, InvertFn, ScalarAtFn, + SliceFn, SubtractScalarFn, TakeFn, }; use crate::{ArrayData, IntoArrayData}; +mod boolean; +mod compare; mod filter; mod invert; mod scalar_at; @@ -16,6 +18,10 @@ mod slice; mod take; impl ComputeVTable for ChunkedEncoding { + fn binary_boolean_fn(&self) -> Option<&dyn BinaryBooleanFn> { + Some(self) + } + fn cast_fn(&self) -> Option<&dyn CastFn> { Some(self) } @@ -59,30 +65,6 @@ impl CastFn for ChunkedEncoding { } } -impl CompareFn for ChunkedEncoding { - fn compare( - &self, - lhs: &ChunkedArray, - rhs: &ArrayData, - operator: Operator, - ) -> VortexResult> { - let mut idx = 0; - let mut compare_chunks = Vec::with_capacity(lhs.nchunks()); - - for chunk in lhs.chunks() { - let sliced = slice(rhs, idx, idx + chunk.len())?; - let cmp_result = compare(&chunk, &sliced, operator)?; - compare_chunks.push(cmp_result); - - idx += chunk.len(); - } - - Ok(Some( - ChunkedArray::try_new(compare_chunks, DType::Bool(Nullability::Nullable))?.into_array(), - )) - } -} - #[cfg(test)] mod test { use vortex_dtype::{DType, Nullability, PType}; diff --git a/vortex-array/src/canonical.rs b/vortex-array/src/canonical.rs index 0fb725cf45..358c19b8c0 100644 --- a/vortex-array/src/canonical.rs +++ b/vortex-array/src/canonical.rs @@ -381,7 +381,10 @@ where /// the array's internal codec. impl IntoCanonical for ArrayData { fn into_canonical(self) -> VortexResult { - log::debug!("Canonicalizing array with encoding {:?}", self.encoding()); + // We only care to know when we canonicalize something non-trivial. + if !self.is_canonical() && self.len() > 1 { + log::debug!("Canonicalizing array with encoding {:?}", self.encoding()); + } self.encoding().into_canonical(self) } } diff --git a/vortex-array/src/compute/boolean.rs b/vortex-array/src/compute/boolean.rs index 9cb0347b6f..8004493c2b 100644 --- a/vortex-array/src/compute/boolean.rs +++ b/vortex-array/src/compute/boolean.rs @@ -93,7 +93,7 @@ fn binary_boolean(lhs: &ArrayData, rhs: &ArrayData, op: BinaryOperator) -> Vorte } // If the RHS is constant and the LHS is Arrow, we can't do any better than arrow_compare. - if lhs.is_arrow() && rhs.is_constant() { + if lhs.is_arrow() && (rhs.is_arrow() || rhs.is_constant()) { return arrow_boolean(lhs.clone(), rhs.clone(), op); } @@ -104,13 +104,6 @@ fn binary_boolean(lhs: &ArrayData, rhs: &ArrayData, op: BinaryOperator) -> Vorte .and_then(|f| f.binary_boolean(lhs, rhs, op).transpose()) { return result; - } else { - log::debug!( - "No boolean implementation found for LHS {}, RHS {}, and operator {:?}", - lhs.encoding().id(), - rhs.encoding().id(), - op, - ); } if let Some(result) = rhs @@ -119,15 +112,15 @@ fn binary_boolean(lhs: &ArrayData, rhs: &ArrayData, op: BinaryOperator) -> Vorte .and_then(|f| f.binary_boolean(rhs, lhs, op).transpose()) { return result; - } else { - log::debug!( - "No boolean implementation found for LHS {}, RHS {}, and operator {:?}", - rhs.encoding().id(), - lhs.encoding().id(), - op, - ); } + log::debug!( + "No boolean implementation found for LHS {}, RHS {}, and operator {:?} (or inverse)", + rhs.encoding().id(), + lhs.encoding().id(), + op, + ); + // If neither side implements the trait, then we delegate to Arrow compute. arrow_boolean(lhs.clone(), rhs.clone(), op) } diff --git a/vortex-array/src/compute/compare.rs b/vortex-array/src/compute/compare.rs index 084f9ec3f6..9b7370c884 100644 --- a/vortex-array/src/compute/compare.rs +++ b/vortex-array/src/compute/compare.rs @@ -123,7 +123,7 @@ pub fn compare( } // If the RHS is constant and the LHS is Arrow, we can't do any better than arrow_compare. - if left.is_arrow() && right.is_constant() { + if left.is_arrow() && (right.is_arrow() || right.is_constant()) { return arrow_compare(left, right, operator); } @@ -133,13 +133,6 @@ pub fn compare( .and_then(|f| f.compare(left, right, operator).transpose()) { return result; - } else { - log::debug!( - "No compare implementation found for LHS {}, RHS {}, and operator {}", - left.encoding().id(), - right.encoding().id(), - operator, - ); } if let Some(result) = right @@ -148,15 +141,15 @@ pub fn compare( .and_then(|f| f.compare(right, left, operator.swap()).transpose()) { return result; - } else { - log::debug!( - "No compare implementation found for LHS {}, RHS {}, and operator {}", - right.encoding().id(), - left.encoding().id(), - operator.swap(), - ); } + log::debug!( + "No compare implementation found for LHS {}, RHS {}, and operator {} (or inverse)", + right.encoding().id(), + left.encoding().id(), + operator.swap(), + ); + // Fallback to arrow on canonical types arrow_compare(left, right, operator) } diff --git a/vortex-array/src/data/mod.rs b/vortex-array/src/data/mod.rs index 4763c9c762..67dabbc571 100644 --- a/vortex-array/src/data/mod.rs +++ b/vortex-array/src/data/mod.rs @@ -393,8 +393,10 @@ impl> ArrayStatistics for T { } } + // FIXME(ngates): this is really slow... fn inherit_statistics(&self, parent: &dyn Statistics) { let stats = self.statistics(); + // The to_set call performs a slow clone of the stats for (stat, scalar) in parent.to_set() { stats.set(stat, scalar); } diff --git a/vortex-file/src/read/mask.rs b/vortex-file/src/read/mask.rs index 723a862208..0c5117a868 100644 --- a/vortex-file/src/read/mask.rs +++ b/vortex-file/src/read/mask.rs @@ -2,7 +2,7 @@ use std::cmp::{max, min}; use std::fmt::{Display, Formatter}; use vortex_array::array::{BoolArray, ConstantArray, PrimitiveArray, SparseArray}; -use vortex_array::compute::{and, filter, slice, take, try_cast, FilterMask, TakeOptions}; +use vortex_array::compute::{and, filter, slice, try_cast, FilterMask}; use vortex_array::stats::ArrayStatistics; use vortex_array::validity::{ArrayValidity, LogicalValidity}; use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant}; @@ -10,8 +10,6 @@ use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, VortexExpect, VortexResult, VortexUnwrap}; -const PREFER_TAKE_TO_FILTER_DENSITY: f64 = 1.0 / 1024.0; - /// Bitmap of selected rows within given [begin, end) row range #[derive(Debug, Clone)] pub struct RowMask { @@ -213,13 +211,8 @@ impl RowMask { return Ok(Some(sliced.clone())); } - if (true_count as f64 / sliced.len() as f64) < PREFER_TAKE_TO_FILTER_DENSITY { - let indices = self.to_indices_array()?; - take(sliced, indices, TakeOptions::default()).map(Some) - } else { - let mask = FilterMask::try_from(self.bitmask.clone())?; - filter(sliced, mask).map(Some) - } + let mask = FilterMask::try_from(self.bitmask.clone())?; + filter(sliced, mask).map(Some) } pub fn to_indices_array(&self) -> VortexResult {