From 9b8a2b41f26114741d47d677867c06828579d4c2 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 3 Sep 2024 13:23:50 +0100 Subject: [PATCH] Fix issues discovered by fuzzer (#707) --- encodings/alp/src/array.rs | 1 + encodings/alp/src/compute.rs | 6 +- encodings/byte-bool/src/compute/mod.rs | 8 +-- .../src/bitpacking/compute/scalar_at.rs | 2 +- .../fastlanes/src/bitpacking/compute/take.rs | 13 +--- encodings/fastlanes/src/for/compute.rs | 4 +- encodings/fsst/src/compute.rs | 13 +--- fuzz/fuzz_targets/fuzz_target_1.rs | 12 ++-- fuzz/src/lib.rs | 10 +-- vortex-array/src/array/arbitrary.rs | 8 +-- .../src/array/bool/compute/scalar_at.rs | 8 ++- vortex-array/src/array/bool/stats.rs | 61 +++++++++++-------- vortex-array/src/array/chunked/compute/mod.rs | 8 +-- vortex-array/src/array/constant/compute.rs | 8 +-- vortex-array/src/array/null/compute.rs | 2 +- vortex-array/src/array/primitive/stats.rs | 52 ++++++++-------- vortex-array/src/array/varbin/stats.rs | 20 ++++-- vortex-array/src/array/varbinview/accessor.rs | 4 +- vortex-array/src/array/varbinview/mod.rs | 4 +- vortex-array/src/compress.rs | 3 +- 20 files changed, 128 insertions(+), 119 deletions(-) diff --git a/encodings/alp/src/array.rs b/encodings/alp/src/array.rs index 5bc569148b..f087e73fbb 100644 --- a/encodings/alp/src/array.rs +++ b/encodings/alp/src/array.rs @@ -124,6 +124,7 @@ struct ALPAccessor { validity: Validity, exponents: Exponents, } + impl ALPAccessor { fn new( encoded: AccessorRef, diff --git a/encodings/alp/src/compute.rs b/encodings/alp/src/compute.rs index af93f5af8d..a695a346ac 100644 --- a/encodings/alp/src/compute.rs +++ b/encodings/alp/src/compute.rs @@ -1,6 +1,6 @@ use vortex::compute::unary::{scalar_at_unchecked, ScalarAtFn}; use vortex::compute::{slice, take, ArrayCompute, SliceFn, TakeFn}; -use vortex::{Array, IntoArray}; +use vortex::{Array, ArrayDType, IntoArray}; use vortex_error::VortexResult; use vortex_scalar::Scalar; @@ -40,10 +40,10 @@ impl ScalarAtFn for ALPArray { match_each_alp_float_ptype!(self.ptype(), |$T| { let encoded_val: <$T as ALPFloat>::ALPInt = encoded_val.as_ref().try_into().unwrap(); - Scalar::from(<$T as ALPFloat>::decode_single( + Scalar::primitive(<$T as ALPFloat>::decode_single( encoded_val, self.exponents(), - )) + ), self.dtype().nullability()) }) } } diff --git a/encodings/byte-bool/src/compute/mod.rs b/encodings/byte-bool/src/compute/mod.rs index eac1d3ed16..e3be395ab2 100644 --- a/encodings/byte-bool/src/compute/mod.rs +++ b/encodings/byte-bool/src/compute/mod.rs @@ -8,7 +8,7 @@ use vortex::validity::{ArrayValidity, Validity}; use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; use vortex_dtype::{match_each_integer_ptype, Nullability}; use vortex_error::{vortex_err, VortexResult}; -use vortex_scalar::{Scalar, ScalarValue}; +use vortex_scalar::Scalar; use super::ByteBoolArray; @@ -40,10 +40,7 @@ impl ScalarAtFn for ByteBoolArray { } fn scalar_at_unchecked(&self, index: usize) -> Scalar { - Scalar::new( - self.dtype().clone(), - ScalarValue::Bool(self.buffer()[index] == 1), - ) + Scalar::bool(self.buffer()[index] == 1, self.dtype().nullability()) } } @@ -179,6 +176,7 @@ mod tests { use vortex::compute::unary::{scalar_at, scalar_at_unchecked}; use vortex::compute::{compare, slice}; use vortex::AsArray as _; + use vortex_scalar::ScalarValue; use super::*; diff --git a/encodings/fastlanes/src/bitpacking/compute/scalar_at.rs b/encodings/fastlanes/src/bitpacking/compute/scalar_at.rs index 9a7d5c2fd7..286b66c8b8 100644 --- a/encodings/fastlanes/src/bitpacking/compute/scalar_at.rs +++ b/encodings/fastlanes/src/bitpacking/compute/scalar_at.rs @@ -9,7 +9,7 @@ impl ScalarAtFn for BitPackedArray { fn scalar_at(&self, index: usize) -> VortexResult { if let Some(patches) = self.patches() { // NB: All non-null values are considered patches - if self.bit_width() == 0 || patches.with_dyn(|a| a.is_valid(index)) { + if patches.with_dyn(|a| a.is_valid(index)) { return scalar_at_unchecked(&patches, index).cast(self.dtype()); } } diff --git a/encodings/fastlanes/src/bitpacking/compute/take.rs b/encodings/fastlanes/src/bitpacking/compute/take.rs index f011d558d2..e5b74afa76 100644 --- a/encodings/fastlanes/src/bitpacking/compute/take.rs +++ b/encodings/fastlanes/src/bitpacking/compute/take.rs @@ -2,14 +2,13 @@ use std::cmp::min; use fastlanes::BitPacking; use itertools::Itertools; -use vortex::array::{ConstantArray, PrimitiveArray, SparseArray}; +use vortex::array::{PrimitiveArray, SparseArray}; use vortex::compute::{slice, take, TakeFn}; use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant}; use vortex_dtype::{ match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType, PType, }; use vortex_error::VortexResult; -use vortex_scalar::Scalar; use crate::{unpack_single_primitive, BitPackedArray}; @@ -18,16 +17,6 @@ impl TakeFn for BitPackedArray { let ptype: PType = self.dtype().try_into()?; let validity = self.validity(); let taken_validity = validity.take(indices)?; - if self.bit_width() == 0 { - return if let Some(patches) = self.patches() { - take(&patches, indices) - } else { - Ok( - ConstantArray::new(Scalar::null(self.dtype().as_nullable()), indices.len()) - .into_array(), - ) - }; - } let indices = indices.clone().into_primitive()?; let taken = match_each_unsigned_integer_ptype!(ptype, |$T| { diff --git a/encodings/fastlanes/src/for/compute.rs b/encodings/fastlanes/src/for/compute.rs index 538333647f..5b1566b684 100644 --- a/encodings/fastlanes/src/for/compute.rs +++ b/encodings/fastlanes/src/for/compute.rs @@ -6,7 +6,7 @@ use vortex::compute::{ use vortex::{Array, ArrayDType, IntoArray}; use vortex_dtype::match_each_integer_ptype; use vortex_error::VortexResult; -use vortex_scalar::{PrimitiveScalar, Scalar, ScalarValue}; +use vortex_scalar::{PrimitiveScalar, Scalar}; use crate::FoRArray; @@ -54,7 +54,7 @@ impl ScalarAtFn for FoRArray { use num_traits::WrappingAdd; encoded.typed_value::<$P>().map(|v| (v << self.shift()).wrapping_add(reference.typed_value::<$P>().unwrap())) .map(|v| Scalar::primitive::<$P>(v, encoded.dtype().nullability())) - .unwrap_or_else(|| Scalar::new(encoded.dtype().clone(), ScalarValue::Null)) + .unwrap_or_else(|| Scalar::null(encoded.dtype().clone())) }) } } diff --git a/encodings/fsst/src/compute.rs b/encodings/fsst/src/compute.rs index c3bb4823ab..cc55ccd6c8 100644 --- a/encodings/fsst/src/compute.rs +++ b/encodings/fsst/src/compute.rs @@ -3,7 +3,7 @@ use vortex::compute::unary::{scalar_at_unchecked, ScalarAtFn}; use vortex::compute::{filter, slice, take, ArrayCompute, FilterFn, SliceFn, TakeFn}; use vortex::{Array, ArrayDType, IntoArray}; use vortex_buffer::Buffer; -use vortex_error::{vortex_bail, VortexResult}; +use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::FSSTArray; @@ -50,16 +50,7 @@ impl TakeFn for FSSTArray { impl ScalarAtFn for FSSTArray { fn scalar_at(&self, index: usize) -> VortexResult { - let compressed = scalar_at_unchecked(&self.codes(), index); - let binary_datum = match compressed.value().as_buffer()? { - Some(b) => b, - None => vortex_bail!("non-nullable scalar must unwrap"), - }; - - let decompressor = self.decompressor(); - let decoded_buffer: Buffer = decompressor.decompress(binary_datum.as_slice()).into(); - - Ok(varbin_scalar(decoded_buffer, self.dtype())) + Ok(self.scalar_at_unchecked(index)) } fn scalar_at_unchecked(&self, index: usize) -> Scalar { diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index 5579e40f4d..4ad490afb3 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -4,7 +4,7 @@ use std::collections::HashSet; use libfuzzer_sys::{fuzz_target, Corpus}; use vortex::compute::slice; -use vortex::compute::unary::scalar_at_unchecked; +use vortex::compute::unary::scalar_at; use vortex::encoding::EncodingId; use vortex::Array; use vortex_fuzz::{Action, FuzzArrayAction}; @@ -26,7 +26,7 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus { assert_array_eq(&array, &compressed_array); Corpus::Keep } - None => return Corpus::Reject, + None => Corpus::Reject, }, Action::Slice(range) => { let slice = slice(&array, range.start, range.end).unwrap(); @@ -48,8 +48,8 @@ fn fuzz_compress(array: &Array, compressor_ref: CompressorRef<'_>) -> Option, } -impl std::fmt::Debug for FuzzArrayAction { +impl Debug for FuzzArrayAction { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("FuzzArrayAction") .field("action", &self.actions) @@ -27,13 +29,12 @@ impl std::fmt::Debug for FuzzArrayAction { } } -#[derive()] pub enum Action { Compress(&'static dyn EncodingCompressor), Slice(Range), } -impl std::fmt::Debug for Action { +impl Debug for Action { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Slice(arg0) => f.debug_tuple("Slice").field(arg0).finish(), @@ -45,7 +46,7 @@ impl std::fmt::Debug for Action { impl<'a> Arbitrary<'a> for FuzzArrayAction { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let array = Array::arbitrary(u)?; - let action = match u.int_in_range(0..=9)? { + let action = match u.int_in_range(0..=10)? { 0 => { let start = u.choose_index(array.len())?; let stop = u.choose_index(array.len() - start)? + start; @@ -60,6 +61,7 @@ impl<'a> Arbitrary<'a> for FuzzArrayAction { 7 => Action::Compress(&DEFAULT_RUN_END_COMPRESSOR), 8 => Action::Compress(&SparseCompressor), 9 => Action::Compress(&ZigZagCompressor), + 10 => Action::Compress(&DateTimePartsCompressor), _ => unreachable!(), }; diff --git a/vortex-array/src/array/arbitrary.rs b/vortex-array/src/array/arbitrary.rs index d8f7451096..5642a7fe3f 100644 --- a/vortex-array/src/array/arbitrary.rs +++ b/vortex-array/src/array/arbitrary.rs @@ -13,7 +13,7 @@ impl<'a> Arbitrary<'a> for Array { } fn random_array(u: &mut Unstructured) -> Result { - match u.int_in_range(0..=12).unwrap() { + match u.int_in_range(0..=12)? { 0 => random_primitive::(u), 1 => random_primitive::(u), 2 => random_primitive::(u), @@ -33,7 +33,7 @@ fn random_array(u: &mut Unstructured) -> Result { fn random_string(u: &mut Unstructured) -> Result { let v = Vec::>::arbitrary(u)?; - let arr = match u.int_in_range(0..=1).unwrap() { + let arr = match u.int_in_range(0..=1)? { 0 => VarBinArray::from_iter(v, DType::Utf8(Nullability::Nullable)).into_array(), 1 => VarBinViewArray::from_iter_nullable_str(v).into_array(), _ => unreachable!(), @@ -44,8 +44,8 @@ fn random_string(u: &mut Unstructured) -> Result { fn random_bytes(u: &mut Unstructured) -> Result { let v = Vec::>>::arbitrary(u)?; - let arr = match u.int_in_range(0..=1).unwrap() { - 0 => VarBinArray::from_iter(v, DType::Utf8(Nullability::Nullable)).into_array(), + let arr = match u.int_in_range(0..=1)? { + 0 => VarBinArray::from_iter(v, DType::Binary(Nullability::Nullable)).into_array(), 1 => VarBinViewArray::from_iter_nullable_bin(v).into_array(), _ => unreachable!(), }; diff --git a/vortex-array/src/array/bool/compute/scalar_at.rs b/vortex-array/src/array/bool/compute/scalar_at.rs index b8ba6a2ed3..12f0227948 100644 --- a/vortex-array/src/array/bool/compute/scalar_at.rs +++ b/vortex-array/src/array/bool/compute/scalar_at.rs @@ -3,6 +3,7 @@ use vortex_scalar::Scalar; use crate::array::BoolArray; use crate::compute::unary::ScalarAtFn; +use crate::ArrayDType; impl ScalarAtFn for BoolArray { fn scalar_at(&self, index: usize) -> VortexResult { @@ -12,6 +13,11 @@ impl ScalarAtFn for BoolArray { fn scalar_at_unchecked(&self, index: usize) -> Scalar { // SAFETY: // `scalar_at_unchecked` is fine with undefined behavior, so it should be acceptable here - unsafe { self.boolean_buffer().value_unchecked(index).into() } + unsafe { + Scalar::bool( + self.boolean_buffer().value_unchecked(index), + self.dtype().nullability(), + ) + } } } diff --git a/vortex-array/src/array/bool/stats.rs b/vortex-array/src/array/bool/stats.rs index cc60ca4ced..4c49264999 100644 --- a/vortex-array/src/array/bool/stats.rs +++ b/vortex-array/src/array/bool/stats.rs @@ -1,7 +1,8 @@ use std::collections::HashMap; use arrow_buffer::BooleanBuffer; -use vortex_error::{vortex_err, VortexResult}; +use vortex_dtype::{DType, Nullability}; +use vortex_error::VortexResult; use crate::array::BoolArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; @@ -36,21 +37,24 @@ impl ArrayStatisticsCompute for NullableBools<'_> { .enumerate() .skip_while(|(_, valid)| !*valid) .map(|(idx, _)| idx) - .next() - .ok_or_else(|| vortex_err!("Must be at least one non-null value"))?; - - let mut stats = BoolStatsAccumulator::new_with_leading_nulls( - self.0.value(first_non_null_idx), - first_non_null_idx, - ); - - self.0 - .iter() - .zip(self.1.iter()) - .skip(first_non_null_idx + 1) - .map(|(next, valid)| valid.then_some(next)) - .for_each(|next| stats.nullable_next(next)); - Ok(stats.into_map(self.0.len())) + .next(); + + if let Some(first_non_null) = first_non_null_idx { + let mut acc = BoolStatsAccumulator::new(self.0.value(first_non_null)); + acc.n_nulls(first_non_null); + self.0 + .iter() + .zip(self.1.iter()) + .skip(first_non_null + 1) + .map(|(next, valid)| valid.then_some(next)) + .for_each(|next| acc.nullable_next(next)); + Ok(acc.finish()) + } else { + Ok(StatsSet::nulls( + self.0.len(), + &DType::Bool(Nullability::Nullable), + )) + } } } @@ -58,7 +62,7 @@ impl ArrayStatisticsCompute for BooleanBuffer { fn compute_statistics(&self, _stat: Stat) -> VortexResult { let mut stats = BoolStatsAccumulator::new(self.value(0)); self.iter().skip(1).for_each(|next| stats.next(next)); - Ok(stats.into_map(self.len())) + Ok(stats.finish()) } } @@ -68,6 +72,7 @@ struct BoolStatsAccumulator { run_count: usize, null_count: usize, true_count: usize, + len: usize, } impl BoolStatsAccumulator { @@ -78,13 +83,13 @@ impl BoolStatsAccumulator { run_count: 1, null_count: 0, true_count: if first_value { 1 } else { 0 }, + len: 1, } } - fn new_with_leading_nulls(first_value: bool, leading_null_count: usize) -> Self { - let mut stats = Self::new(first_value); - stats.null_count += leading_null_count; - stats + fn n_nulls(&mut self, n_nulls: usize) { + self.null_count += n_nulls; + self.len += n_nulls; } pub fn nullable_next(&mut self, next: Option) { @@ -92,11 +97,14 @@ impl BoolStatsAccumulator { Some(n) => self.next(n), None => { self.null_count += 1; + self.len += 1; } } } pub fn next(&mut self, next: bool) { + self.len += 1; + if next { self.true_count += 1 } @@ -109,18 +117,21 @@ impl BoolStatsAccumulator { } } - pub fn into_map(self, len: usize) -> StatsSet { + pub fn finish(self) -> StatsSet { StatsSet::from(HashMap::from([ - (Stat::Min, (self.true_count == len).into()), + (Stat::Min, (self.true_count == self.len).into()), (Stat::Max, (self.true_count > 0).into()), ( Stat::IsConstant, - (self.true_count == len || self.true_count == 0).into(), + (self.null_count == 0 && (self.true_count == self.len || self.true_count == 0) + || self.null_count == self.len) + .into(), ), (Stat::IsSorted, self.is_sorted.into()), ( Stat::IsStrictSorted, - (self.is_sorted && (len < 2 || (len == 2 && self.true_count == 1))).into(), + (self.is_sorted && (self.len < 2 || (self.len == 2 && self.true_count == 1))) + .into(), ), (Stat::RunCount, self.run_count.into()), (Stat::TrueCount, self.true_count.into()), diff --git a/vortex-array/src/array/chunked/compute/mod.rs b/vortex-array/src/array/chunked/compute/mod.rs index a5d7dee690..e8be9d1e15 100644 --- a/vortex-array/src/array/chunked/compute/mod.rs +++ b/vortex-array/src/array/chunked/compute/mod.rs @@ -17,6 +17,10 @@ impl ArrayCompute for ChunkedArray { Some(self) } + fn compare(&self) -> Option<&dyn CompareFn> { + Some(self) + } + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -32,10 +36,6 @@ impl ArrayCompute for ChunkedArray { fn take(&self) -> Option<&dyn TakeFn> { Some(self) } - - fn compare(&self) -> Option<&dyn CompareFn> { - Some(self) - } } impl ScalarAtFn for ChunkedArray { diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index 2e84dc2685..e676912c21 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -18,11 +18,11 @@ use crate::stats::{ArrayStatistics, Stat}; use crate::{Array, ArrayDType, AsArray, IntoArray, IntoCanonical}; impl ArrayCompute for ConstantArray { - fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + fn compare(&self) -> Option<&dyn CompareFn> { Some(self) } - fn slice(&self) -> Option<&dyn SliceFn> { + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } @@ -30,11 +30,11 @@ impl ArrayCompute for ConstantArray { Some(self) } - fn take(&self) -> Option<&dyn TakeFn> { + fn slice(&self) -> Option<&dyn SliceFn> { Some(self) } - fn compare(&self) -> Option<&dyn CompareFn> { + fn take(&self) -> Option<&dyn TakeFn> { Some(self) } diff --git a/vortex-array/src/array/null/compute.rs b/vortex-array/src/array/null/compute.rs index 9cf5588b1e..ccc74193b6 100644 --- a/vortex-array/src/array/null/compute.rs +++ b/vortex-array/src/array/null/compute.rs @@ -29,7 +29,7 @@ impl SliceFn for NullArray { impl ScalarAtFn for NullArray { fn scalar_at(&self, index: usize) -> VortexResult { - Ok(::scalar_at_unchecked(self, index)) + Ok(self.scalar_at_unchecked(index)) } fn scalar_at_unchecked(&self, _index: usize) -> Scalar { diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index 9971bc0a7b..e94ba7c9bf 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -4,8 +4,8 @@ use std::mem::size_of; use arrow_buffer::buffer::BooleanBuffer; use num_traits::PrimInt; use vortex_dtype::half::f16; -use vortex_dtype::{match_each_native_ptype, NativePType}; -use vortex_error::{vortex_err, VortexResult}; +use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability}; +use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; @@ -40,7 +40,7 @@ impl ArrayStatisticsCompute for &[T] { } let mut stats = StatsAccumulator::new(self[0]); self.iter().skip(1).for_each(|next| stats.next(*next)); - Ok(stats.into_map()) + Ok(stats.finish()) } } @@ -59,20 +59,24 @@ impl<'a, T: PStatsType> ArrayStatisticsCompute for NullableValues<'a, T> { .enumerate() .skip_while(|(_, valid)| !*valid) .map(|(idx, _)| idx) - .next() - .ok_or_else(|| vortex_err!("Must be at least one non-null value"))?; - - let mut stats = StatsAccumulator::new_with_leading_nulls( - values[first_non_null_idx], - first_non_null_idx, - ); - values - .iter() - .zip(self.1.iter()) - .skip(first_non_null_idx + 1) - .map(|(next, valid)| valid.then_some(*next)) - .for_each(|next| stats.nullable_next(next)); - Ok(stats.into_map()) + .next(); + + if let Some(first_non_null) = first_non_null_idx { + let mut acc = StatsAccumulator::new(values[first_non_null]); + acc.n_nulls(first_non_null); + self.0 + .iter() + .zip(self.1.iter()) + .skip(first_non_null + 1) + .map(|(next, valid)| valid.then_some(*next)) + .for_each(|next| acc.nullable_next(next)); + Ok(acc.finish()) + } else { + Ok(StatsSet::nulls( + self.0.len(), + &DType::Primitive(T::PTYPE, Nullability::Nullable), + )) + } } } @@ -157,13 +161,11 @@ impl StatsAccumulator { stats } - fn new_with_leading_nulls(first_value: T, leading_null_count: usize) -> Self { - let mut stats = Self::new(first_value); - stats.null_count += leading_null_count; - stats.bit_widths[0] += leading_null_count; - stats.trailing_zeros[T::PTYPE.bit_width()] += leading_null_count; - stats.len += leading_null_count; - stats + fn n_nulls(&mut self, n_nulls: usize) { + self.null_count += n_nulls; + self.bit_widths[0] += n_nulls; + self.trailing_zeros[T::PTYPE.bit_width()] += n_nulls; + self.len += n_nulls; } pub fn nullable_next(&mut self, next: Option) { @@ -203,7 +205,7 @@ impl StatsAccumulator { self.prev = next; } - pub fn into_map(self) -> StatsSet { + pub fn finish(self) -> StatsSet { let is_constant = (self.min == self.max && self.null_count == 0 && self.nan_count == 0) || self.null_count == self.len || self.nan_count == self.len; diff --git a/vortex-array/src/array/varbin/stats.rs b/vortex-array/src/array/varbin/stats.rs index 157f1990ff..9cc892cf96 100644 --- a/vortex-array/src/array/varbin/stats.rs +++ b/vortex-array/src/array/varbin/stats.rs @@ -33,8 +33,8 @@ pub fn compute_stats(iter: &mut dyn Iterator>, dtype: &DTyp if let Some(first_non_null) = first_value { let mut acc = VarBinAccumulator::new(first_non_null); - iter.for_each(|n| acc.nullable_next(n)); acc.n_nulls(leading_nulls); + iter.for_each(|n| acc.nullable_next(n)); acc.finish(dtype) } else { StatsSet::nulls(leading_nulls, dtype) @@ -44,12 +44,12 @@ pub fn compute_stats(iter: &mut dyn Iterator>, dtype: &DTyp pub struct VarBinAccumulator<'a> { min: &'a [u8], max: &'a [u8], - is_constant: bool, is_sorted: bool, is_strict_sorted: bool, last_value: &'a [u8], null_count: usize, runs: usize, + len: usize, } impl<'a> VarBinAccumulator<'a> { @@ -57,27 +57,33 @@ impl<'a> VarBinAccumulator<'a> { Self { min: value, max: value, - is_constant: true, is_sorted: true, is_strict_sorted: true, last_value: value, runs: 1, null_count: 0, + len: 1, } } pub fn nullable_next(&mut self, val: Option<&'a [u8]>) { match val { - None => self.null_count += 1, + None => { + self.null_count += 1; + self.len += 1; + } Some(v) => self.next(v), } } pub fn n_nulls(&mut self, null_count: usize) { + self.len += null_count; self.null_count += null_count; } pub fn next(&mut self, val: &'a [u8]) { + self.len += 1; + if val < self.min { self.min.clone_from(&val); } else if val > self.max { @@ -95,19 +101,21 @@ impl<'a> VarBinAccumulator<'a> { } Ordering::Greater => {} } - self.is_constant = false; self.last_value = val; self.runs += 1; } pub fn finish(&self, dtype: &DType) -> StatsSet { + let is_constant = + (self.min == self.max && self.null_count == 0) || self.null_count == self.len; + StatsSet::from(HashMap::from([ (Stat::Min, varbin_scalar(Buffer::from(self.min), dtype)), (Stat::Max, varbin_scalar(Buffer::from(self.max), dtype)), (Stat::RunCount, self.runs.into()), (Stat::IsSorted, self.is_sorted.into()), (Stat::IsStrictSorted, self.is_strict_sorted.into()), - (Stat::IsConstant, self.is_constant.into()), + (Stat::IsConstant, is_constant.into()), (Stat::NullCount, self.null_count.into()), ])) } diff --git a/vortex-array/src/array/varbinview/accessor.rs b/vortex-array/src/array/varbinview/accessor.rs index 9b1477f001..a7e0792882 100644 --- a/vortex-array/src/array/varbinview/accessor.rs +++ b/vortex-array/src/array/varbinview/accessor.rs @@ -21,7 +21,7 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { None => { let mut iter = views.iter().map(|view| { if view.is_inlined() { - Some(unsafe { &view.inlined.data as &[u8] }) + Some(unsafe { &view.inlined.data[..view.size()] }) } else { let offset = unsafe { view._ref.offset as usize }; let buffer_idx = unsafe { view._ref.buffer_index as usize }; @@ -37,7 +37,7 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { let mut iter = views.iter().zip(validity.iter()).map(|(view, valid)| { if valid { if view.is_inlined() { - Some(unsafe { &view.inlined.data as &[u8] }) + Some(unsafe { &view.inlined.data[..view.size()] }) } else { let offset = unsafe { view._ref.offset as usize }; let buffer_idx = unsafe { view._ref.buffer_index as usize }; diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index aa15466fcb..d8a81b4f84 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -238,7 +238,7 @@ impl VarBinViewArray { pub fn bytes_at(&self, index: usize) -> VortexResult> { let view = self.view_at(index); unsafe { - if view.inlined.size > 12 { + if !view.is_inlined() { let data_buf = slice( &self.bytes(view._ref.buffer_index as usize), view._ref.offset as usize, @@ -247,7 +247,7 @@ impl VarBinViewArray { .into_primitive()?; Ok(data_buf.maybe_null_slice::().to_vec()) } else { - Ok(view.inlined.data[..view.inlined.size as usize].to_vec()) + Ok(view.inlined.data[..view.size()].to_vec()) } } } diff --git a/vortex-array/src/compress.rs b/vortex-array/src/compress.rs index 51c695caed..d089774166 100644 --- a/vortex-array/src/compress.rs +++ b/vortex-array/src/compress.rs @@ -38,9 +38,10 @@ pub fn check_dtype_unchanged(arr: &Array, compressed: &Array) { use crate::ArrayDType; debug_assert!( arr.dtype() == compressed.dtype(), - "Compression changed dtype: {:?} -> {:?} for {}", + "Compression changed dtype: {} -> {}\nFrom array: {}Into array {}", arr.dtype(), compressed.dtype(), + arr.tree_display(), compressed.tree_display(), ); }