diff --git a/encodings/datetime-parts/src/compress.rs b/encodings/datetime-parts/src/compress.rs index 541754b0d6..d06b4c7b7e 100644 --- a/encodings/datetime-parts/src/compress.rs +++ b/encodings/datetime-parts/src/compress.rs @@ -10,7 +10,11 @@ use vortex_error::{vortex_bail, VortexResult}; /// Splitting the components by granularity creates more small values, which enables better /// cascading compression. pub fn compress_temporal(array: TemporalArray) -> VortexResult<(Array, Array, Array)> { - let timestamps = try_cast(&array.temporal_values(), PType::I64.into())?.into_primitive()?; + // After this operation, timestamps will be PrimitiveArray + let timestamps = try_cast( + &array.temporal_values().into_primitive()?.into_array(), + PType::I64.into(), + )?; let divisor = match array.temporal_metadata().time_unit() { TimeUnit::Ns => 1_000_000_000, TimeUnit::Us => 1_000_000, @@ -24,14 +28,14 @@ pub fn compress_temporal(array: TemporalArray) -> VortexResult<(Array, Array, Ar let mut seconds = Vec::with_capacity(length); let mut subsecond = Vec::with_capacity(length); - for &t in timestamps.maybe_null_slice::().iter() { + for &t in timestamps.as_primitive().maybe_null_slice::().iter() { days.push(t / (86_400 * divisor)); seconds.push((t % (86_400 * divisor)) / divisor); subsecond.push((t % (86_400 * divisor)) % divisor); } Ok(( - PrimitiveArray::from_vec(days, timestamps.validity()).into_array(), + PrimitiveArray::from_vec(days, timestamps.as_primitive().validity()).into_array(), PrimitiveArray::from(seconds).into_array(), PrimitiveArray::from(subsecond).into_array(), )) diff --git a/vortex-array/src/array/chunked/canonical.rs b/vortex-array/src/array/chunked/canonical.rs index 778a33f61e..2532077e7d 100644 --- a/vortex-array/src/array/chunked/canonical.rs +++ b/vortex-array/src/array/chunked/canonical.rs @@ -28,6 +28,10 @@ pub(crate) fn try_canonicalize_chunks( chunks: Vec, dtype: &DType, ) -> VortexResult { + if chunks.is_empty() { + vortex_bail!(InvalidArgument: "chunks must be non-empty") + } + let mismatched = chunks .iter() .filter(|chunk| !chunk.dtype().eq(dtype)) @@ -44,15 +48,47 @@ pub(crate) fn try_canonicalize_chunks( Ok(Canonical::Struct(struct_array)) } - // Extension arrays wrap an internal storage array, which can hold a ChunkedArray until - // it is safe to unpack them. + // Extension arrays are containers that wraps an inner storage array with some metadata. + // We delegate to the canonical format of the internal storage array for every chunk, and + // push the chunking down into the inner storage array. + // + // Input: + // ------ + // + // ChunkedArray + // / \ + // / \ + // ExtensionArray ExtensionArray + // | | + // storage storage + // + // + // Output: + // ------ + // + // ExtensionArray + // | + // ChunkedArray + // / \ + // storage storage + // DType::Extension(ext_dtype, _) => { - let ext_array = ExtensionArray::new( + // Recursively apply canonicalization and packing to the storage array backing + // each chunk of the extension array. + let storage_chunks: Vec = chunks + .iter() + // Extension-typed arrays can be compressed into something that is not an + // ExtensionArray, so we should canonicalize each chunk into ExtensionArray first. + .map(|chunk| chunk.clone().into_extension().unwrap().storage()) + .collect(); + let storage_dtype = storage_chunks.first().unwrap().dtype().clone(); + let chunked_storage = + ChunkedArray::try_new(storage_chunks, storage_dtype)?.into_array(); + + Ok(Canonical::Extension(ExtensionArray::new( ext_dtype.clone(), - ChunkedArray::try_new(chunks, dtype.clone())?.into_array(), - ); - - Ok(Canonical::Extension(ext_array)) + chunked_storage, + ))) } // TODO(aduffy): better list support