From 8e0e25c1441a52b5f0d5110e865ffdd26645e3f7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 17 Dec 2024 11:57:03 -0500 Subject: [PATCH] feat: BitPackedCompressor allows signed arrays (#1699) Most of the work to support signed integers has been done in BitPackedArray. This PR removes some assertions and branches in the compressor to make it possible to bit-pack an array of signed ints. --------- Co-authored-by: Will Manning --- docs/quickstart.rst | 4 +-- .../fastlanes/src/bitpacking/compress.rs | 24 ++++++++++---- .../src/bitpacking/compute/filter.rs | 33 +++++++++++++++++-- encodings/fastlanes/src/bitpacking/mod.rs | 5 ++- pyvortex/src/compress.rs | 2 +- .../src/compressors/bitpacked.rs | 3 +- .../src/compressors/for.rs | 2 +- vortex-sampling-compressor/tests/smoketest.rs | 4 +-- 8 files changed, 58 insertions(+), 19 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 2f57d4b7f3..4eb7b94f47 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the >>> cvtx = vortex.compress(vtx) >>> cvtx.nbytes - 16756 + 16539 >>> cvtx.nbytes / vtx.nbytes - 0.118... + 0.117... Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in cache and RAM. diff --git a/encodings/fastlanes/src/bitpacking/compress.rs b/encodings/fastlanes/src/bitpacking/compress.rs index ad0a81bd26..9d1228c2d6 100644 --- a/encodings/fastlanes/src/bitpacking/compress.rs +++ b/encodings/fastlanes/src/bitpacking/compress.rs @@ -70,7 +70,6 @@ pub unsafe fn bitpack_encode_unchecked( /// /// On success, returns a [Buffer] containing the packed data. pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult { - // We know the min is > 0, so it's safe to re-interpret signed integers as unsigned. let parray = parray.reinterpret_cast(parray.ptype().to_unsigned()); let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| { bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width) @@ -359,7 +358,7 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize { #[cfg(test)] #[allow(clippy::cast_possible_truncation)] mod test { - use vortex_array::{IntoArrayVariant, ToArrayData}; + use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData}; use super::*; @@ -431,12 +430,25 @@ mod test { } #[test] - #[should_panic(expected = "expected type: uint but instead got i64")] - fn gh_issue_929() { + fn compress_signed_roundtrip() { let values: Vec = (-500..500).collect(); - let array = PrimitiveArray::from_vec(values, Validity::AllValid); + let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid); assert!(array.ptype().is_signed_int()); - BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap(); + let bitpacked_array = + BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap(); + let num_patches = bitpacked_array + .patches() + .as_ref() + .map(Patches::num_patches) + .unwrap_or_default(); + assert_eq!(num_patches, 500); + + let unpacked = bitpacked_array + .into_canonical() + .unwrap() + .into_primitive() + .unwrap(); + assert_eq!(unpacked.into_maybe_null_slice::(), values); } } diff --git a/encodings/fastlanes/src/bitpacking/compute/filter.rs b/encodings/fastlanes/src/bitpacking/compute/filter.rs index d9984dc622..2743f16dd0 100644 --- a/encodings/fastlanes/src/bitpacking/compute/filter.rs +++ b/encodings/fastlanes/src/bitpacking/compute/filter.rs @@ -13,13 +13,21 @@ use crate::{BitPackedArray, BitPackedEncoding}; impl FilterFn for BitPackedEncoding { fn filter(&self, array: &BitPackedArray, mask: FilterMask) -> VortexResult { - let primitive = match_each_unsigned_integer_ptype!(array.ptype(), |$I| { + let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| { filter_primitive::<$I>(array, mask) }); Ok(primitive?.into_array()) } } +/// Specialized filter kernel for primitive bit-packed arrays. +/// +/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided +/// `T` should be promoted to the unsigned variant for any target bit width. +/// For example, if the array is bit-packed `i16`, this function called be called with `T = u16`. +/// +/// All bit-packing operations will use the unsigned kernels, but the logical type of `array` +/// dictates the final `PType` of the result. fn filter_primitive( array: &BitPackedArray, mask: FilterMask, @@ -49,7 +57,7 @@ fn filter_primitive( FilterIter::SlicesIter(iter) => filter_slices(array, mask.true_count(), iter), }; - let mut values = PrimitiveArray::from_vec(values, validity); + let mut values = PrimitiveArray::from_vec(values, validity).reinterpret_cast(array.ptype()); if let Some(patches) = patches { values = values.patch(patches)?; } @@ -120,6 +128,7 @@ fn filter_slices( #[cfg(test)] mod test { + use itertools::Itertools; use vortex_array::array::PrimitiveArray; use vortex_array::compute::{filter, slice, FilterMask}; use vortex_array::{ArrayLen, IntoArrayVariant}; @@ -166,4 +175,24 @@ mod test { (0..1024).map(|i| (i % 63) as u8).collect::>() ); } + + #[test] + fn filter_bitpacked_signed() { + // Elements 0..=499 are negative integers (patches) + // Element 500 = 0 (packed) + // Elements 501..999 are positive integers (packed) + let values: Vec = (-500..500).collect_vec(); + let unpacked = PrimitiveArray::from(values.clone()); + let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap(); + let filtered = filter( + bitpacked.as_ref(), + FilterMask::from_indices(values.len(), 250..750), + ) + .unwrap() + .into_primitive() + .unwrap() + .into_maybe_null_slice::(); + + assert_eq!(filtered.as_slice(), &values[250..750]); + } } diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs index 04e7bd47a2..b869ff85d7 100644 --- a/encodings/fastlanes/src/bitpacking/mod.rs +++ b/encodings/fastlanes/src/bitpacking/mod.rs @@ -63,9 +63,8 @@ impl BitPackedArray { offset: u16, ) -> VortexResult { let dtype = DType::Primitive(ptype, validity.nullability()); - - if !dtype.is_unsigned_int() { - vortex_bail!(MismatchedTypes: "uint", &dtype); + if !dtype.is_int() { + vortex_bail!(MismatchedTypes: "integer", dtype); } if bit_width > u64::BITS as u8 { diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index 6ceb2a58e4..a485a3b0a2 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -24,7 +24,7 @@ use crate::array::PyArray; /// /// >>> a = vortex.array(list(range(1000))) /// >>> str(vortex.compress(a)) -/// 'fastlanes.for(0x17)(i64, len=1000)' +/// 'fastlanes.bitpacked(0x15)(i64, len=1000)' /// /// Compress an array of increasing floating-point numbers and a few nulls: /// diff --git a/vortex-sampling-compressor/src/compressors/bitpacked.rs b/vortex-sampling-compressor/src/compressors/bitpacked.rs index e51b0572a9..2ecedab23a 100644 --- a/vortex-sampling-compressor/src/compressors/bitpacked.rs +++ b/vortex-sampling-compressor/src/compressors/bitpacked.rs @@ -57,8 +57,7 @@ impl EncodingCompressor for BitPackedCompressor { // Only support primitive arrays let parray = PrimitiveArray::maybe_from(array)?; - // Only supports unsigned ints - if !parray.ptype().is_unsigned_int() { + if !parray.ptype().is_int() { return None; } diff --git a/vortex-sampling-compressor/src/compressors/for.rs b/vortex-sampling-compressor/src/compressors/for.rs index f296be13fc..8db741effa 100644 --- a/vortex-sampling-compressor/src/compressors/for.rs +++ b/vortex-sampling-compressor/src/compressors/for.rs @@ -42,7 +42,7 @@ impl EncodingCompressor for FoRCompressor { let shift = trailing_zeros(array); match_each_integer_ptype!(parray.ptype(), |$P| { let min: $P = parray.statistics().compute_min()?; - if min == 0 && shift == 0 && parray.ptype().is_unsigned_int() { + if min == 0 && shift == 0 { return None; } }); diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs index c441f2b203..996ba80d57 100644 --- a/vortex-sampling-compressor/tests/smoketest.rs +++ b/vortex-sampling-compressor/tests/smoketest.rs @@ -18,7 +18,7 @@ mod tests { use vortex_datetime_dtype::TimeUnit; use vortex_datetime_parts::DateTimePartsEncoding; use vortex_dict::DictEncoding; - use vortex_fastlanes::FoREncoding; + use vortex_fastlanes::BitPackedEncoding; use vortex_fsst::FSSTEncoding; use vortex_sampling_compressor::ALL_COMPRESSORS; use vortex_scalar::Scalar; @@ -122,7 +122,7 @@ mod tests { .unwrap(); println!("prim_col num chunks: {}", prim_col.nchunks()); for chunk in prim_col.chunks() { - assert_eq!(chunk.encoding().id(), FoREncoding::ID); + assert_eq!(chunk.encoding().id(), BitPackedEncoding::ID); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), Some(Scalar::from((chunk.len() * 8) as u64 + 1))