feat: BitPackedCompressor allows signed arrays (#1699)

Most of the work to support signed integers has been done in BitPackedArray. This PR removes some assertions and branches in the compressor to make it possible to bit-pack an array of signed ints. --------- Co-authored-by: Will Manning <[email protected]>
spiraldb · Dec 17, 2024 · 8e0e25c · 8e0e25c
1 parent 3555d87
commit 8e0e25c
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 19 deletions.
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
 
    >>> cvtx = vortex.compress(vtx)
    >>> cvtx.nbytes
-   16756
+   16539
    >>> cvtx.nbytes / vtx.nbytes
-   0.118...
+   0.117...
 
 Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
 cache and RAM.

diff --git a/encodings/fastlanes/src/bitpacking/compress.rs b/encodings/fastlanes/src/bitpacking/compress.rs
@@ -70,7 +70,6 @@ pub unsafe fn bitpack_encode_unchecked(
 ///
 /// On success, returns a [Buffer] containing the packed data.
 pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
-    // We know the min is > 0, so it's safe to re-interpret signed integers as unsigned.
     let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
     let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
         bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
@@ -359,7 +358,7 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
 #[cfg(test)]
 #[allow(clippy::cast_possible_truncation)]
 mod test {
-    use vortex_array::{IntoArrayVariant, ToArrayData};
+    use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};
 
     use super::*;
 
@@ -431,12 +430,25 @@ mod test {
     }
 
     #[test]
-    #[should_panic(expected = "expected type: uint but instead got i64")]
-    fn gh_issue_929() {
+    fn compress_signed_roundtrip() {
         let values: Vec<i64> = (-500..500).collect();
-        let array = PrimitiveArray::from_vec(values, Validity::AllValid);
+        let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
         assert!(array.ptype().is_signed_int());
 
-        BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
+        let bitpacked_array =
+            BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
+        let num_patches = bitpacked_array
+            .patches()
+            .as_ref()
+            .map(Patches::num_patches)
+            .unwrap_or_default();
+        assert_eq!(num_patches, 500);
+
+        let unpacked = bitpacked_array
+            .into_canonical()
+            .unwrap()
+            .into_primitive()
+            .unwrap();
+        assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
     }
 }
diff --git a/encodings/fastlanes/src/bitpacking/compute/filter.rs b/encodings/fastlanes/src/bitpacking/compute/filter.rs
@@ -13,13 +13,21 @@ use crate::{BitPackedArray, BitPackedEncoding};
 
 impl FilterFn<BitPackedArray> for BitPackedEncoding {
     fn filter(&self, array: &BitPackedArray, mask: FilterMask) -> VortexResult<ArrayData> {
-        let primitive = match_each_unsigned_integer_ptype!(array.ptype(), |$I| {
+        let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| {
             filter_primitive::<$I>(array, mask)
         });
         Ok(primitive?.into_array())
     }
 }
 
+/// Specialized filter kernel for primitive bit-packed arrays.
+///
+/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
+/// `T` should be promoted to the unsigned variant for any target bit width.
+/// For example, if the array is bit-packed `i16`, this function called be called with `T = u16`.
+///
+/// All bit-packing operations will use the unsigned kernels, but the logical type of `array`
+/// dictates the final `PType` of the result.
 fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
     array: &BitPackedArray,
     mask: FilterMask,
@@ -49,7 +57,7 @@ fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
         FilterIter::SlicesIter(iter) => filter_slices(array, mask.true_count(), iter),
     };
 
-    let mut values = PrimitiveArray::from_vec(values, validity);
+    let mut values = PrimitiveArray::from_vec(values, validity).reinterpret_cast(array.ptype());
     if let Some(patches) = patches {
         values = values.patch(patches)?;
     }
@@ -120,6 +128,7 @@ fn filter_slices<T: NativePType + BitPacking + ArrowNativeType>(
 
 #[cfg(test)]
 mod test {
+    use itertools::Itertools;
     use vortex_array::array::PrimitiveArray;
     use vortex_array::compute::{filter, slice, FilterMask};
     use vortex_array::{ArrayLen, IntoArrayVariant};
@@ -166,4 +175,24 @@ mod test {
             (0..1024).map(|i| (i % 63) as u8).collect::<Vec<_>>()
         );
     }
+
+    #[test]
+    fn filter_bitpacked_signed() {
+        // Elements 0..=499 are negative integers (patches)
+        // Element 500 = 0 (packed)
+        // Elements 501..999 are positive integers (packed)
+        let values: Vec<i64> = (-500..500).collect_vec();
+        let unpacked = PrimitiveArray::from(values.clone());
+        let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
+        let filtered = filter(
+            bitpacked.as_ref(),
+            FilterMask::from_indices(values.len(), 250..750),
+        )
+        .unwrap()
+        .into_primitive()
+        .unwrap()
+        .into_maybe_null_slice::<i64>();
+
+        assert_eq!(filtered.as_slice(), &values[250..750]);
+    }
 }
diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs
@@ -63,9 +63,8 @@ impl BitPackedArray {
         offset: u16,
     ) -> VortexResult<Self> {
         let dtype = DType::Primitive(ptype, validity.nullability());
-
-        if !dtype.is_unsigned_int() {
-            vortex_bail!(MismatchedTypes: "uint", &dtype);
+        if !dtype.is_int() {
+            vortex_bail!(MismatchedTypes: "integer", dtype);
         }
 
         if bit_width > u64::BITS as u8 {

diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs
@@ -24,7 +24,7 @@ use crate::array::PyArray;
 ///
 /// >>> a = vortex.array(list(range(1000)))
 /// >>> str(vortex.compress(a))
-/// 'fastlanes.for(0x17)(i64, len=1000)'
+/// 'fastlanes.bitpacked(0x15)(i64, len=1000)'
 ///
 /// Compress an array of increasing floating-point numbers and a few nulls:
 ///

diff --git a/vortex-sampling-compressor/src/compressors/bitpacked.rs b/vortex-sampling-compressor/src/compressors/bitpacked.rs
@@ -57,8 +57,7 @@ impl EncodingCompressor for BitPackedCompressor {
         // Only support primitive arrays
         let parray = PrimitiveArray::maybe_from(array)?;
 
-        // Only supports unsigned ints
-        if !parray.ptype().is_unsigned_int() {
+        if !parray.ptype().is_int() {
             return None;
         }
 

diff --git a/vortex-sampling-compressor/src/compressors/for.rs b/vortex-sampling-compressor/src/compressors/for.rs
@@ -42,7 +42,7 @@ impl EncodingCompressor for FoRCompressor {
         let shift = trailing_zeros(array);
         match_each_integer_ptype!(parray.ptype(), |$P| {
             let min: $P = parray.statistics().compute_min()?;
-            if min == 0 && shift == 0 && parray.ptype().is_unsigned_int() {
+            if min == 0 && shift == 0 {
                 return None;
             }
         });

diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs
@@ -18,7 +18,7 @@ mod tests {
     use vortex_datetime_dtype::TimeUnit;
     use vortex_datetime_parts::DateTimePartsEncoding;
     use vortex_dict::DictEncoding;
-    use vortex_fastlanes::FoREncoding;
+    use vortex_fastlanes::BitPackedEncoding;
     use vortex_fsst::FSSTEncoding;
     use vortex_sampling_compressor::ALL_COMPRESSORS;
     use vortex_scalar::Scalar;
@@ -122,7 +122,7 @@ mod tests {
             .unwrap();
         println!("prim_col num chunks: {}", prim_col.nchunks());
         for chunk in prim_col.chunks() {
-            assert_eq!(chunk.encoding().id(), FoREncoding::ID);
+            assert_eq!(chunk.encoding().id(), BitPackedEncoding::ID);
             assert_eq!(
                 chunk.statistics().get(Stat::UncompressedSizeInBytes),
                 Some(Scalar::from((chunk.len() * 8) as u64 + 1))