Skip to content

Commit

Permalink
feat: BitPackedCompressor allows signed arrays (#1699)
Browse files Browse the repository at this point in the history
Most of the work to support signed integers has been done in
BitPackedArray.

This PR removes some assertions and branches in the compressor to make
it possible to bit-pack an array of signed ints.

---------

Co-authored-by: Will Manning <[email protected]>
  • Loading branch information
a10y and lwwmanning authored Dec 17, 2024
1 parent 3555d87 commit 8e0e25c
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 19 deletions.
4 changes: 2 additions & 2 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the

>>> cvtx = vortex.compress(vtx)
>>> cvtx.nbytes
16756
16539
>>> cvtx.nbytes / vtx.nbytes
0.118...
0.117...

Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
cache and RAM.
Expand Down
24 changes: 18 additions & 6 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ pub unsafe fn bitpack_encode_unchecked(
///
/// On success, returns a [Buffer] containing the packed data.
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
// We know the min is > 0, so it's safe to re-interpret signed integers as unsigned.
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
Expand Down Expand Up @@ -359,7 +358,7 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
#[cfg(test)]
#[allow(clippy::cast_possible_truncation)]
mod test {
use vortex_array::{IntoArrayVariant, ToArrayData};
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};

use super::*;

Expand Down Expand Up @@ -431,12 +430,25 @@ mod test {
}

#[test]
#[should_panic(expected = "expected type: uint but instead got i64")]
fn gh_issue_929() {
fn compress_signed_roundtrip() {
let values: Vec<i64> = (-500..500).collect();
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
assert!(array.ptype().is_signed_int());

BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let bitpacked_array =
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let num_patches = bitpacked_array
.patches()
.as_ref()
.map(Patches::num_patches)
.unwrap_or_default();
assert_eq!(num_patches, 500);

let unpacked = bitpacked_array
.into_canonical()
.unwrap()
.into_primitive()
.unwrap();
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
}
}
33 changes: 31 additions & 2 deletions encodings/fastlanes/src/bitpacking/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,21 @@ use crate::{BitPackedArray, BitPackedEncoding};

impl FilterFn<BitPackedArray> for BitPackedEncoding {
fn filter(&self, array: &BitPackedArray, mask: FilterMask) -> VortexResult<ArrayData> {
let primitive = match_each_unsigned_integer_ptype!(array.ptype(), |$I| {
let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| {
filter_primitive::<$I>(array, mask)
});
Ok(primitive?.into_array())
}
}

/// Specialized filter kernel for primitive bit-packed arrays.
///
/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
/// `T` should be promoted to the unsigned variant for any target bit width.
/// For example, if the array is bit-packed `i16`, this function called be called with `T = u16`.
///
/// All bit-packing operations will use the unsigned kernels, but the logical type of `array`
/// dictates the final `PType` of the result.
fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
array: &BitPackedArray,
mask: FilterMask,
Expand Down Expand Up @@ -49,7 +57,7 @@ fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
FilterIter::SlicesIter(iter) => filter_slices(array, mask.true_count(), iter),
};

let mut values = PrimitiveArray::from_vec(values, validity);
let mut values = PrimitiveArray::from_vec(values, validity).reinterpret_cast(array.ptype());
if let Some(patches) = patches {
values = values.patch(patches)?;
}
Expand Down Expand Up @@ -120,6 +128,7 @@ fn filter_slices<T: NativePType + BitPacking + ArrowNativeType>(

#[cfg(test)]
mod test {
use itertools::Itertools;
use vortex_array::array::PrimitiveArray;
use vortex_array::compute::{filter, slice, FilterMask};
use vortex_array::{ArrayLen, IntoArrayVariant};
Expand Down Expand Up @@ -166,4 +175,24 @@ mod test {
(0..1024).map(|i| (i % 63) as u8).collect::<Vec<_>>()
);
}

#[test]
fn filter_bitpacked_signed() {
// Elements 0..=499 are negative integers (patches)
// Element 500 = 0 (packed)
// Elements 501..999 are positive integers (packed)
let values: Vec<i64> = (-500..500).collect_vec();
let unpacked = PrimitiveArray::from(values.clone());
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
let filtered = filter(
bitpacked.as_ref(),
FilterMask::from_indices(values.len(), 250..750),
)
.unwrap()
.into_primitive()
.unwrap()
.into_maybe_null_slice::<i64>();

assert_eq!(filtered.as_slice(), &values[250..750]);
}
}
5 changes: 2 additions & 3 deletions encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,8 @@ impl BitPackedArray {
offset: u16,
) -> VortexResult<Self> {
let dtype = DType::Primitive(ptype, validity.nullability());

if !dtype.is_unsigned_int() {
vortex_bail!(MismatchedTypes: "uint", &dtype);
if !dtype.is_int() {
vortex_bail!(MismatchedTypes: "integer", dtype);
}

if bit_width > u64::BITS as u8 {
Expand Down
2 changes: 1 addition & 1 deletion pyvortex/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::array::PyArray;
///
/// >>> a = vortex.array(list(range(1000)))
/// >>> str(vortex.compress(a))
/// 'fastlanes.for(0x17)(i64, len=1000)'
/// 'fastlanes.bitpacked(0x15)(i64, len=1000)'
///
/// Compress an array of increasing floating-point numbers and a few nulls:
///
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/bitpacked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ impl EncodingCompressor for BitPackedCompressor {
// Only support primitive arrays
let parray = PrimitiveArray::maybe_from(array)?;

// Only supports unsigned ints
if !parray.ptype().is_unsigned_int() {
if !parray.ptype().is_int() {
return None;
}

Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/for.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ impl EncodingCompressor for FoRCompressor {
let shift = trailing_zeros(array);
match_each_integer_ptype!(parray.ptype(), |$P| {
let min: $P = parray.statistics().compute_min()?;
if min == 0 && shift == 0 && parray.ptype().is_unsigned_int() {
if min == 0 && shift == 0 {
return None;
}
});
Expand Down
4 changes: 2 additions & 2 deletions vortex-sampling-compressor/tests/smoketest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ mod tests {
use vortex_datetime_dtype::TimeUnit;
use vortex_datetime_parts::DateTimePartsEncoding;
use vortex_dict::DictEncoding;
use vortex_fastlanes::FoREncoding;
use vortex_fastlanes::BitPackedEncoding;
use vortex_fsst::FSSTEncoding;
use vortex_sampling_compressor::ALL_COMPRESSORS;
use vortex_scalar::Scalar;
Expand Down Expand Up @@ -122,7 +122,7 @@ mod tests {
.unwrap();
println!("prim_col num chunks: {}", prim_col.nchunks());
for chunk in prim_col.chunks() {
assert_eq!(chunk.encoding().id(), FoREncoding::ID);
assert_eq!(chunk.encoding().id(), BitPackedEncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from((chunk.len() * 8) as u64 + 1))
Expand Down

0 comments on commit 8e0e25c

Please sign in to comment.