Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: BitPackedArray enforces can only be built over non-negative values #1705

Merged
merged 7 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 52 additions & 38 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,41 @@ pub fn bitpack_encode(array: PrimitiveArray, bit_width: u8) -> VortexResult<BitP
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "missing bit width frequency"))?;

// Check array contains no negative values.
if array.ptype().is_signed_int() {
let has_negative_values = match_each_integer_ptype!(array.ptype(), |$P| {
array.statistics().compute_min::<Option<$P>>().unwrap_or_default().unwrap_or_default() < 0
});
if has_negative_values {
vortex_bail!("cannot bitpack_encode array containing negative integers")
}
}

let num_exceptions = count_exceptions(bit_width, &bit_width_freq);

if bit_width >= array.ptype().bit_width() as u8 {
// Nothing we can do
vortex_bail!("Cannot pack -- specified bit width is greater than or equal to raw bit width")
}

let packed = bitpack(&array, bit_width)?;
// SAFETY: we check that array only contains non-negative values.
let packed = unsafe { bitpack_unchecked(&array, bit_width)? };
let patches = (num_exceptions > 0)
.then(|| gather_patches(&array, bit_width, num_exceptions))
.flatten();

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
// SAFETY: values already checked to be non-negative.
unsafe {
BitPackedArray::new_unchecked(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
}
}

/// Bitpack an array into the specified bit-width without checking statistics.
Expand All @@ -54,22 +69,33 @@ pub unsafe fn bitpack_encode_unchecked(
array: PrimitiveArray,
bit_width: u8,
a10y marked this conversation as resolved.
Show resolved Hide resolved
) -> VortexResult<BitPackedArray> {
let packed = bitpack(&array, bit_width)?;

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
// SAFETY: non-negativity of input checked by caller.
unsafe {
let packed = bitpack_unchecked(&array, bit_width)?;

BitPackedArray::new_unchecked(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
}
}

/// Bitpack a [PrimitiveArray] to the given width.
///
/// On success, returns a [Buffer] containing the packed data.
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
///
/// # Safety
///
/// Internally this function will promote the provided array to its unsigned equivalent. This will
/// violate ordering guarantees if the array contains any negative values.
///
/// It is the caller's responsibility to ensure that `parray` is non-negative before calling
/// this function.
pub unsafe fn bitpack_unchecked(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
Expand Down Expand Up @@ -358,7 +384,8 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
#[cfg(test)]
#[allow(clippy::cast_possible_truncation)]
mod test {
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};
use vortex_array::{IntoArrayVariant, ToArrayData};
use vortex_error::VortexError;

use super::*;

Expand Down Expand Up @@ -430,25 +457,12 @@ mod test {
}

#[test]
fn compress_signed_roundtrip() {
fn compress_signed_fails() {
let values: Vec<i64> = (-500..500).collect();
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
assert!(array.ptype().is_signed_int());

let bitpacked_array =
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let num_patches = bitpacked_array
.patches()
.as_ref()
.map(Patches::num_patches)
.unwrap_or_default();
assert_eq!(num_patches, 500);

let unpacked = bitpacked_array
.into_canonical()
.unwrap()
.into_primitive()
.unwrap();
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
let err = BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap_err();
assert!(matches!(err, VortexError::InvalidArgument(_, _)));
}
}
6 changes: 3 additions & 3 deletions encodings/fastlanes/src/bitpacking/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,18 @@ mod test {
// Elements 0..=499 are negative integers (patches)
// Element 500 = 0 (packed)
// Elements 501..999 are positive integers (packed)
let values: Vec<i64> = (-500..500).collect_vec();
let values: Vec<i64> = (0..500).collect_vec();
let unpacked = PrimitiveArray::from(values.clone());
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
let filtered = filter(
bitpacked.as_ref(),
FilterMask::from_indices(values.len(), 250..750),
FilterMask::from_indices(values.len(), 0..500),
)
.unwrap()
.into_primitive()
.unwrap()
.into_maybe_null_slice::<i64>();

assert_eq!(filtered.as_slice(), &values[250..750]);
assert_eq!(filtered, values);
}
}
25 changes: 14 additions & 11 deletions encodings/fastlanes/src/bitpacking/compute/scalar_at.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,21 @@ mod test {

#[test]
fn invalid_patches() {
let packed_array = BitPackedArray::try_new(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
// SAFETY: using unsigned PType
let packed_array = unsafe {
BitPackedArray::new_unchecked(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
)
)
}
.unwrap()
.into_array();
assert_eq!(
Expand Down
18 changes: 13 additions & 5 deletions encodings/fastlanes/src/bitpacking/compute/search_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use vortex_array::stats::ArrayStatistics;
use vortex_array::validity::Validity;
use vortex_array::variants::PrimitiveArrayTrait;
use vortex_array::{ArrayDType, ArrayLen};
use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
use vortex_error::{VortexError, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;

Expand All @@ -26,7 +26,9 @@ impl SearchSortedFn<BitPackedArray> for BitPackedEncoding {
value: &Scalar,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
// NOTE: it is a precondition of BitPackedArray that all values must be >= 0, thus it is
// always safe to promote to unsigned type without loss of ordering of the values.
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
search_sorted_typed::<$P>(array, value, side)
})
}
Expand Down Expand Up @@ -59,7 +61,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
value: usize,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
// NOTE: conversion may truncate silently.
if let Some(pvalue) = num_traits::cast::<usize, $P>(value) {
search_sorted_native(array, pvalue, side)
Expand All @@ -77,7 +79,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
values: &[usize],
side: SearchSortedSide,
) -> VortexResult<Vec<SearchResult>> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
let searcher = BitPackedSearch::<'_, $P>::new(array);

values
Expand All @@ -104,7 +106,13 @@ where
+ AsPrimitive<usize>
+ AsPrimitive<u64>,
{
let native_value: T = value.cast(array.dtype())?.try_into()?;
// NOTE: we use the unsigned variant of the BitPackedArray DType so that we can use it
// in the BitPackedSearch. We need a type that impls fastlanes::BitPack, and it is a
// precondition for BitPackedArray that all values must be non-negative, so promotion
// is cheap and safe.
let native_value: T = value
.cast(&DType::from(array.ptype().to_unsigned()))?
.try_into()?;
search_sorted_native(array, native_value, side)
}

Expand Down
30 changes: 17 additions & 13 deletions encodings/fastlanes/src/bitpacking/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,23 @@ impl SliceFn<BitPackedArray> for BitPackedEncoding {
let encoded_start = (block_start / 8) * array.bit_width() as usize;
let encoded_stop = (block_stop / 8) * array.bit_width() as usize;
// slice the buffer using the encoded start/stop values
BitPackedArray::try_new_from_offset(
array.packed().slice(encoded_start..encoded_stop),
array.ptype(),
array.validity().slice(start, stop)?,
array
.patches()
.map(|p| p.slice(start, stop))
.transpose()?
.flatten(),
array.bit_width(),
stop - start,
offset as u16,
)

// SAFETY: the invariants of the original BitPackedArray are preserved when slicing.
unsafe {
BitPackedArray::new_unchecked_with_offset(
array.packed().slice(encoded_start..encoded_stop),
array.ptype(),
array.validity().slice(start, stop)?,
array
.patches()
.map(|p| p.slice(start, stop))
.transpose()?
.flatten(),
array.bit_width(),
stop - start,
offset as u16,
)
}
.map(|a| a.into_array())
}
}
Expand Down
4 changes: 3 additions & 1 deletion encodings/fastlanes/src/bitpacking/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ impl TakeFn<BitPackedArray> for BitPackedEncoding {
return take(array.clone().into_canonical()?.into_primitive()?, indices);
}

let ptype: PType = array.dtype().try_into()?;
// NOTE: we use the unsigned PType because all values in the BitPackedArray must
// be non-negative (pre-condition of creating the BitPackedArray).
let ptype: PType = PType::try_from(array.dtype())?.to_unsigned();
let validity = array.validity();
let taken_validity = validity.take(indices)?;

Expand Down
45 changes: 41 additions & 4 deletions encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,41 @@ impl BitPackedArray {
/// Create a new bitpacked array using a buffer of packed data.
///
/// The packed data should be interpreted as a sequence of values with size `bit_width`.
pub fn try_new(
///
/// # Errors
///
/// This method returns errors if any of the metadata is inconsistent, for example the packed
/// buffer provided does not have the right size according to the supplied length and target
/// PType.
///
/// # Safety
///
/// For signed arrays, it is the caller's responsibility to ensure that there are no values
/// that can be interpreted once unpacked to the provided PType.
///
/// This invariant is upheld by the compressor, but callers must ensure this if they wish to
/// construct a new `BitPackedArray` from parts.
///
/// See also the [`encode`][Self::encode] method on this type for a safe path to create a new
/// bit-packed array.
pub unsafe fn new_unchecked(
packed: Buffer,
ptype: PType,
validity: Validity,
patches: Option<Patches>,
bit_width: u8,
len: usize,
) -> VortexResult<Self> {
Self::try_new_from_offset(packed, ptype, validity, patches, bit_width, len, 0)
// SAFETY: checked by caller.
unsafe {
Self::new_unchecked_with_offset(packed, ptype, validity, patches, bit_width, len, 0)
}
}

pub(crate) fn try_new_from_offset(
/// An unsafe constructor for a `BitPackedArray` that also specifies a slicing offset.
///
/// See also [`new_unchecked`][Self::new_unchecked].
pub(crate) unsafe fn new_unchecked_with_offset(
packed: Buffer,
ptype: PType,
validity: Validity,
Expand Down Expand Up @@ -182,6 +205,17 @@ impl BitPackedArray {
})
}

/// Bit-pack an array of primitive integers down to the target bit-width using the FastLanes
/// SIMD-accelerated packing kernels.
///
/// # Errors
///
/// If the provided array is not an integer type, an error will be returned.
///
/// If the provided array contains negative values, an error will be returned.
///
/// If the requested bit-width for packing is larger than the array's native width, an
/// error will be returned.
pub fn encode(array: &ArrayData, bit_width: u8) -> VortexResult<Self> {
if let Ok(parray) = PrimitiveArray::try_from(array.clone()) {
bitpack_encode(parray, bit_width)
Expand All @@ -190,8 +224,11 @@ impl BitPackedArray {
}
}

/// Calculate the maximum value that **can** be contained by this array, given its bit-width.
///
/// Note that this value need not actually be present in the array.
#[inline]
pub fn max_packed_value(&self) -> usize {
fn max_packed_value(&self) -> usize {
(1 << self.bit_width()) - 1
}
}
Expand Down
Loading
Loading