Skip to content

Commit

Permalink
fix: BitPackedArray enforces can only be built over non-negative valu…
Browse files Browse the repository at this point in the history
…es (#1705)

Following up from #1699.

In the previous PR we allowed signed arrays to be bit-packed directly.
However, we did not explicitly reject arrays with negative values. We
**need** to do this because it is critical for ensuring we have fast
`search_sorted` over BitPacked data with patches, only when the patches
sort to the right-most side of the array can we do efficient binary
search.

I've added explicit preconditions that values are non-negative, and made
the BitPackedArray constructor unsafe to make it clear to callers that
they must explicitly check this themselves (the recommended safe way to
create a BPA is via the `BPA::encode()` method, which returns an Error
if there are negative values).
  • Loading branch information
a10y authored Dec 18, 2024
1 parent 071b126 commit 18babc2
Show file tree
Hide file tree
Showing 10 changed files with 252 additions and 88 deletions.
90 changes: 52 additions & 38 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,41 @@ pub fn bitpack_encode(array: PrimitiveArray, bit_width: u8) -> VortexResult<BitP
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "missing bit width frequency"))?;

// Check array contains no negative values.
if array.ptype().is_signed_int() {
let has_negative_values = match_each_integer_ptype!(array.ptype(), |$P| {
array.statistics().compute_min::<Option<$P>>().unwrap_or_default().unwrap_or_default() < 0
});
if has_negative_values {
vortex_bail!("cannot bitpack_encode array containing negative integers")
}
}

let num_exceptions = count_exceptions(bit_width, &bit_width_freq);

if bit_width >= array.ptype().bit_width() as u8 {
// Nothing we can do
vortex_bail!("Cannot pack -- specified bit width is greater than or equal to raw bit width")
}

let packed = bitpack(&array, bit_width)?;
// SAFETY: we check that array only contains non-negative values.
let packed = unsafe { bitpack_unchecked(&array, bit_width)? };
let patches = (num_exceptions > 0)
.then(|| gather_patches(&array, bit_width, num_exceptions))
.flatten();

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
// SAFETY: values already checked to be non-negative.
unsafe {
BitPackedArray::new_unchecked(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
}
}

/// Bitpack an array into the specified bit-width without checking statistics.
Expand All @@ -54,22 +69,33 @@ pub unsafe fn bitpack_encode_unchecked(
array: PrimitiveArray,
bit_width: u8,
) -> VortexResult<BitPackedArray> {
let packed = bitpack(&array, bit_width)?;

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
// SAFETY: non-negativity of input checked by caller.
unsafe {
let packed = bitpack_unchecked(&array, bit_width)?;

BitPackedArray::new_unchecked(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
}
}

/// Bitpack a [PrimitiveArray] to the given width.
///
/// On success, returns a [Buffer] containing the packed data.
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
///
/// # Safety
///
/// Internally this function will promote the provided array to its unsigned equivalent. This will
/// violate ordering guarantees if the array contains any negative values.
///
/// It is the caller's responsibility to ensure that `parray` is non-negative before calling
/// this function.
pub unsafe fn bitpack_unchecked(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
Expand Down Expand Up @@ -358,7 +384,8 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
#[cfg(test)]
#[allow(clippy::cast_possible_truncation)]
mod test {
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};
use vortex_array::{IntoArrayVariant, ToArrayData};
use vortex_error::VortexError;

use super::*;

Expand Down Expand Up @@ -430,25 +457,12 @@ mod test {
}

#[test]
fn compress_signed_roundtrip() {
fn compress_signed_fails() {
let values: Vec<i64> = (-500..500).collect();
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
assert!(array.ptype().is_signed_int());

let bitpacked_array =
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let num_patches = bitpacked_array
.patches()
.as_ref()
.map(Patches::num_patches)
.unwrap_or_default();
assert_eq!(num_patches, 500);

let unpacked = bitpacked_array
.into_canonical()
.unwrap()
.into_primitive()
.unwrap();
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
let err = BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap_err();
assert!(matches!(err, VortexError::InvalidArgument(_, _)));
}
}
6 changes: 3 additions & 3 deletions encodings/fastlanes/src/bitpacking/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,18 @@ mod test {
// Elements 0..=499 are negative integers (patches)
// Element 500 = 0 (packed)
// Elements 501..999 are positive integers (packed)
let values: Vec<i64> = (-500..500).collect_vec();
let values: Vec<i64> = (0..500).collect_vec();
let unpacked = PrimitiveArray::from(values.clone());
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
let filtered = filter(
bitpacked.as_ref(),
FilterMask::from_indices(values.len(), 250..750),
FilterMask::from_indices(values.len(), 0..500),
)
.unwrap()
.into_primitive()
.unwrap()
.into_maybe_null_slice::<i64>();

assert_eq!(filtered.as_slice(), &values[250..750]);
assert_eq!(filtered, values);
}
}
25 changes: 14 additions & 11 deletions encodings/fastlanes/src/bitpacking/compute/scalar_at.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,21 @@ mod test {

#[test]
fn invalid_patches() {
let packed_array = BitPackedArray::try_new(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
// SAFETY: using unsigned PType
let packed_array = unsafe {
BitPackedArray::new_unchecked(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
)
)
}
.unwrap()
.into_array();
assert_eq!(
Expand Down
18 changes: 13 additions & 5 deletions encodings/fastlanes/src/bitpacking/compute/search_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use vortex_array::stats::ArrayStatistics;
use vortex_array::validity::Validity;
use vortex_array::variants::PrimitiveArrayTrait;
use vortex_array::{ArrayDType, ArrayLen};
use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
use vortex_error::{VortexError, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;

Expand All @@ -26,7 +26,9 @@ impl SearchSortedFn<BitPackedArray> for BitPackedEncoding {
value: &Scalar,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
// NOTE: it is a precondition of BitPackedArray that all values must be >= 0, thus it is
// always safe to promote to unsigned type without loss of ordering of the values.
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
search_sorted_typed::<$P>(array, value, side)
})
}
Expand Down Expand Up @@ -59,7 +61,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
value: usize,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
// NOTE: conversion may truncate silently.
if let Some(pvalue) = num_traits::cast::<usize, $P>(value) {
search_sorted_native(array, pvalue, side)
Expand All @@ -77,7 +79,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
values: &[usize],
side: SearchSortedSide,
) -> VortexResult<Vec<SearchResult>> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
let searcher = BitPackedSearch::<'_, $P>::new(array);

values
Expand All @@ -104,7 +106,13 @@ where
+ AsPrimitive<usize>
+ AsPrimitive<u64>,
{
let native_value: T = value.cast(array.dtype())?.try_into()?;
// NOTE: we use the unsigned variant of the BitPackedArray DType so that we can use it
// in the BitPackedSearch. We need a type that impls fastlanes::BitPack, and it is a
// precondition for BitPackedArray that all values must be non-negative, so promotion
// is cheap and safe.
let native_value: T = value
.cast(&DType::from(array.ptype().to_unsigned()))?
.try_into()?;
search_sorted_native(array, native_value, side)
}

Expand Down
30 changes: 17 additions & 13 deletions encodings/fastlanes/src/bitpacking/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,23 @@ impl SliceFn<BitPackedArray> for BitPackedEncoding {
let encoded_start = (block_start / 8) * array.bit_width() as usize;
let encoded_stop = (block_stop / 8) * array.bit_width() as usize;
// slice the buffer using the encoded start/stop values
BitPackedArray::try_new_from_offset(
array.packed().slice(encoded_start..encoded_stop),
array.ptype(),
array.validity().slice(start, stop)?,
array
.patches()
.map(|p| p.slice(start, stop))
.transpose()?
.flatten(),
array.bit_width(),
stop - start,
offset as u16,
)

// SAFETY: the invariants of the original BitPackedArray are preserved when slicing.
unsafe {
BitPackedArray::new_unchecked_with_offset(
array.packed().slice(encoded_start..encoded_stop),
array.ptype(),
array.validity().slice(start, stop)?,
array
.patches()
.map(|p| p.slice(start, stop))
.transpose()?
.flatten(),
array.bit_width(),
stop - start,
offset as u16,
)
}
.map(|a| a.into_array())
}
}
Expand Down
4 changes: 3 additions & 1 deletion encodings/fastlanes/src/bitpacking/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ impl TakeFn<BitPackedArray> for BitPackedEncoding {
return take(array.clone().into_canonical()?.into_primitive()?, indices);
}

let ptype: PType = array.dtype().try_into()?;
// NOTE: we use the unsigned PType because all values in the BitPackedArray must
// be non-negative (pre-condition of creating the BitPackedArray).
let ptype: PType = PType::try_from(array.dtype())?.to_unsigned();
let validity = array.validity();
let taken_validity = validity.take(indices)?;

Expand Down
45 changes: 41 additions & 4 deletions encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,41 @@ impl BitPackedArray {
/// Create a new bitpacked array using a buffer of packed data.
///
/// The packed data should be interpreted as a sequence of values with size `bit_width`.
pub fn try_new(
///
/// # Errors
///
/// This method returns errors if any of the metadata is inconsistent, for example the packed
/// buffer provided does not have the right size according to the supplied length and target
/// PType.
///
/// # Safety
///
/// For signed arrays, it is the caller's responsibility to ensure that there are no values
/// that can be interpreted once unpacked to the provided PType.
///
/// This invariant is upheld by the compressor, but callers must ensure this if they wish to
/// construct a new `BitPackedArray` from parts.
///
/// See also the [`encode`][Self::encode] method on this type for a safe path to create a new
/// bit-packed array.
pub unsafe fn new_unchecked(
packed: Buffer,
ptype: PType,
validity: Validity,
patches: Option<Patches>,
bit_width: u8,
len: usize,
) -> VortexResult<Self> {
Self::try_new_from_offset(packed, ptype, validity, patches, bit_width, len, 0)
// SAFETY: checked by caller.
unsafe {
Self::new_unchecked_with_offset(packed, ptype, validity, patches, bit_width, len, 0)
}
}

pub(crate) fn try_new_from_offset(
/// An unsafe constructor for a `BitPackedArray` that also specifies a slicing offset.
///
/// See also [`new_unchecked`][Self::new_unchecked].
pub(crate) unsafe fn new_unchecked_with_offset(
packed: Buffer,
ptype: PType,
validity: Validity,
Expand Down Expand Up @@ -182,6 +205,17 @@ impl BitPackedArray {
})
}

/// Bit-pack an array of primitive integers down to the target bit-width using the FastLanes
/// SIMD-accelerated packing kernels.
///
/// # Errors
///
/// If the provided array is not an integer type, an error will be returned.
///
/// If the provided array contains negative values, an error will be returned.
///
/// If the requested bit-width for packing is larger than the array's native width, an
/// error will be returned.
pub fn encode(array: &ArrayData, bit_width: u8) -> VortexResult<Self> {
if let Ok(parray) = PrimitiveArray::try_from(array.clone()) {
bitpack_encode(parray, bit_width)
Expand All @@ -190,8 +224,11 @@ impl BitPackedArray {
}
}

/// Calculate the maximum value that **can** be contained by this array, given its bit-width.
///
/// Note that this value need not actually be present in the array.
#[inline]
pub fn max_packed_value(&self) -> usize {
fn max_packed_value(&self) -> usize {
(1 << self.bit_width()) - 1
}
}
Expand Down
Loading

0 comments on commit 18babc2

Please sign in to comment.