Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: BitPackedArray enforces can only be built over non-negative values #1705

Merged
merged 7 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 52 additions & 38 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,41 @@ pub fn bitpack_encode(array: PrimitiveArray, bit_width: u8) -> VortexResult<BitP
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "missing bit width frequency"))?;

// Check array contains no negative values.
if array.ptype().is_signed_int() {
let has_negative_values = match_each_integer_ptype!(array.ptype(), |$P| {
array.statistics().compute_min::<Option<$P>>().unwrap_or_default().unwrap_or_default() < 0
});
if has_negative_values {
vortex_bail!("cannot bitpack_encode array containing negative integers")
}
}

let num_exceptions = count_exceptions(bit_width, &bit_width_freq);

if bit_width >= array.ptype().bit_width() as u8 {
// Nothing we can do
vortex_bail!("Cannot pack -- specified bit width is greater than or equal to raw bit width")
}

let packed = bitpack(&array, bit_width)?;
// SAFETY: we check that array only contains non-negative values.
let packed = unsafe { bitpack_unchecked(&array, bit_width)? };
let patches = (num_exceptions > 0)
.then(|| gather_patches(&array, bit_width, num_exceptions))
.flatten();

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
// SAFETY: values already checked to be non-negative.
unsafe {
BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
patches,
bit_width,
array.len(),
)
}
}

/// Bitpack an array into the specified bit-width without checking statistics.
Expand All @@ -54,22 +69,33 @@ pub unsafe fn bitpack_encode_unchecked(
array: PrimitiveArray,
bit_width: u8,
a10y marked this conversation as resolved.
Show resolved Hide resolved
) -> VortexResult<BitPackedArray> {
let packed = bitpack(&array, bit_width)?;

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
// SAFETY: non-negativity of input checked by caller.
unsafe {
let packed = bitpack_unchecked(&array, bit_width)?;

BitPackedArray::try_new(
packed,
array.ptype(),
array.validity(),
None,
bit_width,
array.len(),
)
}
}

/// Bitpack a [PrimitiveArray] to the given width.
///
/// On success, returns a [Buffer] containing the packed data.
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
///
/// # Safety
///
/// Internally this function will promote the provided array to its unsigned equivalent. This will
/// violate ordering guarantees if the array contains any negative values.
///
/// It is the caller's responsibility to ensure that `parray` is non-negative before calling
/// this function.
pub unsafe fn bitpack_unchecked(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
Expand Down Expand Up @@ -358,7 +384,8 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
#[cfg(test)]
#[allow(clippy::cast_possible_truncation)]
mod test {
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};
use vortex_array::{IntoArrayVariant, ToArrayData};
use vortex_error::VortexError;

use super::*;

Expand Down Expand Up @@ -430,25 +457,12 @@ mod test {
}

#[test]
fn compress_signed_roundtrip() {
fn compress_signed_fails() {
let values: Vec<i64> = (-500..500).collect();
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
assert!(array.ptype().is_signed_int());

let bitpacked_array =
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
let num_patches = bitpacked_array
.patches()
.as_ref()
.map(Patches::num_patches)
.unwrap_or_default();
assert_eq!(num_patches, 500);

let unpacked = bitpacked_array
.into_canonical()
.unwrap()
.into_primitive()
.unwrap();
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
let err = BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap_err();
assert!(matches!(err, VortexError::InvalidArgument(_, _)));
}
}
6 changes: 3 additions & 3 deletions encodings/fastlanes/src/bitpacking/compute/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,18 +181,18 @@ mod test {
// Elements 0..=499 are negative integers (patches)
// Element 500 = 0 (packed)
// Elements 501..999 are positive integers (packed)
let values: Vec<i64> = (-500..500).collect_vec();
let values: Vec<i64> = (0..500).collect_vec();
let unpacked = PrimitiveArray::from(values.clone());
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
let filtered = filter(
bitpacked.as_ref(),
FilterMask::from_indices(values.len(), 250..750),
FilterMask::from_indices(values.len(), 0..500),
)
.unwrap()
.into_primitive()
.unwrap()
.into_maybe_null_slice::<i64>();

assert_eq!(filtered.as_slice(), &values[250..750]);
assert_eq!(filtered, values);
}
}
25 changes: 14 additions & 11 deletions encodings/fastlanes/src/bitpacking/compute/scalar_at.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,21 @@ mod test {

#[test]
fn invalid_patches() {
let packed_array = BitPackedArray::try_new(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
// SAFETY: using unsigned PType
let packed_array = unsafe {
BitPackedArray::try_new(
Buffer::from(vec![0u8; 128]),
PType::U32,
Validity::AllInvalid,
Some(Patches::new(
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
PrimitiveArray::from_vec(vec![1u32], NonNullable).into_array(),
PrimitiveArray::from_vec(vec![999u32], Validity::AllValid).into_array(),
)),
1,
8,
)
)
}
.unwrap()
.into_array();
assert_eq!(
Expand Down
18 changes: 13 additions & 5 deletions encodings/fastlanes/src/bitpacking/compute/search_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use vortex_array::stats::ArrayStatistics;
use vortex_array::validity::Validity;
use vortex_array::variants::PrimitiveArrayTrait;
use vortex_array::{ArrayDType, ArrayLen};
use vortex_dtype::{match_each_unsigned_integer_ptype, NativePType};
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, NativePType};
use vortex_error::{VortexError, VortexExpect as _, VortexResult};
use vortex_scalar::Scalar;

Expand All @@ -26,7 +26,9 @@ impl SearchSortedFn<BitPackedArray> for BitPackedEncoding {
value: &Scalar,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
// NOTE: it is a precondition of BitPackedArray that all values must be >= 0, thus it is
// always safe to promote to unsigned type without loss of ordering of the values.
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
search_sorted_typed::<$P>(array, value, side)
})
}
Expand Down Expand Up @@ -59,7 +61,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
value: usize,
side: SearchSortedSide,
) -> VortexResult<SearchResult> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
// NOTE: conversion may truncate silently.
if let Some(pvalue) = num_traits::cast::<usize, $P>(value) {
search_sorted_native(array, pvalue, side)
Expand All @@ -77,7 +79,7 @@ impl SearchSortedUsizeFn<BitPackedArray> for BitPackedEncoding {
values: &[usize],
side: SearchSortedSide,
) -> VortexResult<Vec<SearchResult>> {
match_each_unsigned_integer_ptype!(array.ptype(), |$P| {
match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$P| {
let searcher = BitPackedSearch::<'_, $P>::new(array);

values
Expand All @@ -104,7 +106,13 @@ where
+ AsPrimitive<usize>
+ AsPrimitive<u64>,
{
let native_value: T = value.cast(array.dtype())?.try_into()?;
// NOTE: we use the unsigned variant of the BitPackedArray DType so that we can use it
// in the BitPackedSearch. We need a type that impls fastlanes::BitPack, and it is a
// precondition for BitPackedArray that all values must be non-negative, so promotion
// is cheap and safe.
let native_value: T = value
.cast(&DType::from(array.ptype().to_unsigned()))?
.try_into()?;
search_sorted_native(array, native_value, side)
}

Expand Down
4 changes: 3 additions & 1 deletion encodings/fastlanes/src/bitpacking/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ impl TakeFn<BitPackedArray> for BitPackedEncoding {
return take(array.clone().into_canonical()?.into_primitive()?, indices);
}

let ptype: PType = array.dtype().try_into()?;
// NOTE: we use the unsigned PType because all values in the BitPackedArray must
// be non-negative (pre-condition of creating the BitPackedArray).
let ptype: PType = PType::try_from(array.dtype())?.to_unsigned();
let validity = array.validity();
let taken_validity = validity.take(indices)?;

Expand Down
21 changes: 20 additions & 1 deletion encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ impl BitPackedArray {
/// Create a new bitpacked array using a buffer of packed data.
///
/// The packed data should be interpreted as a sequence of values with size `bit_width`.
pub fn try_new(
///
/// # Safety
///
/// If `ptype` is signed, `packed` may **not** contain any values that once unpacked would
/// be interpreted as negative.
pub unsafe fn try_new(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

API-wise, this feels iffy. There are probably other array types that can get some inputs that create weird states. WDYT about having a try_new_unchecked function + a checked function that does some similar validation to what we have here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing that is annoying is that this function really shouldn't be pub but it has to be so that it's accessible from the sampling compressor.

Otherwise users of the public API should be going through BitPackedArray::encode.

My goal with the unsafe was to make it clear that there are invariants that users need to check themselves for the args they're passing to the function. I don't think we can check those invariants here without fully decompressing the packed buffer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. WDYT about just changing it to unchecked to keep the "convention" of try_new as something that errors safely everywhere else?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea i can do that

packed: Buffer,
ptype: PType,
validity: Validity,
Expand Down Expand Up @@ -236,6 +241,20 @@ impl VariantsVTable<BitPackedArray> for BitPackedEncoding {
}

impl PrimitiveArrayTrait for BitPackedArray {}
// impl PrimitiveArrayTrait for BitPackedArray {
// fn ptype(&self) -> PType {
// // NOTE: we use the fastlanes::BitPack provided kernels for compute with BitPackedArray,
// // which is only implemented for unsigned integer types.
// // As a precondition of building a new BitPackedArray, we ensure that it may only
// // contain non-negative values. Thus, it is always safe to read the packed data
// // reinterpreted as the unsigned variant of any integer type.
// if let DType::Primitive(ptype, _) = self.dtype() {
// ptype.to_unsigned()
// } else {
// unreachable!()
// }
// }
// }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the alternative to doing array.ptype().to_unsigned() above. It'd be a bit less code and potentially less error-prone, I'm just not sure if we have/will ever have codepaths that assume that this exactly matches the DType


#[cfg(test)]
mod test {
Expand Down
14 changes: 13 additions & 1 deletion vortex-array/src/stats/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,10 @@ pub trait Statistics {
/// Clear the value of the statistic
fn clear(&self, stat: Stat);

/// Computes the value of the stat if it's not present
/// Computes the value of the stat if it's not present.
///
/// Returns the scalar if compute succeeded, or `None` if the stat is not supported
/// for this array.
fn compute(&self, stat: Stat) -> Option<Scalar>;

/// Compute all the requested statistics (if not already present)
Expand Down Expand Up @@ -244,6 +247,9 @@ impl dyn Statistics + '_ {
})
}

/// Get or calculate the provided stat, converting the `Scalar` into a typed value.
///
/// This function will panic if the conversion fails.
pub fn compute_as<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
&self,
stat: Stat,
Expand Down Expand Up @@ -275,10 +281,16 @@ impl dyn Statistics + '_ {
})
}

/// Get or calculate the minimum value in the array, returning as a typed value.
///
/// This function will panic if the conversion fails.
pub fn compute_min<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
self.compute_as(Stat::Min)
}

/// Get or calculate the maximum value in the array, returning as a typed value.
///
/// This function will panic if the conversion fails.
pub fn compute_max<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
self.compute_as(Stat::Max)
}
Expand Down
4 changes: 4 additions & 0 deletions vortex-array/src/variants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ pub trait NullArrayTrait: ArrayTrait {}
pub trait BoolArrayTrait: ArrayTrait {}

pub trait PrimitiveArrayTrait: ArrayTrait {
/// The logical primitive type of the array.
///
/// This is a type that can safely be converted into a `NativePType` for use in
/// `maybe_null_slice` or `into_maybe_null_slice`.
fn ptype(&self) -> PType {
if let DType::Primitive(ptype, ..) = self.dtype() {
*ptype
Expand Down
Loading
Loading