From 36c33bd41cd876b85c496335d5958dfcc257dc7c Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 5 Sep 2024 09:09:09 -0400 Subject: [PATCH] Add fuzzing for Take and SearchSorted functions (#724) --- Cargo.lock | 4 + fuzz/Cargo.toml | 5 +- fuzz/fuzz_targets/fuzz_target_1.rs | 78 ++++++++++++++--- fuzz/src/lib.rs | 95 +++++++++++---------- vortex-array/src/array/varbinview/mod.rs | 9 +- vortex-array/src/compute/search_sorted.rs | 10 +++ vortex-sampling-compressor/Cargo.toml | 5 ++ vortex-sampling-compressor/src/arbitrary.rs | 19 +++++ vortex-sampling-compressor/src/lib.rs | 37 ++++---- vortex-scalar/Cargo.toml | 2 + vortex-scalar/src/arbitrary.rs | 59 +++++++++++++ vortex-scalar/src/lib.rs | 2 + 12 files changed, 247 insertions(+), 78 deletions(-) create mode 100644 vortex-sampling-compressor/src/arbitrary.rs create mode 100644 vortex-scalar/src/arbitrary.rs diff --git a/Cargo.lock b/Cargo.lock index 90cd4b35c0..661079359b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4481,6 +4481,7 @@ dependencies = [ "libfuzzer-sys", "vortex-array", "vortex-dtype", + "vortex-error", "vortex-sampling-compressor", "vortex-scalar", ] @@ -4542,8 +4543,10 @@ dependencies = [ name = "vortex-sampling-compressor" version = "0.7.0" dependencies = [ + "arbitrary", "chrono", "fsst-rs", + "lazy_static", "log", "rand", "vortex-alp", @@ -4564,6 +4567,7 @@ dependencies = [ name = "vortex-scalar" version = "0.7.0" dependencies = [ + "arbitrary", "arrow-array", "datafusion-common", "flatbuffers", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 8df80c65fa..71e7e9fd05 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -12,8 +12,9 @@ cargo-fuzz = true libfuzzer-sys = { workspace = true } vortex-array = { workspace = true, features = ["arbitrary"] } vortex-dtype = { workspace = true } -vortex-sampling-compressor = { workspace = true } -vortex-scalar = { workspace = true } +vortex-error = { workspace = true } +vortex-sampling-compressor = { workspace = true, features = ["arbitrary"] } +vortex-scalar = { workspace = true, features = ["arbitrary"] } [lib] name = "vortex_fuzz" diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index 4ad490afb3..53d0864251 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -1,27 +1,19 @@ #![no_main] -use std::collections::HashSet; - use libfuzzer_sys::{fuzz_target, Corpus}; -use vortex::compute::slice; use vortex::compute::unary::scalar_at; +use vortex::compute::{search_sorted, slice, take, SearchResult, SearchSorted, SearchSortedSide}; use vortex::encoding::EncodingId; use vortex::Array; +use vortex_error::VortexResult; use vortex_fuzz::{Action, FuzzArrayAction}; -use vortex_sampling_compressor::compressors::CompressorRef; use vortex_sampling_compressor::SamplingCompressor; use vortex_scalar::{PValue, Scalar, ScalarValue}; fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus { let FuzzArrayAction { array, actions } = fuzz_action; - - // TODO(adamg): We actually might want to test empty things, but I'm punting this issue for now - if array.is_empty() { - return Corpus::Reject; - }; - match &actions[0] { - Action::Compress(c) => match fuzz_compress(&array, *c) { + Action::Compress(c) => match fuzz_compress(&array, c) { Some(compressed_array) => { assert_array_eq(&array, &compressed_array); Corpus::Keep @@ -33,12 +25,28 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus { assert_slice(&array, &slice, range.start); Corpus::Keep } + Action::SearchSorted(s, side) => { + if !array_is_sorted(&array).unwrap() { + return Corpus::Reject; + } + + let search_result = search_sorted(&array, s.clone(), *side).unwrap(); + assert_search_sorted(&array, s, *side, search_result); + Corpus::Keep + } + Action::Take(indices) => { + if indices.is_empty() { + return Corpus::Reject; + } + let taken = take(&array, indices).unwrap(); + assert_take(&array, &taken, indices); + Corpus::Keep + } } }); -fn fuzz_compress(array: &Array, compressor_ref: CompressorRef<'_>) -> Option { - let ctx = SamplingCompressor::new(HashSet::from([compressor_ref])); - let compressed_array = ctx.compress(array, None).unwrap(); +fn fuzz_compress(array: &Array, compressor: &SamplingCompressor) -> Option { + let compressed_array = compressor.compress(array, None).unwrap(); compressed_array .path() @@ -46,6 +54,32 @@ fn fuzz_compress(array: &Array, compressor_ref: CompressorRef<'_>) -> Option VortexResult { + if array.is_empty() { + return Ok(true); + } + + let mut last_value = scalar_at(array, 0)?; + for i in 1..array.len() { + let next_value = scalar_at(array, i)?; + if next_value < last_value { + return Ok(false); + } + last_value = next_value; + } + Ok(true) +} diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index c8b42c71bd..fe2473398d 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -1,67 +1,61 @@ use std::fmt::Debug; +use std::iter; use std::ops::Range; use libfuzzer_sys::arbitrary::{Arbitrary, Result, Unstructured}; -use vortex::Array; -use vortex_sampling_compressor::compressors::alp::ALPCompressor; -use vortex_sampling_compressor::compressors::bitpacked::BitPackedCompressor; -use vortex_sampling_compressor::compressors::date_time_parts::DateTimePartsCompressor; -use vortex_sampling_compressor::compressors::dict::DictCompressor; -use vortex_sampling_compressor::compressors::r#for::FoRCompressor; -use vortex_sampling_compressor::compressors::roaring_bool::RoaringBoolCompressor; -use vortex_sampling_compressor::compressors::roaring_int::RoaringIntCompressor; -use vortex_sampling_compressor::compressors::runend::DEFAULT_RUN_END_COMPRESSOR; -use vortex_sampling_compressor::compressors::sparse::SparseCompressor; -use vortex_sampling_compressor::compressors::zigzag::ZigZagCompressor; -use vortex_sampling_compressor::compressors::EncodingCompressor; +use vortex::array::PrimitiveArray; +use vortex::compute::unary::scalar_at; +use vortex::compute::SearchSortedSide; +use vortex::{Array, ArrayDType}; +use vortex_sampling_compressor::SamplingCompressor; +use vortex_scalar::arbitrary::random_scalar; +use vortex_scalar::Scalar; +#[derive(Debug)] pub struct FuzzArrayAction { pub array: Array, pub actions: Vec, } -impl Debug for FuzzArrayAction { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("FuzzArrayAction") - .field("action", &self.actions) - .field("array", &self.array) - .finish() - } -} - +#[derive(Debug)] pub enum Action { - Compress(&'static dyn EncodingCompressor), + Compress(SamplingCompressor<'static>), Slice(Range), -} - -impl Debug for Action { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Slice(arg0) => f.debug_tuple("Slice").field(arg0).finish(), - Self::Compress(c) => write!(f, "Compress({})", c.id()), - } - } + Take(Array), + SearchSorted(Scalar, SearchSortedSide), } impl<'a> Arbitrary<'a> for FuzzArrayAction { fn arbitrary(u: &mut Unstructured<'a>) -> Result { let array = Array::arbitrary(u)?; - let action = match u.int_in_range(0..=10)? { - 0 => { - let start = u.choose_index(array.len())?; - let stop = u.choose_index(array.len() - start)? + start; + let len = array.len(); + let action = match u.int_in_range(0..=3)? { + 0 => Action::Compress(u.arbitrary()?), + 1 => { + let start = u.choose_index(len)?; + let stop = u.int_in_range(start..=len)?; Action::Slice(start..stop) } - 1 => Action::Compress(&ALPCompressor), - 2 => Action::Compress(&BitPackedCompressor), - 3 => Action::Compress(&DictCompressor), - 4 => Action::Compress(&FoRCompressor), - 5 => Action::Compress(&RoaringBoolCompressor), - 6 => Action::Compress(&RoaringIntCompressor), - 7 => Action::Compress(&DEFAULT_RUN_END_COMPRESSOR), - 8 => Action::Compress(&SparseCompressor), - 9 => Action::Compress(&ZigZagCompressor), - 10 => Action::Compress(&DateTimePartsCompressor), + 2 => { + let indices = PrimitiveArray::from(random_vec_in_range(u, 0, len - 1)?).into(); + let compressed = SamplingCompressor::default() + .compress(&indices, None) + .unwrap(); + Action::Take(compressed.into_array()) + } + 3 => { + let side = if u.arbitrary()? { + SearchSortedSide::Left + } else { + SearchSortedSide::Right + }; + if u.arbitrary()? { + let random_value_in_array = scalar_at(&array, u.choose_index(len)?).unwrap(); + Action::SearchSorted(random_value_in_array, side) + } else { + Action::SearchSorted(random_scalar(u, array.dtype())?, side) + } + } _ => unreachable!(), }; @@ -71,3 +65,14 @@ impl<'a> Arbitrary<'a> for FuzzArrayAction { }) } } + +fn random_vec_in_range(u: &mut Unstructured<'_>, min: usize, max: usize) -> Result> { + iter::from_fn(|| { + if u.arbitrary().unwrap_or(false) { + Some(u.int_in_range(min..=max).map(|i| i as u64)) + } else { + None + } + }) + .collect::>>() +} diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index d8a81b4f84..48ed0a5e5e 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -257,10 +257,15 @@ impl ArrayTrait for VarBinViewArray {} impl IntoCanonical for VarBinViewArray { fn into_canonical(self) -> VortexResult { + let arrow_dtype = if matches!(self.dtype(), &DType::Utf8(_)) { + &DataType::Utf8 + } else { + &DataType::Binary + }; let nullable = self.dtype().is_nullable(); let arrow_self = as_arrow(self); - let arrow_varbin = arrow_cast::cast(arrow_self.deref(), &DataType::Utf8) - .map_err(VortexError::ArrowError)?; + let arrow_varbin = + arrow_cast::cast(arrow_self.deref(), arrow_dtype).map_err(VortexError::ArrowError)?; let vortex_array = Array::from_arrow(arrow_varbin, nullable); Ok(Canonical::VarBin(VarBinArray::try_from(&vortex_array)?)) diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index ff4f476a3e..05e5269415 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -1,5 +1,6 @@ use std::cmp::Ordering; use std::cmp::Ordering::{Equal, Greater, Less}; +use std::fmt::{Display, Formatter}; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; @@ -13,6 +14,15 @@ pub enum SearchSortedSide { Right, } +impl Display for SearchSortedSide { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + SearchSortedSide::Left => write!(f, "left"), + SearchSortedSide::Right => write!(f, "right"), + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum SearchResult { Found(usize), diff --git a/vortex-sampling-compressor/Cargo.toml b/vortex-sampling-compressor/Cargo.toml index 4e345eab0f..56d2b92ddd 100644 --- a/vortex-sampling-compressor/Cargo.toml +++ b/vortex-sampling-compressor/Cargo.toml @@ -13,6 +13,8 @@ rust-version = { workspace = true } [dependencies] fsst-rs = { workspace = true } +arbitrary = { workspace = true, optional = true } +lazy_static = { workspace = true } log = { workspace = true } rand = { workspace = true } vortex-alp = { workspace = true } @@ -33,3 +35,6 @@ chrono = { workspace = true } [lints] workspace = true + +[features] +arbitrary = ["dep:arbitrary"] diff --git a/vortex-sampling-compressor/src/arbitrary.rs b/vortex-sampling-compressor/src/arbitrary.rs new file mode 100644 index 0000000000..d84a1f0917 --- /dev/null +++ b/vortex-sampling-compressor/src/arbitrary.rs @@ -0,0 +1,19 @@ +use std::collections::HashSet; + +use arbitrary::{Arbitrary, Result, Unstructured}; + +use crate::compressors::{CompressorRef, EncodingCompressor}; +use crate::{SamplingCompressor, ALL_COMPRESSORS}; + +impl<'a, 'b: 'a> Arbitrary<'a> for SamplingCompressor<'b> { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let compressors: HashSet = u.arbitrary()?; + Ok(Self::new(compressors)) + } +} + +impl<'a, 'b: 'a> Arbitrary<'a> for &'b dyn EncodingCompressor { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + u.choose(&ALL_COMPRESSORS.clone()).cloned() + } +} diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs index 35e129c400..ad5207672e 100644 --- a/vortex-sampling-compressor/src/lib.rs +++ b/vortex-sampling-compressor/src/lib.rs @@ -2,6 +2,7 @@ use std::collections::HashSet; use std::fmt::{Debug, Display, Formatter}; use compressors::fsst::FSSTCompressor; +use lazy_static::lazy_static; use log::{debug, info, warn}; use vortex::array::{Chunked, ChunkedArray, Constant, Struct, StructArray}; use vortex::compress::{check_dtype_unchanged, check_validity_unchanged, CompressionStrategy}; @@ -26,9 +27,29 @@ use crate::compressors::zigzag::ZigZagCompressor; use crate::compressors::{CompressedArray, CompressionTree, CompressorRef, EncodingCompressor}; use crate::sampling::stratified_slices; +#[cfg(feature = "arbitrary")] +pub mod arbitrary; pub mod compressors; mod sampling; +lazy_static! { + pub static ref ALL_COMPRESSORS: [CompressorRef<'static>; 11] = [ + &ALPCompressor as CompressorRef, + &BitPackedCompressor, + &DateTimePartsCompressor, + &DEFAULT_RUN_END_COMPRESSOR, + // TODO(robert): Implement minimal compute for DeltaArrays - scalar_at and slice + // &DeltaCompressor, + &DictCompressor, + &FoRCompressor, + &FSSTCompressor, + &RoaringBoolCompressor, + &RoaringIntCompressor, + &SparseCompressor, + &ZigZagCompressor, + ]; +} + #[derive(Debug, Clone)] pub struct CompressConfig { sample_size: u16, @@ -80,21 +101,7 @@ impl CompressionStrategy for SamplingCompressor<'_> { impl Default for SamplingCompressor<'_> { fn default() -> Self { - Self::new(HashSet::from([ - &ALPCompressor as CompressorRef, - &BitPackedCompressor, - // TODO(robert): Implement minimal compute for DeltaArrays - scalar_at and slice - // &DeltaCompressor, - &DictCompressor, - &FSSTCompressor, - &FoRCompressor, - &DateTimePartsCompressor, - &RoaringBoolCompressor, - &RoaringIntCompressor, - &DEFAULT_RUN_END_COMPRESSOR, - &SparseCompressor, - &ZigZagCompressor, - ])) + Self::new(HashSet::from(*ALL_COMPRESSORS)) } } diff --git a/vortex-scalar/Cargo.toml b/vortex-scalar/Cargo.toml index df5667c3b5..86bafd614d 100644 --- a/vortex-scalar/Cargo.toml +++ b/vortex-scalar/Cargo.toml @@ -12,6 +12,7 @@ edition = { workspace = true } rust-version = { workspace = true } [dependencies] +arbitrary = { workspace = true, optional = true } arrow-array = { workspace = true } datafusion-common = { workspace = true, optional = true } flatbuffers = { workspace = true, optional = true } @@ -36,6 +37,7 @@ workspace = true [features] # Uncomment for improved IntelliJ support #default = ["flatbuffers", "proto", "serde"] +arbitrary = ["dep:arbitrary"] datafusion = ["dep:datafusion-common"] flatbuffers = [ "dep:flatbuffers", diff --git a/vortex-scalar/src/arbitrary.rs b/vortex-scalar/src/arbitrary.rs new file mode 100644 index 0000000000..e8c687406c --- /dev/null +++ b/vortex-scalar/src/arbitrary.rs @@ -0,0 +1,59 @@ +use std::iter; + +use arbitrary::{Result, Unstructured}; +use vortex_buffer::{Buffer, BufferString}; +use vortex_dtype::half::f16; +use vortex_dtype::{DType, PType}; + +use crate::{PValue, Scalar, ScalarValue}; + +pub fn random_scalar(u: &mut Unstructured, dtype: &DType) -> Result { + Ok(Scalar::new(dtype.clone(), random_scalar_value(u, dtype)?)) +} + +fn random_scalar_value(u: &mut Unstructured, dtype: &DType) -> Result { + match dtype { + DType::Null => Ok(ScalarValue::Null), + DType::Bool(_) => Ok(ScalarValue::Bool(u.arbitrary()?)), + DType::Primitive(p, _) => Ok(ScalarValue::Primitive(random_pvalue(u, p)?)), + DType::Utf8(_) => Ok(ScalarValue::BufferString(BufferString::from( + u.arbitrary::()?, + ))), + DType::Binary(_) => Ok(ScalarValue::Buffer(Buffer::from(u.arbitrary::>()?))), + DType::Struct(d, _) => Ok(ScalarValue::List( + d.dtypes() + .iter() + .map(|d| random_scalar_value(u, d)) + .collect::>>()? + .into(), + )), + DType::List(d, _) => Ok(ScalarValue::List( + iter::from_fn(|| { + u.arbitrary() + .unwrap_or(false) + .then(|| random_scalar_value(u, d)) + }) + .collect::>>()? + .into(), + )), + DType::Extension(..) => { + unreachable!("Can't yet generate arbitrary scalars for ext dtype") + } + } +} + +fn random_pvalue(u: &mut Unstructured, ptype: &PType) -> Result { + Ok(match ptype { + PType::U8 => PValue::U8(u.arbitrary()?), + PType::U16 => PValue::U16(u.arbitrary()?), + PType::U32 => PValue::U32(u.arbitrary()?), + PType::U64 => PValue::U64(u.arbitrary()?), + PType::I8 => PValue::I8(u.arbitrary()?), + PType::I16 => PValue::I16(u.arbitrary()?), + PType::I32 => PValue::I32(u.arbitrary()?), + PType::I64 => PValue::I64(u.arbitrary()?), + PType::F16 => PValue::F16(f16::from_bits(u.arbitrary()?)), + PType::F32 => PValue::F32(u.arbitrary()?), + PType::F64 => PValue::F64(u.arbitrary()?), + }) +} diff --git a/vortex-scalar/src/lib.rs b/vortex-scalar/src/lib.rs index 6fb969c8fb..c67e297914 100644 --- a/vortex-scalar/src/lib.rs +++ b/vortex-scalar/src/lib.rs @@ -2,6 +2,8 @@ use std::cmp::Ordering; use vortex_dtype::DType; +#[cfg(feature = "arbitrary")] +pub mod arbitrary; mod arrow; mod binary; mod bool;