From f5d51ce1e15d4bdb3b56b67abb542ea8fb419c68 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 28 Mar 2024 13:38:01 +0000 Subject: [PATCH 1/2] Implement generic search sorted using scalar_at --- Cargo.lock | 1 + vortex-alp/benches/alp_compress.rs | 11 +- vortex-array/Cargo.toml | 3 + vortex-array/benches/search_sorted.rs | 24 +++ .../array/primitive/compute/search_sorted.rs | 3 +- vortex-array/src/compute/search_sorted.rs | 160 +++++++++++++----- 6 files changed, 153 insertions(+), 49 deletions(-) create mode 100644 vortex-array/benches/search_sorted.rs diff --git a/Cargo.lock b/Cargo.lock index 5cb5d695fd..81d0d570c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5215,6 +5215,7 @@ dependencies = [ "arrow-array 51.0.0", "arrow-buffer 51.0.0", "arrow-schema 51.0.0", + "criterion", "half", "humansize", "itertools 0.12.1", diff --git a/vortex-alp/benches/alp_compress.rs b/vortex-alp/benches/alp_compress.rs index 54d0998fc1..d26137a409 100644 --- a/vortex-alp/benches/alp_compress.rs +++ b/vortex-alp/benches/alp_compress.rs @@ -1,6 +1,4 @@ -use vortex::array::primitive::PrimitiveArray; -use vortex::array::ArrayRef; -use vortex_alp::{ALPArray, ALPFloat, Exponents}; +use vortex_alp::{ALPFloat, Exponents}; fn main() { divan::main(); @@ -11,10 +9,3 @@ fn alp_compress(n: usize) -> (Exponents, Vec, Vec, let values: Vec = vec![T::from(1.234).unwrap(); n]; T::encode(values.as_slice(), None) } - -// TODO(ngates): remove this -#[divan::bench(args = [100_000, 10_000_000])] -fn alp_compress_array(n: usize) -> ArrayRef { - let array = PrimitiveArray::from(vec![1.234f64; n]); - ALPArray::encode(&array).unwrap() -} diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 984f461bc7..f1ed9d4167 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -37,3 +37,6 @@ thiserror = { workspace = true } vortex-alloc = { path = "../vortex-alloc" } vortex-error = { path = "../vortex-error" } vortex-schema = { path = "../vortex-schema" } + +[dev-dependencies] +criterion = { workspace = true } diff --git a/vortex-array/benches/search_sorted.rs b/vortex-array/benches/search_sorted.rs new file mode 100644 index 0000000000..f1271da375 --- /dev/null +++ b/vortex-array/benches/search_sorted.rs @@ -0,0 +1,24 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::distributions::Uniform; +use rand::{thread_rng, Rng}; + +use vortex::compute::search_sorted::{SearchSorted, SearchSortedSide}; + +fn search_sorted(c: &mut Criterion) { + let mut group = c.benchmark_group("search_sorted"); + + let mut rng = thread_rng(); + let range = Uniform::new(0, 100_000_000); + let mut data: Vec = (0..10_000_000).map(|_| rng.sample(range)).collect(); + data.sort(); + let needle = rng.sample(range); + + group.bench_function("std", |b| b.iter(|| black_box(data.binary_search(&needle)))); + + group.bench_function("vortex", |b| { + b.iter(|| black_box(data.search_sorted(&needle, SearchSortedSide::Left))) + }); +} + +criterion_group!(benches, search_sorted); +criterion_main!(benches); diff --git a/vortex-array/src/array/primitive/compute/search_sorted.rs b/vortex-array/src/array/primitive/compute/search_sorted.rs index 0355c1c795..c3d3f27119 100644 --- a/vortex-array/src/array/primitive/compute/search_sorted.rs +++ b/vortex-array/src/array/primitive/compute/search_sorted.rs @@ -17,10 +17,11 @@ impl SearchSortedFn for PrimitiveArray { #[cfg(test)] mod test { - use super::*; use crate::array::IntoArray; use crate::compute::search_sorted::search_sorted; + use super::*; + #[test] fn test_searchsorted_primitive() { let values = vec![1u16, 2, 3].into_array(); diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index ed2c80fbb8..4f8ce32c9f 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -1,12 +1,13 @@ +use std::cmp::Ordering; +use std::cmp::Ordering::{Equal, Greater, Less}; + use vortex_error::{VortexError, VortexResult}; use crate::array::Array; -use crate::compute::flatten::flatten; -use crate::compute::ArrayCompute; +use crate::compute::scalar_at::scalar_at; use crate::scalar::Scalar; -use log::info; -use std::cmp::Ordering; +#[derive(Debug, Copy, Clone)] pub enum SearchSortedSide { Left, Right, @@ -26,45 +27,128 @@ pub fn search_sorted>( return search_sorted.search_sorted(&scalar, side); } - // Otherwise, flatten and try again. - info!("SearchSorted not implemented for {}, flattening", array); - flatten(array)? - .into_array() - .search_sorted() - .map(|f| f.search_sorted(&scalar, side)) - .unwrap_or_else(|| { - Err(VortexError::NotImplemented( - "search_sorted", - array.encoding().id().name(), - )) - }) + if array.scalar_at().is_some() { + return Ok(SearchSorted::search_sorted(&array, &scalar, side)); + } + + Err(VortexError::NotImplemented( + "search_sorted", + array.encoding().id().name(), + )) } -pub trait SearchSorted { - fn search_sorted(&self, value: &T, side: SearchSortedSide) -> usize; +pub trait IndexOrd { + fn index_cmp(&self, idx: usize, elem: &V) -> Option; + + fn index_lt(&self, idx: usize, elem: &V) -> bool { + matches!(self.index_cmp(idx, elem), Some(Less)) + } + + fn index_le(&self, idx: usize, elem: &V) -> bool { + matches!(self.index_cmp(idx, elem), Some(Less | Equal)) + } + + fn index_gt(&self, idx: usize, elem: &V) -> bool { + matches!(self.index_cmp(idx, elem), Some(Greater)) + } + + fn index_ge(&self, idx: usize, elem: &V) -> bool { + matches!(self.index_cmp(idx, elem), Some(Greater | Equal)) + } } -impl SearchSorted for &[T] { - fn search_sorted(&self, value: &T, side: SearchSortedSide) -> usize { +#[allow(clippy::len_without_is_empty)] +pub trait Len { + fn len(&self) -> usize; +} + +pub trait SearchSorted { + fn search_sorted(&self, value: &T, side: SearchSortedSide) -> usize + where + Self: IndexOrd, + { match side { - SearchSortedSide::Left => self - .binary_search_by(|x| { - if x < value { - Ordering::Less - } else { - Ordering::Greater - } - }) - .unwrap_or_else(|x| x), - SearchSortedSide::Right => self - .binary_search_by(|x| { - if x <= value { - Ordering::Less - } else { - Ordering::Greater - } - }) - .unwrap_or_else(|x| x), + SearchSortedSide::Left => self.search_sorted_by(|idx| { + if self.index_lt(idx, value) { + Less + } else { + Greater + } + }), + SearchSortedSide::Right => self.search_sorted_by(|idx| { + if self.index_le(idx, value) { + Less + } else { + Greater + } + }), } } + + fn search_sorted_by Ordering>(&self, f: F) -> usize; +} + +impl + Len + ?Sized, T> SearchSorted for S { + // Code adapted from Rust standard library slice::binary_search_by + fn search_sorted_by Ordering>(&self, mut f: F) -> usize { + // INVARIANTS: + // - 0 <= left <= left + size = right <= self.len() + // - f returns Less for everything in self[..left] + // - f returns Greater for everything in self[right..] + let mut size = self.len(); + let mut left = 0; + let mut right = size; + while left < right { + let mid = left + size / 2; + let cmp = f(mid); + + left = if cmp == Less { mid + 1 } else { left }; + right = if cmp == Greater { mid } else { right }; + if cmp == Equal { + return mid; + } + + size = right - left; + } + + left + } +} + +impl IndexOrd for T { + fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { + let scalar_a = scalar_at(self, idx).ok()?; + scalar_a.partial_cmp(elem) + } +} + +impl IndexOrd for &dyn Array { + fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { + let scalar_a = scalar_at(*self, idx).ok()?; + scalar_a.partial_cmp(elem) + } +} + +impl IndexOrd for [T] { + fn index_cmp(&self, idx: usize, elem: &T) -> Option { + self[idx].partial_cmp(elem) + } +} + +impl Len for T { + fn len(&self) -> usize { + T::len(self) + } +} + +impl Len for &dyn Array { + fn len(&self) -> usize { + Array::len(*self) + } +} + +impl Len for [T] { + fn len(&self) -> usize { + self.len() + } } From 39b0832594c16b9db1ca233eb84248dd4b6e2900 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 28 Mar 2024 13:54:31 +0000 Subject: [PATCH 2/2] less --- vortex-array/Cargo.toml | 4 ++++ vortex-array/src/compute/search_sorted.rs | 16 ++-------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index f1ed9d4167..3a967319e1 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -40,3 +40,7 @@ vortex-schema = { path = "../vortex-schema" } [dev-dependencies] criterion = { workspace = true } + +[[bench]] +name = "search_sorted" +harness = false \ No newline at end of file diff --git a/vortex-array/src/compute/search_sorted.rs b/vortex-array/src/compute/search_sorted.rs index 4f8ce32c9f..3022d93354 100644 --- a/vortex-array/src/compute/search_sorted.rs +++ b/vortex-array/src/compute/search_sorted.rs @@ -115,13 +115,6 @@ impl + Len + ?Sized, T> SearchSorted for S { } } -impl IndexOrd for T { - fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { - let scalar_a = scalar_at(self, idx).ok()?; - scalar_a.partial_cmp(elem) - } -} - impl IndexOrd for &dyn Array { fn index_cmp(&self, idx: usize, elem: &Scalar) -> Option { let scalar_a = scalar_at(*self, idx).ok()?; @@ -131,13 +124,8 @@ impl IndexOrd for &dyn Array { impl IndexOrd for [T] { fn index_cmp(&self, idx: usize, elem: &T) -> Option { - self[idx].partial_cmp(elem) - } -} - -impl Len for T { - fn len(&self) -> usize { - T::len(self) + // SAFETY: Used in search_sorted_by same as the standard library. The search_sorted ensures idx is in bounds + unsafe { self.get_unchecked(idx) }.partial_cmp(elem) } }