From 5ccf028adfd9acee3d62e75df397aa54b204789a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 4 Dec 2024 20:24:34 -0500 Subject: [PATCH] fix: use search_sorted_usize when searching for indices (#1566) Originally found in a CI failure: https://github.com/spiraldb/vortex/actions/runs/12170181872/job/33944625544?pr=1528 Brief description: Let's imagine you have the following SparseArray ``` SparseArray(len=1000, indices=[2], values=[2]) ``` If you `slice(array, 2, 1000)` then you end up where the end of the slice exceeds the `ptype` of the indices array (after the changes for `downcast_integer_array` went in earlier) and this would cause a failure. The fix is to use search_sorted_usize so that rather than casting the scalar to the ptype of the array, we cast the array values up to `usize` for comparison Added a minimal test and verified it fails before the change: ![image](https://github.com/user-attachments/assets/0131b0e1-9f07-4e88-8774-1d4fbf111c1a) --- encodings/runend-bool/src/array.rs | 4 ++-- encodings/runend/src/array.rs | 6 ++++-- vortex-array/src/array/chunked/compute/take.rs | 7 ++++--- vortex-array/src/array/chunked/mod.rs | 11 ++++++----- vortex-array/src/array/sparse/compute/slice.rs | 18 ++++++++++++++++++ vortex-array/src/array/sparse/mod.rs | 4 ++-- vortex-array/src/stream/take_rows.rs | 9 ++++++--- 7 files changed, 42 insertions(+), 17 deletions(-) diff --git a/encodings/runend-bool/src/array.rs b/encodings/runend-bool/src/array.rs index feb2b77645..d9303dd483 100644 --- a/encodings/runend-bool/src/array.rs +++ b/encodings/runend-bool/src/array.rs @@ -2,7 +2,7 @@ use std::fmt::{Debug, Display}; use serde::{Deserialize, Serialize}; use vortex_array::array::{BoolArray, PrimitiveArray}; -use vortex_array::compute::{scalar_at, search_sorted, SearchSortedSide}; +use vortex_array::compute::{scalar_at, search_sorted_usize, SearchSortedSide}; use vortex_array::encoding::ids; use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; use vortex_array::validity::{LogicalValidity, Validity, ValidityMetadata, ValidityVTable}; @@ -112,7 +112,7 @@ impl RunEndBoolArray { } pub(crate) fn find_physical_index(&self, index: usize) -> VortexResult { - search_sorted(&self.ends(), index + self.offset(), SearchSortedSide::Right) + search_sorted_usize(&self.ends(), index + self.offset(), SearchSortedSide::Right) .map(|s| s.to_ends_index(self.ends().len())) } diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index f8efb87b12..f270245352 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -2,7 +2,9 @@ use std::fmt::{Debug, Display}; use serde::{Deserialize, Serialize}; use vortex_array::array::PrimitiveArray; -use vortex_array::compute::{scalar_at, search_sorted, search_sorted_usize_many, SearchSortedSide}; +use vortex_array::compute::{ + scalar_at, search_sorted_usize, search_sorted_usize_many, SearchSortedSide, +}; use vortex_array::encoding::ids; use vortex_array::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; use vortex_array::validity::{ @@ -115,7 +117,7 @@ impl RunEndArray { /// Convert the given logical index to an index into the `values` array pub fn find_physical_index(&self, index: usize) -> VortexResult { - search_sorted(&self.ends(), index + self.offset(), SearchSortedSide::Right) + search_sorted_usize(&self.ends(), index + self.offset(), SearchSortedSide::Right) .map(|s| s.to_ends_index(self.ends().len())) } diff --git a/vortex-array/src/array/chunked/compute/take.rs b/vortex-array/src/array/chunked/compute/take.rs index 2fa36196e5..72c19996f7 100644 --- a/vortex-array/src/array/chunked/compute/take.rs +++ b/vortex-array/src/array/chunked/compute/take.rs @@ -6,8 +6,8 @@ use vortex_scalar::Scalar; use crate::array::chunked::ChunkedArray; use crate::array::ChunkedEncoding; use crate::compute::{ - scalar_at, search_sorted, slice, subtract_scalar, take, try_cast, SearchSortedSide, TakeFn, - TakeOptions, + scalar_at, search_sorted_usize, slice, subtract_scalar, take, try_cast, SearchSortedSide, + TakeFn, TakeOptions, }; use crate::stats::ArrayStatistics; use crate::{ArrayDType, ArrayData, ArrayLen, IntoArrayData, IntoArrayVariant, ToArrayData}; @@ -90,7 +90,8 @@ fn take_strict_sorted( // Find the end of this chunk, and locate that position in the indices array. let chunk_begin = usize::try_from(&scalar_at(chunked.chunk_offsets(), chunk_idx)?)?; let chunk_end = usize::try_from(&scalar_at(chunked.chunk_offsets(), chunk_idx + 1)?)?; - let chunk_end_pos = search_sorted(indices, chunk_end, SearchSortedSide::Left)?.to_index(); + let chunk_end_pos = + search_sorted_usize(indices, chunk_end, SearchSortedSide::Left)?.to_index(); // Now we can say the slice of indices belonging to this chunk is [pos, chunk_end_pos) let chunk_indices = slice(indices, pos, chunk_end_pos)?; diff --git a/vortex-array/src/array/chunked/mod.rs b/vortex-array/src/array/chunked/mod.rs index b5c157376f..9614d80917 100644 --- a/vortex-array/src/array/chunked/mod.rs +++ b/vortex-array/src/array/chunked/mod.rs @@ -13,7 +13,7 @@ use vortex_scalar::Scalar; use crate::array::primitive::PrimitiveArray; use crate::compute::{ - scalar_at, search_sorted, subtract_scalar, SearchSortedSide, SubtractScalarFn, + scalar_at, search_sorted_usize, subtract_scalar, SearchSortedSide, SubtractScalarFn, }; use crate::encoding::ids; use crate::iter::{ArrayIterator, ArrayIteratorAdapter}; @@ -120,10 +120,11 @@ impl ChunkedArray { // Since there might be duplicate values in offsets because of empty chunks we want to search from right // and take the last chunk (we subtract 1 since there's a leading 0) - let index_chunk = search_sorted(&self.chunk_offsets(), index, SearchSortedSide::Right) - .vortex_expect("Search sorted failed in find_chunk_idx") - .to_ends_index(self.nchunks() + 1) - .saturating_sub(1); + let index_chunk = + search_sorted_usize(&self.chunk_offsets(), index, SearchSortedSide::Right) + .vortex_expect("Search sorted failed in find_chunk_idx") + .to_ends_index(self.nchunks() + 1) + .saturating_sub(1); let chunk_start = scalar_at(self.chunk_offsets(), index_chunk) .and_then(|s| usize::try_from(&s)) .vortex_expect("Failed to find chunk start in find_chunk_idx"); diff --git a/vortex-array/src/array/sparse/compute/slice.rs b/vortex-array/src/array/sparse/compute/slice.rs index 5ca75884b5..1ba6dc5e67 100644 --- a/vortex-array/src/array/sparse/compute/slice.rs +++ b/vortex-array/src/array/sparse/compute/slice.rs @@ -75,4 +75,22 @@ mod tests { assert_eq!(primitive_doubly_sliced.maybe_null_slice::(), &[13531]); } + + #[test] + fn slice_partially_invalid() { + let values = vec![0u64].into_array(); + let indices = vec![0u8].into_array(); + + let sparse = SparseArray::try_new(indices, values, 1000, 999u64.into()).unwrap(); + let sliced = slice(&sparse, 0, 1000).unwrap(); + let mut expected = vec![999u64; 1000]; + expected[0] = 0; + + let actual = sliced + .into_primitive() + .unwrap() + .maybe_null_slice::() + .to_vec(); + assert_eq!(expected, actual); + } } diff --git a/vortex-array/src/array/sparse/mod.rs b/vortex-array/src/array/sparse/mod.rs index ba7d41e454..39a5092ff4 100644 --- a/vortex-array/src/array/sparse/mod.rs +++ b/vortex-array/src/array/sparse/mod.rs @@ -7,7 +7,7 @@ use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; use vortex_scalar::{Scalar, ScalarValue}; use crate::array::constant::ConstantArray; -use crate::compute::{scalar_at, search_sorted, SearchResult, SearchSortedSide}; +use crate::compute::{scalar_at, search_sorted_usize, SearchResult, SearchSortedSide}; use crate::encoding::ids; use crate::stats::{ArrayStatistics, Stat, StatisticsVTable, StatsSet}; use crate::validity::{ArrayValidity, LogicalValidity, ValidityVTable}; @@ -124,7 +124,7 @@ impl SparseArray { /// Returns the position or the insertion point of a given index in the indices array. fn search_index(&self, index: usize) -> VortexResult { - search_sorted( + search_sorted_usize( &self.indices(), self.indices_offset() + index, SearchSortedSide::Left, diff --git a/vortex-array/src/stream/take_rows.rs b/vortex-array/src/stream/take_rows.rs index babccaf23b..f252a252de 100644 --- a/vortex-array/src/stream/take_rows.rs +++ b/vortex-array/src/stream/take_rows.rs @@ -7,7 +7,9 @@ use vortex_dtype::match_each_integer_ptype; use vortex_error::{vortex_bail, VortexResult}; use vortex_scalar::Scalar; -use crate::compute::{search_sorted, slice, subtract_scalar, take, SearchSortedSide, TakeOptions}; +use crate::compute::{ + search_sorted_usize, slice, subtract_scalar, take, SearchSortedSide, TakeOptions, +}; use crate::stats::{ArrayStatistics, Stat}; use crate::stream::ArrayStream; use crate::variants::PrimitiveArrayTrait; @@ -72,8 +74,9 @@ impl Stream for TakeRows { while let Some(batch) = ready!(this.reader.as_mut().poll_next(cx)?) { let curr_offset = *this.row_offset; - let left = search_sorted(this.indices, curr_offset, SearchSortedSide::Left)?.to_index(); - let right = search_sorted( + let left = + search_sorted_usize(this.indices, curr_offset, SearchSortedSide::Left)?.to_index(); + let right = search_sorted_usize( this.indices, curr_offset + batch.len(), SearchSortedSide::Left,