From b9e745f0d44a7cd6bc27ea6a207261f3ab8297da Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 11 May 2024 09:19:38 +0100 Subject: [PATCH 1/5] Remove buffer -> dtype dependency --- Cargo.lock | 1 - vortex-array/src/array/primitive/mod.rs | 27 +++++++++++++++---- vortex-array/src/array/varbin/compute/mod.rs | 2 +- vortex-array/src/lib.rs | 2 ++ vortex-buffer/Cargo.toml | 1 - vortex-buffer/src/lib.rs | 28 +++----------------- vortex-dict/src/compress.rs | 16 ++++------- vortex-fastlanes/src/bitpacking/compress.rs | 4 +-- vortex-ree/src/compress.rs | 2 +- vortex-zigzag/src/compress.rs | 14 +++------- 10 files changed, 40 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d8f187969..ab7ec6131f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4959,7 +4959,6 @@ dependencies = [ "arrow-buffer", "bytes", "flexbuffers", - "vortex-dtype", ] [[package]] diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index bc895cf6f5..7d1f2759da 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -90,7 +90,27 @@ impl PrimitiveArray<'_> { T::PTYPE, self.ptype(), ); - self.buffer().typed_data::() + + let (prefix, offsets, suffix) = unsafe { self.buffer().as_ref().align_to::() }; + assert!(prefix.is_empty() && suffix.is_empty()); + offsets + } + + /// Convert the array into a mutable vec of the given type. + /// If possible, this will be zero-copy. + pub fn into_typed_data(self) -> Vec { + assert_eq!( + T::PTYPE, + self.ptype(), + "Attempted to get typed_data of type {} from array of type {}", + T::PTYPE, + self.ptype(), + ); + let bytes = self + .into_buffer() + .into_vec() + .unwrap_or_else(|b| Vec::from(b.as_ref())); + unsafe { std::mem::transmute::<_, Vec>(bytes) } } pub fn reinterpret_cast(&self, ptype: PType) -> Self { @@ -124,10 +144,7 @@ impl PrimitiveArray<'_> { let validity = self.validity().to_static(); - let mut own_values = self - .into_buffer() - .into_vec::() - .unwrap_or_else(|b| Vec::from(b.typed_data::())); + let mut own_values = self.into_typed_data(); // TODO(robert): Also patch validity for (idx, value) in positions.iter().zip_eq(values.iter()) { own_values[(*idx).as_()] = *value; diff --git a/vortex-array/src/array/varbin/compute/mod.rs b/vortex-array/src/array/varbin/compute/mod.rs index 9823652238..c24971a3a2 100644 --- a/vortex-array/src/array/varbin/compute/mod.rs +++ b/vortex-array/src/array/varbin/compute/mod.rs @@ -139,7 +139,7 @@ impl ScalarAtFn for VarBinArray<'_> { Ok(varbin_scalar( self.bytes_at(index)? // TODO(ngates): update to use buffer when we refactor scalars. - .into_vec::() + .into_vec() .unwrap_or_else(|b| b.as_ref().to_vec()), self.dtype(), )) diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 19933b628e..cf20228e70 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -1,3 +1,5 @@ +extern crate core; + pub mod accessor; pub mod array; pub mod arrow; diff --git a/vortex-buffer/Cargo.toml b/vortex-buffer/Cargo.toml index db1d71e79f..ce9f00ea9d 100644 --- a/vortex-buffer/Cargo.toml +++ b/vortex-buffer/Cargo.toml @@ -14,7 +14,6 @@ rust-version.workspace = true arrow-buffer = { workspace = true } bytes = { workspace = true } flexbuffers = { workspace = true, optional = true } -vortex-dtype = { path = "../vortex-dtype" } [lints] workspace = true diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 4c9f112d78..57fe4536fb 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -6,7 +6,6 @@ use std::ops::{Deref, Range}; use arrow_buffer::Buffer as ArrowBuffer; pub use string::*; -use vortex_dtype::{match_each_native_ptype, NativePType}; #[derive(Debug, Clone)] pub enum Buffer { @@ -42,31 +41,10 @@ impl Buffer { } } - pub fn typed_data(&self) -> &[T] { + pub fn into_vec(self) -> Result, Buffer> { match self { - Buffer::Arrow(buffer) => unsafe { - match_each_native_ptype!(T::PTYPE, |$T| { - std::mem::transmute(buffer.typed_data::<$T>()) - }) - }, - Buffer::Bytes(bytes) => { - // From ArrowBuffer::typed_data - let (prefix, offsets, suffix) = unsafe { bytes.align_to::() }; - assert!(prefix.is_empty() && suffix.is_empty()); - offsets - } - } - } - - pub fn into_vec(self) -> Result, Buffer> { - match self { - Buffer::Arrow(buffer) => match_each_native_ptype!(T::PTYPE, |$T| { - buffer - .into_vec() - .map(|vec| unsafe { std::mem::transmute::, Vec>(vec) }) - .map_err(Buffer::Arrow) - }), - // Cannot always convert bytes into a mutable vec + Buffer::Arrow(buffer) => buffer.into_vec::().map_err(Buffer::Arrow), + // Cannot convert bytes into a mutable vec Buffer::Bytes(_) => Err(self), } } diff --git a/vortex-dict/src/compress.rs b/vortex-dict/src/compress.rs index ae516537f2..76ef9e6baf 100644 --- a/vortex-dict/src/compress.rs +++ b/vortex-dict/src/compress.rs @@ -264,8 +264,8 @@ mod test { fn encode_primitive() { let arr = PrimitiveArray::from(vec![1, 1, 3, 3, 3]); let (codes, values) = dict_encode_typed_primitive::(&arr); - assert_eq!(codes.buffer().typed_data::(), &[0, 0, 1, 1, 1]); - assert_eq!(values.buffer().typed_data::(), &[1, 3]); + assert_eq!(codes.typed_data::(), &[0, 0, 1, 1, 1]); + assert_eq!(values.typed_data::(), &[1, 3]); } #[test] @@ -281,10 +281,7 @@ mod test { None, ]); let (codes, values) = dict_encode_typed_primitive::(&arr); - assert_eq!( - codes.buffer().typed_data::(), - &[1, 1, 0, 2, 2, 0, 2, 0] - ); + assert_eq!(codes.typed_data::(), &[1, 1, 0, 2, 2, 0, 2, 0]); assert_eq!( scalar_at(&values.to_array(), 0).unwrap(), Scalar::null(DType::Primitive(PType::I32, Nullable)) @@ -303,7 +300,7 @@ mod test { fn encode_varbin() { let arr = VarBinArray::from(vec!["hello", "world", "hello", "again", "world"]); let (codes, values) = dict_encode_varbin(&arr); - assert_eq!(codes.buffer().typed_data::(), &[0, 1, 0, 2, 1]); + assert_eq!(codes.typed_data::(), &[0, 1, 0, 2, 1]); values .with_iterator(|iter| { assert_eq!( @@ -331,10 +328,7 @@ mod test { .into_iter() .collect(); let (codes, values) = dict_encode_varbin(&arr); - assert_eq!( - codes.buffer().typed_data::(), - &[1, 0, 2, 1, 0, 3, 2, 0] - ); + assert_eq!(codes.typed_data::(), &[1, 0, 2, 1, 0, 3, 2, 0]); assert_eq!(str::from_utf8(&values.bytes_at(0).unwrap()).unwrap(), ""); values .with_iterator(|iter| { diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index 483aec0024..bc0b794174 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -120,7 +120,7 @@ pub(crate) fn bitpack(parray: &PrimitiveArray, bit_width: usize) -> VortexResult // We know the min is > 0, so it's safe to re-interpret signed integers as unsigned. // TODO(ngates): we should implement this using a vortex cast to centralize this hack. let bytes = match_integers_by_width!(parray.ptype(), |$P| { - bitpack_primitive(parray.buffer().typed_data::<$P>(), bit_width) + bitpack_primitive(parray.typed_data::<$P>(), bit_width) }); Ok(PrimitiveArray::from(bytes).into_array()) } @@ -163,7 +163,7 @@ fn bitpack_patches( match_each_integer_ptype!(parray.ptype(), |$T| { let mut indices: Vec = Vec::with_capacity(num_exceptions_hint); let mut values: Vec<$T> = Vec::with_capacity(num_exceptions_hint); - for (i, v) in parray.buffer().typed_data::<$T>().iter().enumerate() { + for (i, v) in parray.typed_data::<$T>().iter().enumerate() { if (v.leading_zeros() as usize) < parray.ptype().bit_width() - bit_width { indices.push(i as u64); values.push(*v); diff --git a/vortex-ree/src/compress.rs b/vortex-ree/src/compress.rs index 116d07a461..0f6395baf8 100644 --- a/vortex-ree/src/compress.rs +++ b/vortex-ree/src/compress.rs @@ -216,7 +216,7 @@ mod test { .unwrap(); assert_eq!( - decoded.buffer().typed_data::(), + decoded.typed_data::(), vec![1i32, 1, 2, 2, 2, 3, 3, 3, 3, 3].as_slice() ); assert_eq!( diff --git a/vortex-zigzag/src/compress.rs b/vortex-zigzag/src/compress.rs index bc5b86a4d6..51b4e77fd5 100644 --- a/vortex-zigzag/src/compress.rs +++ b/vortex-zigzag/src/compress.rs @@ -76,16 +76,10 @@ where #[allow(dead_code)] pub fn zigzag_decode<'a>(parray: &'a PrimitiveArray<'a>) -> PrimitiveArray<'a> { match parray.ptype() { - PType::U8 => zigzag_decode_primitive::(parray.buffer().typed_data(), parray.validity()), - PType::U16 => { - zigzag_decode_primitive::(parray.buffer().typed_data(), parray.validity()) - } - PType::U32 => { - zigzag_decode_primitive::(parray.buffer().typed_data(), parray.validity()) - } - PType::U64 => { - zigzag_decode_primitive::(parray.buffer().typed_data(), parray.validity()) - } + PType::U8 => zigzag_decode_primitive::(parray.typed_data(), parray.validity()), + PType::U16 => zigzag_decode_primitive::(parray.typed_data(), parray.validity()), + PType::U32 => zigzag_decode_primitive::(parray.typed_data(), parray.validity()), + PType::U64 => zigzag_decode_primitive::(parray.typed_data(), parray.validity()), _ => panic!("Unsupported ptype {}", parray.ptype()), } } From b9bacb56b40ab8085ca18d5741187693af32a870 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 11 May 2024 09:20:53 +0100 Subject: [PATCH 2/5] Remove buffer -> dtype dependency --- vortex-array/src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index cf20228e70..19933b628e 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -1,5 +1,3 @@ -extern crate core; - pub mod accessor; pub mod array; pub mod arrow; From 7c470597d2e18d55bb88e6c8f9f747bb6dcb585e Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 11 May 2024 09:37:31 +0100 Subject: [PATCH 3/5] Remove buffer -> dtype dependency --- .github/workflows/bench-pr.yml | 49 ------------------------- vortex-array/src/array/primitive/mod.rs | 12 +++++- 2 files changed, 11 insertions(+), 50 deletions(-) delete mode 100644 .github/workflows/bench-pr.yml diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml deleted file mode 100644 index a61fa9fc75..0000000000 --- a/.github/workflows/bench-pr.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: PR Benchmarks - -on: - pull_request: - types: [ labeled, synchronize ] - branches: [ "develop" ] - workflow_dispatch: { } - -permissions: - actions: write - contents: read - pull-requests: write - -jobs: - bench: - runs-on: ubuntu-latest-large - if: ${{ contains(github.event.head_commit.message, '[benchmark]') || github.event.label.name == 'benchmark' && github.event_name == 'pull_request' }} - steps: - # We remove the benchmark label first so that the workflow can be re-triggered. - - uses: actions-ecosystem/action-remove-labels@v1 - with: - labels: benchmark - - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: ./.github/actions/cleanup - - uses: ./.github/actions/setup-zig - - uses: ./.github/actions/setup-rust - - uses: ./.github/actions/setup-python - - - name: Install Protoc - uses: arduino/setup-protoc@v3 - - - name: Bench - Vortex - run: cargo bench | tee bench.txt - - - name: Store benchmark result - uses: benchmark-action/github-action-benchmark@v1.19.3 - with: - name: Vortex Benchmarks - tool: cargo - github-token: ${{ secrets.GITHUB_TOKEN }} - output-file-path: bench.txt - summary-always: true - auto-push: true - fail-on-alert: false - diff --git a/vortex-array/src/array/primitive/mod.rs b/vortex-array/src/array/primitive/mod.rs index 7d1f2759da..1cd6366739 100644 --- a/vortex-array/src/array/primitive/mod.rs +++ b/vortex-array/src/array/primitive/mod.rs @@ -1,3 +1,5 @@ +use std::mem::size_of; + use arrow_buffer::{ArrowNativeType, ScalarBuffer}; use itertools::Itertools; use num_traits::AsPrimitive; @@ -110,7 +112,15 @@ impl PrimitiveArray<'_> { .into_buffer() .into_vec() .unwrap_or_else(|b| Vec::from(b.as_ref())); - unsafe { std::mem::transmute::<_, Vec>(bytes) } + + unsafe { + let mut bytes = std::mem::ManuallyDrop::new(bytes); + Vec::from_raw_parts( + bytes.as_mut_ptr() as *mut T, + bytes.len() / size_of::(), + bytes.capacity() / size_of::(), + ) + } } pub fn reinterpret_cast(&self, ptype: PType) -> Self { From 6d1b362900ccf839739ed10227c79fb41fcf8ade Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 11 May 2024 09:48:16 +0100 Subject: [PATCH 4/5] Fix transmute hack --- vortex-dtype/src/ptype.rs | 15 +++++++++++++++ vortex-fastlanes/src/bitpacking/compress.rs | 14 ++++++++------ vortex-fastlanes/src/bitpacking/compute/mod.rs | 8 ++++---- vortex-fastlanes/src/bitpacking/mod.rs | 16 ---------------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/vortex-dtype/src/ptype.rs b/vortex-dtype/src/ptype.rs index 94cc99c34d..edb3235f7b 100644 --- a/vortex-dtype/src/ptype.rs +++ b/vortex-dtype/src/ptype.rs @@ -109,6 +109,21 @@ macro_rules! match_each_integer_ptype { }) } +#[macro_export] +macro_rules! match_each_unsigned_integer_ptype { + ($self:expr, | $_:tt $enc:ident | $($body:tt)*) => ({ + macro_rules! __with__ {( $_ $enc:ident ) => ( $($body)* )} + use $crate::PType; + match $self { + PType::U8 => __with__! { u8 }, + PType::U16 => __with__! { u16 }, + PType::U32 => __with__! { u32 }, + PType::U64 => __with__! { u64 }, + _ => panic!("Unsupported ptype {}", $self), + } + }) +} + #[macro_export] macro_rules! match_each_float_ptype { ($self:expr, | $_:tt $enc:ident | $($body:tt)*) => ({ diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index bc0b794174..c560b303f2 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -8,11 +8,13 @@ use vortex::stats::ArrayStatistics; use vortex::validity::Validity; use vortex::{Array, ArrayDType, ArrayDef, ArrayTrait, IntoArray, OwnedArray, ToStatic}; use vortex_dtype::PType::U8; -use vortex_dtype::{match_each_integer_ptype, NativePType, PType}; +use vortex_dtype::{ + match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType, PType, +}; use vortex_error::{vortex_bail, vortex_err, VortexResult}; use vortex_scalar::Scalar; -use crate::{match_integers_by_width, BitPackedArray, BitPackedEncoding, OwnedBitPackedArray}; +use crate::{BitPackedArray, BitPackedEncoding, OwnedBitPackedArray}; impl EncodingCompression for BitPackedEncoding { fn cost(&self) -> u8 { @@ -118,8 +120,8 @@ pub(crate) fn bitpack_encode( pub(crate) fn bitpack(parray: &PrimitiveArray, bit_width: usize) -> VortexResult { // We know the min is > 0, so it's safe to re-interpret signed integers as unsigned. - // TODO(ngates): we should implement this using a vortex cast to centralize this hack. - let bytes = match_integers_by_width!(parray.ptype(), |$P| { + let parray = parray.reinterpret_cast(parray.ptype().to_unsigned()); + let bytes = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| { bitpack_primitive(parray.typed_data::<$P>(), bit_width) }); Ok(PrimitiveArray::from(bytes).into_array()) @@ -185,7 +187,7 @@ pub fn unpack<'a>(array: BitPackedArray) -> VortexResult> { let encoded = cast(&array.packed(), U8.into())?.flatten_primitive()?; let ptype: PType = array.dtype().try_into()?; - let mut unpacked = match_integers_by_width!(ptype, |$P| { + let mut unpacked = match_each_unsigned_integer_ptype!(ptype, |$P| { PrimitiveArray::from_vec( unpack_primitive::<$P>(encoded.typed_data::(), bit_width, offset, length), array.validity(), @@ -287,7 +289,7 @@ pub(crate) fn unpack_single(array: &BitPackedArray, index: usize) -> VortexResul let ptype: PType = array.dtype().try_into()?; let index_in_encoded = index + array.offset(); - let scalar: Scalar = match_integers_by_width!(ptype, |$P| { + let scalar: Scalar = match_each_unsigned_integer_ptype!(ptype, |$P| { unsafe { unpack_single_primitive::<$P>(encoded.typed_data::(), bit_width, index_in_encoded).map(|v| v.into()) } diff --git a/vortex-fastlanes/src/bitpacking/compute/mod.rs b/vortex-fastlanes/src/bitpacking/compute/mod.rs index 0960fe9e29..62379bcbe8 100644 --- a/vortex-fastlanes/src/bitpacking/compute/mod.rs +++ b/vortex-fastlanes/src/bitpacking/compute/mod.rs @@ -10,12 +10,12 @@ use vortex::compute::slice::{slice, SliceFn}; use vortex::compute::take::{take, TakeFn}; use vortex::compute::ArrayCompute; use vortex::{Array, ArrayDType, ArrayTrait, IntoArray, OwnedArray}; -use vortex_dtype::{match_each_integer_ptype, NativePType}; -use vortex_error::{vortex_bail, vortex_err, VortexResult}; +use vortex_dtype::{match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType}; +use vortex_error::{vortex_err, VortexResult}; use vortex_scalar::Scalar; use crate::bitpacking::compress::unpack_single; -use crate::{match_integers_by_width, unpack_single_primitive, BitPackedArray}; +use crate::{unpack_single_primitive, BitPackedArray}; mod slice; @@ -66,7 +66,7 @@ impl TakeFn for BitPackedArray<'_> { } let indices = indices.clone().flatten_primitive()?; - let taken = match_integers_by_width!(ptype, |$T| { + let taken = match_each_unsigned_integer_ptype!(ptype, |$T| { PrimitiveArray::from_vec(take_primitive::<$T>(self, &indices)?, taken_validity) }); Ok(taken.reinterpret_cast(ptype).into_array()) diff --git a/vortex-fastlanes/src/bitpacking/mod.rs b/vortex-fastlanes/src/bitpacking/mod.rs index 2b521870b2..29111e92cb 100644 --- a/vortex-fastlanes/src/bitpacking/mod.rs +++ b/vortex-fastlanes/src/bitpacking/mod.rs @@ -174,22 +174,6 @@ impl ArrayTrait for BitPackedArray<'_> { } } -#[macro_export] -macro_rules! match_integers_by_width { - ($self:expr, | $_:tt $enc:ident | $($body:tt)*) => ({ - macro_rules! __with__ {( $_ $enc:ident ) => ( $($body)* )} - use vortex_dtype::PType; - use vortex_error::vortex_bail; - match $self { - PType::I8 | PType::U8 => __with__! { u8 }, - PType::I16 | PType::U16 => __with__! { u16 }, - PType::I32 | PType::U32 => __with__! { u32 }, - PType::I64 | PType::U64 => __with__! { u64 }, - _ => vortex_bail!("Unsupported ptype {}", $self), - } - }) -} - #[cfg(test)] mod test { use vortex::array::primitive::PrimitiveArray; From c983b7e48ae2c3c9bf1bd187d4c55087ab9e6125 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 11 May 2024 10:54:56 +0100 Subject: [PATCH 5/5] Fix transmute hack --- vortex-ipc/src/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-ipc/src/reader.rs b/vortex-ipc/src/reader.rs index f34e44fcab..6418254cef 100644 --- a/vortex-ipc/src/reader.rs +++ b/vortex-ipc/src/reader.rs @@ -563,10 +563,10 @@ mod tests { #[test] fn test_write_read_bitpacked() { // NB: the order is reversed here to ensure we aren't grabbing indexes instead of values - let uncompressed = PrimitiveArray::from((0i64..3_000).rev().collect_vec()); + let uncompressed = PrimitiveArray::from((0u64..3_000).rev().collect_vec()); let packed = BitPackedArray::encode(uncompressed.array(), 5).unwrap(); - let expected = &[2989i64, 2988, 2987, 2986]; + let expected = &[2989u64, 2988, 2987, 2986]; test_base_case(&packed.into_array(), expected, PrimitiveEncoding.id()); }