From 025e5c7ca5a04caef2abb3da371d4e9757e86f4e Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 6 Apr 2024 08:16:26 +0100 Subject: [PATCH] Array Data + View (#210) Prototype of refactoring into ArrayData + ArrayView. --- Cargo.lock | 14 ++ Cargo.toml | 1 + vortex-array2/Cargo.toml | 24 +++ vortex-array2/src/compute.rs | 55 +++++++ vortex-array2/src/context.rs | 39 +++++ vortex-array2/src/data.rs | 148 +++++++++++++++++++ vortex-array2/src/encoding.rs | 69 +++++++++ vortex-array2/src/implementation.rs | 113 ++++++++++++++ vortex-array2/src/lib.rs | 93 ++++++++++++ vortex-array2/src/metadata.rs | 14 ++ vortex-array2/src/primitive/compute.rs | 25 ++++ vortex-array2/src/primitive/mod.rs | 138 +++++++++++++++++ vortex-array2/src/ree/compute.rs | 17 +++ vortex-array2/src/ree/mod.rs | 114 ++++++++++++++ vortex-array2/src/validity/mod.rs | 123 +++++++++++++++ vortex-array2/src/view.rs | 197 +++++++++++++++++++++++++ vortex-error/src/lib.rs | 6 + 17 files changed, 1190 insertions(+) create mode 100644 vortex-array2/Cargo.toml create mode 100644 vortex-array2/src/compute.rs create mode 100644 vortex-array2/src/context.rs create mode 100644 vortex-array2/src/data.rs create mode 100644 vortex-array2/src/encoding.rs create mode 100644 vortex-array2/src/implementation.rs create mode 100644 vortex-array2/src/lib.rs create mode 100644 vortex-array2/src/metadata.rs create mode 100644 vortex-array2/src/primitive/compute.rs create mode 100644 vortex-array2/src/primitive/mod.rs create mode 100644 vortex-array2/src/ree/compute.rs create mode 100644 vortex-array2/src/ree/mod.rs create mode 100644 vortex-array2/src/validity/mod.rs create mode 100644 vortex-array2/src/view.rs diff --git a/Cargo.lock b/Cargo.lock index 4d34423791..adffcc315e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5418,6 +5418,20 @@ dependencies = [ "walkdir", ] +[[package]] +name = "vortex-array2" +version = "0.1.0" +dependencies = [ + "arrow-buffer 51.0.0", + "flatbuffers", + "half", + "paste", + "vortex-array", + "vortex-error", + "vortex-flatbuffers", + "vortex-schema", +] + [[package]] name = "vortex-datetime" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 2d1f3f03f1..99e5ed64ff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "vortex-alloc", "vortex-alp", "vortex-array", + "vortex-array2", "vortex-datetime", "vortex-dict", "vortex-error", diff --git a/vortex-array2/Cargo.toml b/vortex-array2/Cargo.toml new file mode 100644 index 0000000000..28c3fdcb49 --- /dev/null +++ b/vortex-array2/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "vortex-array2" +version.workspace = true +homepage.workspace = true +repository.workspace = true +authors.workspace = true +license.workspace = true +keywords.workspace = true +include.workspace = true +edition.workspace = true +rust-version.workspace = true + +[dependencies] +arrow-buffer = { workspace = true } +flatbuffers = { workspace = true } +half = { workspace = true } +paste = { workspace = true } +vortex-array = { path = "../vortex-array" } +vortex-error = { path = "../vortex-error" } +vortex-flatbuffers = { path = "../vortex-flatbuffers" } +vortex-schema = { path = "../vortex-schema" } + +[lints] +workspace = true diff --git a/vortex-array2/src/compute.rs b/vortex-array2/src/compute.rs new file mode 100644 index 0000000000..79f4f539f2 --- /dev/null +++ b/vortex-array2/src/compute.rs @@ -0,0 +1,55 @@ +use vortex::scalar::Scalar; +use vortex_error::{vortex_err, VortexResult}; + +use crate::primitive::PrimitiveData; +use crate::{Array, WithArray}; + +pub trait ArrayCompute { + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + None + } + fn flatten(&self) -> Option<&dyn FlattenFn> { + None + } +} + +pub trait ScalarAtFn { + fn scalar_at(&self, index: usize) -> VortexResult; +} + +pub fn scalar_at(array: &Array, index: usize) -> VortexResult { + array.with_array(|a| { + a.scalar_at() + .ok_or_else(|| vortex_err!("Not implemented: scalar_at"))? + .scalar_at(index) + }) +} + +pub trait FlattenFn { + fn flatten(&self) -> VortexResult; +} + +pub enum FlattenedArray { + Primitive(PrimitiveData), + // Just to introduce a second variant for now + Other(String), +} + +pub fn flatten(array: &Array) -> VortexResult { + array.with_array(|a| { + a.flatten() + .ok_or_else(|| vortex_err!("Not implemented: flatten"))? + .flatten() + }) +} + +pub fn flatten_primitive(array: &Array) -> VortexResult { + if let FlattenedArray::Primitive(p) = flatten(array)? { + Ok(p) + } else { + Err(vortex_err!( + "Cannot flatten array {:?} into primitive", + array + )) + } +} diff --git a/vortex-array2/src/context.rs b/vortex-array2/src/context.rs new file mode 100644 index 0000000000..ea76216480 --- /dev/null +++ b/vortex-array2/src/context.rs @@ -0,0 +1,39 @@ +use std::sync::Arc; + +use vortex::encoding::EncodingId; + +use crate::encoding::EncodingRef; + +#[derive(Debug)] +pub struct SerdeContext { + encodings: Arc<[EncodingRef]>, +} + +impl SerdeContext { + pub fn new(encodings: Arc<[EncodingRef]>) -> Self { + Self { encodings } + } + + pub fn encodings(&self) -> &[EncodingRef] { + self.encodings.as_ref() + } + + pub fn find_encoding(&self, encoding_id: u16) -> Option { + self.encodings.get(encoding_id as usize).cloned() + } + + pub fn encoding_idx(&self, encoding_id: EncodingId) -> Option { + self.encodings + .iter() + .position(|e| e.id() == encoding_id) + .map(|i| i as u16) + } +} + +impl Default for SerdeContext { + fn default() -> Self { + Self { + encodings: vec![].into(), // ENCODINGS.iter().cloned().collect_vec().into(), + } + } +} diff --git a/vortex-array2/src/data.rs b/vortex-array2/src/data.rs new file mode 100644 index 0000000000..4772a5f36c --- /dev/null +++ b/vortex-array2/src/data.rs @@ -0,0 +1,148 @@ +use std::marker::PhantomData; +use std::sync::Arc; + +use arrow_buffer::Buffer; +use vortex_error::{vortex_bail, VortexError, VortexResult}; +use vortex_schema::DType; + +use crate::encoding::EncodingRef; +use crate::{Array, ArrayDef, ArrayMetadata, IntoArray, ToArray}; + +#[allow(dead_code)] +#[derive(Clone, Debug)] +pub struct ArrayData { + encoding: EncodingRef, + dtype: DType, + metadata: Arc, + buffers: Arc<[Buffer]>, + children: Arc<[ArrayData]>, +} + +impl ArrayData { + pub fn try_new( + encoding: EncodingRef, + dtype: DType, + metadata: Arc, + buffers: Arc<[Buffer]>, + children: Arc<[ArrayData]>, + ) -> VortexResult { + let data = Self { + encoding, + dtype, + metadata, + buffers, + children, + }; + + // Validate here that the metadata correctly parses, so that an encoding can infallibly + // implement Encoding::with_data(). + encoding.with_data_mut(&data, &mut |_| Ok(()))?; + + Ok(data) + } +} + +impl ArrayData { + pub fn encoding(&self) -> EncodingRef { + self.encoding + } + + pub fn dtype(&self) -> &DType { + &self.dtype + } + + pub fn metadata(&self) -> &Arc { + &self.metadata + } + + pub fn buffers(&self) -> &[Buffer] { + &self.buffers + } + + pub fn children(&self) -> &[ArrayData] { + &self.children + } +} + +impl ToArray for ArrayData { + fn to_array(&self) -> Array { + Array::DataRef(self) + } +} + +impl IntoArray<'static> for ArrayData { + fn into_array(self) -> Array<'static> { + Array::Data(self) + } +} + +pub struct TypedArrayData { + data: ArrayData, + phantom: PhantomData, +} + +impl TypedArrayData +where + Self: for<'a> AsRef>, +{ + pub fn new_unchecked(data: ArrayData) -> Self { + Self { + data, + phantom: PhantomData, + } + } + + pub fn data(&self) -> &ArrayData { + &self.data + } + + pub fn into_data(self) -> ArrayData { + self.data + } + + pub fn metadata(&self) -> &D::Metadata { + self.data + .metadata() + .as_any() + .downcast_ref::() + .unwrap() + } + + pub fn into_metadata(self) -> Arc { + self.data + .metadata + .as_any_arc() + .downcast::() + .unwrap() + } + + pub fn as_array(&self) -> &D::Array<'_> { + self.as_ref() + } +} + +impl ToArray for TypedArrayData { + fn to_array(&self) -> Array { + Array::DataRef(&self.data) + } +} + +impl IntoArray<'static> for TypedArrayData { + fn into_array(self) -> Array<'static> { + Array::Data(self.data) + } +} + +impl TryFrom for TypedArrayData { + type Error = VortexError; + + fn try_from(data: ArrayData) -> Result { + if data.encoding().id() != D::ID { + vortex_bail!("Invalid encoding for array") + } + Ok(Self { + data, + phantom: PhantomData, + }) + } +} diff --git a/vortex-array2/src/encoding.rs b/vortex-array2/src/encoding.rs new file mode 100644 index 0000000000..57e4ced4f1 --- /dev/null +++ b/vortex-array2/src/encoding.rs @@ -0,0 +1,69 @@ +use std::fmt::{Debug, Formatter}; + +pub use vortex::encoding::EncodingId; +use vortex_error::VortexResult; + +use crate::ArrayView; +use crate::{ArrayData, ArrayTrait}; + +pub type EncodingRef = &'static dyn ArrayEncoding; + +/// Dynamic trait representing an array type. +#[allow(dead_code)] +pub trait ArrayEncoding { + fn id(&self) -> EncodingId; + + fn with_view_mut<'v>( + &self, + view: &'v ArrayView<'v>, + f: &mut dyn FnMut(&dyn ArrayTrait) -> VortexResult<()>, + ) -> VortexResult<()>; + + fn with_data_mut( + &self, + data: &ArrayData, + f: &mut dyn FnMut(&dyn ArrayTrait) -> VortexResult<()>, + ) -> VortexResult<()>; +} + +impl Debug for dyn ArrayEncoding + '_ { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.id(), f) + } +} + +impl dyn ArrayEncoding { + pub(crate) fn with_view<'v, R, F: Fn(&dyn ArrayTrait) -> R>( + &self, + view: &'v ArrayView<'v>, + f: F, + ) -> R { + let mut result = None; + + // Unwrap the result. This is safe since we validate that encoding against the + // ArrayData during ArrayData::try_new. + self.with_view_mut(view, &mut |array| { + result = Some(f(array)); + Ok(()) + }) + .unwrap(); + + // Now we unwrap the optional, which we know to be populated in the closure. + result.unwrap() + } + + pub(crate) fn with_data R>(&self, data: &ArrayData, f: F) -> R { + let mut result = None; + + // Unwrap the result. This is safe since we validate that encoding against the + // ArrayData during ArrayData::try_new. + self.with_data_mut(data, &mut |array| { + result = Some(f(array)); + Ok(()) + }) + .unwrap(); + + // Now we unwrap the optional, which we know to be populated in the closure. + result.unwrap() + } +} diff --git a/vortex-array2/src/implementation.rs b/vortex-array2/src/implementation.rs new file mode 100644 index 0000000000..c2385dd30b --- /dev/null +++ b/vortex-array2/src/implementation.rs @@ -0,0 +1,113 @@ +use vortex_error::VortexResult; + +use crate::encoding::ArrayEncoding; +use crate::encoding::EncodingId; +use crate::ArrayData; +use crate::ArrayMetadata; +use crate::ArrayView; + +/// Trait the defines the set of types relating to an array. +/// Because it has associated types it can't be used as a trait object. +pub trait ArrayDef { + const ID: EncodingId; + type Array<'a>: ?Sized + 'a; + type Metadata: ArrayMetadata; + type Encoding: ArrayEncoding; +} + +pub trait TryFromArrayMetadata: Sized { + fn try_from_metadata(metadata: Option<&[u8]>) -> VortexResult; +} + +pub trait TryFromArrayData: Sized { + fn try_from_data(data: &ArrayData) -> VortexResult; +} + +pub trait TryFromArrayView<'v>: Sized + 'v { + fn try_from_view(view: &'v ArrayView<'v>) -> VortexResult; +} + +#[macro_export] +macro_rules! impl_encoding { + ($id:literal, $Name:ident) => { + use paste::paste; + + paste! { + use $crate::{ArrayDef, TryFromArrayData, TryFromArrayView, ArrayTrait}; + use $crate::encoding::{ArrayEncoding, EncodingId}; + use std::any::Any; + use std::sync::Arc; + use std::marker::{Send, Sync}; + + /// The array definition trait + pub struct [<$Name Def>]; + impl ArrayDef for [<$Name Def>] { + const ID: EncodingId = EncodingId::new($id); + type Array<'a> = dyn [<$Name Array>] + 'a; + type Metadata = [<$Name Metadata>]; + type Encoding = [<$Name Encoding>]; + } + + pub type [<$Name Data>] = TypedArrayData<[<$Name Def>]>; + pub type [<$Name View>]<'v> = TypedArrayView<'v, [<$Name Def>]>; + + /// The array encoding + pub struct [<$Name Encoding>]; + impl ArrayEncoding for [<$Name Encoding>] { + fn id(&self) -> EncodingId { + [<$Name Def>]::ID + } + + fn with_view_mut<'v>( + &self, + view: &'v ArrayView<'v>, + f: &mut dyn FnMut(&dyn ArrayTrait) -> VortexResult<()>, + ) -> VortexResult<()> { + // Convert ArrayView -> PrimitiveArray, then call compute. + let typed_view = <[<$Name View>] as TryFromArrayView>::try_from_view(view)?; + f(&typed_view.as_array()) + } + + fn with_data_mut( + &self, + data: &ArrayData, + f: &mut dyn FnMut(&dyn ArrayTrait) -> VortexResult<()>, + ) -> VortexResult<()> { + let data = <[<$Name Data>] as TryFromArrayData>::try_from_data(data)?; + f(&data.as_array()) + } + } + + /// Implement ArrayMetadata + impl ArrayMetadata for [<$Name Metadata>] { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn to_arc(&self) -> Arc { + Arc::new(self.clone()) + } + + fn into_arc(self) -> Arc { + Arc::new(self) + } + } + + /// Implement AsRef for both the data and view types + impl<'a> AsRef] + 'a> for [<$Name Data>] { + fn as_ref(&self) -> &(dyn [<$Name Array>] + 'a) { + self + } + } + impl<'a> AsRef] + 'a> for [<$Name View>]<'a> { + fn as_ref(&self) -> &(dyn [<$Name Array>] + 'a) { + self + } + } + } + }; +} diff --git a/vortex-array2/src/lib.rs b/vortex-array2/src/lib.rs new file mode 100644 index 0000000000..d3d9c7c993 --- /dev/null +++ b/vortex-array2/src/lib.rs @@ -0,0 +1,93 @@ +#![allow(dead_code)] + +pub mod compute; +mod context; +mod data; +pub mod encoding; +mod implementation; +mod metadata; +mod primitive; +mod ree; +mod validity; +mod view; + +use std::fmt::Debug; + +pub use context::*; +pub use data::*; +pub use implementation::*; +pub use metadata::*; +pub use validity::*; +pub use view::*; + +use crate::compute::ArrayCompute; + +#[derive(Debug, Clone)] +pub enum Array<'v> { + Data(ArrayData), + DataRef(&'v ArrayData), + View(ArrayView<'v>), +} + +pub trait ToArray { + fn to_array(&self) -> Array; +} + +pub trait IntoArray<'a> { + fn into_array(self) -> Array<'a>; +} + +pub trait ToArrayData { + fn to_array_data(&self) -> ArrayData; +} + +pub trait WithArray { + fn with_array R>(&self, f: F) -> R; +} + +/// Collects together the behaviour of an array. +pub trait ArrayTrait: ArrayCompute + ArrayValidity + ToArrayData { + fn len(&self) -> usize; + + fn is_empty(&self) -> bool { + // TODO(ngates): remove this default impl to encourage explicit implementation + self.len() == 0 + } +} + +impl ToArrayData for Array<'_> { + fn to_array_data(&self) -> ArrayData { + match self { + Array::Data(d) => d.encoding().with_data(d, |a| a.to_array_data()), + Array::DataRef(d) => d.encoding().with_data(d, |a| a.to_array_data()), + Array::View(v) => v.encoding().with_view(v, |a| a.to_array_data()), + } + } +} + +impl WithArray for Array<'_> { + fn with_array R>(&self, f: F) -> R { + match self { + Array::Data(d) => d.encoding().with_data(d, f), + Array::DataRef(d) => d.encoding().with_data(d, f), + Array::View(v) => v.encoding().with_view(v, f), + } + } +} + +#[cfg(test)] +mod test { + use vortex_error::VortexResult; + + use crate::compute::*; + use crate::primitive::PrimitiveData; + use crate::ToArray; + + #[test] + fn test_primitive() -> VortexResult<()> { + let array = PrimitiveData::from_vec(vec![1i32, 2, 3, 4, 5]); + let scalar: i32 = scalar_at(&array.to_array(), 3)?.try_into()?; + assert_eq!(scalar, 4); + Ok(()) + } +} diff --git a/vortex-array2/src/metadata.rs b/vortex-array2/src/metadata.rs new file mode 100644 index 0000000000..3c5f286b50 --- /dev/null +++ b/vortex-array2/src/metadata.rs @@ -0,0 +1,14 @@ +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +/// Dynamic trait used to represent opaque owned Array metadata +/// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the +/// metadata trait, and not the entire array trait. +#[allow(dead_code)] +pub trait ArrayMetadata: 'static + Send + Sync + Debug { + fn as_any(&self) -> &dyn Any; + fn as_any_arc(self: Arc) -> Arc; + fn to_arc(&self) -> Arc; + fn into_arc(self) -> Arc; +} diff --git a/vortex-array2/src/primitive/compute.rs b/vortex-array2/src/primitive/compute.rs new file mode 100644 index 0000000000..9344f150aa --- /dev/null +++ b/vortex-array2/src/primitive/compute.rs @@ -0,0 +1,25 @@ +use vortex::match_each_native_ptype; +use vortex::scalar::Scalar; +use vortex_error::VortexResult; + +use crate::compute::{ArrayCompute, ScalarAtFn}; +use crate::primitive::PrimitiveArray; +use crate::ArrayValidity; + +impl ArrayCompute for &dyn PrimitiveArray { + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } +} + +impl ScalarAtFn for &dyn PrimitiveArray { + fn scalar_at(&self, index: usize) -> VortexResult { + if self.is_valid(index) { + match_each_native_ptype!(self.ptype(), |$T| { + Scalar::from(self.buffer().typed_data::<$T>()[index]).cast(self.dtype()) + }) + } else { + Ok(Scalar::null(self.dtype())) + } + } +} diff --git a/vortex-array2/src/primitive/mod.rs b/vortex-array2/src/primitive/mod.rs new file mode 100644 index 0000000000..959966c90a --- /dev/null +++ b/vortex-array2/src/primitive/mod.rs @@ -0,0 +1,138 @@ +mod compute; + +use arrow_buffer::Buffer; +use vortex::ptype::{NativePType, PType}; +use vortex_error::VortexResult; +use vortex_schema::{DType, Nullability}; + +use crate::compute::scalar_at; +use crate::impl_encoding; +use crate::{Array, ArrayValidity, IntoArray, Validity}; +use crate::{ArrayData, TypedArrayData}; +use crate::{ArrayMetadata, TryFromArrayMetadata}; +use crate::{ArrayView, ToArrayData}; +use crate::{ToArray, TypedArrayView}; + +impl_encoding!("vortex.primitive", Primitive); + +#[derive(Clone, Debug)] +pub struct PrimitiveMetadata(PType); +impl PrimitiveMetadata { + pub fn ptype(&self) -> PType { + self.0 + } +} + +pub trait PrimitiveArray { + fn dtype(&self) -> &DType; + fn ptype(&self) -> PType; + fn buffer(&self) -> &Buffer; + fn validity(&self) -> Option; +} + +impl PrimitiveData { + pub fn from_vec(values: Vec) -> Self { + ArrayData::try_new( + &PrimitiveEncoding, + DType::from(T::PTYPE), + Arc::new(PrimitiveMetadata(T::PTYPE)), + vec![Buffer::from_vec(values)].into(), + vec![].into(), + ) + .unwrap() + .try_into() + .unwrap() + } +} + +impl PrimitiveArray for PrimitiveData { + fn dtype(&self) -> &DType { + self.data().dtype() + } + + fn ptype(&self) -> PType { + self.metadata().ptype() + } + + fn buffer(&self) -> &Buffer { + self.data().buffers().first().unwrap() + } + + fn validity(&self) -> Option { + match self.dtype().nullability() { + Nullability::NonNullable => None, + Nullability::Nullable => Some(self.data().children().first().unwrap().to_array()), + } + } +} + +impl PrimitiveArray for PrimitiveView<'_> { + fn dtype(&self) -> &DType { + self.view().dtype() + } + + fn ptype(&self) -> PType { + self.metadata().ptype() + } + + fn buffer(&self) -> &Buffer { + self.view() + .buffers() + .first() + .expect("PrimitiveView must have a single buffer") + } + + fn validity(&self) -> Option { + match self.dtype().nullability() { + Nullability::NonNullable => None, + Nullability::Nullable => { + Some(self.view().child(0, &Validity::DTYPE).unwrap().into_array()) + } + } + } +} + +impl TryFromArrayMetadata for PrimitiveMetadata { + fn try_from_metadata(_metadata: Option<&[u8]>) -> VortexResult { + todo!() + } +} + +impl<'v> TryFromArrayView<'v> for PrimitiveView<'v> { + fn try_from_view(view: &'v ArrayView<'v>) -> VortexResult { + // TODO(ngates): validate the view. + Ok(PrimitiveView::new_unchecked( + view.clone(), + PrimitiveMetadata::try_from_metadata(view.metadata())?, + )) + } +} + +impl TryFromArrayData for PrimitiveData { + fn try_from_data(data: &ArrayData) -> VortexResult { + // TODO(ngates): validate the array data. + Ok(Self::new_unchecked(data.clone())) + } +} + +impl ArrayTrait for &dyn PrimitiveArray { + fn len(&self) -> usize { + self.buffer().len() / self.ptype().byte_width() + } +} + +impl ArrayValidity for &dyn PrimitiveArray { + fn is_valid(&self, index: usize) -> bool { + if let Some(v) = self.validity() { + scalar_at(&v, index).unwrap().try_into().unwrap() + } else { + true + } + } +} + +impl ToArrayData for &dyn PrimitiveArray { + fn to_array_data(&self) -> ArrayData { + todo!() + } +} diff --git a/vortex-array2/src/ree/compute.rs b/vortex-array2/src/ree/compute.rs new file mode 100644 index 0000000000..8301db7c15 --- /dev/null +++ b/vortex-array2/src/ree/compute.rs @@ -0,0 +1,17 @@ +use vortex::scalar::Scalar; +use vortex_error::VortexResult; + +use crate::compute::{ArrayCompute, ScalarAtFn}; +use crate::ree::REEArray; + +impl ArrayCompute for &dyn REEArray { + fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { + Some(self) + } +} + +impl ScalarAtFn for &dyn REEArray { + fn scalar_at(&self, _index: usize) -> VortexResult { + todo!() + } +} diff --git a/vortex-array2/src/ree/mod.rs b/vortex-array2/src/ree/mod.rs new file mode 100644 index 0000000000..5ee98164b0 --- /dev/null +++ b/vortex-array2/src/ree/mod.rs @@ -0,0 +1,114 @@ +mod compute; + +use vortex_error::VortexResult; +use vortex_schema::DType; + +use crate::impl_encoding; +use crate::validity::ArrayValidity; +use crate::{Array, ArrayMetadata, TryFromArrayMetadata}; +use crate::{ArrayData, TypedArrayData}; +use crate::{ArrayView, ToArrayData}; +use crate::{IntoArray, TypedArrayView}; + +impl_encoding!("vortex.ree", REE); + +#[derive(Clone, Debug)] +pub struct REEMetadata { + length: usize, + ends_dtype: DType, +} + +impl REEMetadata { + pub fn len(&self) -> usize { + self.length + } + pub fn ends_dtype(&self) -> &DType { + &self.ends_dtype + } +} + +pub trait REEArray { + fn run_ends(&self) -> Array; + fn values(&self) -> Array; +} + +impl REEData { + pub fn new(ends: ArrayData, values: ArrayData, length: usize) -> Self { + ArrayData::try_new( + &REEEncoding, + values.dtype().clone(), + REEMetadata { + length, + ends_dtype: ends.dtype().clone(), + } + .into_arc(), + vec![].into(), + vec![ends, values].into(), + ) + .unwrap() + .try_into() + .unwrap() + } +} + +impl REEArray for REEData { + fn run_ends(&self) -> Array { + Array::DataRef(self.data().children().first().unwrap()) + } + + fn values(&self) -> Array { + Array::DataRef(self.data().children().get(1).unwrap()) + } +} + +impl REEArray for REEView<'_> { + fn run_ends(&self) -> Array { + self.view() + .child(0, self.metadata().ends_dtype()) + .unwrap() + .into_array() + } + + fn values(&self) -> Array { + self.view() + .child(1, self.view().dtype()) + .unwrap() + .into_array() + } +} + +impl TryFromArrayMetadata for REEMetadata { + fn try_from_metadata(_metadata: Option<&[u8]>) -> VortexResult { + todo!() + } +} + +impl<'v> TryFromArrayView<'v> for REEView<'v> { + fn try_from_view(_view: &'v ArrayView<'v>) -> VortexResult { + todo!() + } +} + +impl TryFromArrayData for REEData { + fn try_from_data(_data: &ArrayData) -> VortexResult { + todo!() + } +} + +impl ArrayTrait for &dyn REEArray { + fn len(&self) -> usize { + todo!() + } +} + +impl ArrayValidity for &dyn REEArray { + fn is_valid(&self, _index: usize) -> bool { + todo!() + } +} + +impl ToArrayData for &dyn REEArray { + fn to_array_data(&self) -> ArrayData { + todo!() + } +} diff --git a/vortex-array2/src/validity/mod.rs b/vortex-array2/src/validity/mod.rs new file mode 100644 index 0000000000..a609f26481 --- /dev/null +++ b/vortex-array2/src/validity/mod.rs @@ -0,0 +1,123 @@ +use vortex_error::{vortex_bail, VortexResult}; +use vortex_schema::{DType, Nullability}; + +use crate::compute::ArrayCompute; +use crate::impl_encoding; +use crate::TypedArrayView; +use crate::{Array, ArrayMetadata, TryFromArrayMetadata}; +use crate::{ArrayData, TypedArrayData}; +use crate::{ArrayView, ToArrayData}; +use crate::{IntoArray, ToArray}; + +impl_encoding!("vortex.ree", Validity); + +pub trait ArrayValidity { + fn is_valid(&self, index: usize) -> bool; +} + +#[derive(Clone, Debug)] +pub enum ValidityMetadata { + Valid(usize), + Invalid(usize), + Array, +} + +pub enum Validity<'v> { + Valid(usize), + Invalid(usize), + Array(Array<'v>), +} + +impl Validity<'_> { + pub const DTYPE: DType = DType::Bool(Nullability::NonNullable); +} + +pub trait ValidityArray { + fn validity(&self) -> Validity; +} + +impl ValidityData { + pub fn new(validity: Validity) -> Self { + let (meta, children) = match validity { + Validity::Valid(l) => (ValidityMetadata::Valid(l), vec![]), + Validity::Invalid(l) => (ValidityMetadata::Invalid(l), vec![]), + Validity::Array(a) => (ValidityMetadata::Array, vec![a.to_array_data()]), + }; + + ArrayData::try_new( + &ValidityEncoding, + Validity::DTYPE, + meta.into_arc(), + vec![].into(), + children.into(), + ) + .unwrap() + .try_into() + .unwrap() + } +} + +impl ValidityArray for ValidityData { + fn validity(&self) -> Validity { + match self.metadata() { + ValidityMetadata::Valid(l) => Validity::Valid(*l), + ValidityMetadata::Invalid(l) => Validity::Invalid(*l), + ValidityMetadata::Array => { + Validity::Array(self.data().children().first().unwrap().to_array()) + } + } + } +} + +impl ValidityArray for ValidityView<'_> { + fn validity(&self) -> Validity { + match self.metadata() { + ValidityMetadata::Valid(l) => Validity::Valid(*l), + ValidityMetadata::Invalid(l) => Validity::Invalid(*l), + ValidityMetadata::Array => { + Validity::Array(self.view().child(0, &Validity::DTYPE).unwrap().into_array()) + } + } + } +} + +impl TryFromArrayMetadata for ValidityMetadata { + fn try_from_metadata(metadata: Option<&[u8]>) -> VortexResult { + let Some(_bytes) = metadata else { + vortex_bail!("Validity metadata is missing") + }; + todo!() + } +} + +impl<'v> TryFromArrayView<'v> for ValidityView<'v> { + fn try_from_view(_view: &'v ArrayView<'v>) -> VortexResult { + todo!() + } +} + +impl TryFromArrayData for ValidityData { + fn try_from_data(_data: &ArrayData) -> VortexResult { + todo!() + } +} + +impl ArrayTrait for &dyn ValidityArray { + fn len(&self) -> usize { + todo!() + } +} + +impl ArrayValidity for &dyn ValidityArray { + fn is_valid(&self, _index: usize) -> bool { + todo!() + } +} + +impl ToArrayData for &dyn ValidityArray { + fn to_array_data(&self) -> ArrayData { + todo!() + } +} + +impl ArrayCompute for &dyn ValidityArray {} diff --git a/vortex-array2/src/view.rs b/vortex-array2/src/view.rs new file mode 100644 index 0000000000..f5fbe6c32f --- /dev/null +++ b/vortex-array2/src/view.rs @@ -0,0 +1,197 @@ +use std::fmt::{Debug, Formatter}; + +use arrow_buffer::Buffer; +use vortex::flatbuffers::array as fb; +use vortex_error::{vortex_bail, vortex_err, VortexError, VortexResult}; +use vortex_schema::DType; + +use crate::encoding::EncodingRef; +use crate::{Array, ArrayDef, IntoArray, ToArray}; +use crate::{SerdeContext, TryFromArrayMetadata}; + +#[derive(Clone)] +pub struct ArrayView<'v> { + encoding: EncodingRef, + dtype: &'v DType, + array: fb::Array<'v>, + buffers: &'v [Buffer], + ctx: &'v SerdeContext, +} + +impl<'a> Debug for ArrayView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ArrayView") + .field("encoding", &self.encoding) + .field("dtype", &self.dtype) + // .field("array", &self.array) + .field("buffers", &self.buffers) + .field("ctx", &self.ctx) + .finish() + } +} + +impl<'v> ArrayView<'v> { + pub fn try_new( + ctx: &'v SerdeContext, + dtype: &'v DType, + array: fb::Array<'v>, + buffers: &'v [Buffer], + ) -> VortexResult { + let encoding = ctx + .find_encoding(array.encoding()) + .ok_or_else(|| vortex_err!(InvalidSerde: "Encoding ID out of bounds"))?; + + if buffers.len() != Self::cumulative_nbuffers(array) { + vortex_bail!(InvalidSerde: + "Incorrect number of buffers {}, expected {}", + buffers.len(), + Self::cumulative_nbuffers(array) + ) + } + + let view = Self { + encoding, + dtype, + array, + buffers, + ctx, + }; + + // Validate here that the metadata correctly parses, so that an encoding can infallibly + // implement Encoding::with_view(). + encoding.with_view_mut(&view, &mut |_| Ok(()))?; + + Ok(view) + } + + pub fn encoding(&self) -> EncodingRef { + self.encoding + } + + pub fn dtype(&self) -> &DType { + self.dtype + } + + pub fn metadata(&self) -> Option<&'v [u8]> { + self.array.metadata().map(|m| m.bytes()) + } + + pub fn nchildren(&self) -> usize { + self.array.children().map(|c| c.len()).unwrap_or_default() + } + + pub fn child(&self, idx: usize, dtype: &'v DType) -> Option> { + let child = self.array_child(idx)?; + + // Figure out how many buffers to skip... + // We store them depth-first. + let buffer_offset = self + .array + .children()? + .iter() + .take(idx) + .map(|child| Self::cumulative_nbuffers(child)) + .sum(); + let buffer_count = Self::cumulative_nbuffers(child); + + Some( + Self::try_new( + self.ctx, + dtype, + child, + &self.buffers[buffer_offset..][0..buffer_count], + ) + .unwrap(), + ) + } + + fn array_child(&self, idx: usize) -> Option> { + let children = self.array.children()?; + if idx < children.len() { + Some(children.get(idx)) + } else { + None + } + } + + /// The number of buffers used by the current Array. + pub fn nbuffers(&self) -> usize { + self.array.nbuffers() as usize + } + + /// The number of buffers used by the current Array and all its children. + fn cumulative_nbuffers(array: fb::Array) -> usize { + let mut nbuffers = array.nbuffers() as usize; + for child in array.children().unwrap_or_default() { + nbuffers += Self::cumulative_nbuffers(child); + } + nbuffers + } + + pub fn buffers(&self) -> &'v [Buffer] { + // This is only true for the immediate current node? + &self.buffers[0..self.nbuffers()] + } +} + +impl ToArray for ArrayView<'_> { + fn to_array(&self) -> Array { + Array::View(self.clone()) + } +} + +impl<'v> IntoArray<'v> for ArrayView<'v> { + fn into_array(self) -> Array<'v> { + Array::View(self) + } +} + +pub struct TypedArrayView<'v, D: ArrayDef> { + view: ArrayView<'v>, + metadata: D::Metadata, +} + +impl<'v, D: ArrayDef> TypedArrayView<'v, D> { + pub fn new_unchecked(view: ArrayView<'v>, metadata: D::Metadata) -> Self { + Self { view, metadata } + } + + pub fn metadata(&self) -> &D::Metadata { + &self.metadata + } + + pub fn view(&'v self) -> &'v ArrayView<'v> { + &self.view + } + + pub fn as_array(&self) -> &D::Array<'v> + where + Self: AsRef>, + { + self.as_ref() + } +} + +impl ToArray for TypedArrayView<'_, D> { + fn to_array(&self) -> Array { + Array::View(self.view().clone()) + } +} + +/// Convert an ArrayView into a TypedArrayView. +impl<'v, D: ArrayDef> TryFrom> for TypedArrayView<'v, D> +where + D::Metadata: TryFromArrayMetadata, +{ + type Error = VortexError; + + fn try_from(view: ArrayView<'v>) -> Result { + if view.encoding().id() != D::ID { + vortex_bail!("Invalid encoding for array") + } + let metadata = <::Metadata as TryFromArrayMetadata>::try_from_metadata( + view.metadata(), + )?; + Ok(Self { view, metadata }) + } +} diff --git a/vortex-error/src/lib.rs b/vortex-error/src/lib.rs index d75ae5546c..3eb90aad78 100644 --- a/vortex-error/src/lib.rs +++ b/vortex-error/src/lib.rs @@ -81,6 +81,12 @@ pub enum VortexError { #[backtrace] parquet::errors::ParquetError, ), + #[error(transparent)] + TryFromSliceError( + #[from] + #[backtrace] + std::array::TryFromSliceError, + ), } pub type VortexResult = Result;