From 9040800bd8b6a06a04ce35c930d5018a1fe25826 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 20 Mar 2024 14:50:27 +0000 Subject: [PATCH] rename --- bench-vortex/src/lib.rs | 29 +++-- pyvortex/src/dtype.rs | 2 +- pyvortex/src/encode.rs | 8 +- vortex-array/src/array/composite/typed.rs | 3 +- vortex-array/src/arrow/dtypes.rs | 19 +-- vortex-array/src/arrow/mod.rs | 2 +- vortex-array/src/encode.rs | 150 +++++++++------------- 7 files changed, 91 insertions(+), 122 deletions(-) diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index ae35cc0972..8898ede02e 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -1,17 +1,18 @@ +use std::collections::HashSet; +use std::fs::{create_dir_all, File}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + use arrow_array::RecordBatchReader; use itertools::Itertools; use log::info; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet::arrow::ProjectionMask; -use std::collections::HashSet; -use std::fs::{create_dir_all, File}; -use std::path::{Path, PathBuf}; -use std::sync::Arc; + use vortex::array::bool::BoolEncoding; use vortex::array::chunked::{ChunkedArray, ChunkedEncoding}; -use vortex::array::constant::ConstantEncoding; - use vortex::array::composite::CompositeEncoding; +use vortex::array::constant::ConstantEncoding; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveEncoding; use vortex::array::sparse::SparseEncoding; @@ -117,7 +118,7 @@ pub fn compress_taxi_data() -> ArrayRef { }) .collect_vec(); - let dtype = DType::from_arrow_type(schema.clone()); + let dtype = DType::from_arrow(schema.clone()); let compressed = ChunkedArray::new(chunks.clone(), dtype).boxed(); info!("Compressed array {}", display_tree(compressed.as_ref())); @@ -143,13 +144,15 @@ pub fn compress_taxi_data() -> ArrayRef { #[cfg(test)] mod test { + use std::fs::File; + use std::ops::Deref; + use std::sync::Arc; + use arrow_array::{ArrayRef as ArrowArrayRef, StructArray as ArrowStructArray}; use log::LevelFilter; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use simplelog::{ColorChoice, Config, TermLogger, TerminalMode}; - use std::fs::File; - use std::ops::Deref; - use std::sync::Arc; + use vortex::array::ArrayRef; use vortex::compute::as_arrow::as_arrow; use vortex::encode::FromArrowArray; @@ -185,7 +188,7 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow_array(arrow_array.clone(), false); + let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); let mut buf = Vec::::new(); let mut write_ctx = WriteCtx::new(&mut buf); @@ -207,7 +210,7 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow_array(arrow_array.clone(), false); + let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); let vortex_as_arrow = as_arrow(vortex_array.as_ref()).unwrap(); assert_eq!(vortex_as_arrow.deref(), arrow_array.deref()); } @@ -226,7 +229,7 @@ mod test { for record_batch in reader.map(|batch_result| batch_result.unwrap()) { let struct_arrow: ArrowStructArray = record_batch.into(); let arrow_array: ArrowArrayRef = Arc::new(struct_arrow); - let vortex_array = ArrayRef::from_arrow_array(arrow_array.clone(), false); + let vortex_array = ArrayRef::from_arrow(arrow_array.clone(), false); let compressed = ctx.clone().compress(vortex_array.as_ref(), None).unwrap(); let compressed_as_arrow = as_arrow(compressed.as_ref()).unwrap(); diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs index 271baac352..3aa0f35100 100644 --- a/pyvortex/src/dtype.rs +++ b/pyvortex/src/dtype.rs @@ -35,7 +35,7 @@ impl PyDType { ) -> PyResult> { PyDType::wrap( cls.py(), - DType::from_arrow_type(&Field::new("_", arrow_dtype, nullable)), + DType::from_arrow(&Field::new("_", arrow_dtype, nullable)), ) } } diff --git a/pyvortex/src/encode.rs b/pyvortex/src/encode.rs index 1c272d59c5..c98a8a0448 100644 --- a/pyvortex/src/encode.rs +++ b/pyvortex/src/encode.rs @@ -25,7 +25,7 @@ pub fn encode(obj: &PyAny) -> PyResult> { if obj.is_instance(pa_array)? { let arrow_array = ArrayData::from_pyarrow(obj).map(make_array)?; - let enc_array = ArrayRef::from_arrow_array(arrow_array, false); + let enc_array = ArrayRef::from_arrow(arrow_array, false); PyArray::wrap(obj.py(), enc_array) } else if obj.is_instance(chunked_array)? { let chunks: Vec<&PyAny> = obj.getattr("chunks")?.extract()?; @@ -34,17 +34,17 @@ pub fn encode(obj: &PyAny) -> PyResult> { .map(|a| { ArrayData::from_pyarrow(a) .map(make_array) - .map(|a| ArrayRef::from_arrow_array(a, false)) + .map(|a| ArrayRef::from_arrow(a, false)) }) .collect::>>()?; let dtype: DType = obj .getattr("type") .and_then(DataType::from_pyarrow) - .map(|dt| DType::from_arrow_type(&Field::new("_", dt, false)))?; + .map(|dt| DType::from_arrow(&Field::new("_", dt, false)))?; PyArray::wrap(obj.py(), ChunkedArray::new(encoded_chunks, dtype).boxed()) } else if obj.is_instance(table)? { let array_stream = ArrowArrayStreamReader::from_pyarrow(obj)?; - let dtype = DType::from_arrow_type(array_stream.schema()); + let dtype = DType::from_arrow(array_stream.schema()); let chunks = array_stream .into_iter() .map(|b| b.map(ArrayRef::from).map_err(map_arrow_err)) diff --git a/vortex-array/src/array/composite/typed.rs b/vortex-array/src/array/composite/typed.rs index a35ad58d1f..8ed79ee44b 100644 --- a/vortex-array/src/array/composite/typed.rs +++ b/vortex-array/src/array/composite/typed.rs @@ -1,6 +1,8 @@ use std::fmt::Debug; use std::sync::Arc; +use vortex_schema::CompositeID; + use crate::array::composite::array::CompositeArray; use crate::array::composite::CompositeMetadata; use crate::array::{Array, ArrayRef}; @@ -95,4 +97,3 @@ macro_rules! composite_impl { } pub(crate) use composite_impl; -use vortex_schema::CompositeID; diff --git a/vortex-array/src/arrow/dtypes.rs b/vortex-array/src/arrow/dtypes.rs index 770ca4d6b1..e0d4f00789 100644 --- a/vortex-array/src/arrow/dtypes.rs +++ b/vortex-array/src/arrow/dtypes.rs @@ -33,13 +33,8 @@ impl From for ArrayRef { .map(|(array, field)| { // The dtype of the child arrays infer their nullability from the array itself. // In case the schema says something different, we cast into the schema's dtype. - let vortex_array = - ArrayRef::from_arrow_array(array.clone(), field.is_nullable()); - cast( - vortex_array.as_ref(), - &DType::from_arrow_type(field.as_ref()), - ) - .unwrap() + let vortex_array = ArrayRef::from_arrow(array.clone(), field.is_nullable()); + cast(vortex_array.as_ref(), &DType::from_arrow(field.as_ref())).unwrap() }) .collect(), ) @@ -75,7 +70,7 @@ impl TryFrom<&DataType> for PType { } impl FromArrowType for DType { - fn from_arrow_type(value: SchemaRef) -> Self { + fn from_arrow(value: SchemaRef) -> Self { DType::Struct( value .fields() @@ -85,14 +80,14 @@ impl FromArrowType for DType { value .fields() .iter() - .map(|f| DType::from_arrow_type(f.as_ref())) + .map(|f| DType::from_arrow(f.as_ref())) .collect_vec(), ) } } impl FromArrowType<&Field> for DType { - fn from_arrow_type(field: &Field) -> Self { + fn from_arrow(field: &Field) -> Self { use vortex_schema::DType::*; use vortex_schema::Signedness::*; @@ -123,12 +118,12 @@ impl FromArrowType<&Field> for DType { // DataType::Time32(u) => localtime(u.into(), IntWidth::_32, nullability), // DataType::Time64(u) => localtime(u.into(), IntWidth::_64, nullability), DataType::List(e) | DataType::LargeList(e) => { - List(Box::new(DType::from_arrow_type(e.as_ref())), nullability) + List(Box::new(DType::from_arrow(e.as_ref())), nullability) } DataType::Struct(f) => Struct( f.iter().map(|f| Arc::new(f.name().clone())).collect(), f.iter() - .map(|f| DType::from_arrow_type(f.as_ref())) + .map(|f| DType::from_arrow(f.as_ref())) .collect_vec(), ), DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(*p, *s, nullability), diff --git a/vortex-array/src/arrow/mod.rs b/vortex-array/src/arrow/mod.rs index 2f99f3fcb5..bf9a621991 100644 --- a/vortex-array/src/arrow/mod.rs +++ b/vortex-array/src/arrow/mod.rs @@ -2,5 +2,5 @@ pub mod dtypes; pub mod wrappers; pub trait FromArrowType: Sized { - fn from_arrow_type(value: T) -> Self; + fn from_arrow(value: T) -> Self; } diff --git a/vortex-array/src/encode.rs b/vortex-array/src/encode.rs index 205c7125d6..b71807a830 100644 --- a/vortex-array/src/encode.rs +++ b/vortex-array/src/encode.rs @@ -34,7 +34,7 @@ use crate::ptype::PType; use crate::scalar::NullScalar; pub trait FromArrowArray { - fn from_arrow_array(array: A, nullable: bool) -> Self; + fn from_arrow(array: A, nullable: bool) -> Self; } impl From<&Buffer> for ArrayRef { @@ -57,7 +57,7 @@ impl From<&OffsetBuffer> for ArrayRef { } impl FromArrowArray<&ArrowPrimitiveArray> for ArrayRef { - fn from_arrow_array(value: &ArrowPrimitiveArray, nullable: bool) -> Self { + fn from_arrow(value: &ArrowPrimitiveArray, nullable: bool) -> Self { let ptype: PType = (&T::DATA_TYPE).try_into().unwrap(); let arr = PrimitiveArray::new( ptype, @@ -91,7 +91,7 @@ impl FromArrowArray<&ArrowPrimitiveArray> for ArrayRef } impl FromArrowArray<&GenericByteArray> for ArrayRef { - fn from_arrow_array(value: &GenericByteArray, nullable: bool) -> Self { + fn from_arrow(value: &GenericByteArray, nullable: bool) -> Self { let dtype = match T::DATA_TYPE { DataType::Binary | DataType::LargeBinary => DType::Binary(nullable.into()), DataType::Utf8 | DataType::LargeUtf8 => DType::Utf8(nullable.into()), @@ -108,7 +108,7 @@ impl FromArrowArray<&GenericByteArray> for ArrayRef { } impl FromArrowArray<&ArrowBooleanArray> for ArrayRef { - fn from_arrow_array(value: &ArrowBooleanArray, nullable: bool) -> Self { + fn from_arrow(value: &ArrowBooleanArray, nullable: bool) -> Self { BoolArray::new( value.values().to_owned(), nulls(value.nulls(), nullable, value.len()), @@ -118,7 +118,7 @@ impl FromArrowArray<&ArrowBooleanArray> for ArrayRef { } impl FromArrowArray<&ArrowStructArray> for ArrayRef { - fn from_arrow_array(value: &ArrowStructArray, nullable: bool) -> Self { + fn from_arrow(value: &ArrowStructArray, nullable: bool) -> Self { // TODO(ngates): how should we deal with Arrow "logical nulls"? assert!(!nullable); StructArray::new( @@ -132,7 +132,7 @@ impl FromArrowArray<&ArrowStructArray> for ArrayRef { .columns() .iter() .zip(value.fields()) - .map(|(c, field)| ArrayRef::from_arrow_array(c.clone(), field.is_nullable())) + .map(|(c, field)| ArrayRef::from_arrow(c.clone(), field.is_nullable())) .collect(), ) .boxed() @@ -140,7 +140,7 @@ impl FromArrowArray<&ArrowStructArray> for ArrayRef { } impl FromArrowArray<&ArrowNullArray> for ArrayRef { - fn from_arrow_array(value: &ArrowNullArray, nullable: bool) -> Self { + fn from_arrow(value: &ArrowNullArray, nullable: bool) -> Self { assert!(nullable); ConstantArray::new(NullScalar::new().into(), value.len()).boxed() } @@ -160,109 +160,79 @@ fn nulls(nulls: Option<&NullBuffer>, nullable: bool, len: usize) -> Option for ArrayRef { - fn from_arrow_array(array: ArrowArrayRef, nullable: bool) -> Self { + fn from_arrow(array: ArrowArrayRef, nullable: bool) -> Self { match array.data_type() { - DataType::Boolean => ArrayRef::from_arrow_array(array.as_boolean(), nullable), - DataType::UInt8 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::UInt16 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::UInt32 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::UInt64 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::Int8 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::Int16 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::Int32 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::Int64 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } + DataType::Boolean => ArrayRef::from_arrow(array.as_boolean(), nullable), + DataType::UInt8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::UInt64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int8 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int16 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Int64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), DataType::Float16 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) + ArrayRef::from_arrow(array.as_primitive::(), nullable) } DataType::Float32 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) + ArrayRef::from_arrow(array.as_primitive::(), nullable) } DataType::Float64 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) + ArrayRef::from_arrow(array.as_primitive::(), nullable) } - DataType::Utf8 => ArrayRef::from_arrow_array(array.as_string::(), nullable), - DataType::LargeUtf8 => ArrayRef::from_arrow_array(array.as_string::(), nullable), - DataType::Binary => ArrayRef::from_arrow_array(array.as_binary::(), nullable), - DataType::LargeBinary => ArrayRef::from_arrow_array(array.as_binary::(), nullable), - DataType::Struct(_) => ArrayRef::from_arrow_array(array.as_struct(), nullable), - DataType::Null => ArrayRef::from_arrow_array(as_null_array(array.as_ref()), nullable), + DataType::Utf8 => ArrayRef::from_arrow(array.as_string::(), nullable), + DataType::LargeUtf8 => ArrayRef::from_arrow(array.as_string::(), nullable), + DataType::Binary => ArrayRef::from_arrow(array.as_binary::(), nullable), + DataType::LargeBinary => ArrayRef::from_arrow(array.as_binary::(), nullable), + DataType::Struct(_) => ArrayRef::from_arrow(array.as_struct(), nullable), + DataType::Null => ArrayRef::from_arrow(as_null_array(array.as_ref()), nullable), DataType::Timestamp(u, _) => match u { - TimeUnit::Second => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Millisecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Microsecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Nanosecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), + TimeUnit::Second => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } }, - DataType::Date32 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } - DataType::Date64 => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) - } + DataType::Date32 => ArrayRef::from_arrow(array.as_primitive::(), nullable), + DataType::Date64 => ArrayRef::from_arrow(array.as_primitive::(), nullable), DataType::Time32(u) => match u { TimeUnit::Second => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) } - TimeUnit::Millisecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), _ => unreachable!(), }, DataType::Time64(u) => match u { - TimeUnit::Microsecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Nanosecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } _ => unreachable!(), }, DataType::Duration(u) => match u { TimeUnit::Second => { - ArrayRef::from_arrow_array(array.as_primitive::(), nullable) + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Millisecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Microsecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) + } + TimeUnit::Nanosecond => { + ArrayRef::from_arrow(array.as_primitive::(), nullable) } - TimeUnit::Millisecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Microsecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), - TimeUnit::Nanosecond => ArrayRef::from_arrow_array( - array.as_primitive::(), - nullable, - ), }, _ => panic!( "TODO(robert): Missing array encoding for dtype {}",