From 6624f31ede6e5db5ee494021fa2737f3eeccd53b Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 4 Dec 2024 12:12:58 -0500 Subject: [PATCH] ArrayNBytes includes size of arrays metadata (#1549) --- docs/quickstart.rst | 6 +++--- pyvortex/src/array.rs | 4 ++-- uv.lock | 2 +- vortex-array/src/metadata.rs | 1 - vortex-array/src/nbytes.rs | 2 +- vortex-sampling-compressor/src/compressors/mod.rs | 7 ------- vortex-sampling-compressor/src/lib.rs | 15 ++------------- vortex-sampling-compressor/tests/smoketest.rs | 10 +++++----- 8 files changed, 14 insertions(+), 33 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 5f6868d8cb..07d471ed8c 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -35,7 +35,7 @@ Vortex array: >>> parquet = pq.read_table("_static/example.parquet") >>> vtx = vortex.array(parquet) >>> vtx.nbytes - 141024 + 141070 Compress ^^^^^^^^ @@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the >>> cvtx = vortex.compress(vtx) >>> cvtx.nbytes - 15243 + 17780 >>> cvtx.nbytes / vtx.nbytes - 0.108... + 0.126... Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in cache and RAM. diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 7c9fd8c6ff..012b3bcf30 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -520,10 +520,10 @@ impl PyArray { /// /// >>> arr = vortex.array([1, 2, None, 3]) /// >>> print(arr.tree_display()) - /// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%) + /// root: vortex.primitive(0x03)(i64?, len=4) nbytes=36 B (100.00%) /// metadata: PrimitiveMetadata { validity: Array } /// buffer: 32 B - /// validity: vortex.bool(0x02)(bool, len=4) nbytes=1 B (3.03%) + /// validity: vortex.bool(0x02)(bool, len=4) nbytes=3 B (8.33%) /// metadata: BoolMetadata { validity: NonNullable, first_byte_bit_offset: 0 } /// buffer: 1 B /// diff --git a/uv.lock b/uv.lock index 92847e54cc..e7f4ac4f72 100644 --- a/uv.lock +++ b/uv.lock @@ -1003,7 +1003,7 @@ wheels = [ [[package]] name = "vortex-array" -version = "0.13.1" +version = "0.21.0" source = { editable = "pyvortex" } dependencies = [ { name = "pyarrow" }, diff --git a/vortex-array/src/metadata.rs b/vortex-array/src/metadata.rs index d9b7ace135..4645526bcc 100644 --- a/vortex-array/src/metadata.rs +++ b/vortex-array/src/metadata.rs @@ -13,7 +13,6 @@ use crate::encoding::Encoding; /// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the /// metadata trait, and not the entire array trait. We require 'static so that we can downcast /// use the Any trait. -/// TODO(ngates): add Display pub trait ArrayMetadata: 'static + Send + Sync + Debug + TrySerializeArrayMetadata + Display { diff --git a/vortex-array/src/nbytes.rs b/vortex-array/src/nbytes.rs index fd4f861061..da639c5294 100644 --- a/vortex-array/src/nbytes.rs +++ b/vortex-array/src/nbytes.rs @@ -11,7 +11,7 @@ impl ArrayData { self.encoding() .accept(self.as_ref(), &mut visitor) .vortex_expect("Failed to get nbytes from Array"); - visitor.0 + visitor.0 + size_of_val(self.array_metadata()) } } diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs index 4ed0e0fad3..6e349be801 100644 --- a/vortex-sampling-compressor/src/compressors/mod.rs +++ b/vortex-sampling-compressor/src/compressors/mod.rs @@ -183,13 +183,6 @@ impl<'a> CompressionTree<'a> { std::mem::take(&mut self.metadata) } - pub fn num_descendants(&self) -> usize { - self.children - .iter() - .filter_map(|child| child.as_ref().map(|c| c.num_descendants() + 1)) - .sum::() - } - #[allow(clippy::type_complexity)] pub fn into_parts( self, diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs index f6a595e187..058739239d 100644 --- a/vortex-sampling-compressor/src/lib.rs +++ b/vortex-sampling-compressor/src/lib.rs @@ -9,7 +9,7 @@ use compressors::roaring_bool::RoaringBoolCompressor; use compressors::roaring_int::RoaringIntCompressor; use compressors::struct_::StructCompressor; use compressors::varbin::VarBinCompressor; -use compressors::{CompressedArray, CompressionTree, CompressorRef}; +use compressors::{CompressedArray, CompressorRef}; use vortex_alp::{ALPEncoding, ALPRDEncoding}; use vortex_array::array::{ PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding, @@ -126,16 +126,8 @@ impl Objective { base_size_bytes: usize, config: &CompressConfig, ) -> f64 { - let num_descendants = array - .path() - .as_ref() - .map(CompressionTree::num_descendants) - .unwrap_or(0) as u64; - let overhead_bytes = num_descendants * config.overhead_bytes_per_array; - let size_in_bytes = array.nbytes() as u64 + overhead_bytes; - match &config.objective { - Objective::MinSize => (size_in_bytes as f64) / (base_size_bytes as f64), + Objective::MinSize => (array.nbytes() as f64) / (base_size_bytes as f64), } } } @@ -153,8 +145,6 @@ pub struct CompressConfig { max_cost: u8, // Are we minimizing size or maximizing performance? objective: Objective, - /// Penalty in bytes per compression level - overhead_bytes_per_array: u64, // Target chunk size in bytes target_block_bytesize: usize, @@ -172,7 +162,6 @@ impl Default for CompressConfig { sample_count: 16, max_cost: constants::DEFAULT_MAX_COST, objective: Objective::MinSize, - overhead_bytes_per_array: 64, target_block_bytesize: 16 * mib, target_block_size: 64 * kib, rng_seed: 0, diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs index db626ed5c6..c441f2b203 100644 --- a/vortex-sampling-compressor/tests/smoketest.rs +++ b/vortex-sampling-compressor/tests/smoketest.rs @@ -125,7 +125,7 @@ mod tests { assert_eq!(chunk.encoding().id(), FoREncoding::ID); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), - Some(Scalar::from((chunk.len() * 8) as u64)) + Some(Scalar::from((chunk.len() * 8) as u64 + 1)) ); } @@ -138,7 +138,7 @@ mod tests { assert_eq!(chunk.encoding().id(), BoolEncoding::ID); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), - Some(Scalar::from(chunk.len().div_ceil(8) as u64)) + Some(Scalar::from(chunk.len().div_ceil(8) as u64 + 2)) ); } @@ -154,7 +154,7 @@ mod tests { ); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), - Some(Scalar::from(1392640u64)) + Some(Scalar::from(1392677u64)) ); } @@ -167,7 +167,7 @@ mod tests { assert_eq!(chunk.encoding().id(), VarBinEncoding::ID); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), - Some(Scalar::from(134357000u64)) + Some(Scalar::from(134357018u64)) ); } @@ -180,7 +180,7 @@ mod tests { assert_eq!(chunk.encoding().id(), DateTimePartsEncoding::ID); assert_eq!( chunk.statistics().get(Stat::UncompressedSizeInBytes), - Some((chunk.len() * 8).into()) + Some((chunk.len() * 8 + 4).into()) ) } }