Skip to content

Commit

Permalink
ArrayNBytes includes size of arrays metadata (#1549)
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Dec 4, 2024
1 parent 9146a45 commit 6624f31
Show file tree
Hide file tree
Showing 8 changed files with 14 additions and 33 deletions.
6 changes: 3 additions & 3 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Vortex array:
>>> parquet = pq.read_table("_static/example.parquet")
>>> vtx = vortex.array(parquet)
>>> vtx.nbytes
141024
141070

Compress
^^^^^^^^
Expand All @@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the

>>> cvtx = vortex.compress(vtx)
>>> cvtx.nbytes
15243
17780
>>> cvtx.nbytes / vtx.nbytes
0.108...
0.126...

Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
cache and RAM.
Expand Down
4 changes: 2 additions & 2 deletions pyvortex/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -520,10 +520,10 @@ impl PyArray {
///
/// >>> arr = vortex.array([1, 2, None, 3])
/// >>> print(arr.tree_display())
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%)
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=36 B (100.00%)
/// metadata: PrimitiveMetadata { validity: Array }
/// buffer: 32 B
/// validity: vortex.bool(0x02)(bool, len=4) nbytes=1 B (3.03%)
/// validity: vortex.bool(0x02)(bool, len=4) nbytes=3 B (8.33%)
/// metadata: BoolMetadata { validity: NonNullable, first_byte_bit_offset: 0 }
/// buffer: 1 B
/// <BLANKLINE>
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion vortex-array/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use crate::encoding::Encoding;
/// Note that this allows us to restrict the ('static + Send + Sync) requirement to just the
/// metadata trait, and not the entire array trait. We require 'static so that we can downcast
/// use the Any trait.
/// TODO(ngates): add Display
pub trait ArrayMetadata:
'static + Send + Sync + Debug + TrySerializeArrayMetadata + Display
{
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/nbytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ impl ArrayData {
self.encoding()
.accept(self.as_ref(), &mut visitor)
.vortex_expect("Failed to get nbytes from Array");
visitor.0
visitor.0 + size_of_val(self.array_metadata())
}
}

Expand Down
7 changes: 0 additions & 7 deletions vortex-sampling-compressor/src/compressors/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,6 @@ impl<'a> CompressionTree<'a> {
std::mem::take(&mut self.metadata)
}

pub fn num_descendants(&self) -> usize {
self.children
.iter()
.filter_map(|child| child.as_ref().map(|c| c.num_descendants() + 1))
.sum::<usize>()
}

#[allow(clippy::type_complexity)]
pub fn into_parts(
self,
Expand Down
15 changes: 2 additions & 13 deletions vortex-sampling-compressor/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use compressors::roaring_bool::RoaringBoolCompressor;
use compressors::roaring_int::RoaringIntCompressor;
use compressors::struct_::StructCompressor;
use compressors::varbin::VarBinCompressor;
use compressors::{CompressedArray, CompressionTree, CompressorRef};
use compressors::{CompressedArray, CompressorRef};
use vortex_alp::{ALPEncoding, ALPRDEncoding};
use vortex_array::array::{
PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
Expand Down Expand Up @@ -126,16 +126,8 @@ impl Objective {
base_size_bytes: usize,
config: &CompressConfig,
) -> f64 {
let num_descendants = array
.path()
.as_ref()
.map(CompressionTree::num_descendants)
.unwrap_or(0) as u64;
let overhead_bytes = num_descendants * config.overhead_bytes_per_array;
let size_in_bytes = array.nbytes() as u64 + overhead_bytes;

match &config.objective {
Objective::MinSize => (size_in_bytes as f64) / (base_size_bytes as f64),
Objective::MinSize => (array.nbytes() as f64) / (base_size_bytes as f64),
}
}
}
Expand All @@ -153,8 +145,6 @@ pub struct CompressConfig {
max_cost: u8,
// Are we minimizing size or maximizing performance?
objective: Objective,
/// Penalty in bytes per compression level
overhead_bytes_per_array: u64,

// Target chunk size in bytes
target_block_bytesize: usize,
Expand All @@ -172,7 +162,6 @@ impl Default for CompressConfig {
sample_count: 16,
max_cost: constants::DEFAULT_MAX_COST,
objective: Objective::MinSize,
overhead_bytes_per_array: 64,
target_block_bytesize: 16 * mib,
target_block_size: 64 * kib,
rng_seed: 0,
Expand Down
10 changes: 5 additions & 5 deletions vortex-sampling-compressor/tests/smoketest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ mod tests {
assert_eq!(chunk.encoding().id(), FoREncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from((chunk.len() * 8) as u64))
Some(Scalar::from((chunk.len() * 8) as u64 + 1))
);
}

Expand All @@ -138,7 +138,7 @@ mod tests {
assert_eq!(chunk.encoding().id(), BoolEncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from(chunk.len().div_ceil(8) as u64))
Some(Scalar::from(chunk.len().div_ceil(8) as u64 + 2))
);
}

Expand All @@ -154,7 +154,7 @@ mod tests {
);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from(1392640u64))
Some(Scalar::from(1392677u64))
);
}

Expand All @@ -167,7 +167,7 @@ mod tests {
assert_eq!(chunk.encoding().id(), VarBinEncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some(Scalar::from(134357000u64))
Some(Scalar::from(134357018u64))
);
}

Expand All @@ -180,7 +180,7 @@ mod tests {
assert_eq!(chunk.encoding().id(), DateTimePartsEncoding::ID);
assert_eq!(
chunk.statistics().get(Stat::UncompressedSizeInBytes),
Some((chunk.len() * 8).into())
Some((chunk.len() * 8 + 4).into())
)
}
}
Expand Down

0 comments on commit 6624f31

Please sign in to comment.