From d029c431f70deee306043224d40cac5d9701a932 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 13:50:50 +0000 Subject: [PATCH 01/10] ALP --- Cargo.lock | 1 + vortex-alp/Cargo.toml | 1 + vortex-alp/src/alp.rs | 20 ++- vortex-alp/src/compress.rs | 274 ++++++++++++++++++++++++++----------- vortex-alp/src/compute.rs | 66 +++++---- vortex-alp/src/serde.rs | 5 +- 6 files changed, 242 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 13a0456f84..aa8dfa5aea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2833,6 +2833,7 @@ dependencies = [ "itertools 0.12.1", "linkme", "log", + "num-traits", "vortex", ] diff --git a/vortex-alp/Cargo.toml b/vortex-alp/Cargo.toml index cb1a1cdd1d..8f37114d5f 100644 --- a/vortex-alp/Cargo.toml +++ b/vortex-alp/Cargo.toml @@ -16,6 +16,7 @@ arrow = { version = "50.0.0" } vortex = { "path" = "../vortex" } linkme = "0.3.22" itertools = "0.12.1" +num-traits = "0.2.18" codecz = { version = "0.1.0", path = "../codecz" } log = { version = "0.4.20", features = [] } diff --git a/vortex-alp/src/alp.rs b/vortex-alp/src/alp.rs index 65d37418aa..ad150ddfe4 100644 --- a/vortex-alp/src/alp.rs +++ b/vortex-alp/src/alp.rs @@ -12,23 +12,29 @@ use vortex::stats::{Stats, StatsSet}; use crate::compress::alp_encode; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Exponents { + pub e: u8, + pub f: u8, +} + #[derive(Debug, Clone)] pub struct ALPArray { encoded: ArrayRef, - exponents: ALPExponents, + exponents: Exponents, patches: Option, dtype: DType, stats: Arc>, } impl ALPArray { - pub fn new(encoded: ArrayRef, exponents: ALPExponents, patches: Option) -> Self { + pub fn new(encoded: ArrayRef, exponents: Exponents, patches: Option) -> Self { Self::try_new(encoded, exponents, patches).unwrap() } pub fn try_new( encoded: ArrayRef, - exponents: ALPExponents, + exponents: Exponents, patches: Option, ) -> VortexResult { let dtype = match encoded.dtype() { @@ -59,8 +65,8 @@ impl ALPArray { self.encoded.as_ref() } - pub fn exponents(&self) -> ALPExponents { - self.exponents + pub fn exponents(&self) -> &Exponents { + &self.exponents } pub fn patches(&self) -> Option<&dyn Array> { @@ -111,7 +117,7 @@ impl Array for ALPArray { fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new( self.encoded().slice(start, stop)?, - self.exponents(), + self.exponents().clone(), self.patches().map(|p| p.slice(start, stop)).transpose()?, )? .boxed()) @@ -140,7 +146,7 @@ impl<'arr> AsRef<(dyn Array + 'arr)> for ALPArray { impl ArrayDisplay for ALPArray { fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.writeln(format!("exponents: {}", self.exponents()))?; + f.writeln(format!("exponents: {:?}", self.exponents()))?; if let Some(p) = self.patches() { f.writeln("patches:")?; f.indent(|indent| indent.array(p.as_ref()))?; diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index a742b0924a..a6986ce74a 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -1,16 +1,17 @@ +use itertools::Itertools; use log::debug; +use num_traits::{cast, Float, PrimInt}; -use codecz::alp; -use codecz::alp::{ALPEncoded, ALPExponents, SupportsALP}; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; -use vortex::array::{Array, ArrayRef, CloneOptionalArray}; +use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, Compressor, EncodingCompression}; use vortex::ptype::{NativePType, PType}; use crate::alp::{ALPArray, ALPEncoding}; use crate::downcast::DowncastALP; +use crate::Exponents; impl EncodingCompression for ALPEncoding { fn compressor( @@ -38,9 +39,15 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) let like_alp = like.map(|like_array| like_array.as_alp()); let parray = array.as_primitive(); - let (encoded, exponents, patches) = like_alp - .map(|alp_like| alp_encode_like_parts(parray, alp_like)) - .unwrap_or_else(|| alp_encode_parts(parray)); + let (exponents, encoded, patches) = match parray.ptype() { + PType::F32 => { + ALPFloat::encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())) + } + PType::F64 => { + ALPFloat::encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())) + } + _ => panic!("Unsupported ptype"), + }; ALPArray::new( ctx.next_level() @@ -55,91 +62,196 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) } pub fn alp_encode(parray: &PrimitiveArray) -> ALPArray { - let (encoded, exponents, patches) = alp_encode_parts(parray); + let (exponents, encoded, patches) = match parray.ptype() { + PType::F32 => ALPFloat::encode_to_array(parray.typed_data::(), None), + PType::F64 => ALPFloat::encode_to_array(parray.typed_data::(), None), + _ => panic!("Unsupported ptype"), + }; ALPArray::new(encoded, exponents, patches) } -fn alp_encode_parts(parray: &PrimitiveArray) -> (ArrayRef, ALPExponents, Option) { - match parray.ptype() { - PType::F32 => { - alp_encode_primitive(parray.buffer().typed_data::(), parray.validity(), None) - } - PType::F64 => { - alp_encode_primitive(parray.buffer().typed_data::(), parray.validity(), None) - } - _ => panic!("Unsupported ptype"), +trait ALPFloat: NativePType + Float { + type ALPInt: NativePType + PrimInt; + const FRACTIONAL_BITS: u8; + const SWEET: Self; + const F10: &'static [Self]; // TODO(ngates): const exprs for these to be arrays. + const IF10: &'static [Self]; + + /// Round to the nearest floating integer by shifting in and out of the low precision range. + fn fast_round(self) -> Self { + (self + Self::SWEET) - Self::SWEET } -} -fn alp_encode_like_parts( - parray: &PrimitiveArray, - sample: &ALPArray, -) -> (ArrayRef, ALPExponents, Option) { - match parray.ptype() { - PType::F32 => alp_encode_primitive( - parray.buffer().typed_data::(), - parray.validity(), - Some(sample.exponents()), - ), - PType::F64 => alp_encode_primitive( - parray.buffer().typed_data::(), - parray.validity(), - Some(sample.exponents()), - ), - _ => panic!("Unsupported ptype"), + fn find_best_exponents(_values: &[Self]) -> Exponents { + Exponents { e: 16, f: 13 } } -} -fn alp_encode_primitive( - values: &[T], - validity: Option<&dyn Array>, - exponents: Option, -) -> (ArrayRef, ALPExponents, Option) -where - T::EncInt: NativePType, -{ - // TODO: actually handle CodecErrors instead of blindly unwrapping - let ALPEncoded { - values, - exponents, - exceptions_idx, - num_exceptions, - } = exponents - .map(|exp| alp::encode_with(values, exp)) - .unwrap_or_else(|| alp::encode(values)) - .unwrap(); - let values = PrimitiveArray::from_nullable_in(values, validity.clone_optional()); // move and re-alias - - let patches = if num_exceptions == 0 { - None - } else { - let patch_indices = codecz::utils::into_u64_vec(&exceptions_idx, num_exceptions); - let patch_values = codecz::utils::gather_patches( - values.buffer().typed_data::(), - patch_indices.as_slice(), - ); - Some( - SparseArray::new( - PrimitiveArray::from_vec_in(patch_indices).boxed(), - PrimitiveArray::from_vec_in(patch_values).boxed(), - values.len(), - ) - .boxed(), + fn encode_to_array( + values: &[Self], + exponents: Option<&Exponents>, + ) -> (Exponents, ArrayRef, Option) { + let best_exponents = + exponents.map_or_else(|| Self::find_best_exponents(values), Exponents::clone); + let (values, exc_pos, exc) = Self::encode(values, &best_exponents); + let len = values.len(); + ( + best_exponents, + PrimitiveArray::from_vec(values).boxed(), + (exc.len() > 0).then(|| { + SparseArray::new( + PrimitiveArray::from_vec(exc_pos).boxed(), + PrimitiveArray::from_vec(exc).boxed(), + len, + ) + .boxed() + }), ) - }; + } - (values.boxed(), exponents, patches) + fn encode(values: &[Self], exponents: &Exponents) -> (Vec, Vec, Vec) { + let mut exc_pos = Vec::new(); + let mut exc_value = Vec::new(); + let encoded = values + .iter() + .enumerate() + .map(|(i, v)| { + let encoded = + (*v * Self::F10[exponents.e as usize] * Self::IF10[exponents.f as usize]) + .fast_round(); + let decoded = + encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; + + if encoded != decoded { + exc_pos.push(i as u64); + exc_value.push(*v); + // TODO(ngates): we could find previous? + Self::default() + } else { + *v + } + }) + .map(|v| cast(v).unwrap()) + .collect_vec(); + + (encoded, exc_pos, exc_value) + } } -#[allow(dead_code)] -pub fn alp_decode(parray: &PrimitiveArray, exp: ALPExponents) -> PrimitiveArray { - match parray.ptype() { - PType::I32 => PrimitiveArray::from_vec_in( - alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), - ), - PType::I64 => PrimitiveArray::from_vec_in( - alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), - ), - _ => panic!("Unsupported ptype"), +impl ALPFloat for f32 { + type ALPInt = i32; + const FRACTIONAL_BITS: u8 = 23; + const SWEET: Self = + (1 << Self::FRACTIONAL_BITS) as Self + (1 << Self::FRACTIONAL_BITS - 1) as Self; + + const F10: &'static [Self] = &[ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + ]; + const IF10: &'static [Self] = &[ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + ]; +} + +impl ALPFloat for f64 { + type ALPInt = i64; + const FRACTIONAL_BITS: u8 = 52; + const SWEET: Self = + (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; + const F10: &'static [Self] = &[ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, + 1000000000000000000000.0, + 10000000000000000000000.0, + 100000000000000000000000.0, + ]; + + const IF10: &'static [Self] = &[ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001, + 0.00000000000000000001, + ]; +} + +// +// #[allow(dead_code)] +// pub fn alp_decode(parray: &PrimitiveArray, exp: ALPExponents) -> PrimitiveArray { +// match parray.ptype() { +// PType::I32 => PrimitiveArray::from_vec_in( +// alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), +// ), +// PType::I64 => PrimitiveArray::from_vec_in( +// alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), +// ), +// _ => panic!("Unsupported ptype"), +// } +// } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + // Create a range offset by a million + let array = PrimitiveArray::from_vec(vec![1.234; 1024]); + let encoded = alp_encode(&array); + println!("Encoded {:?}", encoded); + assert_eq!(encoded.exponents(), &Exponents { e: 0, f: 0 }); } } diff --git a/vortex-alp/src/compute.rs b/vortex-alp/src/compute.rs index d85d6e1072..41b01512bc 100644 --- a/vortex-alp/src/compute.rs +++ b/vortex-alp/src/compute.rs @@ -1,11 +1,8 @@ use crate::ALPArray; -use codecz::alp; -use vortex::array::Array; -use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; +use vortex::compute::scalar_at::ScalarAtFn; use vortex::compute::ArrayCompute; -use vortex::dtype::{DType, FloatWidth}; use vortex::error::VortexResult; -use vortex::scalar::{NullableScalar, Scalar, ScalarRef}; +use vortex::scalar::ScalarRef; impl ArrayCompute for ALPArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -14,34 +11,35 @@ impl ArrayCompute for ALPArray { } impl ScalarAtFn for ALPArray { - fn scalar_at(&self, index: usize) -> VortexResult { - if let Some(patch) = self - .patches() - .and_then(|p| scalar_at(p, index).ok()) - .and_then(|p| p.into_nonnull()) - { - return Ok(patch); - } - - let Some(encoded_val) = scalar_at(self.encoded(), index)?.into_nonnull() else { - return Ok(NullableScalar::none(self.dtype().clone()).boxed()); - }; - match self.dtype() { - DType::Float(FloatWidth::_32, _) => { - let encoded_val: i32 = encoded_val.try_into().unwrap(); - Ok(alp::decode_single::(encoded_val, self.exponents()) - .unwrap() - .into()) - } - - DType::Float(FloatWidth::_64, _) => { - let encoded_val: i64 = encoded_val.try_into().unwrap(); - Ok(alp::decode_single::(encoded_val, self.exponents()) - .unwrap() - .into()) - } - - _ => unreachable!(), - } + fn scalar_at(&self, _index: usize) -> VortexResult { + todo!() + // if let Some(patch) = self + // .patches() + // .and_then(|p| scalar_at(p, index).ok()) + // .and_then(|p| p.into_nonnull()) + // { + // return Ok(patch); + // } + // + // let Some(encoded_val) = scalar_at(self.encoded(), index)?.into_nonnull() else { + // return Ok(NullableScalar::none(self.dtype().clone()).boxed()); + // }; + // match self.dtype() { + // DType::Float(FloatWidth::_32, _) => { + // let encoded_val: i32 = encoded_val.try_into().unwrap(); + // Ok(alp::decode_single::(encoded_val, self.exponents()) + // .unwrap() + // .into()) + // } + // + // DType::Float(FloatWidth::_64, _) => { + // let encoded_val: i64 = encoded_val.try_into().unwrap(); + // Ok(alp::decode_single::(encoded_val, self.exponents()) + // .unwrap() + // .into()) + // } + // + // _ => unreachable!(), + // } } } diff --git a/vortex-alp/src/serde.rs b/vortex-alp/src/serde.rs index 58ef3e090e..3d3e4215a2 100644 --- a/vortex-alp/src/serde.rs +++ b/vortex-alp/src/serde.rs @@ -1,12 +1,11 @@ use std::io; use std::io::ErrorKind; -use codecz::alp::ALPExponents; use vortex::array::{Array, ArrayRef}; use vortex::dtype::{DType, FloatWidth, Signedness}; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use crate::{ALPArray, ALPEncoding}; +use crate::{ALPArray, ALPEncoding, Exponents}; impl ArraySerde for ALPArray { fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { @@ -39,7 +38,7 @@ impl EncodingSerde for ALPEncoding { let encoded = ctx.with_schema(&encoded_dtype).read()?; Ok(ALPArray::new( encoded, - ALPExponents { + Exponents { e: exponents[0], f: exponents[1], }, From 465ecad8488234c9d314a4718c4de7ebf2c666bc Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 14:04:13 +0000 Subject: [PATCH 02/10] ALP --- vortex-alp/src/compress.rs | 40 +++++++++++++++++++++++++--- vortex/src/array/chunked/compress.rs | 7 +++-- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index a6986ce74a..5c6961d36d 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -82,8 +82,38 @@ trait ALPFloat: NativePType + Float { (self + Self::SWEET) - Self::SWEET } - fn find_best_exponents(_values: &[Self]) -> Exponents { - Exponents { e: 16, f: 13 } + fn find_best_exponents(values: &[Self]) -> Exponents { + let mut best_e: usize = 0; + let mut best_f: usize = 0; + let mut best_nbytes: usize = usize::MAX; + + // TODO(wmanning): idea, start with highest e, then find the best f + // after that, try e's in descending order, with a gap no larger than the original e - f + for e in 0..Self::F10.len() - 1 { + for f in 0..e { + let (_, encoded, patches) = Self::encode_to_array( + values, + Some(&Exponents { + e: e as u8, + f: f as u8, + }), + ); + let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); + if size < best_nbytes { + best_nbytes = size; + best_e = e; + best_f = f; + } else if size == best_nbytes && e - f < best_e - best_f { + best_e = e; + best_f = f; + } + } + } + + Exponents { + e: best_e as u8, + f: best_f as u8, + } } fn encode_to_array( @@ -121,7 +151,7 @@ trait ALPFloat: NativePType + Float { let decoded = encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; - if encoded != decoded { + if decoded != *v { exc_pos.push(i as u64); exc_value.push(*v); // TODO(ngates): we could find previous? @@ -225,6 +255,9 @@ impl ALPFloat for f64 { 0.000000000000000001, 0.0000000000000000001, 0.00000000000000000001, + 0.000000000000000000001, + 0.0000000000000000000001, + 0.00000000000000000000001, ]; } @@ -252,6 +285,7 @@ mod test { let array = PrimitiveArray::from_vec(vec![1.234; 1024]); let encoded = alp_encode(&array); println!("Encoded {:?}", encoded); + assert_eq!(encoded.patches(), None); assert_eq!(encoded.exponents(), &Exponents { e: 0, f: 0 }); } } diff --git a/vortex/src/array/chunked/compress.rs b/vortex/src/array/chunked/compress.rs index 1268ba6dfd..a5bf11aa54 100644 --- a/vortex/src/array/chunked/compress.rs +++ b/vortex/src/array/chunked/compress.rs @@ -1,9 +1,8 @@ -use rayon::prelude::*; - use crate::array::chunked::{ChunkedArray, ChunkedEncoding}; use crate::array::downcast::DowncastArrayBuiltin; use crate::array::{Array, ArrayRef}; use crate::compress::{CompressConfig, CompressCtx, Compressor, EncodingCompression}; +use itertools::Itertools; impl EncodingCompression for ChunkedEncoding { fn compressor( @@ -27,7 +26,7 @@ fn chunked_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: Compress .map(|c_like| { chunked_array .chunks() - .par_iter() + .iter() .zip_eq(c_like.chunks()) .map(|(chunk, chunk_like)| ctx.compress(chunk.as_ref(), Some(chunk_like.as_ref()))) .collect() @@ -35,7 +34,7 @@ fn chunked_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: Compress .unwrap_or_else(|| { chunked_array .chunks() - .par_iter() + .iter() .map(|chunk| ctx.compress(chunk.as_ref(), None)) .collect() }); From 55d407b488855867e7c48a034771fbd741fa51a4 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 14:14:05 +0000 Subject: [PATCH 03/10] ALP --- vortex-alp/src/compress.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 5c6961d36d..14d05ec188 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -51,11 +51,13 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) ALPArray::new( ctx.next_level() - .compress(encoded.as_ref(), like_alp.map(|a| a.encoded())), + //.compress(encoded.as_ref(), like_alp.map(|a| a.encoded())), + .compress(encoded.as_ref(), None), exponents, patches.map(|p| { ctx.next_level() - .compress(p.as_ref(), like_alp.and_then(|a| a.patches())) + //.compress(p.as_ref(), like_alp.and_then(|a| a.patches())) + .compress(p.as_ref(), None) }), ) .boxed() From cecded6bb53d69aa1d2bca12c22cefcccb81601c Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 14:40:01 +0000 Subject: [PATCH 04/10] ALP --- vortex-alp/src/compress.rs | 71 +++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 14d05ec188..935906afd4 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -1,6 +1,6 @@ use itertools::Itertools; use log::debug; -use num_traits::{cast, Float, PrimInt}; +use num_traits::{Float, NumCast, PrimInt}; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; @@ -51,13 +51,13 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) ALPArray::new( ctx.next_level() - //.compress(encoded.as_ref(), like_alp.map(|a| a.encoded())), - .compress(encoded.as_ref(), None), + .compress(encoded.as_ref(), like_alp.map(|a| a.encoded())), + // .compress(encoded.as_ref(), None), exponents, patches.map(|p| { ctx.next_level() - //.compress(p.as_ref(), like_alp.and_then(|a| a.patches())) - .compress(p.as_ref(), None) + .compress(p.as_ref(), like_alp.and_then(|a| a.patches())) + // .compress(p.as_ref(), None) }), ) .boxed() @@ -75,6 +75,7 @@ pub fn alp_encode(parray: &PrimitiveArray) -> ALPArray { trait ALPFloat: NativePType + Float { type ALPInt: NativePType + PrimInt; const FRACTIONAL_BITS: u8; + const MAX_EXPONENT: u8; const SWEET: Self; const F10: &'static [Self]; // TODO(ngates): const exprs for these to be arrays. const IF10: &'static [Self]; @@ -85,21 +86,16 @@ trait ALPFloat: NativePType + Float { } fn find_best_exponents(values: &[Self]) -> Exponents { - let mut best_e: usize = 0; - let mut best_f: usize = 0; + let mut best_e: u8 = 0; + let mut best_f: u8 = 0; let mut best_nbytes: usize = usize::MAX; // TODO(wmanning): idea, start with highest e, then find the best f // after that, try e's in descending order, with a gap no larger than the original e - f - for e in 0..Self::F10.len() - 1 { + for e in 0..Self::MAX_EXPONENT { for f in 0..e { - let (_, encoded, patches) = Self::encode_to_array( - values, - Some(&Exponents { - e: e as u8, - f: f as u8, - }), - ); + let (_, encoded, patches) = + Self::encode_to_array(values, Some(&Exponents { e, f })); let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); if size < best_nbytes { best_nbytes = size; @@ -143,6 +139,7 @@ trait ALPFloat: NativePType + Float { fn encode(values: &[Self], exponents: &Exponents) -> (Vec, Vec, Vec) { let mut exc_pos = Vec::new(); let mut exc_value = Vec::new(); + let mut prev = Self::ALPInt::default(); let encoded = values .iter() .enumerate() @@ -153,16 +150,18 @@ trait ALPFloat: NativePType + Float { let decoded = encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; - if decoded != *v { - exc_pos.push(i as u64); - exc_value.push(*v); - // TODO(ngates): we could find previous? - Self::default() - } else { - *v + if decoded == *v { + if let Some(e) = <::ALPInt as NumCast>::from(encoded) { + prev = e; + return e; + } } + + exc_pos.push(i as u64); + exc_value.push(*v); + // Emit the last known good value. This helps with run-end encoding. + prev }) - .map(|v| cast(v).unwrap()) .collect_vec(); (encoded, exc_pos, exc_value) @@ -172,6 +171,7 @@ trait ALPFloat: NativePType + Float { impl ALPFloat for f32 { type ALPInt = i32; const FRACTIONAL_BITS: u8 = 23; + const MAX_EXPONENT: u8 = 10; const SWEET: Self = (1 << Self::FRACTIONAL_BITS) as Self + (1 << Self::FRACTIONAL_BITS - 1) as Self; @@ -205,6 +205,7 @@ impl ALPFloat for f32 { impl ALPFloat for f64 { type ALPInt = i64; + const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 const FRACTIONAL_BITS: u8 = 52; const SWEET: Self = (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; @@ -263,20 +264,6 @@ impl ALPFloat for f64 { ]; } -// -// #[allow(dead_code)] -// pub fn alp_decode(parray: &PrimitiveArray, exp: ALPExponents) -> PrimitiveArray { -// match parray.ptype() { -// PType::I32 => PrimitiveArray::from_vec_in( -// alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), -// ), -// PType::I64 => PrimitiveArray::from_vec_in( -// alp::decode::(parray.buffer().typed_data::(), exp).unwrap(), -// ), -// _ => panic!("Unsupported ptype"), -// } -// } - #[cfg(test)] mod test { use super::*; @@ -284,10 +271,14 @@ mod test { #[test] fn test_compress() { // Create a range offset by a million - let array = PrimitiveArray::from_vec(vec![1.234; 1024]); + let array = PrimitiveArray::from_vec(vec![1.234f32; 10]); let encoded = alp_encode(&array); println!("Encoded {:?}", encoded); - assert_eq!(encoded.patches(), None); - assert_eq!(encoded.exponents(), &Exponents { e: 0, f: 0 }); + assert!(encoded.patches().is_none()); + assert_eq!( + encoded.encoded().as_primitive().typed_data::(), + vec![1234; 10] + ); + assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); } } From 5ae0e9b81cbb7ca4b17e5d026a7489df95363cb7 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 16:08:03 +0000 Subject: [PATCH 05/10] ALP --- vortex-alp/src/compress.rs | 71 +++++++++++++++++++++++++------------- vortex-alp/src/compute.rs | 66 ++++++++++++++++++----------------- vortex-array/src/lib.rs | 2 -- 3 files changed, 81 insertions(+), 58 deletions(-) diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 935906afd4..2ce182f9f1 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -7,12 +7,15 @@ use vortex::array::primitive::PrimitiveArray; use vortex::array::sparse::SparseArray; use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, Compressor, EncodingCompression}; +use vortex::error::{VortexError, VortexResult}; use vortex::ptype::{NativePType, PType}; use crate::alp::{ALPArray, ALPEncoding}; use crate::downcast::DowncastALP; use crate::Exponents; +const SAMPLE_SIZE: usize = 32; + impl EncodingCompression for ALPEncoding { fn compressor( &self, @@ -49,27 +52,25 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) _ => panic!("Unsupported ptype"), }; - ALPArray::new( + let compressed_encoded = ctx + .next_level() + .compress(encoded.as_ref(), like_alp.map(|a| a.encoded())); + + let compressed_patches = patches.map(|p| { ctx.next_level() - .compress(encoded.as_ref(), like_alp.map(|a| a.encoded())), - // .compress(encoded.as_ref(), None), - exponents, - patches.map(|p| { - ctx.next_level() - .compress(p.as_ref(), like_alp.and_then(|a| a.patches())) - // .compress(p.as_ref(), None) - }), - ) - .boxed() + .compress(p.as_ref(), like_alp.and_then(|a| a.patches())) + }); + + ALPArray::new(compressed_encoded, exponents, compressed_patches).boxed() } -pub fn alp_encode(parray: &PrimitiveArray) -> ALPArray { +pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { let (exponents, encoded, patches) = match parray.ptype() { PType::F32 => ALPFloat::encode_to_array(parray.typed_data::(), None), PType::F64 => ALPFloat::encode_to_array(parray.typed_data::(), None), - _ => panic!("Unsupported ptype"), + _ => return Err(VortexError::InvalidPType(parray.ptype().clone())), }; - ALPArray::new(encoded, exponents, patches) + Ok(ALPArray::new(encoded, exponents, patches)) } trait ALPFloat: NativePType + Float { @@ -77,7 +78,7 @@ trait ALPFloat: NativePType + Float { const FRACTIONAL_BITS: u8; const MAX_EXPONENT: u8; const SWEET: Self; - const F10: &'static [Self]; // TODO(ngates): const exprs for these to be arrays. + const F10: &'static [Self]; const IF10: &'static [Self]; /// Round to the nearest floating integer by shifting in and out of the low precision range. @@ -90,12 +91,22 @@ trait ALPFloat: NativePType + Float { let mut best_f: u8 = 0; let mut best_nbytes: usize = usize::MAX; + let sample = (values.len() > SAMPLE_SIZE).then(|| { + values + .iter() + .step_by(values.len() / SAMPLE_SIZE) + .cloned() + .collect_vec() + }); + // TODO(wmanning): idea, start with highest e, then find the best f // after that, try e's in descending order, with a gap no larger than the original e - f for e in 0..Self::MAX_EXPONENT { for f in 0..e { - let (_, encoded, patches) = - Self::encode_to_array(values, Some(&Exponents { e, f })); + let (_, encoded, patches) = Self::encode_to_array( + sample.as_deref().unwrap_or(values), + Some(&Exponents { e, f }), + ); let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); if size < best_nbytes { best_nbytes = size; @@ -109,8 +120,8 @@ trait ALPFloat: NativePType + Float { } Exponents { - e: best_e as u8, - f: best_f as u8, + e: best_e, + f: best_f, } } @@ -205,8 +216,8 @@ impl ALPFloat for f32 { impl ALPFloat for f64 { type ALPInt = i64; - const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 const FRACTIONAL_BITS: u8 = 52; + const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 const SWEET: Self = (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; const F10: &'static [Self] = &[ @@ -270,14 +281,26 @@ mod test { #[test] fn test_compress() { - // Create a range offset by a million - let array = PrimitiveArray::from_vec(vec![1.234f32; 10]); - let encoded = alp_encode(&array); + let array = PrimitiveArray::from_vec(vec![1.234f32; 1025]); + let encoded = alp_encode(&array).unwrap(); + println!("Encoded {:?}", encoded); + assert!(encoded.patches().is_none()); + assert_eq!( + encoded.encoded().as_primitive().typed_data::(), + vec![1234; 1025] + ); + assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); + } + + #[test] + fn test_nullable_compress() { + let array = PrimitiveArray::from_iter(vec![1.234f32; 1025]); + let encoded = alp_encode(&array).unwrap(); println!("Encoded {:?}", encoded); assert!(encoded.patches().is_none()); assert_eq!( encoded.encoded().as_primitive().typed_data::(), - vec![1234; 10] + vec![1234; 1025] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); } diff --git a/vortex-alp/src/compute.rs b/vortex-alp/src/compute.rs index 41b01512bc..12d888260e 100644 --- a/vortex-alp/src/compute.rs +++ b/vortex-alp/src/compute.rs @@ -1,8 +1,10 @@ use crate::ALPArray; -use vortex::compute::scalar_at::ScalarAtFn; +use vortex::array::Array; +use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; +use vortex::dtype::{DType, FloatWidth}; use vortex::error::VortexResult; -use vortex::scalar::ScalarRef; +use vortex::scalar::{NullableScalar, ScalarRef}; impl ArrayCompute for ALPArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -11,35 +13,35 @@ impl ArrayCompute for ALPArray { } impl ScalarAtFn for ALPArray { - fn scalar_at(&self, _index: usize) -> VortexResult { - todo!() - // if let Some(patch) = self - // .patches() - // .and_then(|p| scalar_at(p, index).ok()) - // .and_then(|p| p.into_nonnull()) - // { - // return Ok(patch); - // } - // - // let Some(encoded_val) = scalar_at(self.encoded(), index)?.into_nonnull() else { - // return Ok(NullableScalar::none(self.dtype().clone()).boxed()); - // }; - // match self.dtype() { - // DType::Float(FloatWidth::_32, _) => { - // let encoded_val: i32 = encoded_val.try_into().unwrap(); - // Ok(alp::decode_single::(encoded_val, self.exponents()) - // .unwrap() - // .into()) - // } - // - // DType::Float(FloatWidth::_64, _) => { - // let encoded_val: i64 = encoded_val.try_into().unwrap(); - // Ok(alp::decode_single::(encoded_val, self.exponents()) - // .unwrap() - // .into()) - // } - // - // _ => unreachable!(), - // } + fn scalar_at(&self, index: usize) -> VortexResult { + if let Some(patch) = self + .patches() + .and_then(|p| scalar_at(p, index).ok()) + .and_then(|p| p.into_nonnull()) + { + return Ok(patch); + } + + let Some(encoded_val) = scalar_at(self.encoded(), index)?.into_nonnull() else { + return Ok(NullableScalar::none(self.dtype().clone()).boxed()); + }; + + match self.dtype() { + DType::Float(FloatWidth::_32, _) => { + let encoded_val: i32 = encoded_val.try_into().unwrap(); + Ok(alp::decode_single::(encoded_val, self.exponents()) + .unwrap() + .into()) + } + + DType::Float(FloatWidth::_64, _) => { + let encoded_val: i64 = encoded_val.try_into().unwrap(); + Ok(alp::decode_single::(encoded_val, self.exponents()) + .unwrap() + .into()) + } + + _ => unreachable!(), + } } } diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 0fbd8789b5..fad5ab605f 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -1,5 +1,3 @@ -#![feature(iterator_try_collect)] - pub mod array; pub mod arrow; pub mod scalar; From b7ef3a4440079b7a195403d44c29c31230e35199 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 16:31:29 +0000 Subject: [PATCH 06/10] ALP --- vortex-alp/src/alp.rs | 365 +++++++++++++++++++++---------------- vortex-alp/src/array.rs | 172 +++++++++++++++++ vortex-alp/src/compress.rs | 225 ++--------------------- vortex-alp/src/compute.rs | 24 +-- vortex-alp/src/lib.rs | 3 +- vortex-alp/src/serde.rs | 7 +- 6 files changed, 411 insertions(+), 385 deletions(-) create mode 100644 vortex-alp/src/array.rs diff --git a/vortex-alp/src/alp.rs b/vortex-alp/src/alp.rs index ad150ddfe4..ff1a8e9165 100644 --- a/vortex-alp/src/alp.rs +++ b/vortex-alp/src/alp.rs @@ -1,16 +1,12 @@ -use std::any::Any; -use std::sync::{Arc, RwLock}; +use itertools::Itertools; +use num_traits::{Float, NumCast, PrimInt}; +use vortex::array::primitive::PrimitiveArray; +use vortex::array::sparse::SparseArray; +use vortex::ptype::NativePType; -pub use codecz::alp::ALPExponents; -use vortex::array::{Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; -use vortex::compress::EncodingCompression; -use vortex::dtype::{DType, IntWidth, Signedness}; -use vortex::error::{VortexError, VortexResult}; -use vortex::formatter::{ArrayDisplay, ArrayFormatter}; -use vortex::serde::{ArraySerde, EncodingSerde}; -use vortex::stats::{Stats, StatsSet}; +use vortex::array::{Array, ArrayRef}; -use crate::compress::alp_encode; +const SAMPLE_SIZE: usize = 32; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Exponents { @@ -18,160 +14,213 @@ pub struct Exponents { pub f: u8, } -#[derive(Debug, Clone)] -pub struct ALPArray { - encoded: ArrayRef, - exponents: Exponents, - patches: Option, - dtype: DType, - stats: Arc>, -} - -impl ALPArray { - pub fn new(encoded: ArrayRef, exponents: Exponents, patches: Option) -> Self { - Self::try_new(encoded, exponents, patches).unwrap() - } - - pub fn try_new( - encoded: ArrayRef, - exponents: Exponents, - patches: Option, - ) -> VortexResult { - let dtype = match encoded.dtype() { - d @ DType::Int(width, Signedness::Signed, nullability) => match width { - IntWidth::_32 => DType::Float(32.into(), *nullability), - IntWidth::_64 => DType::Float(64.into(), *nullability), - _ => return Err(VortexError::InvalidDType(d.clone())), - }, - d => return Err(VortexError::InvalidDType(d.clone())), - }; - Ok(Self { - encoded, - exponents, - patches, - dtype, - stats: Arc::new(RwLock::new(StatsSet::new())), - }) - } - - pub fn encode(array: &dyn Array) -> VortexResult { - match ArrayKind::from(array) { - ArrayKind::Primitive(p) => Ok(alp_encode(p).boxed()), - _ => Err(VortexError::InvalidEncoding(array.encoding().id().clone())), +pub trait ALPFloat: NativePType + Float { + type ALPInt: NativePType + PrimInt; + const FRACTIONAL_BITS: u8; + const MAX_EXPONENT: u8; + const SWEET: Self; + const F10: &'static [Self]; + const IF10: &'static [Self]; + + /// Round to the nearest floating integer by shifting in and out of the low precision range. + fn fast_round(self) -> Self { + (self + Self::SWEET) - Self::SWEET + } + + fn as_int(self) -> Option { + ::from(self) + } + + fn find_best_exponents(values: &[Self]) -> Exponents { + let mut best_e: u8 = 0; + let mut best_f: u8 = 0; + let mut best_nbytes: usize = usize::MAX; + + let sample = (values.len() > SAMPLE_SIZE).then(|| { + values + .iter() + .step_by(values.len() / SAMPLE_SIZE) + .cloned() + .collect_vec() + }); + + // TODO(wmanning): idea, start with highest e, then find the best f + // after that, try e's in descending order, with a gap no larger than the original e - f + for e in 0..Self::MAX_EXPONENT { + for f in 0..e { + let (_, encoded, patches) = Self::encode_to_array( + sample.as_deref().unwrap_or(values), + Some(&Exponents { e, f }), + ); + let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); + if size < best_nbytes { + best_nbytes = size; + best_e = e; + best_f = f; + } else if size == best_nbytes && e - f < best_e - best_f { + best_e = e; + best_f = f; + } + } } - } - - pub fn encoded(&self) -> &dyn Array { - self.encoded.as_ref() - } - - pub fn exponents(&self) -> &Exponents { - &self.exponents - } - - pub fn patches(&self) -> Option<&dyn Array> { - self.patches.as_deref() - } -} - -impl Array for ALPArray { - #[inline] - fn as_any(&self) -> &dyn Any { - self - } - - #[inline] - fn boxed(self) -> ArrayRef { - Box::new(self) - } - - #[inline] - fn into_any(self: Box) -> Box { - self - } - - #[inline] - fn len(&self) -> usize { - self.encoded.len() - } - - #[inline] - fn is_empty(&self) -> bool { - self.encoded.is_empty() - } - #[inline] - fn dtype(&self) -> &DType { - &self.dtype - } - - #[inline] - fn stats(&self) -> Stats { - Stats::new(&self.stats, self) - } - - fn iter_arrow(&self) -> Box { - todo!() - } - - fn slice(&self, start: usize, stop: usize) -> VortexResult { - Ok(Self::try_new( - self.encoded().slice(start, stop)?, - self.exponents().clone(), - self.patches().map(|p| p.slice(start, stop)).transpose()?, - )? - .boxed()) - } - - #[inline] - fn encoding(&self) -> EncodingRef { - &ALPEncoding - } - - #[inline] - fn nbytes(&self) -> usize { - self.encoded().nbytes() + self.patches().map(|p| p.nbytes()).unwrap_or(0) - } - - fn serde(&self) -> &dyn ArraySerde { - self - } -} - -impl<'arr> AsRef<(dyn Array + 'arr)> for ALPArray { - fn as_ref(&self) -> &(dyn Array + 'arr) { - self + Exponents { + e: best_e, + f: best_f, + } } -} -impl ArrayDisplay for ALPArray { - fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { - f.writeln(format!("exponents: {:?}", self.exponents()))?; - if let Some(p) = self.patches() { - f.writeln("patches:")?; - f.indent(|indent| indent.array(p.as_ref()))?; - } - f.indent(|indent| indent.array(self.encoded())) + fn encode_to_array( + values: &[Self], + exponents: Option<&Exponents>, + ) -> (Exponents, ArrayRef, Option) { + let best_exponents = + exponents.map_or_else(|| Self::find_best_exponents(values), Exponents::clone); + let (values, exc_pos, exc) = Self::encode(values, &best_exponents); + let len = values.len(); + ( + best_exponents, + PrimitiveArray::from_vec(values).boxed(), + (exc.len() > 0).then(|| { + SparseArray::new( + PrimitiveArray::from_vec(exc_pos).boxed(), + PrimitiveArray::from_vec(exc).boxed(), + len, + ) + .boxed() + }), + ) + } + + fn encode(values: &[Self], exponents: &Exponents) -> (Vec, Vec, Vec) { + let mut exc_pos = Vec::new(); + let mut exc_value = Vec::new(); + let mut prev = Self::ALPInt::default(); + let encoded = values + .iter() + .enumerate() + .map(|(i, v)| { + let encoded = + (*v * Self::F10[exponents.e as usize] * Self::IF10[exponents.f as usize]) + .fast_round(); + let decoded = + encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; + + if decoded == *v { + if let Some(e) = encoded.as_int() { + prev = e; + return e; + } + } + + exc_pos.push(i as u64); + exc_value.push(*v); + // Emit the last known good value. This helps with run-end encoding. + prev + }) + .collect_vec(); + + (encoded, exc_pos, exc_value) + } + + fn decode_single(encoded: Self::ALPInt, exponents: &Exponents) -> Self { + let encoded_float: Self = Self::from(encoded).unwrap(); + encoded_float * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize] } } -#[derive(Debug)] -pub struct ALPEncoding; - -impl ALPEncoding { - pub const ID: EncodingId = EncodingId::new("vortex.alp"); +impl ALPFloat for f32 { + type ALPInt = i32; + const FRACTIONAL_BITS: u8 = 23; + const MAX_EXPONENT: u8 = 10; + const SWEET: Self = + (1 << Self::FRACTIONAL_BITS) as Self + (1 << Self::FRACTIONAL_BITS - 1) as Self; + + const F10: &'static [Self] = &[ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + ]; + const IF10: &'static [Self] = &[ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + ]; } -impl Encoding for ALPEncoding { - fn id(&self) -> &EncodingId { - &Self::ID - } - - fn compression(&self) -> Option<&dyn EncodingCompression> { - Some(self) - } - - fn serde(&self) -> Option<&dyn EncodingSerde> { - Some(self) - } +impl ALPFloat for f64 { + type ALPInt = i64; + const FRACTIONAL_BITS: u8 = 52; + const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 + const SWEET: Self = + (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; + const F10: &'static [Self] = &[ + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, + 1000000000000000000000.0, + 10000000000000000000000.0, + 100000000000000000000000.0, + ]; + + const IF10: &'static [Self] = &[ + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001, + 0.00000000000000000001, + 0.000000000000000000001, + 0.0000000000000000000001, + 0.00000000000000000000001, + ]; } diff --git a/vortex-alp/src/array.rs b/vortex-alp/src/array.rs new file mode 100644 index 0000000000..472a70dafe --- /dev/null +++ b/vortex-alp/src/array.rs @@ -0,0 +1,172 @@ +use std::any::Any; +use std::sync::{Arc, RwLock}; + +use crate::alp::Exponents; +pub use codecz::alp::ALPExponents; +use vortex::array::{Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; +use vortex::compress::EncodingCompression; +use vortex::dtype::{DType, IntWidth, Signedness}; +use vortex::error::{VortexError, VortexResult}; +use vortex::formatter::{ArrayDisplay, ArrayFormatter}; +use vortex::serde::{ArraySerde, EncodingSerde}; +use vortex::stats::{Stats, StatsSet}; + +use crate::compress::alp_encode; + +#[derive(Debug, Clone)] +pub struct ALPArray { + encoded: ArrayRef, + exponents: Exponents, + patches: Option, + dtype: DType, + stats: Arc>, +} + +impl ALPArray { + pub fn new(encoded: ArrayRef, exponents: Exponents, patches: Option) -> Self { + Self::try_new(encoded, exponents, patches).unwrap() + } + + pub fn try_new( + encoded: ArrayRef, + exponents: Exponents, + patches: Option, + ) -> VortexResult { + let dtype = match encoded.dtype() { + d @ DType::Int(width, Signedness::Signed, nullability) => match width { + IntWidth::_32 => DType::Float(32.into(), *nullability), + IntWidth::_64 => DType::Float(64.into(), *nullability), + _ => return Err(VortexError::InvalidDType(d.clone())), + }, + d => return Err(VortexError::InvalidDType(d.clone())), + }; + Ok(Self { + encoded, + exponents, + patches, + dtype, + stats: Arc::new(RwLock::new(StatsSet::new())), + }) + } + + pub fn encode(array: &dyn Array) -> VortexResult { + match ArrayKind::from(array) { + ArrayKind::Primitive(p) => Ok(alp_encode(p)?.boxed()), + _ => Err(VortexError::InvalidEncoding(array.encoding().id().clone())), + } + } + + pub fn encoded(&self) -> &dyn Array { + self.encoded.as_ref() + } + + pub fn exponents(&self) -> &Exponents { + &self.exponents + } + + pub fn patches(&self) -> Option<&dyn Array> { + self.patches.as_deref() + } +} + +impl Array for ALPArray { + #[inline] + fn as_any(&self) -> &dyn Any { + self + } + + #[inline] + fn boxed(self) -> ArrayRef { + Box::new(self) + } + + #[inline] + fn into_any(self: Box) -> Box { + self + } + + #[inline] + fn len(&self) -> usize { + self.encoded.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.encoded.is_empty() + } + + #[inline] + fn dtype(&self) -> &DType { + &self.dtype + } + + #[inline] + fn stats(&self) -> Stats { + Stats::new(&self.stats, self) + } + + fn iter_arrow(&self) -> Box { + todo!() + } + + fn slice(&self, start: usize, stop: usize) -> VortexResult { + Ok(Self::try_new( + self.encoded().slice(start, stop)?, + self.exponents().clone(), + self.patches().map(|p| p.slice(start, stop)).transpose()?, + )? + .boxed()) + } + + #[inline] + fn encoding(&self) -> EncodingRef { + &ALPEncoding + } + + #[inline] + fn nbytes(&self) -> usize { + self.encoded().nbytes() + self.patches().map(|p| p.nbytes()).unwrap_or(0) + } + + fn serde(&self) -> &dyn ArraySerde { + self + } +} + +impl<'arr> AsRef<(dyn Array + 'arr)> for ALPArray { + fn as_ref(&self) -> &(dyn Array + 'arr) { + self + } +} + +impl ArrayDisplay for ALPArray { + fn fmt(&self, f: &mut ArrayFormatter) -> std::fmt::Result { + f.writeln(format!("exponents: {:?}", self.exponents()))?; + if let Some(p) = self.patches() { + f.writeln("patches:")?; + f.indent(|indent| indent.array(p.as_ref()))?; + } + f.indent(|indent| indent.array(self.encoded())) + } +} + +#[derive(Debug)] +pub struct ALPEncoding; + +impl ALPEncoding { + pub const ID: EncodingId = EncodingId::new("vortex.alp"); +} + +impl Encoding for ALPEncoding { + fn id(&self) -> &EncodingId { + &Self::ID + } + + fn compression(&self) -> Option<&dyn EncodingCompression> { + Some(self) + } + + fn serde(&self) -> Option<&dyn EncodingSerde> { + Some(self) + } +} diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 2ce182f9f1..6115637a9f 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -1,20 +1,15 @@ -use itertools::Itertools; use log::debug; -use num_traits::{Float, NumCast, PrimInt}; +use crate::alp::ALPFloat; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; -use vortex::array::sparse::SparseArray; use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, Compressor, EncodingCompression}; use vortex::error::{VortexError, VortexResult}; -use vortex::ptype::{NativePType, PType}; +use vortex::ptype::PType; -use crate::alp::{ALPArray, ALPEncoding}; +use crate::array::{ALPArray, ALPEncoding}; use crate::downcast::DowncastALP; -use crate::Exponents; - -const SAMPLE_SIZE: usize = 32; impl EncodingCompression for ALPEncoding { fn compressor( @@ -41,7 +36,12 @@ impl EncodingCompression for ALPEncoding { fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) -> ArrayRef { let like_alp = like.map(|like_array| like_array.as_alp()); - let parray = array.as_primitive(); + let mut parray = array.as_primitive().clone(); + if parray.validity().is_some() { + parray = compute:: + + } + let (exponents, encoded, patches) = match parray.ptype() { PType::F32 => { ALPFloat::encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())) @@ -73,211 +73,10 @@ pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { Ok(ALPArray::new(encoded, exponents, patches)) } -trait ALPFloat: NativePType + Float { - type ALPInt: NativePType + PrimInt; - const FRACTIONAL_BITS: u8; - const MAX_EXPONENT: u8; - const SWEET: Self; - const F10: &'static [Self]; - const IF10: &'static [Self]; - - /// Round to the nearest floating integer by shifting in and out of the low precision range. - fn fast_round(self) -> Self { - (self + Self::SWEET) - Self::SWEET - } - - fn find_best_exponents(values: &[Self]) -> Exponents { - let mut best_e: u8 = 0; - let mut best_f: u8 = 0; - let mut best_nbytes: usize = usize::MAX; - - let sample = (values.len() > SAMPLE_SIZE).then(|| { - values - .iter() - .step_by(values.len() / SAMPLE_SIZE) - .cloned() - .collect_vec() - }); - - // TODO(wmanning): idea, start with highest e, then find the best f - // after that, try e's in descending order, with a gap no larger than the original e - f - for e in 0..Self::MAX_EXPONENT { - for f in 0..e { - let (_, encoded, patches) = Self::encode_to_array( - sample.as_deref().unwrap_or(values), - Some(&Exponents { e, f }), - ); - let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); - if size < best_nbytes { - best_nbytes = size; - best_e = e; - best_f = f; - } else if size == best_nbytes && e - f < best_e - best_f { - best_e = e; - best_f = f; - } - } - } - - Exponents { - e: best_e, - f: best_f, - } - } - - fn encode_to_array( - values: &[Self], - exponents: Option<&Exponents>, - ) -> (Exponents, ArrayRef, Option) { - let best_exponents = - exponents.map_or_else(|| Self::find_best_exponents(values), Exponents::clone); - let (values, exc_pos, exc) = Self::encode(values, &best_exponents); - let len = values.len(); - ( - best_exponents, - PrimitiveArray::from_vec(values).boxed(), - (exc.len() > 0).then(|| { - SparseArray::new( - PrimitiveArray::from_vec(exc_pos).boxed(), - PrimitiveArray::from_vec(exc).boxed(), - len, - ) - .boxed() - }), - ) - } - - fn encode(values: &[Self], exponents: &Exponents) -> (Vec, Vec, Vec) { - let mut exc_pos = Vec::new(); - let mut exc_value = Vec::new(); - let mut prev = Self::ALPInt::default(); - let encoded = values - .iter() - .enumerate() - .map(|(i, v)| { - let encoded = - (*v * Self::F10[exponents.e as usize] * Self::IF10[exponents.f as usize]) - .fast_round(); - let decoded = - encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; - - if decoded == *v { - if let Some(e) = <::ALPInt as NumCast>::from(encoded) { - prev = e; - return e; - } - } - - exc_pos.push(i as u64); - exc_value.push(*v); - // Emit the last known good value. This helps with run-end encoding. - prev - }) - .collect_vec(); - - (encoded, exc_pos, exc_value) - } -} - -impl ALPFloat for f32 { - type ALPInt = i32; - const FRACTIONAL_BITS: u8 = 23; - const MAX_EXPONENT: u8 = 10; - const SWEET: Self = - (1 << Self::FRACTIONAL_BITS) as Self + (1 << Self::FRACTIONAL_BITS - 1) as Self; - - const F10: &'static [Self] = &[ - 1.0, - 10.0, - 100.0, - 1000.0, - 10000.0, - 100000.0, - 1000000.0, - 10000000.0, - 100000000.0, - 1000000000.0, - 10000000000.0, - ]; - const IF10: &'static [Self] = &[ - 1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - ]; -} - -impl ALPFloat for f64 { - type ALPInt = i64; - const FRACTIONAL_BITS: u8 = 52; - const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 - const SWEET: Self = - (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; - const F10: &'static [Self] = &[ - 1.0, - 10.0, - 100.0, - 1000.0, - 10000.0, - 100000.0, - 1000000.0, - 10000000.0, - 100000000.0, - 1000000000.0, - 10000000000.0, - 100000000000.0, - 1000000000000.0, - 10000000000000.0, - 100000000000000.0, - 1000000000000000.0, - 10000000000000000.0, - 100000000000000000.0, - 1000000000000000000.0, - 10000000000000000000.0, - 100000000000000000000.0, - 1000000000000000000000.0, - 10000000000000000000000.0, - 100000000000000000000000.0, - ]; - - const IF10: &'static [Self] = &[ - 1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - 0.00000000001, - 0.000000000001, - 0.0000000000001, - 0.00000000000001, - 0.000000000000001, - 0.0000000000000001, - 0.00000000000000001, - 0.000000000000000001, - 0.0000000000000000001, - 0.00000000000000000001, - 0.000000000000000000001, - 0.0000000000000000000001, - 0.00000000000000000000001, - ]; -} - #[cfg(test)] mod test { use super::*; + use crate::alp::Exponents; #[test] fn test_compress() { @@ -294,13 +93,13 @@ mod test { #[test] fn test_nullable_compress() { - let array = PrimitiveArray::from_iter(vec![1.234f32; 1025]); + let array = PrimitiveArray::from_iter(vec![None, Some(1.234f32), None]); let encoded = alp_encode(&array).unwrap(); println!("Encoded {:?}", encoded); assert!(encoded.patches().is_none()); assert_eq!( encoded.encoded().as_primitive().typed_data::(), - vec![1234; 1025] + vec![0, 1234, 1234] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); } diff --git a/vortex-alp/src/compute.rs b/vortex-alp/src/compute.rs index 12d888260e..86426ab162 100644 --- a/vortex-alp/src/compute.rs +++ b/vortex-alp/src/compute.rs @@ -1,10 +1,12 @@ +use crate::alp::ALPFloat; use crate::ALPArray; +use std::f32; use vortex::array::Array; use vortex::compute::scalar_at::{scalar_at, ScalarAtFn}; use vortex::compute::ArrayCompute; use vortex::dtype::{DType, FloatWidth}; -use vortex::error::VortexResult; -use vortex::scalar::{NullableScalar, ScalarRef}; +use vortex::error::{VortexError, VortexResult}; +use vortex::scalar::{NullableScalar, Scalar, ScalarRef}; impl ArrayCompute for ALPArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -29,19 +31,19 @@ impl ScalarAtFn for ALPArray { match self.dtype() { DType::Float(FloatWidth::_32, _) => { let encoded_val: i32 = encoded_val.try_into().unwrap(); - Ok(alp::decode_single::(encoded_val, self.exponents()) - .unwrap() - .into()) + Ok(ScalarRef::from(::decode_single( + encoded_val, + self.exponents(), + ))) } - DType::Float(FloatWidth::_64, _) => { let encoded_val: i64 = encoded_val.try_into().unwrap(); - Ok(alp::decode_single::(encoded_val, self.exponents()) - .unwrap() - .into()) + Ok(ScalarRef::from(::decode_single( + encoded_val, + self.exponents(), + ))) } - - _ => unreachable!(), + _ => Err(VortexError::InvalidDType(self.dtype().clone())), } } } diff --git a/vortex-alp/src/lib.rs b/vortex-alp/src/lib.rs index 8ff1d432cc..6f6ec0efd8 100644 --- a/vortex-alp/src/lib.rs +++ b/vortex-alp/src/lib.rs @@ -1,8 +1,9 @@ -pub use alp::*; +pub use array::*; use linkme::distributed_slice; use vortex::array::{EncodingRef, ENCODINGS}; mod alp; +mod array; mod compress; mod compute; mod downcast; diff --git a/vortex-alp/src/serde.rs b/vortex-alp/src/serde.rs index 3d3e4215a2..9419fc13cc 100644 --- a/vortex-alp/src/serde.rs +++ b/vortex-alp/src/serde.rs @@ -1,11 +1,13 @@ use std::io; use std::io::ErrorKind; +use crate::alp::Exponents; use vortex::array::{Array, ArrayRef}; use vortex::dtype::{DType, FloatWidth, Signedness}; use vortex::serde::{ArraySerde, EncodingSerde, ReadCtx, WriteCtx}; -use crate::{ALPArray, ALPEncoding, Exponents}; +use crate::ALPArray; +use crate::ALPEncoding; impl ArraySerde for ALPArray { fn write(&self, ctx: &mut WriteCtx) -> io::Result<()> { @@ -76,7 +78,8 @@ mod test { 0.0004f64, 1000000.0f64, 0.33f64, - ])); + ])) + .unwrap(); let read_arr = roundtrip_array(arr.as_ref()).unwrap(); let read_alp = read_arr.as_alp(); From 6fdf93c229b68d99bc6932c0ecc4872620ea03f7 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 17:11:39 +0000 Subject: [PATCH 07/10] ALP --- Cargo.lock | 51 +++++++++++++++++++++- vortex-alp/Cargo.toml | 13 ++++-- vortex-alp/benches/alp_compress.rs | 11 +++++ vortex-alp/src/alp.rs | 69 +++++++++--------------------- vortex-alp/src/array.rs | 1 - vortex-alp/src/compress.rs | 51 +++++++++++++++------- vortex-alp/src/lib.rs | 2 + 7 files changed, 127 insertions(+), 71 deletions(-) create mode 100644 vortex-alp/benches/alp_compress.rs diff --git a/Cargo.lock b/Cargo.lock index e7b474ef82..f481cfd052 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -550,6 +550,7 @@ checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" dependencies = [ "anstyle", "clap_lex", + "terminal_size", ] [[package]] @@ -587,6 +588,12 @@ dependencies = [ "walkdir", ] +[[package]] +name = "condtype" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af" + [[package]] name = "const-random" version = "0.1.18" @@ -757,6 +764,31 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "divan" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d567df2c9c2870a43f3f2bd65aaeb18dbce1c18f217c3e564b4fbaeb3ee56c" +dependencies = [ + "cfg-if", + "clap", + "condtype", + "divan-macros", + "libc", + "regex-lite", +] + +[[package]] +name = "divan-macros" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27540baf49be0d484d8f0130d7d8da3011c32a44d4fc873368154f1510e574a2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "dyn-clone" version = "1.0.17" @@ -2164,6 +2196,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.8.2" @@ -2543,6 +2581,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "thiserror" version = "1.0.57" @@ -2812,8 +2860,7 @@ dependencies = [ name = "vortex-alp" version = "0.1.0" dependencies = [ - "arrow", - "codecz", + "divan", "itertools 0.12.1", "linkme", "log", diff --git a/vortex-alp/Cargo.toml b/vortex-alp/Cargo.toml index 1e1fc4cf51..64911557b3 100644 --- a/vortex-alp/Cargo.toml +++ b/vortex-alp/Cargo.toml @@ -11,14 +11,19 @@ include = { workspace = true } edition = { workspace = true } rust-version = { workspace = true } +[lints] +workspace = true + [dependencies] -arrow = { version = "50.0.0" } vortex-array = { path = "../vortex-array" } linkme = "0.3.22" itertools = "0.12.1" num-traits = "0.2.18" -codecz = { path = "../codecz" } log = { version = "0.4.20", features = [] } -[lints] -workspace = true +[dev-dependencies] +divan = "0.1.14" + +[[bench]] +name = "alp_compress" +harness = false \ No newline at end of file diff --git a/vortex-alp/benches/alp_compress.rs b/vortex-alp/benches/alp_compress.rs new file mode 100644 index 0000000000..d26137a409 --- /dev/null +++ b/vortex-alp/benches/alp_compress.rs @@ -0,0 +1,11 @@ +use vortex_alp::{ALPFloat, Exponents}; + +fn main() { + divan::main(); +} + +#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])] +fn alp_compress(n: usize) -> (Exponents, Vec, Vec, Vec) { + let values: Vec = vec![T::from(1.234).unwrap(); n]; + T::encode(values.as_slice(), None) +} diff --git a/vortex-alp/src/alp.rs b/vortex-alp/src/alp.rs index ff1a8e9165..f2113489cc 100644 --- a/vortex-alp/src/alp.rs +++ b/vortex-alp/src/alp.rs @@ -1,10 +1,6 @@ use itertools::Itertools; -use num_traits::{Float, NumCast, PrimInt}; -use vortex::array::primitive::PrimitiveArray; -use vortex::array::sparse::SparseArray; -use vortex::ptype::NativePType; - -use vortex::array::{Array, ArrayRef}; +use num_traits::{Float, NumCast, PrimInt, Zero}; +use std::mem::size_of; const SAMPLE_SIZE: usize = 32; @@ -14,8 +10,9 @@ pub struct Exponents { pub f: u8, } -pub trait ALPFloat: NativePType + Float { - type ALPInt: NativePType + PrimInt; +pub trait ALPFloat: Float + 'static { + type ALPInt: PrimInt; + const FRACTIONAL_BITS: u8; const MAX_EXPONENT: u8; const SWEET: Self; @@ -32,8 +29,7 @@ pub trait ALPFloat: NativePType + Float { } fn find_best_exponents(values: &[Self]) -> Exponents { - let mut best_e: u8 = 0; - let mut best_f: u8 = 0; + let mut best_exp = Exponents { e: 0, f: 0 }; let mut best_nbytes: usize = usize::MAX; let sample = (values.len() > SAMPLE_SIZE).then(|| { @@ -45,66 +41,43 @@ pub trait ALPFloat: NativePType + Float { }); // TODO(wmanning): idea, start with highest e, then find the best f - // after that, try e's in descending order, with a gap no larger than the original e - f + // after that, try e's in descending order, with a gap no larger than the original e - f for e in 0..Self::MAX_EXPONENT { for f in 0..e { - let (_, encoded, patches) = Self::encode_to_array( + let (_, encoded, exc_pos, exc_patches) = Self::encode( sample.as_deref().unwrap_or(values), Some(&Exponents { e, f }), ); - let size = encoded.nbytes() + patches.map_or(0, |p| p.nbytes()); + let size = + (encoded.len() + exc_patches.len()) * size_of::() + (exc_pos.len() * 4); if size < best_nbytes { best_nbytes = size; - best_e = e; - best_f = f; - } else if size == best_nbytes && e - f < best_e - best_f { - best_e = e; - best_f = f; + best_exp = Exponents { e, f }; + } else if size == best_nbytes && e - f < best_exp.e - best_exp.f { + best_exp = Exponents { e, f }; } } } - Exponents { - e: best_e, - f: best_f, - } + best_exp } - fn encode_to_array( + fn encode( values: &[Self], exponents: Option<&Exponents>, - ) -> (Exponents, ArrayRef, Option) { - let best_exponents = - exponents.map_or_else(|| Self::find_best_exponents(values), Exponents::clone); - let (values, exc_pos, exc) = Self::encode(values, &best_exponents); - let len = values.len(); - ( - best_exponents, - PrimitiveArray::from_vec(values).boxed(), - (exc.len() > 0).then(|| { - SparseArray::new( - PrimitiveArray::from_vec(exc_pos).boxed(), - PrimitiveArray::from_vec(exc).boxed(), - len, - ) - .boxed() - }), - ) - } + ) -> (Exponents, Vec, Vec, Vec) { + let exp = exponents.map_or_else(|| Self::find_best_exponents(values), Exponents::clone); - fn encode(values: &[Self], exponents: &Exponents) -> (Vec, Vec, Vec) { let mut exc_pos = Vec::new(); let mut exc_value = Vec::new(); - let mut prev = Self::ALPInt::default(); + let mut prev = Self::ALPInt::zero(); let encoded = values .iter() .enumerate() .map(|(i, v)| { let encoded = - (*v * Self::F10[exponents.e as usize] * Self::IF10[exponents.f as usize]) - .fast_round(); - let decoded = - encoded * Self::F10[exponents.f as usize] * Self::IF10[exponents.e as usize]; + (*v * Self::F10[exp.e as usize] * Self::IF10[exp.f as usize]).fast_round(); + let decoded = encoded * Self::F10[exp.f as usize] * Self::IF10[exp.e as usize]; if decoded == *v { if let Some(e) = encoded.as_int() { @@ -120,7 +93,7 @@ pub trait ALPFloat: NativePType + Float { }) .collect_vec(); - (encoded, exc_pos, exc_value) + (exp, encoded, exc_pos, exc_value) } fn decode_single(encoded: Self::ALPInt, exponents: &Exponents) -> Self { diff --git a/vortex-alp/src/array.rs b/vortex-alp/src/array.rs index 472a70dafe..978af89680 100644 --- a/vortex-alp/src/array.rs +++ b/vortex-alp/src/array.rs @@ -2,7 +2,6 @@ use std::any::Any; use std::sync::{Arc, RwLock}; use crate::alp::Exponents; -pub use codecz::alp::ALPExponents; use vortex::array::{Array, ArrayKind, ArrayRef, ArrowIterator, Encoding, EncodingId, EncodingRef}; use vortex::compress::EncodingCompression; use vortex::dtype::{DType, IntWidth, Signedness}; diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 6115637a9f..c59bc93c2c 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -3,13 +3,15 @@ use log::debug; use crate::alp::ALPFloat; use vortex::array::downcast::DowncastArrayBuiltin; use vortex::array::primitive::PrimitiveArray; +use vortex::array::sparse::SparseArray; use vortex::array::{Array, ArrayRef}; use vortex::compress::{CompressConfig, CompressCtx, Compressor, EncodingCompression}; use vortex::error::{VortexError, VortexResult}; -use vortex::ptype::PType; +use vortex::ptype::{NativePType, PType}; use crate::array::{ALPArray, ALPEncoding}; use crate::downcast::DowncastALP; +use crate::Exponents; impl EncodingCompression for ALPEncoding { fn compressor( @@ -36,19 +38,12 @@ impl EncodingCompression for ALPEncoding { fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) -> ArrayRef { let like_alp = like.map(|like_array| like_array.as_alp()); - let mut parray = array.as_primitive().clone(); - if parray.validity().is_some() { - parray = compute:: - - } + // TODO(ngates): fill forward nulls + let parray = array.as_primitive(); let (exponents, encoded, patches) = match parray.ptype() { - PType::F32 => { - ALPFloat::encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())) - } - PType::F64 => { - ALPFloat::encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())) - } + PType::F32 => encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())), + PType::F64 => encode_to_array(parray.typed_data::(), like_alp.map(|a| a.exponents())), _ => panic!("Unsupported ptype"), }; @@ -64,17 +59,41 @@ fn alp_compressor(array: &dyn Array, like: Option<&dyn Array>, ctx: CompressCtx) ALPArray::new(compressed_encoded, exponents, compressed_patches).boxed() } +fn encode_to_array( + values: &[T], + exponents: Option<&Exponents>, +) -> (Exponents, ArrayRef, Option) +where + T: ALPFloat + NativePType, + T::ALPInt: NativePType, +{ + let (exponents, values, exc_pos, exc) = T::encode(values, exponents); + let len = values.len(); + ( + exponents, + PrimitiveArray::from_vec(values).boxed(), + (exc.len() > 0).then(|| { + SparseArray::new( + PrimitiveArray::from_vec(exc_pos).boxed(), + PrimitiveArray::from_vec(exc).boxed(), + len, + ) + .boxed() + }), + ) +} + pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { let (exponents, encoded, patches) = match parray.ptype() { - PType::F32 => ALPFloat::encode_to_array(parray.typed_data::(), None), - PType::F64 => ALPFloat::encode_to_array(parray.typed_data::(), None), + PType::F32 => encode_to_array(parray.typed_data::(), None), + PType::F64 => encode_to_array(parray.typed_data::(), None), _ => return Err(VortexError::InvalidPType(parray.ptype().clone())), }; Ok(ALPArray::new(encoded, exponents, patches)) } #[cfg(test)] -mod test { +mod tests { use super::*; use crate::alp::Exponents; @@ -99,7 +118,7 @@ mod test { assert!(encoded.patches().is_none()); assert_eq!( encoded.encoded().as_primitive().typed_data::(), - vec![0, 1234, 1234] + vec![0, 1234, 0] ); assert_eq!(encoded.exponents(), &Exponents { e: 4, f: 1 }); } diff --git a/vortex-alp/src/lib.rs b/vortex-alp/src/lib.rs index 6f6ec0efd8..f104396cd5 100644 --- a/vortex-alp/src/lib.rs +++ b/vortex-alp/src/lib.rs @@ -1,4 +1,6 @@ +pub use alp::*; pub use array::*; + use linkme::distributed_slice; use vortex::array::{EncodingRef, ENCODINGS}; From 2824169fe0a99a1d896043b824d90ff6ededd8a1 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 5 Mar 2024 17:28:42 +0000 Subject: [PATCH 08/10] Bench --- vortex-alp/benches/alp_compress.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vortex-alp/benches/alp_compress.rs b/vortex-alp/benches/alp_compress.rs index d26137a409..d0cb986d91 100644 --- a/vortex-alp/benches/alp_compress.rs +++ b/vortex-alp/benches/alp_compress.rs @@ -1,4 +1,6 @@ -use vortex_alp::{ALPFloat, Exponents}; +use vortex::array::primitive::PrimitiveArray; +use vortex::array::ArrayRef; +use vortex_alp::{ALPArray, ALPFloat, Exponents}; fn main() { divan::main(); @@ -9,3 +11,10 @@ fn alp_compress(n: usize) -> (Exponents, Vec, Vec, let values: Vec = vec![T::from(1.234).unwrap(); n]; T::encode(values.as_slice(), None) } + +// TODO(ngates): remove this +#[divan::bench(args = [100_000, 10_000_000])] +fn alp_compress_array(n: usize) -> ArrayRef { + let array = PrimitiveArray::from_vec(vec![1.234f64; n]); + ALPArray::encode(&array).unwrap() +} From b11a043c0fc765475046b1aa31f43041c0fc1d0e Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 6 Mar 2024 09:07:35 +0000 Subject: [PATCH 09/10] Compressors --- vortex-alp/src/alp.rs | 4 ++-- vortex-alp/src/compress.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vortex-alp/src/alp.rs b/vortex-alp/src/alp.rs index f2113489cc..75e410fb71 100644 --- a/vortex-alp/src/alp.rs +++ b/vortex-alp/src/alp.rs @@ -107,7 +107,7 @@ impl ALPFloat for f32 { const FRACTIONAL_BITS: u8 = 23; const MAX_EXPONENT: u8 = 10; const SWEET: Self = - (1 << Self::FRACTIONAL_BITS) as Self + (1 << Self::FRACTIONAL_BITS - 1) as Self; + (1 << Self::FRACTIONAL_BITS) as Self + (1 << (Self::FRACTIONAL_BITS - 1)) as Self; const F10: &'static [Self] = &[ 1.0, @@ -142,7 +142,7 @@ impl ALPFloat for f64 { const FRACTIONAL_BITS: u8 = 52; const MAX_EXPONENT: u8 = 18; // 10^18 is the maximum i64 const SWEET: Self = - (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << Self::FRACTIONAL_BITS - 1) as Self; + (1u64 << Self::FRACTIONAL_BITS) as Self + (1u64 << (Self::FRACTIONAL_BITS - 1)) as Self; const F10: &'static [Self] = &[ 1.0, 10.0, diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index c59bc93c2c..5ccbc6d3df 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -72,7 +72,7 @@ where ( exponents, PrimitiveArray::from_vec(values).boxed(), - (exc.len() > 0).then(|| { + (!exc.is_empty()).then(|| { SparseArray::new( PrimitiveArray::from_vec(exc_pos).boxed(), PrimitiveArray::from_vec(exc).boxed(), @@ -87,7 +87,7 @@ pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { let (exponents, encoded, patches) = match parray.ptype() { PType::F32 => encode_to_array(parray.typed_data::(), None), PType::F64 => encode_to_array(parray.typed_data::(), None), - _ => return Err(VortexError::InvalidPType(parray.ptype().clone())), + _ => return Err(VortexError::InvalidPType(*parray.ptype())), }; Ok(ALPArray::new(encoded, exponents, patches)) } From 43a47fea0b4d320e1e0817bc1946f554fe93457e Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 6 Mar 2024 09:24:46 +0000 Subject: [PATCH 10/10] REE --- vortex-alp/src/compress.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex-alp/src/compress.rs b/vortex-alp/src/compress.rs index 5ccbc6d3df..83018a53c6 100644 --- a/vortex-alp/src/compress.rs +++ b/vortex-alp/src/compress.rs @@ -95,7 +95,6 @@ pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult { #[cfg(test)] mod tests { use super::*; - use crate::alp::Exponents; #[test] fn test_compress() {