Skip to content

Commit

Permalink
Cleanup Dict encoding (#82)
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Mar 7, 2024
1 parent 05938cb commit 2976d3d
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 197 deletions.
33 changes: 17 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion bench-vortex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ log = "0.4.20"
[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
simplelog = { version = "0.12.1", features = ["paris"] }
rand = "0.8.5"

[[bench]]
name = "compress_benchmark"
Expand Down
4 changes: 2 additions & 2 deletions vortex-array/src/array/primitive/compute/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ fn cast<T: NativePType>(array: &PrimitiveArray) -> VortexResult<Vec<T>> {
.typed_data::<$E>()
.iter()
// TODO(ngates): allow configurable checked/unchecked casting
.map(|v| {
T::from(*v).ok_or_else(|| {
.map(|&v| {
T::from(v).ok_or_else(|| {
VortexError::ComputeError(format!("Failed to cast {} to {:?}", v, T::PTYPE).into())
})
})
Expand Down
9 changes: 1 addition & 8 deletions vortex-array/src/array/primitive/compute/scalar_at.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,7 @@ use crate::scalar::{NullableScalar, Scalar, ScalarRef};
impl ScalarAtFn for PrimitiveArray {
fn scalar_at(&self, index: usize) -> VortexResult<ScalarRef> {
if self.is_valid(index) {
Ok(
match_each_native_ptype!(self.ptype, |$T| self.buffer.typed_data::<$T>()
.get(index)
.unwrap()
.clone()
.into()
),
)
Ok(match_each_native_ptype!(self.ptype, |$T| self.typed_data::<$T>()[index].into()))
} else {
Ok(NullableScalar::none(self.dtype().clone()).boxed())
}
Expand Down
36 changes: 20 additions & 16 deletions vortex-array/src/array/varbin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ use arrow::array::{make_array, Array as ArrowArray, ArrayData, AsArray};
use arrow::buffer::NullBuffer;
use arrow::datatypes::UInt8Type;
use linkme::distributed_slice;
use num_traits::{AsPrimitive, FromPrimitive, Unsigned};
use num_traits::{FromPrimitive, Unsigned};

use crate::array::bool::BoolArray;
use crate::array::downcast::DowncastArrayBuiltin;
use crate::array::primitive::PrimitiveArray;
use crate::array::varbin::values_iter::{VarBinIter, VarBinPrimitiveIter};
use crate::array::{
check_slice_bounds, check_validity_buffer, Array, ArrayRef, ArrowIterator, Encoding,
EncodingId, EncodingRef, ENCODINGS,
Expand All @@ -20,14 +21,14 @@ use crate::compute::scalar_at::scalar_at;
use crate::dtype::{DType, IntWidth, Nullability, Signedness};
use crate::error::{VortexError, VortexResult};
use crate::formatter::{ArrayDisplay, ArrayFormatter};
use crate::match_each_native_ptype;
use crate::ptype::NativePType;
use crate::serde::{ArraySerde, EncodingSerde};
use crate::stats::{Stats, StatsSet};

mod compute;
mod serde;
mod stats;
mod values_iter;

#[derive(Debug, Clone)]
pub struct VarBinArray {
Expand Down Expand Up @@ -189,20 +190,23 @@ impl VarBinArray {
}
}

pub fn bytes_at(&self, index: usize) -> VortexResult<Vec<u8>> {
// check_index_bounds(self, index)?;

let (start, end): (usize, usize) = if let Some(p) = self.offsets.maybe_primitive() {
match_each_native_ptype!(p.ptype(), |$P| {
let buf = p.buffer().typed_data::<$P>();
(buf[index].as_(), buf[index + 1].as_())
pub fn iter_primitive(&self) -> VortexResult<VarBinPrimitiveIter> {
self.bytes()
.maybe_primitive()
.zip(self.offsets().maybe_primitive())
.ok_or_else(|| {
VortexError::ComputeError("Bytes array was not a primitive array".into())
})
} else {
(
scalar_at(self.offsets(), index)?.try_into()?,
scalar_at(self.offsets(), index + 1)?.try_into()?,
)
};
.map(|(b, o)| VarBinPrimitiveIter::new(b.typed_data::<u8>(), o))
}

pub fn iter(&self) -> VarBinIter {
VarBinIter::new(self.bytes(), self.offsets())
}

pub fn bytes_at(&self, index: usize) -> VortexResult<Vec<u8>> {
let start = scalar_at(self.offsets(), index)?.try_into()?;
let end = scalar_at(self.offsets(), index + 1)?.try_into()?;
let sliced = self.bytes().slice(start, end)?;
let arr_ref = sliced.iter_arrow().combine_chunks();
Ok(arr_ref.as_primitive::<UInt8Type>().values().to_vec())
Expand Down Expand Up @@ -382,11 +386,11 @@ impl<'a> FromIterator<Option<&'a str>> for VarBinArray {

#[cfg(test)]
mod test {
use crate::array::Array;
use arrow::array::{AsArray, GenericStringArray as ArrowStringArray};

use crate::array::primitive::PrimitiveArray;
use crate::array::varbin::VarBinArray;
use crate::array::Array;
use crate::arrow::CombineChunks;
use crate::compute::scalar_at::scalar_at;
use crate::dtype::{DType, Nullability};
Expand Down
100 changes: 100 additions & 0 deletions vortex-array/src/array/varbin/values_iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use arrow::array::AsArray;
use arrow::datatypes::UInt8Type;

use crate::array::primitive::PrimitiveArray;
use crate::array::Array;
use crate::arrow::CombineChunks;
use crate::compute::scalar_at::scalar_at;
use crate::match_each_native_ptype;
use num_traits::AsPrimitive;

#[derive(Debug)]
pub struct VarBinPrimitiveIter<'a> {
bytes: &'a [u8],
offsets: &'a PrimitiveArray,
last_offset: usize,
idx: usize,
}

impl<'a> VarBinPrimitiveIter<'a> {
pub fn new(bytes: &'a [u8], offsets: &'a PrimitiveArray) -> Self {
assert!(offsets.len() > 1);
let last_offset = Self::offset_at(offsets, 0);
Self {
bytes,
offsets,
last_offset,
idx: 1,
}
}

pub(self) fn offset_at(array: &'a PrimitiveArray, index: usize) -> usize {
match_each_native_ptype!(array.ptype(), |$P| {
array.typed_data::<$P>()[index].as_()
})
}
}

impl<'a> Iterator for VarBinPrimitiveIter<'a> {
type Item = &'a [u8];

fn next(&mut self) -> Option<Self::Item> {
if self.idx == self.offsets.len() {
return None;
}

let next_offset = Self::offset_at(self.offsets, self.idx);
let slice_bytes = &self.bytes[self.last_offset..next_offset];
self.last_offset = next_offset;
self.idx += 1;
Some(slice_bytes)
}
}

#[derive(Debug)]
pub struct VarBinIter<'a> {
bytes: &'a dyn Array,
offsets: &'a dyn Array,
last_offset: usize,
idx: usize,
}

impl<'a> VarBinIter<'a> {
pub fn new(bytes: &'a dyn Array, offsets: &'a dyn Array) -> Self {
assert!(offsets.len() > 1);
let last_offset = scalar_at(offsets, 0).unwrap().try_into().unwrap();
Self {
bytes,
offsets,
last_offset,
idx: 1,
}
}
}

impl<'a> Iterator for VarBinIter<'a> {
type Item = Vec<u8>;

fn next(&mut self) -> Option<Self::Item> {
if self.idx == self.offsets.len() {
return None;
}

let next_offset: usize = scalar_at(self.offsets, self.idx)
.unwrap()
.try_into()
.unwrap();
let slice_bytes = self.bytes.slice(self.last_offset, next_offset).unwrap();
self.last_offset = next_offset;
self.idx += 1;
// TODO(robert): iter as primitive vs arrow
Some(
slice_bytes
.iter_arrow()
.combine_chunks()
.as_primitive::<UInt8Type>()
.values()
.to_vec(),
)
}
}
5 changes: 4 additions & 1 deletion vortex-dict/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ num-traits = "0.2.17"
workspace = true

[dev-dependencies]
divan = "0.1.14"
criterion = { version = "0.5.1", features = ["html_reports"] }
log = "0.4.20"
rand = "0.8.5"
simplelog = { version = "0.12.1", features = ["paris"] }

[[bench]]
name = "dict_compress"
Expand Down
Loading

0 comments on commit 2976d3d

Please sign in to comment.