Skip to content

Commit

Permalink
Fix dictionary encoding (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Mar 6, 2024
1 parent 36ce638 commit 9c3e82c
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions vortex-dict/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ fn dict_encode_typed_varbin<O, K, V, U>(
validity: Option<&dyn Array>,
) -> (PrimitiveArray, VarBinArray)
where
O: NativePType + Unsigned + FromPrimitive,
O: NativePType + Unsigned + FromPrimitive + AsPrimitive<usize>,
K: NativePType + Unsigned + FromPrimitive + AsPrimitive<usize>,
V: Fn(usize) -> U,
U: AsRef<[u8]>,
Expand All @@ -242,7 +242,7 @@ where
let byte_ref = byte_val.as_ref();
let value_hash = hasher.hash_one(byte_ref);
let raw_entry = lookup_dict.raw_entry_mut().from_hash(value_hash, |idx| {
byte_ref == value_lookup(idx.as_()).as_ref()
byte_ref == bytes_at_primitive(offsets.as_slice(), bytes.as_slice(), idx.as_())
});

let code: K = match raw_entry {
Expand All @@ -252,7 +252,11 @@ where
bytes.extend_from_slice(byte_ref);
offsets.push(<O as FromPrimitive>::from_usize(bytes.len()).unwrap());
vac.insert_with_hasher(value_hash, next_code, (), |idx| {
hasher.hash_one(value_lookup(idx.as_()).as_ref())
hasher.hash_one(bytes_at_primitive(
offsets.as_slice(),
bytes.as_slice(),
idx.as_(),
))
});
next_code
}
Expand All @@ -272,6 +276,7 @@ where

#[cfg(test)]
mod test {
use vortex::array::downcast::DowncastArrayBuiltin;
use vortex::array::primitive::PrimitiveArray;
use vortex::array::varbin::VarBinArray;
use vortex::compute::scalar_at::scalar_at;
Expand Down Expand Up @@ -359,4 +364,19 @@ mod test {
"again"
);
}

#[test]
fn repeated_values() {
let arr = VarBinArray::from(vec!["a", "a", "b", "b", "a", "b", "a", "b"]);
let (codes, values) = dict_encode_varbin(&arr);
assert_eq!(
values.bytes().as_primitive().typed_data::<u8>(),
"ab".as_bytes()
);
assert_eq!(
values.offsets().as_primitive().typed_data::<u32>(),
&[0, 1, 2]
);
assert_eq!(codes.typed_data::<u8>(), &[0u8, 0, 1, 1, 0, 1, 0, 1]);
}
}

0 comments on commit 9c3e82c

Please sign in to comment.