-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
168 additions
and
1 deletion.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
// Compress a set of values into an Array. | ||
|
||
use fsst::{Compressor, Symbol}; | ||
use vortex::accessor::ArrayAccessor; | ||
use vortex::array::builder::VarBinBuilder; | ||
use vortex::array::{PrimitiveArray, VarBinArray, VarBinViewArray}; | ||
use vortex::validity::Validity; | ||
use vortex::{Array, ArrayDType, IntoArray}; | ||
use vortex_dtype::DType; | ||
|
||
use crate::FSSTArray; | ||
|
||
/// Compress an array using FSST. If a compressor is provided, use the existing compressor, else | ||
/// it will train a new compressor directly from the `strings`. | ||
/// | ||
/// # Panics | ||
/// | ||
/// If the `strings` array is not encoded as either [`VarBinArray`] or [`VarBinViewArray`]. | ||
pub fn fsst_compress(strings: Array, compressor: Option<Compressor>) -> FSSTArray { | ||
let len = strings.len(); | ||
let dtype = strings.dtype().clone(); | ||
|
||
// Compress VarBinArray | ||
if let Ok(varbin) = VarBinArray::try_from(&strings) { | ||
let compressor = compressor.unwrap_or_else(|| { | ||
varbin | ||
.with_iterator(|iter| fsst_train_compressor(iter)) | ||
.unwrap() | ||
}); | ||
return varbin | ||
.with_iterator(|iter| fsst_compress_iter(iter, len, dtype, &compressor)) | ||
.unwrap(); | ||
} | ||
|
||
// Compress VarBinViewArray | ||
if let Ok(varbin_view) = VarBinViewArray::try_from(&strings) { | ||
let compressor = compressor.unwrap_or_else(|| { | ||
varbin_view | ||
.with_iterator(|iter| fsst_train_compressor(iter)) | ||
.unwrap() | ||
}); | ||
return varbin_view | ||
.with_iterator(|iter| fsst_compress_iter(iter, len, dtype, &compressor)) | ||
.unwrap(); | ||
} | ||
|
||
panic!( | ||
"cannot fsst_compress array with unsupported encoding {:?}", | ||
strings.encoding().id() | ||
) | ||
} | ||
|
||
fn fsst_train_compressor<'a, I>(iter: I) -> Compressor | ||
where | ||
I: Iterator<Item = Option<&'a [u8]>>, | ||
{ | ||
// TODO(aduffy): eliminate the copying. | ||
let mut sample = Vec::with_capacity(1_024 * 1_024); | ||
for string in iter { | ||
match string { | ||
None => {} | ||
Some(b) => sample.extend_from_slice(b), | ||
} | ||
} | ||
|
||
Compressor::train(&sample) | ||
} | ||
|
||
pub fn fsst_compress_iter<'a, I>( | ||
iter: I, | ||
len: usize, | ||
dtype: DType, | ||
compressor: &Compressor, | ||
) -> FSSTArray | ||
where | ||
I: Iterator<Item = Option<&'a [u8]>>, | ||
{ | ||
let mut builder = VarBinBuilder::<i32>::with_capacity(len); | ||
for string in iter { | ||
match string { | ||
None => builder.push_null(), | ||
Some(s) => builder.push_value(&compressor.compress(s)), | ||
} | ||
} | ||
|
||
let codes = builder.finish(dtype.clone()); | ||
let symbols_vec: Vec<Symbol> = compressor.symbol_table().to_vec(); | ||
// SAFETY: Symbol and u64 are same size | ||
let symbols_u64: Vec<u64> = unsafe { std::mem::transmute(symbols_vec) }; | ||
let symbols = PrimitiveArray::from_vec(symbols_u64, Validity::NonNullable); | ||
|
||
FSSTArray::try_new(dtype, symbols.into_array(), codes.into_array()) | ||
.expect("building FSSTArray from parts") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ | |
mod array; | ||
mod canonical; | ||
mod compress; | ||
mod compute; | ||
|
||
pub use array::*; | ||
pub use compress::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
use std::collections::HashSet; | ||
|
||
use vortex::array::{VarBinArray, VarBinViewArray}; | ||
use vortex::encoding::EncodingRef; | ||
use vortex::{ArrayDType, ArrayDef, IntoArray}; | ||
use vortex_dict::DictArray; | ||
use vortex_dtype::DType; | ||
use vortex_error::{vortex_bail, VortexResult}; | ||
use vortex_fsst::{fsst_compress, FSSTEncoding, FSST}; | ||
|
||
use super::{CompressedArray, CompressionTree, EncodingCompressor}; | ||
use crate::SamplingCompressor; | ||
|
||
#[derive(Debug)] | ||
pub struct FSSTCompressor; | ||
|
||
impl EncodingCompressor for FSSTCompressor { | ||
fn id(&self) -> &str { | ||
FSST::ID.as_ref() | ||
} | ||
|
||
fn can_compress(&self, array: &vortex::Array) -> Option<&dyn EncodingCompressor> { | ||
// FSST arrays must have DType::Utf8. | ||
// | ||
// Note that while it can accept binary data, it is unlikely to perform well. | ||
if !matches!(array.dtype(), &DType::Utf8(_)) { | ||
return None; | ||
} | ||
|
||
// FSST cannot be applied recursively. | ||
if array.encoding().id() == FSST::ID { | ||
return None; | ||
} | ||
|
||
Some(self) | ||
} | ||
|
||
fn compress<'a>( | ||
&'a self, | ||
array: &vortex::Array, | ||
_like: Option<CompressionTree<'a>>, | ||
_ctx: SamplingCompressor<'a>, | ||
) -> VortexResult<super::CompressedArray<'a>> { | ||
// TODO(aduffy): use like array to clone the existing symbol table | ||
let fsst_array = | ||
if VarBinArray::try_from(array).is_ok() || VarBinViewArray::try_from(array).is_ok() { | ||
// For a VarBinArray or VarBinViewArray, compress directly. | ||
fsst_compress(array.clone(), None) | ||
} else if let Ok(dict) = DictArray::try_from(array) { | ||
// For a dict array, just compress the values | ||
fsst_compress(dict.values(), None) | ||
} else { | ||
vortex_bail!( | ||
InvalidArgument: "unsupported encoding for FSSTCompressor {:?}", | ||
array.encoding().id() | ||
) | ||
}; | ||
|
||
Ok(CompressedArray::new(fsst_array.into_array(), None)) | ||
} | ||
|
||
fn used_encodings(&self) -> HashSet<EncodingRef> { | ||
HashSet::from([&FSSTEncoding as EncodingRef]) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters