Skip to content

Commit

Permalink
fix: Consistent metadata table column names for DataFusion stats (#1577)
Browse files Browse the repository at this point in the history
Fixes #1576.
  • Loading branch information
AdamGS authored Dec 5, 2024
1 parent 4ad1851 commit ae58bb9
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 17 deletions.
30 changes: 17 additions & 13 deletions vortex-array/src/stats/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,22 @@ impl Stat {
Stat::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
}
}

pub fn name(&self) -> &str {
match self {
Self::BitWidthFreq => "bit_width_frequency",
Self::TrailingZeroFreq => "trailing_zero_frequency",
Self::IsConstant => "is_constant",
Self::IsSorted => "is_sorted",
Self::IsStrictSorted => "is_strict_sorted",
Self::Max => "max",
Self::Min => "min",
Self::RunCount => "run_count",
Self::TrueCount => "true_count",
Self::NullCount => "null_count",
Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
}
}
}

pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
Expand Down Expand Up @@ -134,19 +150,7 @@ pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {

impl Display for Stat {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::BitWidthFreq => write!(f, "bit_width_frequency"),
Self::TrailingZeroFreq => write!(f, "trailing_zero_frequency"),
Self::IsConstant => write!(f, "is_constant"),
Self::IsSorted => write!(f, "is_sorted"),
Self::IsStrictSorted => write!(f, "is_strict_sorted"),
Self::Max => write!(f, "max"),
Self::Min => write!(f, "min"),
Self::RunCount => write!(f, "run_count"),
Self::TrueCount => write!(f, "true_count"),
Self::NullCount => write!(f, "null_count"),
Self::UncompressedSizeInBytes => write!(f, "uncompressed_size_in_bytes"),
}
write!(f, "{}", self.name())
}
}

Expand Down
9 changes: 5 additions & 4 deletions vortex-datafusion/src/persistent/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,23 @@ use datafusion_common::stats::Precision;
use datafusion_common::ColumnStatistics;
use datafusion_expr::Accumulator;
use vortex_array::array::StructArray;
use vortex_array::stats::Stat;
use vortex_array::variants::StructArrayTrait as _;
use vortex_array::IntoCanonical;
use vortex_error::VortexResult;

pub fn array_to_col_statistics(array: &StructArray) -> VortexResult<ColumnStatistics> {
let mut stats = ColumnStatistics::new_unknown();

if let Some(null_count_array) = array.field_by_name("null_count") {
if let Some(null_count_array) = array.field_by_name(Stat::NullCount.name()) {
let array = null_count_array.into_canonical()?.into_arrow()?;
let array = array.as_primitive::<UInt64Type>();

let null_count = array.iter().map(|v| v.unwrap_or_default()).sum::<u64>();
stats.null_count = Precision::Exact(null_count as usize);
}

if let Some(max_value_array) = array.field_by_name("max") {
if let Some(max_value_array) = array.field_by_name(Stat::Max.name()) {
let array = max_value_array.into_canonical()?.into_arrow()?;
let mut acc = MaxAccumulator::try_new(array.data_type())?;
acc.update_batch(&[array])?;
Expand All @@ -29,7 +30,7 @@ pub fn array_to_col_statistics(array: &StructArray) -> VortexResult<ColumnStatis
stats.max_value = Precision::Exact(max_val)
}

if let Some(min_value_array) = array.field_by_name("min") {
if let Some(min_value_array) = array.field_by_name(Stat::Min.name()) {
let array = min_value_array.into_canonical()?.into_arrow()?;
let mut acc = MinAccumulator::try_new(array.data_type())?;
acc.update_batch(&[array])?;
Expand All @@ -42,7 +43,7 @@ pub fn array_to_col_statistics(array: &StructArray) -> VortexResult<ColumnStatis
}

pub fn uncompressed_col_size(array: &StructArray) -> VortexResult<Option<u64>> {
match array.field_by_name("uncompressed_size") {
match array.field_by_name(Stat::UncompressedSizeInBytes.name()) {
None => Ok(None),
Some(array) => {
let array = array.into_canonical()?.into_arrow()?;
Expand Down

0 comments on commit ae58bb9

Please sign in to comment.