Skip to content

Commit

Permalink
Optimize Statistics::projection (#13225)
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb authored Nov 5, 2024
1 parent 9005585 commit 003813a
Showing 1 changed file with 73 additions and 5 deletions.
78 changes: 73 additions & 5 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,35 @@ impl Statistics {
return self;
};

// todo: it would be nice to avoid cloning column statistics if
// possible (e.g. if the projection did not contain duplicates)
self.column_statistics = projection
.iter()
.map(|&i| self.column_statistics[i].clone())
enum Slot {
/// The column is taken and put into the specified statistics location
Taken(usize),
/// The original columns is present
Present(ColumnStatistics),
}

// Convert to Vec<Slot> so we can avoid copying the statistics
let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
.into_iter()
.map(Slot::Present)
.collect();

for idx in projection {
let next_idx = self.column_statistics.len();
let slot = std::mem::replace(
columns.get_mut(*idx).expect("projection out of bounds"),
Slot::Taken(next_idx),
);
match slot {
// The column was there, so just move it
Slot::Present(col) => self.column_statistics.push(col),
// The column was taken, so copy from the previous location
Slot::Taken(prev_idx) => self
.column_statistics
.push(self.column_statistics[prev_idx].clone()),
}
}

self
}

Expand Down Expand Up @@ -581,4 +603,50 @@ mod tests {
let p2 = precision.clone();
assert_eq!(precision, p2);
}

#[test]
fn test_project_none() {
let projection = None;
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
assert_eq!(stats, make_stats(vec![10, 20, 30]));
}

#[test]
fn test_project_empty() {
let projection = Some(vec![]);
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
assert_eq!(stats, make_stats(vec![]));
}

#[test]
fn test_project_swap() {
let projection = Some(vec![2, 1]);
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
assert_eq!(stats, make_stats(vec![30, 20]));
}

#[test]
fn test_project_repeated() {
let projection = Some(vec![1, 2, 1, 1, 0, 2]);
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
}

// Make a Statistics structure with the specified null counts for each column
fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
Statistics {
num_rows: Precision::Exact(42),
total_byte_size: Precision::Exact(500),
column_statistics: counts.into_iter().map(col_stats_i64).collect(),
}
}

fn col_stats_i64(null_count: usize) -> ColumnStatistics {
ColumnStatistics {
null_count: Precision::Exact(null_count),
max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
distinct_count: Precision::Exact(100),
}
}
}

0 comments on commit 003813a

Please sign in to comment.