diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 1aa42705e7f8..d2ce965c5c49 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -268,13 +268,35 @@ impl Statistics { return self; }; - // todo: it would be nice to avoid cloning column statistics if - // possible (e.g. if the projection did not contain duplicates) - self.column_statistics = projection - .iter() - .map(|&i| self.column_statistics[i].clone()) + enum Slot { + /// The column is taken and put into the specified statistics location + Taken(usize), + /// The original columns is present + Present(ColumnStatistics), + } + + // Convert to Vec so we can avoid copying the statistics + let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics) + .into_iter() + .map(Slot::Present) .collect(); + for idx in projection { + let next_idx = self.column_statistics.len(); + let slot = std::mem::replace( + columns.get_mut(*idx).expect("projection out of bounds"), + Slot::Taken(next_idx), + ); + match slot { + // The column was there, so just move it + Slot::Present(col) => self.column_statistics.push(col), + // The column was taken, so copy from the previous location + Slot::Taken(prev_idx) => self + .column_statistics + .push(self.column_statistics[prev_idx].clone()), + } + } + self } @@ -581,4 +603,50 @@ mod tests { let p2 = precision.clone(); assert_eq!(precision, p2); } + + #[test] + fn test_project_none() { + let projection = None; + let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref()); + assert_eq!(stats, make_stats(vec![10, 20, 30])); + } + + #[test] + fn test_project_empty() { + let projection = Some(vec![]); + let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref()); + assert_eq!(stats, make_stats(vec![])); + } + + #[test] + fn test_project_swap() { + let projection = Some(vec![2, 1]); + let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref()); + assert_eq!(stats, make_stats(vec![30, 20])); + } + + #[test] + fn test_project_repeated() { + let projection = Some(vec![1, 2, 1, 1, 0, 2]); + let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref()); + assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30])); + } + + // Make a Statistics structure with the specified null counts for each column + fn make_stats(counts: impl IntoIterator) -> Statistics { + Statistics { + num_rows: Precision::Exact(42), + total_byte_size: Precision::Exact(500), + column_statistics: counts.into_iter().map(col_stats_i64).collect(), + } + } + + fn col_stats_i64(null_count: usize) -> ColumnStatistics { + ColumnStatistics { + null_count: Precision::Exact(null_count), + max_value: Precision::Exact(ScalarValue::Int64(Some(42))), + min_value: Precision::Exact(ScalarValue::Int64(Some(64))), + distinct_count: Precision::Exact(100), + } + } }