From 4ab4b918c725f31f10661ea73ab4806d2db495f7 Mon Sep 17 00:00:00 2001 From: Prashant Singh <35593236+singhpk234@users.noreply.github.com> Date: Mon, 4 Apr 2022 03:03:49 +0530 Subject: [PATCH] Spark 3.2: Update task stats for split files (#4446) Co-authored-by: Prashant Singh --- .../main/java/org/apache/iceberg/spark/source/SparkScan.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java index ae29e05d4..7d93ad66e 100644 --- a/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java +++ b/spark/v3.2/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java @@ -148,7 +148,9 @@ protected Statistics estimateStatistics(Snapshot snapshot) { for (CombinedScanTask task : tasks()) { for (FileScanTask file : task.files()) { - numRows += file.file().recordCount(); + // TODO: if possible, take deletes also into consideration. + double fractionOfFileScanned = ((double) file.length()) / file.file().fileSizeInBytes(); + numRows += (fractionOfFileScanned * file.file().recordCount()); } }