Skip to content

Commit

Permalink
push down read-padding on char type
Browse files Browse the repository at this point in the history
Signed-off-by: Peng Huo <[email protected]>
  • Loading branch information
penghuo committed May 20, 2024
1 parent 306d372 commit 878ef4e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKi

import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GetStructField}
import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType

/**
* Skipping index strategy that defines skipping data structure building and reading logic.
Expand Down Expand Up @@ -115,6 +118,17 @@ object FlintSparkSkippingStrategy {
Seq(attr.name)
case GetStructField(child, _, Some(name)) =>
extractColumnName(child) :+ name
/**
* Since Spark 3.4 add read-side padding, char_col = "sample char" became
* (staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils,
* StringType, readSidePadding, char_col#47, 20, true, false, true) = sample char )
*
* When create skipping index, Spark did write-side padding. So read-side push down can be
* ignored. More reading, https://issues.apache.org/jira/browse/SPARK-40697
*/
case StaticInvoke(staticObject, StringType, "readSidePadding", arguments, _, _, _, _)
if classOf[CharVarcharCodegenUtils].isAssignableFrom(staticObject) =>
extractColumnName(arguments.head)
case _ => Seq.empty
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -797,13 +797,10 @@ class FlintSparkSkippingIndexITSuite extends FlintSparkSuite {
// CharType column is padded to a fixed length with whitespace
val paddedChar = "sample char".padTo(20, ' ')
checkAnswer(query, Row("sample varchar", paddedChar))
/*
* todo Spark 3.4 add read-side padding, SkippingIndex rule can not push down char_col plan now.
* https://issues.apache.org/jira/browse/SPARK-40697
*/
query.queryExecution.executedPlan should
useFlintSparkSkippingFileIndex(
hasIndexFilter(isnull(col("varchar_col")) || col("varchar_col") === "sample varchar"))
hasIndexFilter((isnull(col("varchar_col")) || col("varchar_col") === "sample varchar") &&
(isnull(col("char_col")) || col("char_col") === paddedChar)))

deleteTestIndex(testIndex)
}
Expand Down

0 comments on commit 878ef4e

Please sign in to comment.