From f66994a1c630f74c3ef0548bfb0546f031f34d95 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Tue, 12 Mar 2024 13:59:30 -0700 Subject: [PATCH 01/11] dummy result test Signed-off-by: Rupal Mahajan --- .../main/antlr4/FlintSparkSqlExtensions.g4 | 5 +++ .../src/main/antlr4/SparkSqlBase.g4 | 1 + .../FlintSparkSkippingIndexAstBuilder.scala | 40 ++++++++++++++++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 index 219bbe782..c12ac8b5a 100644 --- a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 +++ b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 @@ -28,6 +28,11 @@ skippingIndexStatement | describeSkippingIndexStatement | dropSkippingIndexStatement | vacuumSkippingIndexStatement + | analyzeSkippingIndexStatement + ; + +analyzeSkippingIndexStatement + : ANALYZE SKIPPING INDEX ON tableName ; createSkippingIndexStatement diff --git a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 index 01f45016d..5c649d657 100644 --- a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 +++ b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 @@ -155,6 +155,7 @@ DOT: '.'; AS: 'AS'; +ANALYZE: 'ANALYZE'; CREATE: 'CREATE'; DESC: 'DESC'; DESCRIBE: 'DESCRIBE'; diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala index 9b638f36f..68af6664e 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala @@ -20,7 +20,7 @@ import org.opensearch.flint.spark.sql.FlintSparkSqlExtensionsParser._ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.Command -import org.apache.spark.sql.types.StringType +import org.apache.spark.sql.types.{ArrayType, MapType, StringType} /** * Flint Spark AST builder that builds Spark command for Flint skipping index statement. @@ -96,6 +96,44 @@ trait FlintSparkSkippingIndexAstBuilder extends FlintSparkSqlExtensionsVisitor[A } } + override def visitAnalyzeSkippingIndexStatement( + ctx: AnalyzeSkippingIndexStatementContext): Command = { + + val outputSchema = Seq( + AttributeReference("rule", StringType, nullable = false)(), + AttributeReference("recommendation", ArrayType(MapType(StringType, StringType)), nullable = false)(), + AttributeReference("unsupported columns", ArrayType(StringType), nullable = false)()) + + FlintSparkSqlCommand(outputSchema) { flint => + Seq( + Row("all top-level columns", + List( + Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), + Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), + Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), + Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") + ), + List("binary_code", "map_column")), + Row("all top-level columns and nested columns", + List( + Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), + Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), + Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), + Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") + ), + List("array column", "decimal value")), + Row("first 32 columns", + List( + Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), + Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), + Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), + Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") + ), + List()) + ) + } + } + override def visitDropSkippingIndexStatement(ctx: DropSkippingIndexStatementContext): Command = FlintSparkSqlCommand() { flint => val indexName = getSkippingIndexName(flint, ctx.tableName) From 5ce393d67a3b54e47c1a6a1c542827fa70b53ce0 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Thu, 14 Mar 2024 10:14:42 -0700 Subject: [PATCH 02/11] Add grammar for analyze skipping index Signed-off-by: Rupal Mahajan --- .../src/main/antlr4/FlintSparkSqlExtensions.g4 | 5 +++++ flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 | 1 + 2 files changed, 6 insertions(+) diff --git a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 index 4ecef6a69..a971dc560 100644 --- a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 +++ b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 @@ -29,6 +29,7 @@ skippingIndexStatement | alterSkippingIndexStatement | dropSkippingIndexStatement | vacuumSkippingIndexStatement + | analyzeSkippingIndexStatement ; createSkippingIndexStatement @@ -105,6 +106,10 @@ vacuumCoveringIndexStatement : VACUUM INDEX indexName ON tableName ; +analyzeSkippingIndexStatement + : ANALYZE SKIPPING INDEX ON tableName + ; + materializedViewStatement : createMaterializedViewStatement | refreshMaterializedViewStatement diff --git a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 index cb58e97e7..8e316084b 100644 --- a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 +++ b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 @@ -156,6 +156,7 @@ DOT: '.'; AS: 'AS'; ALTER: 'ALTER'; +ANALYZE: 'ANALYZE'; CREATE: 'CREATE'; DESC: 'DESC'; DESCRIBE: 'DESCRIBE'; From bba6823333eb0e16136f39c4c397fe79714f609f Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Thu, 14 Mar 2024 22:52:42 -0700 Subject: [PATCH 03/11] Add analyze skippig index function Signed-off-by: Rupal Mahajan --- .../opensearch/flint/spark/FlintSpark.scala | 41 ++++++++++++++++++- .../FlintSparkSkippingIndexAstBuilder.scala | 34 +++------------ 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index fba818a0f..0bb4b6769 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -21,7 +21,8 @@ import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKindSerializer import org.apache.spark.internal.Logging -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.flint.{loadTable, parseTableName} import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE import org.apache.spark.sql.flint.config.FlintSparkConf import org.apache.spark.sql.flint.config.FlintSparkConf.{DOC_ID_COLUMN_NAME, IGNORE_DOC_ID_COLUMN} @@ -332,6 +333,44 @@ class FlintSpark(val spark: SparkSession) extends Logging { spark.read.format(FLINT_DATASOURCE).load(indexName) } + /** + * Recommend skipping index columns based on set of rules. + * + * @param tableName + * table name + * @return + * skipping index recommendation dataframe + */ + def analyzeSkippingIndex(tableName: String): Seq[Row] = { + require(tableName.nonEmpty, "Source table name is not provided") + + val (catalog, ident) = parseTableName(spark, tableName) + val table = loadTable(catalog, ident).getOrElse( + throw new IllegalStateException(s"Table $tableName is not found")) + + val partitionFields = table.partitioning().flatMap { + transform => transform.references().collect ({ + case reference => reference.fieldNames() + }).flatten.toSet + } + + table.schema().fields.map { + field => + if (partitionFields.contains(field.name)) { + Row(field.name, field.dataType.toString, "PARTITION", "PARTITION data structure is recommended for partition columns") + } else { + field.dataType.toString match { + case "BooleanType" => + Row(field.name, field.dataType.typeName, "VALUE_SET", "We suggest MIN_MAX for numerical data types") + case "IntegerType" | "LongType" | "ShortType" => + Row(field.name, field.dataType.typeName, "MIN_MAX", "We suggest MIN_MAX for numerical data types") + case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" => + Row(field.name, field.dataType.typeName, "BLOOM_FILTER", "We suggest MIN_MAX for numerical data types") + } + } + }.toSeq + } + private def stopRefreshingJob(indexName: String): Unit = { logInfo(s"Terminating refreshing job $indexName") val job = spark.streams.active.find(_.name == indexName) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala index 68af6664e..795e2cbb9 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala @@ -100,37 +100,13 @@ trait FlintSparkSkippingIndexAstBuilder extends FlintSparkSqlExtensionsVisitor[A ctx: AnalyzeSkippingIndexStatementContext): Command = { val outputSchema = Seq( - AttributeReference("rule", StringType, nullable = false)(), - AttributeReference("recommendation", ArrayType(MapType(StringType, StringType)), nullable = false)(), - AttributeReference("unsupported columns", ArrayType(StringType), nullable = false)()) + AttributeReference("column_name", StringType, nullable = false)(), + AttributeReference("column_type", StringType, nullable = false)(), + AttributeReference("reason", StringType, nullable = false)(), + AttributeReference("skipping_type", StringType, nullable = false)()) FlintSparkSqlCommand(outputSchema) { flint => - Seq( - Row("all top-level columns", - List( - Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), - Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), - Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), - Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") - ), - List("binary_code", "map_column")), - Row("all top-level columns and nested columns", - List( - Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), - Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), - Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), - Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") - ), - List("array column", "decimal value")), - Row("first 32 columns", - List( - Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"), - Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"), - Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"), - Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column") - ), - List()) - ) + flint.analyzeSkippingIndex(ctx.tableName().getText) } } From cc46bbd5de8458c725c6ecd808fc86cad3311e84 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Fri, 15 Mar 2024 10:31:36 -0700 Subject: [PATCH 04/11] update analyze strategy Signed-off-by: Rupal Mahajan --- .../opensearch/flint/spark/FlintSpark.scala | 33 ++-------- .../AnalyzeSkippingStrategy.scala | 24 +++++++ .../DataTypeSkippingStrategy.scala | 63 +++++++++++++++++++ 3 files changed, 91 insertions(+), 29 deletions(-) create mode 100644 flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala create mode 100644 flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index 0bb4b6769..b80e8c564 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -19,10 +19,10 @@ import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh.RefreshMode.AUTO import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKindSerializer +import org.opensearch.flint.spark.skipping.recommendations.DataTypeSkippingStrategy import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.apache.spark.sql.flint.{loadTable, parseTableName} import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE import org.apache.spark.sql.flint.config.FlintSparkConf import org.apache.spark.sql.flint.config.FlintSparkConf.{DOC_ID_COLUMN_NAME, IGNORE_DOC_ID_COLUMN} @@ -334,7 +334,7 @@ class FlintSpark(val spark: SparkSession) extends Logging { } /** - * Recommend skipping index columns based on set of rules. + * Recommend skipping index columns and algorithm. * * @param tableName * table name @@ -342,33 +342,8 @@ class FlintSpark(val spark: SparkSession) extends Logging { * skipping index recommendation dataframe */ def analyzeSkippingIndex(tableName: String): Seq[Row] = { - require(tableName.nonEmpty, "Source table name is not provided") - - val (catalog, ident) = parseTableName(spark, tableName) - val table = loadTable(catalog, ident).getOrElse( - throw new IllegalStateException(s"Table $tableName is not found")) - - val partitionFields = table.partitioning().flatMap { - transform => transform.references().collect ({ - case reference => reference.fieldNames() - }).flatten.toSet - } - - table.schema().fields.map { - field => - if (partitionFields.contains(field.name)) { - Row(field.name, field.dataType.toString, "PARTITION", "PARTITION data structure is recommended for partition columns") - } else { - field.dataType.toString match { - case "BooleanType" => - Row(field.name, field.dataType.typeName, "VALUE_SET", "We suggest MIN_MAX for numerical data types") - case "IntegerType" | "LongType" | "ShortType" => - Row(field.name, field.dataType.typeName, "MIN_MAX", "We suggest MIN_MAX for numerical data types") - case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" => - Row(field.name, field.dataType.typeName, "BLOOM_FILTER", "We suggest MIN_MAX for numerical data types") - } - } - }.toSeq + val recommendation = new DataTypeSkippingStrategy() + recommendation.analyzeSkippingIndexColumns(tableName, spark) } private def stopRefreshingJob(indexName: String): Unit = { diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala new file mode 100644 index 000000000..1f15f9b07 --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala @@ -0,0 +1,24 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ +package org.opensearch.flint.spark.skipping.recommendations + +import org.apache.spark.sql.{Row, SparkSession} + +/** + * Automate skipping index column and algorithm selection. + */ +trait AnalyzeSkippingStrategy { + + /** + * Recommend skipping index columns and algorithm. + * + * @param tableName + * table name + * @return + * skipping index recommendation dataframe + */ + def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession) : Seq[Row] + +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala new file mode 100644 index 000000000..111c09e46 --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala @@ -0,0 +1,63 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.skipping.recommendations + +import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET} + +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.flint.{loadTable, parseTableName} + +class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { + + override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = { + require(tableName.nonEmpty, "Source table name is not provided") + + val (catalog, ident) = parseTableName(spark, tableName) + val table = loadTable(catalog, ident).getOrElse( + throw new IllegalStateException(s"Table $tableName is not found")) + + val partitionFields = table.partitioning().flatMap { + transform => + transform.references().collect({ + case reference => reference.fieldNames() + }).flatten.toSet + } + + table.schema().fields.map { + field => + if (partitionFields.contains(field.name)) { + Row(field.name, field.dataType.toString, PARTITION.toString, getRule(PARTITION.toString)) + } else { + val reason = getRule(field.dataType.toString) + field.dataType.toString match { + case "BooleanType" => + Row(field.name, field.dataType.typeName, VALUE_SET.toString, reason) + case "IntegerType" | "LongType" | "ShortType" => + Row(field.name, field.dataType.typeName, MIN_MAX.toString, reason) + case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" => + Row(field.name, field.dataType.typeName, BLOOM_FILTER.toString, reason) + } + } + }.toSeq + } + + private def getRule(dataTypeName: String): String = { + dataTypeName match { + case "PARTITION" => "PARTITION data structure is recommended for partition columns" + case "BooleanType" => "VALUE_SET data structure is recommended for BooleanType columns" + case "IntegerType" => "MIN_MAX data structure is recommended for IntegerType columns" + case "LongType" => "MIN_MAX data structure is recommended for LongType columns" + case "ShortType" => "MIN_MAX data structure is recommended for ShortType columns" + case "DateType" => "MIN_MAX data structure is recommended for DateType columns" + case "TimestampType" => "MIN_MAX data structure is recommended for TimestampType columns" + case "StringType" => "MIN_MAX data structure is recommended for StringType columns" + case "VarcharType" => "MIN_MAX data structure is recommended for VarcharType columns" + case "CharType" => "MIN_MAX data structure is recommended for CharType columns" + case "StructType" => "MIN_MAX data structure is recommended for StructType columns" + } + } + +} From b04b0b879cbf1de1ed39a042a1f5554dd7db18fc Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Fri, 15 Mar 2024 15:16:12 -0700 Subject: [PATCH 05/11] Update recommendations Signed-off-by: Rupal Mahajan --- .../opensearch/flint/spark/FlintSpark.scala | 3 +- .../DataTypeSkippingStrategy.scala | 37 +++++++------------ 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index b80e8c564..5322953d0 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -342,8 +342,7 @@ class FlintSpark(val spark: SparkSession) extends Logging { * skipping index recommendation dataframe */ def analyzeSkippingIndex(tableName: String): Seq[Row] = { - val recommendation = new DataTypeSkippingStrategy() - recommendation.analyzeSkippingIndexColumns(tableName, spark) + new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark) } private def stopRefreshingJob(indexName: String): Unit = { diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala index 111c09e46..e672882ed 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala @@ -29,35 +29,26 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { table.schema().fields.map { field => if (partitionFields.contains(field.name)) { - Row(field.name, field.dataType.toString, PARTITION.toString, getRule(PARTITION.toString)) + Row(field.name, field.dataType.toString, getRecommendation("PARTITION")._1, getRecommendation("PARTITION")._2) } else { - val reason = getRule(field.dataType.toString) - field.dataType.toString match { - case "BooleanType" => - Row(field.name, field.dataType.typeName, VALUE_SET.toString, reason) - case "IntegerType" | "LongType" | "ShortType" => - Row(field.name, field.dataType.typeName, MIN_MAX.toString, reason) - case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" => - Row(field.name, field.dataType.typeName, BLOOM_FILTER.toString, reason) - } + Row(field.name, field.dataType.toString, getRecommendation(field.dataType.typeName)._1, getRecommendation(field.dataType.toString)._2) } }.toSeq } - private def getRule(dataTypeName: String): String = { + private def getRecommendation(dataTypeName: String): (String, String) = { dataTypeName match { - case "PARTITION" => "PARTITION data structure is recommended for partition columns" - case "BooleanType" => "VALUE_SET data structure is recommended for BooleanType columns" - case "IntegerType" => "MIN_MAX data structure is recommended for IntegerType columns" - case "LongType" => "MIN_MAX data structure is recommended for LongType columns" - case "ShortType" => "MIN_MAX data structure is recommended for ShortType columns" - case "DateType" => "MIN_MAX data structure is recommended for DateType columns" - case "TimestampType" => "MIN_MAX data structure is recommended for TimestampType columns" - case "StringType" => "MIN_MAX data structure is recommended for StringType columns" - case "VarcharType" => "MIN_MAX data structure is recommended for VarcharType columns" - case "CharType" => "MIN_MAX data structure is recommended for CharType columns" - case "StructType" => "MIN_MAX data structure is recommended for StructType columns" + case "PARTITION" => (PARTITION.toString, "PARTITION data structure is recommended for partition columns") + case "BooleanType" => (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns") + case "IntegerType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns") + case "LongType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns") + case "ShortType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns") + case "DateType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns") + case "TimestampType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns") + case "StringType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns") + case "VarcharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns") + case "CharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns") + case "StructType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns") } } - } From 472a9628f716f615d9ed4f2dc4d1b325ebd0e148 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Sun, 17 Mar 2024 18:00:08 -0700 Subject: [PATCH 06/11] Add test Signed-off-by: Rupal Mahajan --- .../DataTypeSkippingStrategy.scala | 41 +++++++++---------- .../FlintSparkSkippingIndexSqlITSuite.scala | 26 ++++++++++-- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala index e672882ed..7e2904726 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala @@ -6,15 +6,28 @@ package org.opensearch.flint.spark.skipping.recommendations import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET} +import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.flint.{loadTable, parseTableName} class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { - override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = { - require(tableName.nonEmpty, "Source table name is not provided") + val rules = Map ( + "PARTITION" -> (PARTITION.toString, "PARTITION data structure is recommended for partition columns"), + "BooleanType" -> (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns"), + "IntegerType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns"), + "LongType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns"), + "ShortType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns"), + "DateType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns"), + "TimestampType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns"), + "StringType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns"), + "VarcharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns"), + "CharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns"), + "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns") + ) + override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = { val (catalog, ident) = parseTableName(spark, tableName) val table = loadTable(catalog, ident).getOrElse( throw new IllegalStateException(s"Table $tableName is not found")) @@ -26,29 +39,15 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { }).flatten.toSet } + val result = ArrayBuffer[Row]() table.schema().fields.map { field => if (partitionFields.contains(field.name)) { - Row(field.name, field.dataType.toString, getRecommendation("PARTITION")._1, getRecommendation("PARTITION")._2) - } else { - Row(field.name, field.dataType.toString, getRecommendation(field.dataType.typeName)._1, getRecommendation(field.dataType.toString)._2) + result += Row(field.name, field.dataType.typeName, rules("PARTITION")._1, rules("PARTITION")._2) + } else if (rules.contains(field.dataType.toString)) { + result += Row(field.name, field.dataType.typeName, rules(field.dataType.toString)._1, rules(field.dataType.toString)._2) } - }.toSeq - } - - private def getRecommendation(dataTypeName: String): (String, String) = { - dataTypeName match { - case "PARTITION" => (PARTITION.toString, "PARTITION data structure is recommended for partition columns") - case "BooleanType" => (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns") - case "IntegerType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns") - case "LongType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns") - case "ShortType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns") - case "DateType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns") - case "TimestampType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns") - case "StringType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns") - case "VarcharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns") - case "CharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns") - case "StructType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns") } + result } } diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala index ca14a555c..efdeea13d 100644 --- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala +++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala @@ -7,19 +7,18 @@ package org.opensearch.flint.spark import scala.Option.empty import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConverter} - import org.json4s.{Formats, NoTypeHints} import org.json4s.native.JsonMethods.parse import org.json4s.native.Serialization import org.opensearch.flint.core.FlintOptions import org.opensearch.flint.core.storage.FlintOpenSearchClient import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName -import org.scalatest.matchers.must.Matchers.defined -import org.scalatest.matchers.should.Matchers.{convertToAnyShouldWrapper, the} - +import org.scalatest.matchers.must.Matchers.{be, defined} +import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the} import org.apache.spark.sql.Row import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE import org.apache.spark.sql.flint.config.FlintSparkConf.CHECKPOINT_MANDATORY +import org.mockito.IdiomaticMockito.thrown class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite { @@ -318,4 +317,23 @@ class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite { sql(s"VACUUM SKIPPING INDEX ON $testTable") flint.describeIndex(testIndex) shouldBe empty } + + test("analyze skipping index with for supported data types") { + val result = sql(s"ANALYZE SKIPPING INDEX ON $testTable") + + checkAnswer( + result, + Seq( + Row("year", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"), + Row("month", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"), + Row("name", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"), + Row("age", "integer", "MIN_MAX", "MIN_MAX data structure is recommended for IntegerType columns"), + Row("address", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"))) + } + + test("analyze skipping index on invalid table") { + the[IllegalStateException] thrownBy { + sql(s"ANALYZE SKIPPING INDEX ON testTable") + } + } } From 37d3df33faf224d8b3f8572cc9d24fa043e16ad8 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Sun, 17 Mar 2024 20:23:01 -0700 Subject: [PATCH 07/11] Format code Signed-off-by: Rupal Mahajan --- .../opensearch/flint/spark/FlintSpark.scala | 6 +-- .../AnalyzeSkippingStrategy.scala | 6 +-- .../DataTypeSkippingStrategy.scala | 42 ++++++++++++------- .../FlintSparkSkippingIndexSqlITSuite.scala | 34 ++++++++++++--- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index 5322953d0..89e8dc423 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -337,12 +337,12 @@ class FlintSpark(val spark: SparkSession) extends Logging { * Recommend skipping index columns and algorithm. * * @param tableName - * table name + * table name * @return - * skipping index recommendation dataframe + * skipping index recommendation dataframe */ def analyzeSkippingIndex(tableName: String): Seq[Row] = { - new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark) + new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark) } private def stopRefreshingJob(indexName: String): Unit = { diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala index 1f15f9b07..cd73b3854 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala @@ -15,10 +15,10 @@ trait AnalyzeSkippingStrategy { * Recommend skipping index columns and algorithm. * * @param tableName - * table name + * table name * @return - * skipping index recommendation dataframe + * skipping index recommendation dataframe */ - def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession) : Seq[Row] + def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] } diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala index 7e2904726..1f9cc0b68 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala @@ -5,15 +5,16 @@ package org.opensearch.flint.spark.skipping.recommendations -import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET} import scala.collection.mutable.ArrayBuffer +import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET} + import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.flint.{loadTable, parseTableName} class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { - val rules = Map ( + val rules = Map( "PARTITION" -> (PARTITION.toString, "PARTITION data structure is recommended for partition columns"), "BooleanType" -> (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns"), "IntegerType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns"), @@ -24,29 +25,38 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy { "StringType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns"), "VarcharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns"), "CharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns"), - "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns") - ) + "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns")) override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = { val (catalog, ident) = parseTableName(spark, tableName) val table = loadTable(catalog, ident).getOrElse( throw new IllegalStateException(s"Table $tableName is not found")) - val partitionFields = table.partitioning().flatMap { - transform => - transform.references().collect({ - case reference => reference.fieldNames() - }).flatten.toSet + val partitionFields = table.partitioning().flatMap { transform => + transform + .references() + .collect({ case reference => + reference.fieldNames() + }) + .flatten + .toSet } val result = ArrayBuffer[Row]() - table.schema().fields.map { - field => - if (partitionFields.contains(field.name)) { - result += Row(field.name, field.dataType.typeName, rules("PARTITION")._1, rules("PARTITION")._2) - } else if (rules.contains(field.dataType.toString)) { - result += Row(field.name, field.dataType.typeName, rules(field.dataType.toString)._1, rules(field.dataType.toString)._2) - } + table.schema().fields.map { field => + if (partitionFields.contains(field.name)) { + result += Row( + field.name, + field.dataType.typeName, + rules("PARTITION")._1, + rules("PARTITION")._2) + } else if (rules.contains(field.dataType.toString)) { + result += Row( + field.name, + field.dataType.typeName, + rules(field.dataType.toString)._1, + rules(field.dataType.toString)._2) + } } result } diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala index efdeea13d..f079cf02d 100644 --- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala +++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala @@ -7,18 +7,20 @@ package org.opensearch.flint.spark import scala.Option.empty import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConverter} + import org.json4s.{Formats, NoTypeHints} import org.json4s.native.JsonMethods.parse import org.json4s.native.Serialization +import org.mockito.IdiomaticMockito.thrown import org.opensearch.flint.core.FlintOptions import org.opensearch.flint.core.storage.FlintOpenSearchClient import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName import org.scalatest.matchers.must.Matchers.{be, defined} import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the} + import org.apache.spark.sql.Row import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE import org.apache.spark.sql.flint.config.FlintSparkConf.CHECKPOINT_MANDATORY -import org.mockito.IdiomaticMockito.thrown class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite { @@ -324,11 +326,31 @@ class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite { checkAnswer( result, Seq( - Row("year", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"), - Row("month", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"), - Row("name", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"), - Row("age", "integer", "MIN_MAX", "MIN_MAX data structure is recommended for IntegerType columns"), - Row("address", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"))) + Row( + "year", + "integer", + "PARTITION", + "PARTITION data structure is recommended for partition columns"), + Row( + "month", + "integer", + "PARTITION", + "PARTITION data structure is recommended for partition columns"), + Row( + "name", + "string", + "BLOOM_FILTER", + "BLOOM_FILTER data structure is recommended for StringType columns"), + Row( + "age", + "integer", + "MIN_MAX", + "MIN_MAX data structure is recommended for IntegerType columns"), + Row( + "address", + "string", + "BLOOM_FILTER", + "BLOOM_FILTER data structure is recommended for StringType columns"))) } test("analyze skipping index on invalid table") { From 2fdf421626bedb8df5f25537032a1c223536c1cb Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Sun, 17 Mar 2024 21:00:15 -0700 Subject: [PATCH 08/11] Remove unused import Signed-off-by: Rupal Mahajan --- .../sql/skipping/FlintSparkSkippingIndexAstBuilder.scala | 2 +- .../flint/spark/FlintSparkSkippingIndexSqlITSuite.scala | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala index 795e2cbb9..b0c449fea 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala @@ -20,7 +20,7 @@ import org.opensearch.flint.spark.sql.FlintSparkSqlExtensionsParser._ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.Command -import org.apache.spark.sql.types.{ArrayType, MapType, StringType} +import org.apache.spark.sql.types.StringType /** * Flint Spark AST builder that builds Spark command for Flint skipping index statement. diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala index f079cf02d..eeb467bf6 100644 --- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala +++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala @@ -11,12 +11,11 @@ import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConv import org.json4s.{Formats, NoTypeHints} import org.json4s.native.JsonMethods.parse import org.json4s.native.Serialization -import org.mockito.IdiomaticMockito.thrown import org.opensearch.flint.core.FlintOptions import org.opensearch.flint.core.storage.FlintOpenSearchClient import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName -import org.scalatest.matchers.must.Matchers.{be, defined} -import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the} +import org.scalatest.matchers.must.Matchers.defined +import org.scalatest.matchers.should.Matchers.{convertToAnyShouldWrapper, the} import org.apache.spark.sql.Row import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE From 54d7ee6c1d9d9e88376878549959313456c6d77b Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Mon, 18 Mar 2024 08:46:36 -0700 Subject: [PATCH 09/11] Update doc Signed-off-by: Rupal Mahajan --- docs/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/index.md b/docs/index.md index 31147aed4..f5674c537 100644 --- a/docs/index.md +++ b/docs/index.md @@ -170,6 +170,8 @@ DROP SKIPPING INDEX ON VACUUM SKIPPING INDEX ON +ANALYZE SKIPPING INDEX ON + ::= [db_name].[schema_name].table_name ``` From e289a9c18d1a5aed76afd8cdf583b4c2de171233 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Mon, 18 Mar 2024 11:47:33 -0700 Subject: [PATCH 10/11] Update doc Signed-off-by: Rupal Mahajan --- docs/index.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/index.md b/docs/index.md index 6fc2f7b0c..e8067ce3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -321,6 +321,31 @@ fetched rows / total rows = 3/3 +-------------------------------------------------------------+----------+----------+-----------+-----------------+--------------+------------+ ``` +- **Analyze Skipping Indexes**: Provides recommendation for creating skipping index. It outputs the following columns: + - column_name: recommended column's name + - column_type: recommended column's type + - skipping_type: recommended skipping type for column + - reason: why this skipping type is recommended + +```sql +ANALYZE SKIPPING INDEX ON [catalog.database.]table +``` + +Example: +``` +sql> ANALYZE SKIPPING INDEX ON alb_logs; +fetched rows / total rows = 5/5 ++-------------------------+-------------+---------------+-------------------------------------------------------------------+ +| column_name | column_type | skipping_type | reason | +|-------------------------+-------------+---------------+-------------------------------------------------------------------+ +| year | integer | PARTITION | PARTITION data structure is recommended for partition columns | +| month | integer | PARTITION | PARTITION data structure is recommended for partition columns | +| day | integer | PARTITION | PARTITION data structure is recommended for partition columns | +| request_processing_time | integer | MIN_MAX | MIN_MAX data structure is recommended for IntegerType columns | +| client_ip | string | BLOOM_FILTER | BLOOM_FILTER data structure is recommended for StringType columns | ++-------------------------+-------------+---------------+-------------------------------------------------------------------+ +``` + #### Create Index Options User can provide the following options in `WITH` clause of create statement: From a720116b5b26243bf891c7b504c28d0565387ea7 Mon Sep 17 00:00:00 2001 From: Rupal Mahajan Date: Mon, 18 Mar 2024 11:49:01 -0700 Subject: [PATCH 11/11] nit Signed-off-by: Rupal Mahajan --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index e8067ce3d..3b1e2efe5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -321,7 +321,7 @@ fetched rows / total rows = 3/3 +-------------------------------------------------------------+----------+----------+-----------+-----------------+--------------+------------+ ``` -- **Analyze Skipping Indexes**: Provides recommendation for creating skipping index. It outputs the following columns: +- **Analyze Skipping Index**: Provides recommendation for creating skipping index. It outputs the following columns: - column_name: recommended column's name - column_type: recommended column's type - skipping_type: recommended skipping type for column