From f66994a1c630f74c3ef0548bfb0546f031f34d95 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Tue, 12 Mar 2024 13:59:30 -0700
Subject: [PATCH 01/11] dummy result test

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../main/antlr4/FlintSparkSqlExtensions.g4    |  5 +++
 .../src/main/antlr4/SparkSqlBase.g4           |  1 +
 .../FlintSparkSkippingIndexAstBuilder.scala   | 40 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
index 219bbe782..c12ac8b5a 100644
--- a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
+++ b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
@@ -28,6 +28,11 @@ skippingIndexStatement
     | describeSkippingIndexStatement
     | dropSkippingIndexStatement
     | vacuumSkippingIndexStatement
+    | analyzeSkippingIndexStatement
+    ;
+
+analyzeSkippingIndexStatement
+    : ANALYZE SKIPPING INDEX ON tableName
     ;
 
 createSkippingIndexStatement
diff --git a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
index 01f45016d..5c649d657 100644
--- a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
+++ b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
@@ -155,6 +155,7 @@ DOT: '.';
 
 
 AS: 'AS';
+ANALYZE: 'ANALYZE';
 CREATE: 'CREATE';
 DESC: 'DESC';
 DESCRIBE: 'DESCRIBE';
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
index 9b638f36f..68af6664e 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
@@ -20,7 +20,7 @@ import org.opensearch.flint.spark.sql.FlintSparkSqlExtensionsParser._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.logical.Command
-import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.{ArrayType, MapType, StringType}
 
 /**
  * Flint Spark AST builder that builds Spark command for Flint skipping index statement.
@@ -96,6 +96,44 @@ trait FlintSparkSkippingIndexAstBuilder extends FlintSparkSqlExtensionsVisitor[A
     }
   }
 
+  override def visitAnalyzeSkippingIndexStatement(
+      ctx: AnalyzeSkippingIndexStatementContext): Command = {
+
+    val outputSchema = Seq(
+      AttributeReference("rule", StringType, nullable = false)(),
+      AttributeReference("recommendation", ArrayType(MapType(StringType, StringType)), nullable = false)(),
+      AttributeReference("unsupported columns", ArrayType(StringType), nullable = false)())
+
+    FlintSparkSqlCommand(outputSchema) { flint =>
+      Seq(
+          Row("all top-level columns",
+            List(
+              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
+              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
+              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
+              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
+            ),
+            List("binary_code", "map_column")),
+          Row("all top-level columns and nested columns",
+            List(
+              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
+              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
+              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
+              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
+            ),
+            List("array column", "decimal value")),
+          Row("first 32 columns",
+            List(
+              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
+              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
+              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
+              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
+            ),
+            List())
+        )
+    }
+  }
+
   override def visitDropSkippingIndexStatement(ctx: DropSkippingIndexStatementContext): Command =
     FlintSparkSqlCommand() { flint =>
       val indexName = getSkippingIndexName(flint, ctx.tableName)

From 5ce393d67a3b54e47c1a6a1c542827fa70b53ce0 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Thu, 14 Mar 2024 10:14:42 -0700
Subject: [PATCH 02/11] Add grammar for analyze skipping index

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../src/main/antlr4/FlintSparkSqlExtensions.g4               | 5 +++++
 flint-spark-integration/src/main/antlr4/SparkSqlBase.g4      | 1 +
 2 files changed, 6 insertions(+)

diff --git a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4 b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
index 4ecef6a69..a971dc560 100644
--- a/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
+++ b/flint-spark-integration/src/main/antlr4/FlintSparkSqlExtensions.g4
@@ -29,6 +29,7 @@ skippingIndexStatement
     | alterSkippingIndexStatement
     | dropSkippingIndexStatement
     | vacuumSkippingIndexStatement
+    | analyzeSkippingIndexStatement
     ;
 
 createSkippingIndexStatement
@@ -105,6 +106,10 @@ vacuumCoveringIndexStatement
     : VACUUM INDEX indexName ON tableName
     ;
 
+analyzeSkippingIndexStatement
+    : ANALYZE SKIPPING INDEX ON tableName
+    ;
+
 materializedViewStatement
     : createMaterializedViewStatement
     | refreshMaterializedViewStatement
diff --git a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4 b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
index cb58e97e7..8e316084b 100644
--- a/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
+++ b/flint-spark-integration/src/main/antlr4/SparkSqlBase.g4
@@ -156,6 +156,7 @@ DOT: '.';
 
 AS: 'AS';
 ALTER: 'ALTER';
+ANALYZE: 'ANALYZE';
 CREATE: 'CREATE';
 DESC: 'DESC';
 DESCRIBE: 'DESCRIBE';

From bba6823333eb0e16136f39c4c397fe79714f609f Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Thu, 14 Mar 2024 22:52:42 -0700
Subject: [PATCH 03/11] Add analyze skippig index function

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../opensearch/flint/spark/FlintSpark.scala   | 41 ++++++++++++++++++-
 .../FlintSparkSkippingIndexAstBuilder.scala   | 34 +++------------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index fba818a0f..0bb4b6769 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -21,7 +21,8 @@ import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKindSerializer
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.sql.flint.{loadTable, parseTableName}
 import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE
 import org.apache.spark.sql.flint.config.FlintSparkConf
 import org.apache.spark.sql.flint.config.FlintSparkConf.{DOC_ID_COLUMN_NAME, IGNORE_DOC_ID_COLUMN}
@@ -332,6 +333,44 @@ class FlintSpark(val spark: SparkSession) extends Logging {
     spark.read.format(FLINT_DATASOURCE).load(indexName)
   }
 
+  /**
+   * Recommend skipping index columns based on set of rules.
+   *
+   * @param tableName
+   *    table name
+   * @return
+   *    skipping index recommendation dataframe
+   */
+  def analyzeSkippingIndex(tableName: String): Seq[Row] = {
+    require(tableName.nonEmpty, "Source table name is not provided")
+
+    val (catalog, ident) = parseTableName(spark, tableName)
+    val table = loadTable(catalog, ident).getOrElse(
+      throw new IllegalStateException(s"Table $tableName is not found"))
+
+    val partitionFields = table.partitioning().flatMap {
+      transform => transform.references().collect ({
+        case reference => reference.fieldNames()
+      }).flatten.toSet
+    }
+
+    table.schema().fields.map {
+      field =>
+        if (partitionFields.contains(field.name)) {
+          Row(field.name, field.dataType.toString, "PARTITION", "PARTITION data structure is recommended for partition columns")
+        } else {
+          field.dataType.toString match {
+            case "BooleanType" =>
+              Row(field.name, field.dataType.typeName, "VALUE_SET", "We suggest MIN_MAX for numerical data types")
+            case "IntegerType" | "LongType" | "ShortType" =>
+              Row(field.name, field.dataType.typeName, "MIN_MAX", "We suggest MIN_MAX for numerical data types")
+            case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" =>
+              Row(field.name, field.dataType.typeName, "BLOOM_FILTER", "We suggest MIN_MAX for numerical data types")
+          }
+        }
+    }.toSeq
+  }
+
   private def stopRefreshingJob(indexName: String): Unit = {
     logInfo(s"Terminating refreshing job $indexName")
     val job = spark.streams.active.find(_.name == indexName)
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
index 68af6664e..795e2cbb9 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
@@ -100,37 +100,13 @@ trait FlintSparkSkippingIndexAstBuilder extends FlintSparkSqlExtensionsVisitor[A
       ctx: AnalyzeSkippingIndexStatementContext): Command = {
 
     val outputSchema = Seq(
-      AttributeReference("rule", StringType, nullable = false)(),
-      AttributeReference("recommendation", ArrayType(MapType(StringType, StringType)), nullable = false)(),
-      AttributeReference("unsupported columns", ArrayType(StringType), nullable = false)())
+      AttributeReference("column_name", StringType, nullable = false)(),
+      AttributeReference("column_type", StringType, nullable = false)(),
+      AttributeReference("reason", StringType, nullable = false)(),
+      AttributeReference("skipping_type", StringType, nullable = false)())
 
     FlintSparkSqlCommand(outputSchema) { flint =>
-      Seq(
-          Row("all top-level columns",
-            List(
-              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
-              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
-              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
-              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
-            ),
-            List("binary_code", "map_column")),
-          Row("all top-level columns and nested columns",
-            List(
-              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
-              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
-              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
-              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
-            ),
-            List("array column", "decimal value")),
-          Row("first 32 columns",
-            List(
-              Map("column_name"->"year", "column_type"->"DateType", "skipping_type"->"PARTITION", "reason"->"top level partition column"),
-              Map("column_name"->"month", "column_type"->"StringType", "skipping_type"->"BLOOMFILTER", "reason"->"top level string column"),
-              Map("column_name"->"day", "column_type"->"IntegerType", "skipping_type"->"MIN_MAX", "reason"->"top level integer column"),
-              Map("column_name"->"hour", "column_type"->"TimestampType", "skipping_type"->"PARTITION", "reason"->"top level partition column")
-            ),
-            List())
-        )
+      flint.analyzeSkippingIndex(ctx.tableName().getText)
     }
   }
 

From cc46bbd5de8458c725c6ecd808fc86cad3311e84 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Fri, 15 Mar 2024 10:31:36 -0700
Subject: [PATCH 04/11] update analyze strategy

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../opensearch/flint/spark/FlintSpark.scala   | 33 ++--------
 .../AnalyzeSkippingStrategy.scala             | 24 +++++++
 .../DataTypeSkippingStrategy.scala            | 63 +++++++++++++++++++
 3 files changed, 91 insertions(+), 29 deletions(-)
 create mode 100644 flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
 create mode 100644 flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index 0bb4b6769..b80e8c564 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -19,10 +19,10 @@ import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh
 import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh.RefreshMode.AUTO
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKindSerializer
+import org.opensearch.flint.spark.skipping.recommendations.DataTypeSkippingStrategy
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.sql.flint.{loadTable, parseTableName}
 import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE
 import org.apache.spark.sql.flint.config.FlintSparkConf
 import org.apache.spark.sql.flint.config.FlintSparkConf.{DOC_ID_COLUMN_NAME, IGNORE_DOC_ID_COLUMN}
@@ -334,7 +334,7 @@ class FlintSpark(val spark: SparkSession) extends Logging {
   }
 
   /**
-   * Recommend skipping index columns based on set of rules.
+   * Recommend skipping index columns and algorithm.
    *
    * @param tableName
    *    table name
@@ -342,33 +342,8 @@ class FlintSpark(val spark: SparkSession) extends Logging {
    *    skipping index recommendation dataframe
    */
   def analyzeSkippingIndex(tableName: String): Seq[Row] = {
-    require(tableName.nonEmpty, "Source table name is not provided")
-
-    val (catalog, ident) = parseTableName(spark, tableName)
-    val table = loadTable(catalog, ident).getOrElse(
-      throw new IllegalStateException(s"Table $tableName is not found"))
-
-    val partitionFields = table.partitioning().flatMap {
-      transform => transform.references().collect ({
-        case reference => reference.fieldNames()
-      }).flatten.toSet
-    }
-
-    table.schema().fields.map {
-      field =>
-        if (partitionFields.contains(field.name)) {
-          Row(field.name, field.dataType.toString, "PARTITION", "PARTITION data structure is recommended for partition columns")
-        } else {
-          field.dataType.toString match {
-            case "BooleanType" =>
-              Row(field.name, field.dataType.typeName, "VALUE_SET", "We suggest MIN_MAX for numerical data types")
-            case "IntegerType" | "LongType" | "ShortType" =>
-              Row(field.name, field.dataType.typeName, "MIN_MAX", "We suggest MIN_MAX for numerical data types")
-            case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" =>
-              Row(field.name, field.dataType.typeName, "BLOOM_FILTER", "We suggest MIN_MAX for numerical data types")
-          }
-        }
-    }.toSeq
+    val recommendation = new DataTypeSkippingStrategy()
+    recommendation.analyzeSkippingIndexColumns(tableName, spark)
   }
 
   private def stopRefreshingJob(indexName: String): Unit = {
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
new file mode 100644
index 000000000..1f15f9b07
--- /dev/null
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
@@ -0,0 +1,24 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.flint.spark.skipping.recommendations
+
+import org.apache.spark.sql.{Row, SparkSession}
+
+/**
+ * Automate skipping index column and algorithm selection.
+ */
+trait AnalyzeSkippingStrategy {
+
+  /**
+   * Recommend skipping index columns and algorithm.
+   *
+   * @param tableName
+   * table name
+   * @return
+   * skipping index recommendation dataframe
+   */
+  def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession) : Seq[Row]
+
+}
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
new file mode 100644
index 000000000..111c09e46
--- /dev/null
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
@@ -0,0 +1,63 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.flint.spark.skipping.recommendations
+
+import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET}
+
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.flint.{loadTable, parseTableName}
+
+class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
+
+  override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = {
+    require(tableName.nonEmpty, "Source table name is not provided")
+
+    val (catalog, ident) = parseTableName(spark, tableName)
+    val table = loadTable(catalog, ident).getOrElse(
+      throw new IllegalStateException(s"Table $tableName is not found"))
+
+    val partitionFields = table.partitioning().flatMap {
+      transform =>
+        transform.references().collect({
+          case reference => reference.fieldNames()
+        }).flatten.toSet
+    }
+
+    table.schema().fields.map {
+      field =>
+        if (partitionFields.contains(field.name)) {
+          Row(field.name, field.dataType.toString, PARTITION.toString, getRule(PARTITION.toString))
+        } else {
+          val reason = getRule(field.dataType.toString)
+          field.dataType.toString match {
+            case "BooleanType" =>
+              Row(field.name, field.dataType.typeName, VALUE_SET.toString, reason)
+            case "IntegerType" | "LongType" | "ShortType" =>
+              Row(field.name, field.dataType.typeName, MIN_MAX.toString, reason)
+            case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" =>
+              Row(field.name, field.dataType.typeName, BLOOM_FILTER.toString, reason)
+          }
+        }
+    }.toSeq
+  }
+
+  private def getRule(dataTypeName: String): String = {
+    dataTypeName match {
+      case "PARTITION" => "PARTITION data structure is recommended for partition columns"
+      case "BooleanType" => "VALUE_SET data structure is recommended for BooleanType columns"
+      case "IntegerType" => "MIN_MAX data structure is recommended for IntegerType columns"
+      case "LongType" => "MIN_MAX data structure is recommended for LongType columns"
+      case "ShortType" => "MIN_MAX data structure is recommended for ShortType columns"
+      case "DateType" => "MIN_MAX data structure is recommended for DateType columns"
+      case "TimestampType" => "MIN_MAX data structure is recommended for TimestampType columns"
+      case "StringType" => "MIN_MAX data structure is recommended for StringType columns"
+      case "VarcharType" => "MIN_MAX data structure is recommended for VarcharType columns"
+      case "CharType" => "MIN_MAX data structure is recommended for CharType columns"
+      case "StructType" => "MIN_MAX data structure is recommended for StructType columns"
+    }
+  }
+
+}

From b04b0b879cbf1de1ed39a042a1f5554dd7db18fc Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Fri, 15 Mar 2024 15:16:12 -0700
Subject: [PATCH 05/11] Update recommendations

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../opensearch/flint/spark/FlintSpark.scala   |  3 +-
 .../DataTypeSkippingStrategy.scala            | 37 +++++++------------
 2 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index b80e8c564..5322953d0 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -342,8 +342,7 @@ class FlintSpark(val spark: SparkSession) extends Logging {
    *    skipping index recommendation dataframe
    */
   def analyzeSkippingIndex(tableName: String): Seq[Row] = {
-    val recommendation = new DataTypeSkippingStrategy()
-    recommendation.analyzeSkippingIndexColumns(tableName, spark)
+      new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark)
   }
 
   private def stopRefreshingJob(indexName: String): Unit = {
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
index 111c09e46..e672882ed 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
@@ -29,35 +29,26 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
     table.schema().fields.map {
       field =>
         if (partitionFields.contains(field.name)) {
-          Row(field.name, field.dataType.toString, PARTITION.toString, getRule(PARTITION.toString))
+          Row(field.name, field.dataType.toString, getRecommendation("PARTITION")._1, getRecommendation("PARTITION")._2)
         } else {
-          val reason = getRule(field.dataType.toString)
-          field.dataType.toString match {
-            case "BooleanType" =>
-              Row(field.name, field.dataType.typeName, VALUE_SET.toString, reason)
-            case "IntegerType" | "LongType" | "ShortType" =>
-              Row(field.name, field.dataType.typeName, MIN_MAX.toString, reason)
-            case "DateType" | "TimestampType" | "StringType" | "VarcharType" | "CharType" | "StructType" =>
-              Row(field.name, field.dataType.typeName, BLOOM_FILTER.toString, reason)
-          }
+          Row(field.name, field.dataType.toString, getRecommendation(field.dataType.typeName)._1, getRecommendation(field.dataType.toString)._2)
         }
     }.toSeq
   }
 
-  private def getRule(dataTypeName: String): String = {
+  private def getRecommendation(dataTypeName: String): (String, String) = {
     dataTypeName match {
-      case "PARTITION" => "PARTITION data structure is recommended for partition columns"
-      case "BooleanType" => "VALUE_SET data structure is recommended for BooleanType columns"
-      case "IntegerType" => "MIN_MAX data structure is recommended for IntegerType columns"
-      case "LongType" => "MIN_MAX data structure is recommended for LongType columns"
-      case "ShortType" => "MIN_MAX data structure is recommended for ShortType columns"
-      case "DateType" => "MIN_MAX data structure is recommended for DateType columns"
-      case "TimestampType" => "MIN_MAX data structure is recommended for TimestampType columns"
-      case "StringType" => "MIN_MAX data structure is recommended for StringType columns"
-      case "VarcharType" => "MIN_MAX data structure is recommended for VarcharType columns"
-      case "CharType" => "MIN_MAX data structure is recommended for CharType columns"
-      case "StructType" => "MIN_MAX data structure is recommended for StructType columns"
+      case "PARTITION" => (PARTITION.toString, "PARTITION data structure is recommended for partition columns")
+      case "BooleanType" => (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns")
+      case "IntegerType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns")
+      case "LongType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns")
+      case "ShortType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns")
+      case "DateType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns")
+      case "TimestampType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns")
+      case "StringType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns")
+      case "VarcharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns")
+      case "CharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns")
+      case "StructType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns")
     }
   }
-
 }

From 472a9628f716f615d9ed4f2dc4d1b325ebd0e148 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Sun, 17 Mar 2024 18:00:08 -0700
Subject: [PATCH 06/11] Add test

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../DataTypeSkippingStrategy.scala            | 41 +++++++++----------
 .../FlintSparkSkippingIndexSqlITSuite.scala   | 26 ++++++++++--
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
index e672882ed..7e2904726 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
@@ -6,15 +6,28 @@
 package org.opensearch.flint.spark.skipping.recommendations
 
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET}
+import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.flint.{loadTable, parseTableName}
 
 class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
 
-  override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = {
-    require(tableName.nonEmpty, "Source table name is not provided")
+  val rules = Map (
+    "PARTITION" -> (PARTITION.toString, "PARTITION data structure is recommended for partition columns"),
+    "BooleanType" -> (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns"),
+    "IntegerType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns"),
+    "LongType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns"),
+    "ShortType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns"),
+    "DateType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns"),
+    "TimestampType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns"),
+    "StringType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns"),
+    "VarcharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns"),
+    "CharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns"),
+    "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns")
+  )
 
+  override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = {
     val (catalog, ident) = parseTableName(spark, tableName)
     val table = loadTable(catalog, ident).getOrElse(
       throw new IllegalStateException(s"Table $tableName is not found"))
@@ -26,29 +39,15 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
         }).flatten.toSet
     }
 
+    val result = ArrayBuffer[Row]()
     table.schema().fields.map {
       field =>
         if (partitionFields.contains(field.name)) {
-          Row(field.name, field.dataType.toString, getRecommendation("PARTITION")._1, getRecommendation("PARTITION")._2)
-        } else {
-          Row(field.name, field.dataType.toString, getRecommendation(field.dataType.typeName)._1, getRecommendation(field.dataType.toString)._2)
+          result += Row(field.name, field.dataType.typeName, rules("PARTITION")._1, rules("PARTITION")._2)
+        } else if (rules.contains(field.dataType.toString)) {
+          result += Row(field.name, field.dataType.typeName, rules(field.dataType.toString)._1, rules(field.dataType.toString)._2)
         }
-    }.toSeq
-  }
-
-  private def getRecommendation(dataTypeName: String): (String, String) = {
-    dataTypeName match {
-      case "PARTITION" => (PARTITION.toString, "PARTITION data structure is recommended for partition columns")
-      case "BooleanType" => (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns")
-      case "IntegerType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns")
-      case "LongType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for LongType columns")
-      case "ShortType" => (MIN_MAX.toString, "MIN_MAX data structure is recommended for ShortType columns")
-      case "DateType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for DateType columns")
-      case "TimestampType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for TimestampType columns")
-      case "StringType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns")
-      case "VarcharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns")
-      case "CharType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns")
-      case "StructType" => (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns")
     }
+    result
   }
 }
diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
index ca14a555c..efdeea13d 100644
--- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
+++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
@@ -7,19 +7,18 @@ package org.opensearch.flint.spark
 
 import scala.Option.empty
 import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConverter}
-
 import org.json4s.{Formats, NoTypeHints}
 import org.json4s.native.JsonMethods.parse
 import org.json4s.native.Serialization
 import org.opensearch.flint.core.FlintOptions
 import org.opensearch.flint.core.storage.FlintOpenSearchClient
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName
-import org.scalatest.matchers.must.Matchers.defined
-import org.scalatest.matchers.should.Matchers.{convertToAnyShouldWrapper, the}
-
+import org.scalatest.matchers.must.Matchers.{be, defined}
+import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the}
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE
 import org.apache.spark.sql.flint.config.FlintSparkConf.CHECKPOINT_MANDATORY
+import org.mockito.IdiomaticMockito.thrown
 
 class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite {
 
@@ -318,4 +317,23 @@ class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite {
     sql(s"VACUUM SKIPPING INDEX ON $testTable")
     flint.describeIndex(testIndex) shouldBe empty
   }
+
+  test("analyze skipping index with for supported data types") {
+    val result = sql(s"ANALYZE SKIPPING INDEX ON $testTable")
+
+    checkAnswer(
+      result,
+      Seq(
+        Row("year", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"),
+        Row("month", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"),
+        Row("name", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"),
+        Row("age", "integer", "MIN_MAX", "MIN_MAX data structure is recommended for IntegerType columns"),
+        Row("address", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns")))
+  }
+
+  test("analyze skipping index on invalid table") {
+    the[IllegalStateException] thrownBy {
+      sql(s"ANALYZE SKIPPING INDEX ON testTable")
+    }
+  }
 }

From 37d3df33faf224d8b3f8572cc9d24fa043e16ad8 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Sun, 17 Mar 2024 20:23:01 -0700
Subject: [PATCH 07/11] Format code

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../opensearch/flint/spark/FlintSpark.scala   |  6 +--
 .../AnalyzeSkippingStrategy.scala             |  6 +--
 .../DataTypeSkippingStrategy.scala            | 42 ++++++++++++-------
 .../FlintSparkSkippingIndexSqlITSuite.scala   | 34 ++++++++++++---
 4 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index 5322953d0..89e8dc423 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -337,12 +337,12 @@ class FlintSpark(val spark: SparkSession) extends Logging {
    * Recommend skipping index columns and algorithm.
    *
    * @param tableName
-   *    table name
+   *   table name
    * @return
-   *    skipping index recommendation dataframe
+   *   skipping index recommendation dataframe
    */
   def analyzeSkippingIndex(tableName: String): Seq[Row] = {
-      new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark)
+    new DataTypeSkippingStrategy().analyzeSkippingIndexColumns(tableName, spark)
   }
 
   private def stopRefreshingJob(indexName: String): Unit = {
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
index 1f15f9b07..cd73b3854 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/AnalyzeSkippingStrategy.scala
@@ -15,10 +15,10 @@ trait AnalyzeSkippingStrategy {
    * Recommend skipping index columns and algorithm.
    *
    * @param tableName
-   * table name
+   *   table name
    * @return
-   * skipping index recommendation dataframe
+   *   skipping index recommendation dataframe
    */
-  def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession) : Seq[Row]
+  def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row]
 
 }
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
index 7e2904726..1f9cc0b68 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/skipping/recommendations/DataTypeSkippingStrategy.scala
@@ -5,15 +5,16 @@
 
 package org.opensearch.flint.spark.skipping.recommendations
 
-import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET}
 import scala.collection.mutable.ArrayBuffer
 
+import org.opensearch.flint.spark.skipping.FlintSparkSkippingStrategy.SkippingKind.{BLOOM_FILTER, MIN_MAX, PARTITION, VALUE_SET}
+
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.flint.{loadTable, parseTableName}
 
 class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
 
-  val rules = Map (
+  val rules = Map(
     "PARTITION" -> (PARTITION.toString, "PARTITION data structure is recommended for partition columns"),
     "BooleanType" -> (VALUE_SET.toString, "VALUE_SET data structure is recommended for BooleanType columns"),
     "IntegerType" -> (MIN_MAX.toString, "MIN_MAX data structure is recommended for IntegerType columns"),
@@ -24,29 +25,38 @@ class DataTypeSkippingStrategy extends AnalyzeSkippingStrategy {
     "StringType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StringType columns"),
     "VarcharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for VarcharType columns"),
     "CharType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for CharType columns"),
-    "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns")
-  )
+    "StructType" -> (BLOOM_FILTER.toString, "BLOOM_FILTER data structure is recommended for StructType columns"))
 
   override def analyzeSkippingIndexColumns(tableName: String, spark: SparkSession): Seq[Row] = {
     val (catalog, ident) = parseTableName(spark, tableName)
     val table = loadTable(catalog, ident).getOrElse(
       throw new IllegalStateException(s"Table $tableName is not found"))
 
-    val partitionFields = table.partitioning().flatMap {
-      transform =>
-        transform.references().collect({
-          case reference => reference.fieldNames()
-        }).flatten.toSet
+    val partitionFields = table.partitioning().flatMap { transform =>
+      transform
+        .references()
+        .collect({ case reference =>
+          reference.fieldNames()
+        })
+        .flatten
+        .toSet
     }
 
     val result = ArrayBuffer[Row]()
-    table.schema().fields.map {
-      field =>
-        if (partitionFields.contains(field.name)) {
-          result += Row(field.name, field.dataType.typeName, rules("PARTITION")._1, rules("PARTITION")._2)
-        } else if (rules.contains(field.dataType.toString)) {
-          result += Row(field.name, field.dataType.typeName, rules(field.dataType.toString)._1, rules(field.dataType.toString)._2)
-        }
+    table.schema().fields.map { field =>
+      if (partitionFields.contains(field.name)) {
+        result += Row(
+          field.name,
+          field.dataType.typeName,
+          rules("PARTITION")._1,
+          rules("PARTITION")._2)
+      } else if (rules.contains(field.dataType.toString)) {
+        result += Row(
+          field.name,
+          field.dataType.typeName,
+          rules(field.dataType.toString)._1,
+          rules(field.dataType.toString)._2)
+      }
     }
     result
   }
diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
index efdeea13d..f079cf02d 100644
--- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
+++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
@@ -7,18 +7,20 @@ package org.opensearch.flint.spark
 
 import scala.Option.empty
 import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConverter}
+
 import org.json4s.{Formats, NoTypeHints}
 import org.json4s.native.JsonMethods.parse
 import org.json4s.native.Serialization
+import org.mockito.IdiomaticMockito.thrown
 import org.opensearch.flint.core.FlintOptions
 import org.opensearch.flint.core.storage.FlintOpenSearchClient
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName
 import org.scalatest.matchers.must.Matchers.{be, defined}
 import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the}
+
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE
 import org.apache.spark.sql.flint.config.FlintSparkConf.CHECKPOINT_MANDATORY
-import org.mockito.IdiomaticMockito.thrown
 
 class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite {
 
@@ -324,11 +326,31 @@ class FlintSparkSkippingIndexSqlITSuite extends FlintSparkSuite {
     checkAnswer(
       result,
       Seq(
-        Row("year", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"),
-        Row("month", "integer", "PARTITION", "PARTITION data structure is recommended for partition columns"),
-        Row("name", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns"),
-        Row("age", "integer", "MIN_MAX", "MIN_MAX data structure is recommended for IntegerType columns"),
-        Row("address", "string", "BLOOM_FILTER", "BLOOM_FILTER data structure is recommended for StringType columns")))
+        Row(
+          "year",
+          "integer",
+          "PARTITION",
+          "PARTITION data structure is recommended for partition columns"),
+        Row(
+          "month",
+          "integer",
+          "PARTITION",
+          "PARTITION data structure is recommended for partition columns"),
+        Row(
+          "name",
+          "string",
+          "BLOOM_FILTER",
+          "BLOOM_FILTER data structure is recommended for StringType columns"),
+        Row(
+          "age",
+          "integer",
+          "MIN_MAX",
+          "MIN_MAX data structure is recommended for IntegerType columns"),
+        Row(
+          "address",
+          "string",
+          "BLOOM_FILTER",
+          "BLOOM_FILTER data structure is recommended for StringType columns")))
   }
 
   test("analyze skipping index on invalid table") {

From 2fdf421626bedb8df5f25537032a1c223536c1cb Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Sun, 17 Mar 2024 21:00:15 -0700
Subject: [PATCH 08/11] Remove unused import

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 .../sql/skipping/FlintSparkSkippingIndexAstBuilder.scala     | 2 +-
 .../flint/spark/FlintSparkSkippingIndexSqlITSuite.scala      | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
index 795e2cbb9..b0c449fea 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/sql/skipping/FlintSparkSkippingIndexAstBuilder.scala
@@ -20,7 +20,7 @@ import org.opensearch.flint.spark.sql.FlintSparkSqlExtensionsParser._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.plans.logical.Command
-import org.apache.spark.sql.types.{ArrayType, MapType, StringType}
+import org.apache.spark.sql.types.StringType
 
 /**
  * Flint Spark AST builder that builds Spark command for Flint skipping index statement.
diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
index f079cf02d..eeb467bf6 100644
--- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
+++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkSkippingIndexSqlITSuite.scala
@@ -11,12 +11,11 @@ import scala.collection.JavaConverters.{mapAsJavaMapConverter, mapAsScalaMapConv
 import org.json4s.{Formats, NoTypeHints}
 import org.json4s.native.JsonMethods.parse
 import org.json4s.native.Serialization
-import org.mockito.IdiomaticMockito.thrown
 import org.opensearch.flint.core.FlintOptions
 import org.opensearch.flint.core.storage.FlintOpenSearchClient
 import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName
-import org.scalatest.matchers.must.Matchers.{be, defined}
-import org.scalatest.matchers.should.Matchers.{an, convertToAnyShouldWrapper, the}
+import org.scalatest.matchers.must.Matchers.defined
+import org.scalatest.matchers.should.Matchers.{convertToAnyShouldWrapper, the}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.flint.FlintDataSourceV2.FLINT_DATASOURCE

From 54d7ee6c1d9d9e88376878549959313456c6d77b Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Mon, 18 Mar 2024 08:46:36 -0700
Subject: [PATCH 09/11] Update doc

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 docs/index.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 31147aed4..f5674c537 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -170,6 +170,8 @@ DROP SKIPPING INDEX ON <object>
 
 VACUUM SKIPPING INDEX ON <object>
 
+ANALYZE SKIPPING INDEX ON <object>
+
 <object> ::= [db_name].[schema_name].table_name
 ```
 

From e289a9c18d1a5aed76afd8cdf583b4c2de171233 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Mon, 18 Mar 2024 11:47:33 -0700
Subject: [PATCH 10/11] Update doc

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 docs/index.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/docs/index.md b/docs/index.md
index 6fc2f7b0c..e8067ce3d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -321,6 +321,31 @@ fetched rows / total rows = 3/3
 +-------------------------------------------------------------+----------+----------+-----------+-----------------+--------------+------------+
 ```
 
+- **Analyze Skipping Indexes**: Provides recommendation for creating skipping index. It outputs the following columns:
+  - column_name: recommended column's name
+  - column_type: recommended column's type
+  - skipping_type: recommended skipping type for column
+  - reason: why this skipping type is recommended
+
+```sql
+ANALYZE SKIPPING INDEX ON [catalog.database.]table
+```
+
+Example:
+```
+sql> ANALYZE SKIPPING INDEX ON alb_logs;
+fetched rows / total rows = 5/5
++-------------------------+-------------+---------------+-------------------------------------------------------------------+
+| column_name             | column_type | skipping_type | reason                                                            |
+|-------------------------+-------------+---------------+-------------------------------------------------------------------+
+| year                    | integer     | PARTITION     | PARTITION data structure is recommended for partition columns     |
+| month                   | integer     | PARTITION     | PARTITION data structure is recommended for partition columns     |
+| day                     | integer     | PARTITION     | PARTITION data structure is recommended for partition columns     |
+| request_processing_time | integer     | MIN_MAX       | MIN_MAX data structure is recommended for IntegerType columns     |
+| client_ip               | string      | BLOOM_FILTER  | BLOOM_FILTER data structure is recommended for StringType columns |
++-------------------------+-------------+---------------+-------------------------------------------------------------------+
+```
+
 #### Create Index Options
 
 User can provide the following options in `WITH` clause of create statement:

From a720116b5b26243bf891c7b504c28d0565387ea7 Mon Sep 17 00:00:00 2001
From: Rupal Mahajan <maharup@amazon.com>
Date: Mon, 18 Mar 2024 11:49:01 -0700
Subject: [PATCH 11/11] nit

Signed-off-by: Rupal Mahajan <maharup@amazon.com>
---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index e8067ce3d..3b1e2efe5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -321,7 +321,7 @@ fetched rows / total rows = 3/3
 +-------------------------------------------------------------+----------+----------+-----------+-----------------+--------------+------------+
 ```
 
-- **Analyze Skipping Indexes**: Provides recommendation for creating skipping index. It outputs the following columns:
+- **Analyze Skipping Index**: Provides recommendation for creating skipping index. It outputs the following columns:
   - column_name: recommended column's name
   - column_type: recommended column's type
   - skipping_type: recommended skipping type for column