From 8374cb6050a7941d086c9dd355ef268388358f73 Mon Sep 17 00:00:00 2001
From: Chen Dai <daichen@amazon.com>
Date: Mon, 18 Mar 2024 20:16:54 -0700
Subject: [PATCH] Implement vacuum index operation (#2557)

* Add vacuum operation and IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Add index state doc delete and more IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Refactor IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix bytebuddy version conflict

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix broken IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix broken IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix jacoco failure with new IT

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix code format

Signed-off-by: Chen Dai <daichen@amazon.com>

* Fix jacoco test coverage

Signed-off-by: Chen Dai <daichen@amazon.com>

---------

Signed-off-by: Chen Dai <daichen@amazon.com>
---
 .../sql/spark/dispatcher/IndexDMLHandler.java |  21 ++
 .../dispatcher/SparkQueryDispatcher.java      |   7 +-
 .../model/IndexQueryActionType.java           |   1 +
 .../execution/statestore/StateStore.java      |  39 +++
 .../sql/spark/flint/FlintIndexState.java      |   4 +-
 .../spark/flint/operation/FlintIndexOp.java   |   8 +-
 .../flint/operation/FlintIndexOpVacuum.java   |  55 ++++
 .../sql/spark/utils/SQLQueryUtils.java        |  25 ++
 .../AsyncQueryGetResultSpecTest.java          |   2 +-
 .../spark/asyncquery/IndexQuerySpecTest.java  |  20 +-
 .../asyncquery/IndexQuerySpecVacuumTest.java  | 240 ++++++++++++++++++
 .../asyncquery/model/MockFlintSparkJob.java   |   5 +
 .../spark/dispatcher/IndexDMLHandlerTest.java |  17 +-
 .../0.1.1/flint_covering_index.json           |   4 +-
 .../flint-index-mappings/0.1.1/flint_mv.json  |   4 +-
 .../0.1.1/flint_skipping_index.json           |   2 +-
 16 files changed, 435 insertions(+), 19 deletions(-)
 create mode 100644 spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOpVacuum.java
 create mode 100644 spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecVacuumTest.java

diff --git a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandler.java b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandler.java
index c2351bcd0b..d1ebf21e24 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandler.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandler.java
@@ -16,6 +16,7 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.json.JSONObject;
+import org.opensearch.client.Client;
 import org.opensearch.sql.datasource.model.DataSourceMetadata;
 import org.opensearch.sql.spark.asyncquery.model.AsyncQueryId;
 import org.opensearch.sql.spark.asyncquery.model.AsyncQueryJobMetadata;
@@ -32,6 +33,7 @@
 import org.opensearch.sql.spark.flint.operation.FlintIndexOp;
 import org.opensearch.sql.spark.flint.operation.FlintIndexOpAlter;
 import org.opensearch.sql.spark.flint.operation.FlintIndexOpDrop;
+import org.opensearch.sql.spark.flint.operation.FlintIndexOpVacuum;
 import org.opensearch.sql.spark.response.JobExecutionResponseReader;
 
 /** Handle Index DML query. includes * DROP * ALT? */
@@ -51,6 +53,8 @@ public class IndexDMLHandler extends AsyncQueryHandler {
 
   private final StateStore stateStore;
 
+  private final Client client;
+
   public static boolean isIndexDMLQuery(String jobId) {
     return DROP_INDEX_JOB_ID.equalsIgnoreCase(jobId) || DML_QUERY_JOB_ID.equalsIgnoreCase(jobId);
   }
@@ -127,6 +131,23 @@ private void executeIndexOp(
                 flintIndexMetadataService);
         flintIndexOpAlter.apply(indexMetadata);
         break;
+      case VACUUM:
+        // Try to perform drop operation first
+        FlintIndexOp tryDropOp =
+            new FlintIndexOpDrop(
+                stateStore, dispatchQueryRequest.getDatasource(), emrServerlessClient);
+        try {
+          tryDropOp.apply(indexMetadata);
+        } catch (IllegalStateException e) {
+          // Drop failed possibly due to invalid initial state
+        }
+
+        // Continue to delete index data physically if state is DELETED
+        // which means previous transaction succeeds
+        FlintIndexOp indexVacuumOp =
+            new FlintIndexOpVacuum(stateStore, dispatchQueryRequest.getDatasource(), client);
+        indexVacuumOp.apply(indexMetadata);
+        break;
       default:
         throw new IllegalStateException(
             String.format(
diff --git a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/SparkQueryDispatcher.java b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/SparkQueryDispatcher.java
index f32c3433e8..2760b30123 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/SparkQueryDispatcher.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/SparkQueryDispatcher.java
@@ -112,6 +112,7 @@ private boolean isEligibleForStreamingQuery(IndexQueryDetails indexQueryDetails)
 
   private boolean isEligibleForIndexDMLHandling(IndexQueryDetails indexQueryDetails) {
     return IndexQueryActionType.DROP.equals(indexQueryDetails.getIndexQueryActionType())
+        || IndexQueryActionType.VACUUM.equals(indexQueryDetails.getIndexQueryActionType())
         || (IndexQueryActionType.ALTER.equals(indexQueryDetails.getIndexQueryActionType())
             && (indexQueryDetails
                     .getFlintIndexOptions()
@@ -161,7 +162,11 @@ public String cancelJob(AsyncQueryJobMetadata asyncQueryJobMetadata) {
 
   private IndexDMLHandler createIndexDMLHandler(EMRServerlessClient emrServerlessClient) {
     return new IndexDMLHandler(
-        emrServerlessClient, jobExecutionResponseReader, flintIndexMetadataService, stateStore);
+        emrServerlessClient,
+        jobExecutionResponseReader,
+        flintIndexMetadataService,
+        stateStore,
+        client);
   }
 
   // TODO: Revisit this logic.
diff --git a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/model/IndexQueryActionType.java b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/model/IndexQueryActionType.java
index 93e44f00ea..96e7d159af 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/dispatcher/model/IndexQueryActionType.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/dispatcher/model/IndexQueryActionType.java
@@ -12,5 +12,6 @@ public enum IndexQueryActionType {
   DESCRIBE,
   SHOW,
   DROP,
+  VACUUM,
   ALTER
 }
diff --git a/spark/src/main/java/org/opensearch/sql/spark/execution/statestore/StateStore.java b/spark/src/main/java/org/opensearch/sql/spark/execution/statestore/StateStore.java
index e99087b24d..e50a2837d9 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/execution/statestore/StateStore.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/execution/statestore/StateStore.java
@@ -24,6 +24,8 @@
 import org.opensearch.action.DocWriteResponse;
 import org.opensearch.action.admin.indices.create.CreateIndexRequest;
 import org.opensearch.action.admin.indices.create.CreateIndexResponse;
+import org.opensearch.action.delete.DeleteRequest;
+import org.opensearch.action.delete.DeleteResponse;
 import org.opensearch.action.get.GetRequest;
 import org.opensearch.action.get.GetResponse;
 import org.opensearch.action.index.IndexRequest;
@@ -167,6 +169,33 @@ public <T extends StateModel, S> T updateState(
     }
   }
 
+  /**
+   * Delete the index state document with the given ID.
+   *
+   * @param sid index state doc ID
+   * @param indexName index store index name
+   * @return true if deleted, otherwise false
+   */
+  @VisibleForTesting
+  public boolean delete(String sid, String indexName) {
+    try {
+      // No action if the index doesn't exist
+      if (!this.clusterService.state().routingTable().hasIndex(indexName)) {
+        return true;
+      }
+
+      try (ThreadContext.StoredContext ignored =
+          client.threadPool().getThreadContext().stashContext()) {
+        DeleteRequest deleteRequest = new DeleteRequest(indexName, sid);
+        DeleteResponse deleteResponse = client.delete(deleteRequest).actionGet();
+        return deleteResponse.getResult() == DocWriteResponse.Result.DELETED;
+      }
+    } catch (Exception e) {
+      throw new RuntimeException(
+          String.format("Failed to delete index state doc %s in index %s", sid, indexName), e);
+    }
+  }
+
   private void createIndex(String indexName) {
     try {
       CreateIndexRequest createIndexRequest = new CreateIndexRequest(indexName);
@@ -328,6 +357,16 @@ public static Function<FlintIndexStateModel, FlintIndexStateModel> createFlintIn
             st, FlintIndexStateModel::copy, DATASOURCE_TO_REQUEST_INDEX.apply(datasourceName));
   }
 
+  /**
+   * @param stateStore index state store
+   * @param datasourceName data source name
+   * @return function that accepts index state doc ID and perform the deletion
+   */
+  public static Function<String, Boolean> deleteFlintIndexState(
+      StateStore stateStore, String datasourceName) {
+    return (docId) -> stateStore.delete(docId, DATASOURCE_TO_REQUEST_INDEX.apply(datasourceName));
+  }
+
   public static Function<IndexDMLResult, IndexDMLResult> createIndexDMLResult(
       StateStore stateStore, String indexName) {
     return (result) -> stateStore.create(result, IndexDMLResult::copy, indexName);
diff --git a/spark/src/main/java/org/opensearch/sql/spark/flint/FlintIndexState.java b/spark/src/main/java/org/opensearch/sql/spark/flint/FlintIndexState.java
index 36ac8fe715..3d6532b8ea 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/flint/FlintIndexState.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/flint/FlintIndexState.java
@@ -37,7 +37,9 @@ public enum FlintIndexState {
   // stable state
   FAILED("failed"),
   // unknown state, if some state update in Spark side, not reflect in here.
-  UNKNOWN("unknown");
+  UNKNOWN("unknown"),
+  // special state that instructs StateStore to purge the index state doc
+  NONE("none");
 
   private final String state;
 
diff --git a/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOp.java b/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOp.java
index 37d36a49db..0e99c18eef 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOp.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOp.java
@@ -5,6 +5,7 @@
 
 package org.opensearch.sql.spark.flint.operation;
 
+import static org.opensearch.sql.spark.execution.statestore.StateStore.deleteFlintIndexState;
 import static org.opensearch.sql.spark.execution.statestore.StateStore.getFlintIndexState;
 import static org.opensearch.sql.spark.execution.statestore.StateStore.updateFlintIndexState;
 
@@ -120,7 +121,12 @@ private void commit(FlintIndexStateModel flintIndex) {
     LOG.debug("Committing the transaction and moving to stable state.");
     FlintIndexState stableState = stableState();
     try {
-      updateFlintIndexState(stateStore, datasourceName).apply(flintIndex, stableState);
+      if (stableState == FlintIndexState.NONE) {
+        LOG.info("Deleting index state with docId: " + flintIndex.getLatestId());
+        deleteFlintIndexState(stateStore, datasourceName).apply(flintIndex.getLatestId());
+      } else {
+        updateFlintIndexState(stateStore, datasourceName).apply(flintIndex, stableState);
+      }
     } catch (Exception e) {
       String errorMsg =
           String.format(Locale.ROOT, "commit failed. target stable state: [%s]", stableState);
diff --git a/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOpVacuum.java b/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOpVacuum.java
new file mode 100644
index 0000000000..cf204450e7
--- /dev/null
+++ b/spark/src/main/java/org/opensearch/sql/spark/flint/operation/FlintIndexOpVacuum.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.sql.spark.flint.operation;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.opensearch.action.admin.indices.delete.DeleteIndexRequest;
+import org.opensearch.action.support.master.AcknowledgedResponse;
+import org.opensearch.client.Client;
+import org.opensearch.sql.spark.execution.statestore.StateStore;
+import org.opensearch.sql.spark.flint.FlintIndexMetadata;
+import org.opensearch.sql.spark.flint.FlintIndexState;
+import org.opensearch.sql.spark.flint.FlintIndexStateModel;
+
+/** Flint index vacuum operation. */
+public class FlintIndexOpVacuum extends FlintIndexOp {
+
+  private static final Logger LOG = LogManager.getLogger();
+
+  /** OpenSearch client. */
+  private final Client client;
+
+  public FlintIndexOpVacuum(StateStore stateStore, String datasourceName, Client client) {
+    super(stateStore, datasourceName);
+    this.client = client;
+  }
+
+  @Override
+  boolean validate(FlintIndexState state) {
+    return state == FlintIndexState.DELETED;
+  }
+
+  @Override
+  FlintIndexState transitioningState() {
+    return FlintIndexState.VACUUMING;
+  }
+
+  @Override
+  public void runOp(FlintIndexMetadata flintIndexMetadata, FlintIndexStateModel flintIndex) {
+    LOG.info("Vacuuming Flint index {}", flintIndexMetadata.getOpensearchIndexName());
+    DeleteIndexRequest request =
+        new DeleteIndexRequest().indices(flintIndexMetadata.getOpensearchIndexName());
+    AcknowledgedResponse response = client.admin().indices().delete(request).actionGet();
+    LOG.info("OpenSearch index delete result: {}", response.isAcknowledged());
+  }
+
+  @Override
+  FlintIndexState stableState() {
+    // Instruct StateStore to purge the index state doc
+    return FlintIndexState.NONE;
+  }
+}
diff --git a/spark/src/main/java/org/opensearch/sql/spark/utils/SQLQueryUtils.java b/spark/src/main/java/org/opensearch/sql/spark/utils/SQLQueryUtils.java
index 1ac177771c..78978dcb71 100644
--- a/spark/src/main/java/org/opensearch/sql/spark/utils/SQLQueryUtils.java
+++ b/spark/src/main/java/org/opensearch/sql/spark/utils/SQLQueryUtils.java
@@ -190,6 +190,31 @@ public Void visitDropMaterializedViewStatement(
       return super.visitDropMaterializedViewStatement(ctx);
     }
 
+    @Override
+    public Void visitVacuumSkippingIndexStatement(
+        FlintSparkSqlExtensionsParser.VacuumSkippingIndexStatementContext ctx) {
+      indexQueryDetailsBuilder.indexQueryActionType(IndexQueryActionType.VACUUM);
+      indexQueryDetailsBuilder.indexType(FlintIndexType.SKIPPING);
+      return super.visitVacuumSkippingIndexStatement(ctx);
+    }
+
+    @Override
+    public Void visitVacuumCoveringIndexStatement(
+        FlintSparkSqlExtensionsParser.VacuumCoveringIndexStatementContext ctx) {
+      indexQueryDetailsBuilder.indexQueryActionType(IndexQueryActionType.VACUUM);
+      indexQueryDetailsBuilder.indexType(FlintIndexType.COVERING);
+      return super.visitVacuumCoveringIndexStatement(ctx);
+    }
+
+    @Override
+    public Void visitVacuumMaterializedViewStatement(
+        FlintSparkSqlExtensionsParser.VacuumMaterializedViewStatementContext ctx) {
+      indexQueryDetailsBuilder.indexQueryActionType(IndexQueryActionType.VACUUM);
+      indexQueryDetailsBuilder.indexType(FlintIndexType.MATERIALIZED_VIEW);
+      indexQueryDetailsBuilder.mvName(ctx.mvName.getText());
+      return super.visitVacuumMaterializedViewStatement(ctx);
+    }
+
     @Override
     public Void visitDescribeCoveringIndexStatement(
         FlintSparkSqlExtensionsParser.DescribeCoveringIndexStatementContext ctx) {
diff --git a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/AsyncQueryGetResultSpecTest.java b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/AsyncQueryGetResultSpecTest.java
index 3acbfc439c..3a9b6e12a9 100644
--- a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/AsyncQueryGetResultSpecTest.java
+++ b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/AsyncQueryGetResultSpecTest.java
@@ -47,7 +47,7 @@ public class AsyncQueryGetResultSpecTest extends AsyncQueryExecutorServiceSpec {
               "REFRESH SKIPPING INDEX ON mys3.default.http_logs",
               FlintIndexType.SKIPPING,
               "flint_mys3_default_http_logs_skipping_index")
-          .latestId("skippingindexid");
+          .latestId("ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19za2lwcGluZ19pbmRleA==");
 
   private MockFlintSparkJob mockIndexState;
 
diff --git a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecTest.java b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecTest.java
index 25b94f2d11..132074de63 100644
--- a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecTest.java
+++ b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecTest.java
@@ -29,7 +29,7 @@
 public class IndexQuerySpecTest extends AsyncQueryExecutorServiceSpec {
   public final String REFRESH_SI = "REFRESH SKIPPING INDEX on mys3.default.http_logs";
   public final String REFRESH_CI = "REFRESH INDEX covering ON mys3.default.http_logs";
-  public final String REFRESH_MV = "REFRESH MATERIALIZED VIEW mv";
+  public final String REFRESH_MV = "REFRESH MATERIALIZED VIEW mys3.default.http_logs_metrics";
 
   public final FlintDatasetMock LEGACY_SKIPPING =
       new FlintDatasetMock(
@@ -47,7 +47,10 @@ public class IndexQuerySpecTest extends AsyncQueryExecutorServiceSpec {
           .isLegacy(true);
   public final FlintDatasetMock LEGACY_MV =
       new FlintDatasetMock(
-              "DROP MATERIALIZED VIEW mv", REFRESH_MV, FlintIndexType.MATERIALIZED_VIEW, "flint_mv")
+              "DROP MATERIALIZED VIEW mys3.default.http_logs_metrics",
+              REFRESH_MV,
+              FlintIndexType.MATERIALIZED_VIEW,
+              "flint_mys3_default_http_logs_metrics")
           .isLegacy(true);
 
   public final FlintDatasetMock SKIPPING =
@@ -56,18 +59,21 @@ public class IndexQuerySpecTest extends AsyncQueryExecutorServiceSpec {
               REFRESH_SI,
               FlintIndexType.SKIPPING,
               "flint_mys3_default_http_logs_skipping_index")
-          .latestId("skippingindexid");
+          .latestId("ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19za2lwcGluZ19pbmRleA==");
   public final FlintDatasetMock COVERING =
       new FlintDatasetMock(
               "DROP INDEX covering ON mys3.default.http_logs",
               REFRESH_CI,
               FlintIndexType.COVERING,
               "flint_mys3_default_http_logs_covering_index")
-          .latestId("coveringid");
+          .latestId("ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19jb3ZlcmluZ19pbmRleA==");
   public final FlintDatasetMock MV =
       new FlintDatasetMock(
-              "DROP MATERIALIZED VIEW mv", REFRESH_MV, FlintIndexType.MATERIALIZED_VIEW, "flint_mv")
-          .latestId("mvid");
+              "DROP MATERIALIZED VIEW mys3.default.http_logs_metrics",
+              REFRESH_MV,
+              FlintIndexType.MATERIALIZED_VIEW,
+              "flint_mys3_default_http_logs_metrics")
+          .latestId("ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19tZXRyaWNz");
   public final String CREATE_SI_AUTO =
       "CREATE SKIPPING INDEX ON mys3.default.http_logs"
           + "(l_orderkey VALUE_SET) WITH (auto_refresh = true)";
@@ -77,7 +83,7 @@ public class IndexQuerySpecTest extends AsyncQueryExecutorServiceSpec {
           + "(l_orderkey, l_quantity) WITH (auto_refresh = true)";
 
   public final String CREATE_MV_AUTO =
-      "CREATE MATERIALIZED VIEW mv AS select * "
+      "CREATE MATERIALIZED VIEW mys3.default.http_logs_metrics AS select * "
           + "from mys3.default.https WITH (auto_refresh = true)";
 
   /**
diff --git a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecVacuumTest.java b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecVacuumTest.java
new file mode 100644
index 0000000000..67c89c791c
--- /dev/null
+++ b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/IndexQuerySpecVacuumTest.java
@@ -0,0 +1,240 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.sql.spark.asyncquery;
+
+import static org.opensearch.sql.spark.execution.statestore.StateStore.DATASOURCE_TO_REQUEST_INDEX;
+import static org.opensearch.sql.spark.flint.FlintIndexState.ACTIVE;
+import static org.opensearch.sql.spark.flint.FlintIndexState.CREATING;
+import static org.opensearch.sql.spark.flint.FlintIndexState.DELETED;
+import static org.opensearch.sql.spark.flint.FlintIndexState.EMPTY;
+import static org.opensearch.sql.spark.flint.FlintIndexState.REFRESHING;
+import static org.opensearch.sql.spark.flint.FlintIndexState.VACUUMING;
+import static org.opensearch.sql.spark.flint.FlintIndexType.COVERING;
+import static org.opensearch.sql.spark.flint.FlintIndexType.MATERIALIZED_VIEW;
+import static org.opensearch.sql.spark.flint.FlintIndexType.SKIPPING;
+
+import com.amazonaws.services.emrserverless.model.CancelJobRunResult;
+import com.amazonaws.services.emrserverless.model.GetJobRunResult;
+import com.amazonaws.services.emrserverless.model.JobRun;
+import com.google.common.collect.Lists;
+import java.util.Base64;
+import java.util.List;
+import java.util.function.BiConsumer;
+import org.apache.commons.lang3.tuple.Pair;
+import org.junit.Test;
+import org.opensearch.action.admin.indices.exists.indices.IndicesExistsRequest;
+import org.opensearch.action.get.GetRequest;
+import org.opensearch.sql.spark.asyncquery.model.AsyncQueryExecutionResponse;
+import org.opensearch.sql.spark.asyncquery.model.MockFlintSparkJob;
+import org.opensearch.sql.spark.client.EMRServerlessClientFactory;
+import org.opensearch.sql.spark.flint.FlintIndexState;
+import org.opensearch.sql.spark.flint.FlintIndexType;
+import org.opensearch.sql.spark.rest.model.CreateAsyncQueryRequest;
+import org.opensearch.sql.spark.rest.model.CreateAsyncQueryResponse;
+import org.opensearch.sql.spark.rest.model.LangType;
+
+@SuppressWarnings({"unchecked", "rawtypes"})
+public class IndexQuerySpecVacuumTest extends AsyncQueryExecutorServiceSpec {
+
+  private static final EMRApiCall DEFAULT_OP = () -> null;
+
+  private final List<FlintDatasetMock> FLINT_TEST_DATASETS =
+      List.of(
+          mockDataset(
+              "VACUUM SKIPPING INDEX ON mys3.default.http_logs",
+              SKIPPING,
+              "flint_mys3_default_http_logs_skipping_index"),
+          mockDataset(
+              "VACUUM INDEX covering ON mys3.default.http_logs",
+              COVERING,
+              "flint_mys3_default_http_logs_covering_index"),
+          mockDataset(
+              "VACUUM MATERIALIZED VIEW mys3.default.http_logs_metrics",
+              MATERIALIZED_VIEW,
+              "flint_mys3_default_http_logs_metrics"));
+
+  @Test
+  public void shouldVacuumIndexInRefreshingState() {
+    List<List<Object>> testCases =
+        Lists.cartesianProduct(
+            FLINT_TEST_DATASETS,
+            List.of(REFRESHING),
+            List.of(
+                // Happy case that there is job running
+                Pair.<EMRApiCall, EMRApiCall>of(
+                    DEFAULT_OP,
+                    () -> new GetJobRunResult().withJobRun(new JobRun().withState("Cancelled"))),
+                // Cancel EMR-S job, but not job running
+                Pair.<EMRApiCall, EMRApiCall>of(
+                    () -> {
+                      throw new IllegalArgumentException("Job run is not in a cancellable state");
+                    },
+                    DEFAULT_OP)));
+
+    runVacuumTestSuite(
+        testCases,
+        (mockDS, response) -> {
+          assertEquals("SUCCESS", response.getStatus());
+          assertFalse(flintIndexExists(mockDS.indexName));
+          assertFalse(indexDocExists(mockDS.latestId));
+        });
+  }
+
+  @Test
+  public void shouldNotVacuumIndexInRefreshingStateIfCancelTimeout() {
+    List<List<Object>> testCases =
+        Lists.cartesianProduct(
+            FLINT_TEST_DATASETS,
+            List.of(REFRESHING),
+            List.of(
+                Pair.<EMRApiCall, EMRApiCall>of(
+                    DEFAULT_OP,
+                    () -> new GetJobRunResult().withJobRun(new JobRun().withState("Running")))));
+
+    runVacuumTestSuite(
+        testCases,
+        (mockDS, response) -> {
+          assertEquals("FAILED", response.getStatus());
+          assertEquals("Cancel job operation timed out.", response.getError());
+          assertTrue(indexExists(mockDS.indexName));
+          assertTrue(indexDocExists(mockDS.latestId));
+        });
+  }
+
+  @Test
+  public void shouldNotVacuumIndexInVacuumingState() {
+    List<List<Object>> testCases =
+        Lists.cartesianProduct(
+            FLINT_TEST_DATASETS,
+            List.of(VACUUMING),
+            List.of(
+                Pair.<EMRApiCall, EMRApiCall>of(
+                    () -> {
+                      throw new AssertionError("should not call cancelJobRun");
+                    },
+                    () -> {
+                      throw new AssertionError("should not call getJobRunResult");
+                    })));
+
+    runVacuumTestSuite(
+        testCases,
+        (mockDS, response) -> {
+          assertEquals("FAILED", response.getStatus());
+          assertTrue(flintIndexExists(mockDS.indexName));
+          assertTrue(indexDocExists(mockDS.latestId));
+        });
+  }
+
+  @Test
+  public void shouldVacuumIndexWithoutJobRunning() {
+    List<List<Object>> testCases =
+        Lists.cartesianProduct(
+            FLINT_TEST_DATASETS,
+            List.of(EMPTY, CREATING, ACTIVE, DELETED),
+            List.of(
+                Pair.<EMRApiCall, EMRApiCall>of(
+                    DEFAULT_OP,
+                    () -> new GetJobRunResult().withJobRun(new JobRun().withState("Cancelled")))));
+
+    runVacuumTestSuite(
+        testCases,
+        (mockDS, response) -> {
+          assertEquals("SUCCESS", response.getStatus());
+          assertFalse(flintIndexExists(mockDS.indexName));
+          assertFalse(indexDocExists(mockDS.latestId));
+        });
+  }
+
+  private void runVacuumTestSuite(
+      List<List<Object>> testCases,
+      BiConsumer<FlintDatasetMock, AsyncQueryExecutionResponse> assertion) {
+    testCases.forEach(
+        params -> {
+          FlintDatasetMock mockDS = (FlintDatasetMock) params.get(0);
+          FlintIndexState state = (FlintIndexState) params.get(1);
+          EMRApiCall cancelJobRun = ((Pair<EMRApiCall, EMRApiCall>) params.get(2)).getLeft();
+          EMRApiCall getJobRunResult = ((Pair<EMRApiCall, EMRApiCall>) params.get(2)).getRight();
+
+          AsyncQueryExecutionResponse response =
+              runVacuumTest(mockDS, state, cancelJobRun, getJobRunResult);
+          assertion.accept(mockDS, response);
+        });
+  }
+
+  private AsyncQueryExecutionResponse runVacuumTest(
+      FlintDatasetMock mockDS,
+      FlintIndexState state,
+      EMRApiCall<CancelJobRunResult> cancelJobRun,
+      EMRApiCall<GetJobRunResult> getJobRunResult) {
+    LocalEMRSClient emrsClient =
+        new LocalEMRSClient() {
+          @Override
+          public CancelJobRunResult cancelJobRun(String applicationId, String jobId) {
+            if (cancelJobRun == DEFAULT_OP) {
+              return super.cancelJobRun(applicationId, jobId);
+            }
+            return cancelJobRun.call();
+          }
+
+          @Override
+          public GetJobRunResult getJobRunResult(String applicationId, String jobId) {
+            if (getJobRunResult == DEFAULT_OP) {
+              return super.getJobRunResult(applicationId, jobId);
+            }
+            return getJobRunResult.call();
+          }
+        };
+    EMRServerlessClientFactory emrServerlessClientFactory = () -> emrsClient;
+    AsyncQueryExecutorService asyncQueryExecutorService =
+        createAsyncQueryExecutorService(emrServerlessClientFactory);
+
+    // Mock Flint index
+    mockDS.createIndex();
+
+    // Mock index state doc
+    MockFlintSparkJob flintIndexJob = new MockFlintSparkJob(stateStore, mockDS.latestId, "mys3");
+    flintIndexJob.transition(state);
+
+    // Vacuum index
+    CreateAsyncQueryResponse response =
+        asyncQueryExecutorService.createAsyncQuery(
+            new CreateAsyncQueryRequest(mockDS.query, DATASOURCE, LangType.SQL, null));
+
+    return asyncQueryExecutorService.getAsyncQueryResults(response.getQueryId());
+  }
+
+  private boolean flintIndexExists(String flintIndexName) {
+    return client
+        .admin()
+        .indices()
+        .exists(new IndicesExistsRequest(flintIndexName))
+        .actionGet()
+        .isExists();
+  }
+
+  private boolean indexDocExists(String docId) {
+    return client
+        .get(new GetRequest(DATASOURCE_TO_REQUEST_INDEX.apply("mys3"), docId))
+        .actionGet()
+        .isExists();
+  }
+
+  private FlintDatasetMock mockDataset(String query, FlintIndexType indexType, String indexName) {
+    FlintDatasetMock dataset = new FlintDatasetMock(query, "", indexType, indexName);
+    dataset.latestId(Base64.getEncoder().encodeToString(indexName.getBytes()));
+    return dataset;
+  }
+
+  /**
+   * EMR API call mock interface.
+   *
+   * @param <V> API call response type
+   */
+  @FunctionalInterface
+  public interface EMRApiCall<V> {
+    V call();
+  }
+}
diff --git a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/model/MockFlintSparkJob.java b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/model/MockFlintSparkJob.java
index 0840ce975c..4cfdb6a9a9 100644
--- a/spark/src/test/java/org/opensearch/sql/spark/asyncquery/model/MockFlintSparkJob.java
+++ b/spark/src/test/java/org/opensearch/sql/spark/asyncquery/model/MockFlintSparkJob.java
@@ -38,6 +38,11 @@ public MockFlintSparkJob(StateStore stateStore, String latestId, String datasour
     stateModel = StateStore.createFlintIndexState(stateStore, datasource).apply(stateModel);
   }
 
+  public void transition(FlintIndexState newState) {
+    stateModel =
+        StateStore.updateFlintIndexState(stateStore, datasource).apply(stateModel, newState);
+  }
+
   public void refreshing() {
     stateModel =
         StateStore.updateFlintIndexState(stateStore, datasource)
diff --git a/spark/src/test/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandlerTest.java b/spark/src/test/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandlerTest.java
index ac03e817dd..045de66d0a 100644
--- a/spark/src/test/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandlerTest.java
+++ b/spark/src/test/java/org/opensearch/sql/spark/dispatcher/IndexDMLHandlerTest.java
@@ -24,6 +24,7 @@
 import org.mockito.Mock;
 import org.mockito.Mockito;
 import org.mockito.junit.jupiter.MockitoExtension;
+import org.opensearch.client.Client;
 import org.opensearch.sql.datasource.model.DataSourceMetadata;
 import org.opensearch.sql.datasource.model.DataSourceType;
 import org.opensearch.sql.spark.client.EMRServerlessClient;
@@ -46,10 +47,12 @@ class IndexDMLHandlerTest {
   @Mock private JobExecutionResponseReader jobExecutionResponseReader;
   @Mock private FlintIndexMetadataService flintIndexMetadataService;
   @Mock private StateStore stateStore;
+  @Mock private Client client;
 
   @Test
   public void getResponseFromExecutor() {
-    JSONObject result = new IndexDMLHandler(null, null, null, null).getResponseFromExecutor(null);
+    JSONObject result =
+        new IndexDMLHandler(null, null, null, null, null).getResponseFromExecutor(null);
 
     assertEquals("running", result.getString(STATUS_FIELD));
     assertEquals("", result.getString(ERROR_FIELD));
@@ -59,7 +62,11 @@ public void getResponseFromExecutor() {
   public void testWhenIndexDetailsAreNotFound() {
     IndexDMLHandler indexDMLHandler =
         new IndexDMLHandler(
-            emrServerlessClient, jobExecutionResponseReader, flintIndexMetadataService, stateStore);
+            emrServerlessClient,
+            jobExecutionResponseReader,
+            flintIndexMetadataService,
+            stateStore,
+            client);
     DispatchQueryRequest dispatchQueryRequest =
         new DispatchQueryRequest(
             EMRS_APPLICATION_ID,
@@ -97,7 +104,11 @@ public void testWhenIndexDetailsWithInvalidQueryActionType() {
     FlintIndexMetadata flintIndexMetadata = mock(FlintIndexMetadata.class);
     IndexDMLHandler indexDMLHandler =
         new IndexDMLHandler(
-            emrServerlessClient, jobExecutionResponseReader, flintIndexMetadataService, stateStore);
+            emrServerlessClient,
+            jobExecutionResponseReader,
+            flintIndexMetadataService,
+            stateStore,
+            client);
     DispatchQueryRequest dispatchQueryRequest =
         new DispatchQueryRequest(
             EMRS_APPLICATION_ID,
diff --git a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_covering_index.json b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_covering_index.json
index 54ed5e05e1..811204847c 100644
--- a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_covering_index.json
+++ b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_covering_index.json
@@ -19,7 +19,7 @@
         "columnName": "request_url"
       }
     ],
-    "name": "test",
+    "name": "covering",
     "options": {
       "auto_refresh": "true",
       "index_settings": "{\"number_of_shards\":1,\"number_of_replicas\":1}"
@@ -32,6 +32,6 @@
         "SERVERLESS_EMR_JOB_ID": "00fe3gu2tgad000q"
       }
     },
-    "latestId": "coveringid"
+    "latestId": "ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19jb3ZlcmluZ19pbmRleA=="
   }
 }
diff --git a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_mv.json b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_mv.json
index 1a9c74806a..1369f9c721 100644
--- a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_mv.json
+++ b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_mv.json
@@ -11,7 +11,7 @@
         "columnName": "count"
       }
     ],
-    "name": "spark_catalog.default.http_logs_metrics_chen",
+    "name": "mys3.default.http_logs_metrics",
     "options": {
       "auto_refresh": "true",
       "checkpoint_location": "s3://flint-data-dp-eu-west-1-beta/data/checkpoint/chen-job-1",
@@ -25,6 +25,6 @@
         "SERVERLESS_EMR_JOB_ID": "00fe86mkk5q3u00q"
       }
     },
-    "latestId": "mvid"
+    "latestId": "ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19tZXRyaWNz"
   }
 }
diff --git a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_skipping_index.json b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_skipping_index.json
index 5e7c9175fd..2f65b1d8ee 100644
--- a/spark/src/test/resources/flint-index-mappings/0.1.1/flint_skipping_index.json
+++ b/spark/src/test/resources/flint-index-mappings/0.1.1/flint_skipping_index.json
@@ -18,6 +18,6 @@
         "SERVERLESS_EMR_JOB_ID": "00fdmvv9hp8u0o0q"
       }
     },
-    "latestId": "skippingindexid"
+    "latestId": "ZmxpbnRfbXlzM19kZWZhdWx0X2h0dHBfbG9nc19za2lwcGluZ19pbmRleA=="
   }
 }