From 4fe86d9cd98edfe2b6f33f0d4e79f4c2c1be48af Mon Sep 17 00:00:00 2001
From: Chen Dai <daichen@amazon.com>
Date: Wed, 7 Feb 2024 09:25:28 -0800
Subject: [PATCH] Fix recover index bug when Flint data index is deleted
 accidentally (#241)

* Clean up metadata log in recover index API

Signed-off-by: Chen Dai <daichen@amazon.com>

* Await termination only if there is streaming job running

Signed-off-by: Chen Dai <daichen@amazon.com>

* Update user manual

Signed-off-by: Chen Dai <daichen@amazon.com>

---------

Signed-off-by: Chen Dai <daichen@amazon.com>
---
 docs/index.md                                 |  2 ++
 .../opensearch/flint/spark/FlintSpark.scala   | 14 ++++++++++++++
 .../spark/FlintSparkTransactionITSuite.scala  | 19 +++++++++++++++++++
 .../org/apache/spark/sql/JobOperator.scala    |  4 ++--
 4 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/docs/index.md b/docs/index.md
index ea6778f39..3ceac9088 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -266,6 +266,8 @@ WITH (
 
 Currently Flint index job ID is same as internal Flint index name in [OpenSearch](./index.md#OpenSearch) section below.
 
+- **Recover Job**: Initiates a restart of the index refresh job and transition the Flint index to the 'refreshing' state. Additionally, it includes functionality to clean up the metadata log entry in the event that the Flint data index is no longer present in OpenSearch.
+
 ```sql
 RECOVER INDEX JOB <id>
 ```
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index 175436fbf..d8c168d5b 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -270,6 +270,20 @@ class FlintSpark(val spark: SparkSession) extends Logging {
       }
     } else {
       logInfo("Index to be recovered either doesn't exist or not auto refreshed")
+      if (index.isEmpty) {
+        /*
+         * If execution reaches this point, it indicates that the Flint index is corrupted.
+         * In such cases, clean up the metadata log, as the index data no longer exists.
+         * There is a very small possibility that users may recreate the index in the
+         * interim, but metadata log get deleted by this cleanup process.
+         */
+        logWarning("Cleaning up metadata log as index data has been deleted")
+        flintClient
+          .startTransaction(indexName, dataSourceName)
+          .initialLog(_ => true)
+          .finalLog(_ => NO_LOG_ENTRY)
+          .commit(_ => {})
+      }
       false
     }
   }
diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala
index 56227533a..8fa60f8ad 100644
--- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala
+++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala
@@ -159,6 +159,25 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match
     } should have message s"Flint index $testFlintIndex already exists"
   }
 
+  test("should clean up metadata log entry if index data has been deleted") {
+    flint
+      .skippingIndex()
+      .onTable(testTable)
+      .addPartitions("year", "month")
+      .options(FlintSparkIndexOptions(Map("auto_refresh" -> "true")))
+      .create()
+    flint.refreshIndex(testFlintIndex, INCREMENTAL)
+
+    // Simulate the situation that user delete index data directly and then refresh exits
+    spark.streams.active.find(_.name == testFlintIndex).get.stop()
+    deleteIndex(testFlintIndex)
+
+    // Index state is refreshing and expect recover API clean it up
+    latestLogEntry(testLatestId) should contain("state" -> "refreshing")
+    flint.recoverIndex(testFlintIndex)
+    latestLogEntry(testLatestId) shouldBe empty
+  }
+
   private def deleteLogically(latestId: String): Unit = {
     val response = openSearchClient
       .get(new GetRequest(testMetaLogIndex, latestId), RequestOptions.DEFAULT)
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
index c60d250ea..a702d2c64 100644
--- a/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
@@ -84,8 +84,8 @@ case class JobOperator(
     }
 
     try {
-      // Stop SparkSession if streaming job succeeds
-      if (!exceptionThrown && streaming) {
+      // Wait for streaming job complete if no error and there is streaming job running
+      if (!exceptionThrown && streaming && spark.streams.active.nonEmpty) {
         // wait if any child thread to finish before the main thread terminates
         spark.streams.awaitAnyTermination()
       }