From b66aac0b26dbe4a7e97644d2e63ac1d304e761ad Mon Sep 17 00:00:00 2001
From: Louis Chu <clingzhi@amazon.com>
Date: Tue, 12 Nov 2024 22:00:01 -0800
Subject: [PATCH] Support UnrecoverableException

Signed-off-by: Louis Chu <clingzhi@amazon.com>
---
 .../exception/UnrecoverableException.scala    | 25 +++++++
 .../apache/spark/sql/FlintJobExecutor.scala   | 41 +++++++----
 .../org/apache/spark/sql/FlintREPL.scala      | 69 +++++++++++--------
 .../org/apache/spark/sql/JobOperator.scala    | 57 ++++++++-------
 .../spark/sql/util/ThrowableHandler.scala     | 41 +++++++++++
 5 files changed, 164 insertions(+), 69 deletions(-)
 create mode 100644 flint-commons/src/main/scala/org/apache/spark/sql/exception/UnrecoverableException.scala
 create mode 100644 spark-sql-application/src/main/scala/org/apache/spark/sql/util/ThrowableHandler.scala

diff --git a/flint-commons/src/main/scala/org/apache/spark/sql/exception/UnrecoverableException.scala b/flint-commons/src/main/scala/org/apache/spark/sql/exception/UnrecoverableException.scala
new file mode 100644
index 000000000..c23178f00
--- /dev/null
+++ b/flint-commons/src/main/scala/org/apache/spark/sql/exception/UnrecoverableException.scala
@@ -0,0 +1,25 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.apache.spark.sql.exception
+
+/**
+ * Represents an unrecoverable exception in session management and statement execution. This
+ * exception is used for errors that cannot be handled or recovered from.
+ */
+class UnrecoverableException private (message: String, cause: Throwable)
+    extends RuntimeException(message, cause) {
+
+  def this(cause: Throwable) =
+    this(cause.getMessage, cause)
+}
+
+object UnrecoverableException {
+  def apply(cause: Throwable): UnrecoverableException =
+    new UnrecoverableException(cause)
+
+  def apply(message: String, cause: Throwable): UnrecoverableException =
+    new UnrecoverableException(message, cause)
+}
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
index 8e037a53e..63c120a2c 100644
--- a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
@@ -22,6 +22,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkConfConstants.{DEFAULT_SQL_EXTENSIONS, SQL_EXTENSIONS_KEY}
 import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.exception.UnrecoverableException
 import org.apache.spark.sql.flint.config.FlintSparkConf
 import org.apache.spark.sql.flint.config.FlintSparkConf.REFRESH_POLICY
 import org.apache.spark.sql.types._
@@ -44,12 +45,13 @@ trait FlintJobExecutor {
   this: Logging =>
 
   val mapper = new ObjectMapper()
+  val throwableHandler = new ThrowableHandler()
 
   var currentTimeProvider: TimeProvider = new RealTimeProvider()
   var threadPoolFactory: ThreadPoolFactory = new DefaultThreadPoolFactory()
   var environmentProvider: EnvironmentProvider = new RealEnvironment()
   var enableHiveSupport: Boolean = true
-  // termiante JVM in the presence non-deamon thread before exiting
+  // terminate JVM in the presence non-daemon thread before exiting
   var terminateJVM = true
 
   // The enabled setting, which can be applied only to the top-level mapping definition and to object fields,
@@ -435,11 +437,13 @@ trait FlintJobExecutor {
   }
 
   private def handleQueryException(
-      e: Exception,
+      t: Throwable,
       messagePrefix: String,
       errorSource: Option[String] = None,
       statusCode: Option[Int] = None): String = {
-    val errorMessage = s"$messagePrefix: ${e.getMessage}"
+    throwableHandler.setThrowable(t)
+
+    val errorMessage = s"$messagePrefix: ${t.getMessage}"
     val errorDetails = new java.util.LinkedHashMap[String, String]()
     errorDetails.put("Message", errorMessage)
     errorSource.foreach(es => errorDetails.put("ErrorSource", es))
@@ -450,25 +454,25 @@ trait FlintJobExecutor {
     // CustomLogging will call log4j logger.error() underneath
     statusCode match {
       case Some(code) =>
-        CustomLogging.logError(new OperationMessage(errorMessage, code), e)
+        CustomLogging.logError(new OperationMessage(errorMessage, code), t)
       case None =>
-        CustomLogging.logError(errorMessage, e)
+        CustomLogging.logError(errorMessage, t)
     }
 
     errorJson
   }
 
-  def getRootCause(e: Throwable): Throwable = {
-    if (e.getCause == null) e
-    else getRootCause(e.getCause)
+  def getRootCause(t: Throwable): Throwable = {
+    if (t.getCause == null) t
+    else getRootCause(t.getCause)
   }
 
   /**
    * This method converts query exception into error string, which then persist to query result
    * metadata
    */
-  def processQueryException(ex: Exception): String = {
-    getRootCause(ex) match {
+  def processQueryException(throwable: Throwable): String = {
+    getRootCause(throwable) match {
       case r: ParseException =>
         handleQueryException(r, ExceptionMessages.SyntaxErrorPrefix)
       case r: AmazonS3Exception =>
@@ -495,15 +499,15 @@ trait FlintJobExecutor {
         handleQueryException(r, ExceptionMessages.QueryAnalysisErrorPrefix)
       case r: SparkException =>
         handleQueryException(r, ExceptionMessages.SparkExceptionErrorPrefix)
-      case r: Exception =>
-        val rootCauseClassName = r.getClass.getName
-        val errMsg = r.getMessage
+      case t: Throwable =>
+        val rootCauseClassName = t.getClass.getName
+        val errMsg = t.getMessage
         if (rootCauseClassName == "org.apache.hadoop.hive.metastore.api.MetaException" &&
           errMsg.contains("com.amazonaws.services.glue.model.AccessDeniedException")) {
           val e = new SecurityException(ExceptionMessages.GlueAccessDeniedMessage)
           handleQueryException(e, ExceptionMessages.QueryRunErrorPrefix)
         } else {
-          handleQueryException(r, ExceptionMessages.QueryRunErrorPrefix)
+          handleQueryException(t, ExceptionMessages.QueryRunErrorPrefix)
         }
     }
   }
@@ -532,6 +536,14 @@ trait FlintJobExecutor {
     throw t
   }
 
+  def checkAndThrowUnrecoverableExceptions(): Unit = {
+    throwableHandler.exceptionThrown.foreach {
+      case e: UnrecoverableException =>
+        throw e
+      case _ => // Do nothing for other types of exceptions
+    }
+  }
+
   def instantiate[T](defaultConstructor: => T, className: String, args: Any*): T = {
     if (Strings.isNullOrEmpty(className)) {
       defaultConstructor
@@ -551,5 +563,4 @@ trait FlintJobExecutor {
       }
     }
   }
-
 }
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala
index 9b6ff4ff6..caafa9c88 100644
--- a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintREPL.scala
@@ -59,6 +59,10 @@ object FlintREPL extends Logging with FlintJobExecutor {
   private val statementRunningCount = new AtomicInteger(0)
   private var queryCountMetric = 0
 
+  // After handling any exceptions from stopping the Spark session,
+  // check if there's a stored exception and throw it if it's an UnrecoverableException
+  sys.addShutdownHook(checkAndThrowUnrecoverableExceptions())
+
   def main(args: Array[String]) {
     val (queryOption, resultIndexOption) = parseArgs(args)
 
@@ -187,9 +191,9 @@ object FlintREPL extends Logging with FlintJobExecutor {
         }
         recordSessionSuccess(sessionTimerContext)
       } catch {
-        case e: Exception =>
+        case t: Throwable =>
           handleSessionError(
-            e,
+            t,
             applicationId,
             jobId,
             sessionId,
@@ -420,24 +424,29 @@ object FlintREPL extends Logging with FlintJobExecutor {
   }
 
   def handleSessionError(
-      e: Exception,
+      t: Throwable,
       applicationId: String,
       jobId: String,
       sessionId: String,
       sessionManager: SessionManager,
       jobStartTime: Long,
       sessionTimerContext: Timer.Context): Unit = {
-    val error = s"Session error: ${e.getMessage}"
-    CustomLogging.logError(error, e)
+    throwableHandler.recordThrowable(s"Session error: ${t.getMessage}", t)
+
+    try {
+      refreshSessionState(
+        applicationId,
+        jobId,
+        sessionId,
+        sessionManager,
+        jobStartTime,
+        SessionStates.FAIL,
+        Some(t.getMessage))
+    } catch {
+      case t: Throwable =>
+        throwableHandler.recordThrowable(s"Failed to update session state.", t)
+    }
 
-    refreshSessionState(
-      applicationId,
-      jobId,
-      sessionId,
-      sessionManager,
-      jobStartTime,
-      SessionStates.FAIL,
-      Some(e.getMessage))
     recordSessionFailed(sessionTimerContext)
   }
 
@@ -485,8 +494,8 @@ object FlintREPL extends Logging with FlintJobExecutor {
       startTime)
   }
 
-  def processQueryException(ex: Exception, flintStatement: FlintStatement): String = {
-    val error = super.processQueryException(ex)
+  def processQueryException(t: Throwable, flintStatement: FlintStatement): String = {
+    val error = super.processQueryException(t)
     flintStatement.fail()
     flintStatement.error = Some(error)
     error
@@ -581,11 +590,13 @@ object FlintREPL extends Logging with FlintJobExecutor {
     } catch {
       // e.g., maybe due to authentication service connection issue
       // or invalid catalog (e.g., we are operating on data not defined in provided data source)
-      case e: Exception =>
-        val error = s"""Fail to write result of ${flintStatement}, cause: ${e.getMessage}"""
-        CustomLogging.logError(error, e)
+      case e: Throwable =>
+        throwableHandler.recordThrowable(
+          s"""Fail to write result of ${flintStatement}, cause: ${e.getMessage}""",
+          e)
         flintStatement.fail()
     } finally {
+      logInfo(s"command complete: $flintStatement")
       statementExecutionManager.updateStatement(flintStatement)
       recordStatementStateChange(flintStatement, statementTimerContext)
     }
@@ -671,8 +682,8 @@ object FlintREPL extends Logging with FlintJobExecutor {
             flintStatement,
             sessionId,
             startTime))
-      case e: Exception =>
-        val error = processQueryException(e, flintStatement)
+      case t: Throwable =>
+        val error = processQueryException(t, flintStatement)
         Some(
           handleCommandFailureAndGetFailedData(
             applicationId,
@@ -747,7 +758,7 @@ object FlintREPL extends Logging with FlintJobExecutor {
                 startTime))
           case NonFatal(e) =>
             val error = s"An unexpected error occurred: ${e.getMessage}"
-            CustomLogging.logError(error, e)
+            throwableHandler.recordThrowable(error, e)
             dataToWrite = Some(
               handleCommandFailureAndGetFailedData(
                 applicationId,
@@ -786,7 +797,6 @@ object FlintREPL extends Logging with FlintJobExecutor {
           queryWaitTimeMillis)
     }
 
-    logInfo(s"command complete: $flintStatement")
     (dataToWrite, verificationResult)
   }
 
@@ -858,7 +868,8 @@ object FlintREPL extends Logging with FlintJobExecutor {
           }
         }
       } catch {
-        case e: Exception => logError(s"Failed to update session state for $sessionId", e)
+        case t: Throwable =>
+          throwableHandler.recordThrowable(s"Failed to update session state for $sessionId", t)
       }
     }
   }
@@ -897,10 +908,10 @@ object FlintREPL extends Logging with FlintJobExecutor {
                 MetricConstants.REQUEST_METADATA_HEARTBEAT_FAILED_METRIC
               ) // Record heartbeat failure metric
             // maybe due to invalid sequence number or primary term
-            case e: Exception =>
-              CustomLogging.logWarning(
+            case t: Throwable =>
+              throwableHandler.recordThrowable(
                 s"""Fail to update the last update time of the flint instance ${sessionId}""",
-                e)
+                t)
               incrementCounter(
                 MetricConstants.REQUEST_METADATA_HEARTBEAT_FAILED_METRIC
               ) // Record heartbeat failure metric
@@ -948,8 +959,10 @@ object FlintREPL extends Logging with FlintJobExecutor {
       }
     } catch {
       // still proceed since we are not sure what happened (e.g., OpenSearch cluster may be unresponsive)
-      case e: Exception =>
-        CustomLogging.logError(s"""Fail to find id ${sessionId} from session index.""", e)
+      case t: Throwable =>
+        throwableHandler.recordThrowable(
+          s"""Fail to find id ${sessionId} from session index.""",
+          t)
         true
     }
   }
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
index 8582d3037..27b0be84f 100644
--- a/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/JobOperator.scala
@@ -82,9 +82,6 @@ case class JobOperator(
         LangType.SQL,
         currentTimeProvider.currentEpochMillis())
 
-    var exceptionThrown = true
-    var error: String = null
-
     try {
       val futurePrepareQueryExecution = Future {
         statementExecutionManager.prepareStatementExecution()
@@ -94,7 +91,7 @@ case class JobOperator(
         ThreadUtils.awaitResult(futurePrepareQueryExecution, Duration(1, MINUTES)) match {
           case Right(_) => data
           case Left(err) =>
-            error = err
+            throwableHandler.setError(err)
             constructErrorDF(
               applicationId,
               jobId,
@@ -107,11 +104,9 @@ case class JobOperator(
               "",
               startTime)
         })
-      exceptionThrown = false
     } catch {
       case e: TimeoutException =>
-        error = s"Preparation for query execution timed out"
-        logError(error, e)
+        throwableHandler.recordThrowable(s"Preparation for query execution timed out", e)
         dataToWrite = Some(
           constructErrorDF(
             applicationId,
@@ -119,13 +114,13 @@ case class JobOperator(
             sparkSession,
             dataSource,
             "TIMEOUT",
-            error,
+            throwableHandler.error,
             queryId,
             query,
             "",
             startTime))
-      case e: Exception =>
-        val error = processQueryException(e)
+      case t: Throwable =>
+        val error = processQueryException(t)
         dataToWrite = Some(
           constructErrorDF(
             applicationId,
@@ -146,27 +141,32 @@ case class JobOperator(
       try {
         dataToWrite.foreach(df => writeDataFrameToOpensearch(df, resultIndex, osClient))
       } catch {
-        case e: Exception =>
-          exceptionThrown = true
-          error = s"Failed to write to result index. originalError='${error}'"
-          logError(error, e)
+        case t: Throwable =>
+          throwableHandler.recordThrowable(
+            s"Failed to write to result index. originalError='${throwableHandler.error}'",
+            t)
       }
-      if (exceptionThrown) statement.fail() else statement.complete()
-      statement.error = Some(error)
-      statementExecutionManager.updateStatement(statement)
+      if (throwableHandler.hasException) statement.fail() else statement.complete()
+      statement.error = Some(throwableHandler.error)
 
-      cleanUpResources(exceptionThrown, threadPool, startTime)
+      try {
+        statementExecutionManager.updateStatement(statement)
+      } catch {
+        case t: Throwable =>
+          throwableHandler.recordThrowable(
+            s"Failed to update statement. originalError='${throwableHandler.error}'",
+            t)
+      }
+
+      cleanUpResources(threadPool)
     }
   }
 
-  def cleanUpResources(
-      exceptionThrown: Boolean,
-      threadPool: ThreadPoolExecutor,
-      startTime: Long): Unit = {
+  def cleanUpResources(threadPool: ThreadPoolExecutor): Unit = {
     val isStreaming = jobType.equalsIgnoreCase(FlintJobType.STREAMING)
     try {
       // Wait for streaming job complete if no error
-      if (!exceptionThrown && isStreaming) {
+      if (!throwableHandler.hasException && isStreaming) {
         // Clean Spark shuffle data after each microBatch.
         sparkSession.streams.addListener(new ShuffleCleaner(sparkSession))
         // Await index monitor before the main thread terminates
@@ -174,7 +174,7 @@ case class JobOperator(
       } else {
         logInfo(s"""
            | Skip streaming job await due to conditions not met:
-           |  - exceptionThrown: $exceptionThrown
+           |  - exceptionThrown: ${throwableHandler.hasException}
            |  - streaming: $isStreaming
            |  - activeStreams: ${sparkSession.streams.active.mkString(",")}
            |""".stripMargin)
@@ -190,7 +190,7 @@ case class JobOperator(
     } catch {
       case e: Exception => logError("Fail to close threadpool", e)
     }
-    recordStreamingCompletionStatus(exceptionThrown)
+    recordStreamingCompletionStatus(throwableHandler.hasException)
 
     // Check for non-daemon threads that may prevent the driver from shutting down.
     // Non-daemon threads other than the main thread indicate that the driver is still processing tasks,
@@ -219,8 +219,13 @@ case class JobOperator(
       logInfo("Stopped Spark session")
     } match {
       case Success(_) =>
-      case Failure(e) => logError("unexpected error while stopping spark session", e)
+      case Failure(e) =>
+        throwableHandler.recordThrowable("unexpected error while stopping spark session", e)
     }
+
+    // After handling any exceptions from stopping the Spark session,
+    // check if there's a stored exception and throw it if it's an UnrecoverableException
+    checkAndThrowUnrecoverableExceptions()
   }
 
   /**
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/util/ThrowableHandler.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/util/ThrowableHandler.scala
new file mode 100644
index 000000000..01c90bdd4
--- /dev/null
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/util/ThrowableHandler.scala
@@ -0,0 +1,41 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.apache.spark.sql.util
+
+import org.opensearch.flint.core.logging.CustomLogging
+
+/**
+ * Handles and manages exceptions and error messages during each emr job run. Provides methods to
+ * set, retrieve, and reset exception information.
+ */
+class ThrowableHandler {
+  private var _throwableOption: Option[Throwable] = None
+  private var _error: String = _
+
+  def exceptionThrown: Option[Throwable] = _throwableOption
+  def error: String = _error
+
+  def recordThrowable(err: String, t: Throwable): Unit = {
+    _error = err
+    _throwableOption = Some(t)
+    CustomLogging.logError(err, t)
+  }
+
+  def setError(err: String): Unit = {
+    _error = err
+  }
+
+  def setThrowable(t: Throwable): Unit = {
+    _throwableOption = Some(t)
+  }
+
+  def reset(): Unit = {
+    _throwableOption = None
+    _error = null
+  }
+
+  def hasException: Boolean = _throwableOption.isDefined
+}