opensearch-project · noCharger · Aug 28, 2024 · Aug 9, 2024 · Aug 15, 2024 · Aug 15, 2024
@@ -0,0 +1,20 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.apache.spark.sql
+
+import org.opensearch.flint.common.model.FlintStatement
+
+/**
+ * Trait for writing the result of a query execution to an external data storage.
+ */
+trait QueryResultWriter {
+
+  /**
+   * Writes the given DataFrame, which represents the result of a query execution, to an external
+   * data storage based on the provided FlintStatement metadata.
+   */
+  def writeDataFrame(dataFrame: DataFrame, flintStatement: FlintStatement): Unit
+}
@@ -0,0 +1,43 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.apache.spark.sql
+
+import org.opensearch.flint.common.model.{FlintStatement, InteractiveSession}
+
+import org.apache.spark.sql.SessionUpdateMode.SessionUpdateMode
+
+/**
+ * Trait defining the interface for managing interactive sessions.
+ */
+trait SessionManager {
+
+  /**
+   * Retrieves metadata about the session manager.
+   */
+  def getSessionContext: Map[String, Any]
+
+  /**
+   * Fetches the details of a specific session.
+   */
+  def getSessionDetails(sessionId: String): Option[InteractiveSession]
+
+  /**
+   * Updates the details of a specific session.
+   */
+  def updateSessionDetails(
+      sessionDetails: InteractiveSession,
+      updateMode: SessionUpdateMode): Unit
+
+  /**
+   * Records a heartbeat for a specific session to indicate it is still active.
+   */
+  def recordHeartbeat(sessionId: String): Unit
+}
+
+object SessionUpdateMode extends Enumeration {
+  type SessionUpdateMode = Value
+  val UPDATE, UPSERT, UPDATE_IF = Value
+}
@@ -0,0 +1,42 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.apache.spark.sql
+
+import org.opensearch.flint.common.model.FlintStatement
+
+/**
+ * Trait defining the interface for managing FlintStatement execution. For example, in FlintREPL,
+ * multiple FlintStatements are running in a micro-batch within same session.
+ *
+ * This interface can also apply to other spark entry point like FlintJob.
+ */
+trait StatementExecutionManager {
+
+  /**
+   * Prepares execution of each individual statement
+   */
+  def prepareStatementExecution(): Either[String, Unit]
+
+  /**
+   * Executes a specific statement and returns the spark dataframe
+   */
+  def executeStatement(statement: FlintStatement): DataFrame
+
+  /**
+   * Retrieves the next statement to be executed.
+   */
+  def getNextStatement(): Option[FlintStatement]
+
+  /**
+   * Updates a specific statement.
+   */
+  def updateStatement(statement: FlintStatement): Unit
+
+  /**
+   * Terminates the statement lifecycle.
+   */
+  def terminateStatementsExecution(): Unit
+}
@@ -65,7 +65,7 @@ class FlintStatement(
 
   // Does not include context, which could contain sensitive information.
   override def toString: String =
-    s"FlintStatement(state=$state, query=$query, statementId=$statementId, queryId=$queryId, submitTime=$submitTime, error=$error)"
+    s"FlintStatement(state=$state, statementId=$statementId, queryId=$queryId, submitTime=$submitTime, error=$error)"
 }
 
 object FlintStatement {

@@ -22,7 +22,6 @@
  * Abstract OpenSearch Reader.
  */
 public abstract class OpenSearchReader implements FlintReader {
-
   @VisibleForTesting
   /** Search request source builder. */
   public final SearchRequest searchRequest;

@@ -231,6 +231,15 @@ object FlintSparkConf {
     FlintConfig("spark.metadata.accessAWSCredentialsProvider")
       .doc("AWS credentials provider for metadata access permission")
       .createOptional()
+  val CUSTOM_SESSION_MANAGER =
+    FlintConfig("spark.flint.job.customSessionManager")
+      .createOptional()
+  val CUSTOM_STATEMENT_MANAGER =
+    FlintConfig("spark.flint.job.customStatementManager")
+      .createOptional()
+  val CUSTOM_QUERY_RESULT_WRITER =
+    FlintConfig("spark.flint.job.customQueryResultWriter")
+      .createOptional()
 }
 
 /**

@@ -145,7 +145,6 @@ trait OpenSearchSuite extends BeforeAndAfterAll {
 
       val response =
         openSearchClient.bulk(request, RequestOptions.DEFAULT)
-
       assume(
         !response.hasFailures,
         s"bulk index docs to $index failed: ${response.buildFailureMessage()}")

@@ -11,15 +11,14 @@ import scala.concurrent.duration.Duration
 import org.opensearch.flint.core.storage.{FlintReader, OpenSearchUpdater}
 
 case class CommandContext(
-    spark: SparkSession,
-    dataSource: String,
-    resultIndex: String,
-    sessionId: String,
-    flintSessionIndexUpdater: OpenSearchUpdater,
-    osClient: OSClient,
-    sessionIndex: String,
-    jobId: String,
-    queryExecutionTimeout: Duration,
-    inactivityLimitMillis: Long,
-    queryWaitTimeMillis: Long,
-    queryLoopExecutionFrequency: Long)
+    val spark: SparkSession,
+    val dataSource: String,
+    val sessionId: String,
+    val sessionManager: SessionManager,
+    val jobId: String,
+    var statementsExecutionManager: StatementExecutionManager,
+    val queryResultWriter: QueryResultWriter,
+    val queryExecutionTimeout: Duration,
+    val inactivityLimitMillis: Long,
+    val queryWaitTimeMillis: Long,
+    val queryLoopExecutionFrequency: Long)
@@ -12,7 +12,6 @@ import org.opensearch.flint.core.storage.FlintReader
 case class CommandState(
     recordedLastActivityTime: Long,
     recordedVerificationResult: VerificationResult,
-    flintReader: FlintReader,
-    futureMappingCheck: Future[Either[String, Unit]],
+    futurePrepareQueryExecution: Future[Either[String, Unit]],
     executionContext: ExecutionContextExecutor,
     recordedLastCanPickCheckTime: Long)
@@ -29,7 +29,7 @@ import org.apache.spark.sql.types._
  */
 object FlintJob extends Logging with FlintJobExecutor {
   def main(args: Array[String]): Unit = {
-    val (queryOption, resultIndex) = parseArgs(args)
+    val (queryOption, resultIndexOption) = parseArgs(args)
 
     val conf = createSparkConf()
     val jobType = conf.get("spark.flint.job.type", "batch")
@@ -41,6 +41,9 @@ object FlintJob extends Logging with FlintJobExecutor {
     if (query.isEmpty) {
       logAndThrow(s"Query undefined for the ${jobType} job.")
     }
+    if (resultIndexOption.isEmpty) {
+      logAndThrow("resultIndex is not set")
+    }
     // https://github.com/opensearch-project/opensearch-spark/issues/138
     /*
      * To execute queries such as `CREATE SKIPPING INDEX ON my_glue1.default.http_logs_plain (`@timestamp` VALUE_SET) WITH (auto_refresh = true)`,
@@ -58,7 +61,7 @@ object FlintJob extends Logging with FlintJobExecutor {
         createSparkSession(conf),
         query,
         dataSource,
-        resultIndex,
+        resultIndexOption.get,
         jobType.equalsIgnoreCase("streaming"),
         streamingRunningCount)
     registerGauge(MetricConstants.STREAMING_RUNNING_METRIC, streamingRunningCount)

@@ -493,16 +493,21 @@ trait FlintJobExecutor {
     }
   }
 
-  def parseArgs(args: Array[String]): (Option[String], String) = {
+  /**
+   * Before OS 2.13, there are two arguments from entry point: query and result index Starting
+   * from OS 2.13, query is optional for FlintREPL And since Flint 0.5, result index is also
+   * optional for non-OpenSearch result persist
+   */
+  def parseArgs(args: Array[String]): (Option[String], Option[String]) = {
     args match {
+      case Array() =>
+        (None, None)
       case Array(resultIndex) =>
-        (None, resultIndex) // Starting from OS 2.13, resultIndex is the only argument
+        (None, Some(resultIndex))
       case Array(query, resultIndex) =>
-        (
-          Some(query),
-          resultIndex
-        ) // Before OS 2.13, there are two arguments, the second one is resultIndex
-      case _ => logAndThrow("Unsupported number of arguments. Expected 1 or 2 arguments.")
+        (Some(query), Some(resultIndex))
+      case _ =>
+        logAndThrow("Unsupported number of arguments. Expected no more than two arguments.")
     }
   }