From a9b894e97d82efaa547b6aae1c4e3e5ecc19db27 Mon Sep 17 00:00:00 2001
From: Chen Dai <daichen@amazon.com>
Date: Wed, 18 Oct 2023 09:01:08 -0700
Subject: [PATCH] Change UT and IT with watermark delay option

Signed-off-by: Chen Dai <daichen@amazon.com>
---
 .../spark/mv/FlintSparkMaterializedView.scala | 16 ++++++++-------
 .../mv/FlintSparkMaterializedViewSuite.scala  | 20 ++++++++++++++-----
 .../FlintSparkMaterializedViewITSuite.scala   | 18 ++++++++++++-----
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala
index ee58ec7f5..112de680f 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedView.scala
@@ -44,9 +44,6 @@ case class FlintSparkMaterializedView(
     extends FlintSparkIndex
     with StreamingRefresh {
 
-  /** TODO: add it to index option */
-  private val watermarkDelay = "0 Minute"
-
   override val kind: String = MV_INDEX_TYPE
 
   override def name(): String = getFlintIndexName(mvName)
@@ -81,8 +78,8 @@ case class FlintSparkMaterializedView(
      *  2.Set isStreaming flag to true in Relation operator
      */
     val streamingPlan = batchPlan transform {
-      case WindowingAggregate(agg, timeCol) =>
-        agg.copy(child = watermark(timeCol, watermarkDelay, agg.child))
+      case WindowingAggregate(aggregate, timeCol) =>
+        aggregate.copy(child = watermark(timeCol, aggregate.child))
 
       case relation: UnresolvedRelation if !relation.isStreaming =>
         relation.copy(isStreaming = true)
@@ -90,7 +87,12 @@ case class FlintSparkMaterializedView(
     logicalPlanToDataFrame(spark, streamingPlan)
   }
 
-  private def watermark(timeCol: Attribute, delay: String, child: LogicalPlan) = {
+  private def watermark(timeCol: Attribute, child: LogicalPlan) = {
+    require(
+      options.watermarkDelay().isDefined,
+      "watermark delay is required for incremental refresh with aggregation")
+
+    val delay = options.watermarkDelay().get
     EventTimeWatermark(timeCol, IntervalUtils.fromIntervalString(delay), child)
   }
 
@@ -107,7 +109,7 @@ case class FlintSparkMaterializedView(
 
       if (winFuncs.size != 1) {
         throw new IllegalStateException(
-          "A windowing function is required for streaming aggregation")
+          "A windowing function is required for incremental refresh with aggregation")
       }
 
       // Assume first aggregate item must be time column
diff --git a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedViewSuite.scala b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedViewSuite.scala
index c28495c69..1f9b52963 100644
--- a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedViewSuite.scala
+++ b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/mv/FlintSparkMaterializedViewSuite.scala
@@ -100,19 +100,24 @@ class FlintSparkMaterializedViewSuite extends FlintSuite {
           | GROUP BY TUMBLE(time, '1 Minute')
           |""".stripMargin
 
-      val mv = FlintSparkMaterializedView(testMvName, testQuery, Map.empty)
+      val mv = FlintSparkMaterializedView(
+        testMvName,
+        testQuery,
+        Map.empty,
+        FlintSparkIndexOptions(Map("watermark_delay" -> "30 Seconds")))
+
       val actualPlan = mv.buildStream(spark).queryExecution.logical
       assert(
         actualPlan.sameSemantics(
           streamingRelation(testTable)
-            .watermark($"time", "0 Minute")
+            .watermark($"time", "30 Seconds")
             .groupBy($"TUMBLE".function($"time", "1 Minute"))(
               $"window.start" as "startTime",
               count(1) as "count")))
     }
   }
 
-  test("build stream with filtering query") {
+  test("build stream with filtering aggregate query") {
     val testTable = "mv_build_test"
     withTable(testTable) {
       sql(s"CREATE TABLE $testTable (time TIMESTAMP, name STRING, age INT) USING CSV")
@@ -127,13 +132,18 @@ class FlintSparkMaterializedViewSuite extends FlintSuite {
            | GROUP BY TUMBLE(time, '1 Minute')
            |""".stripMargin
 
-      val mv = FlintSparkMaterializedView(testMvName, testQuery, Map.empty)
+      val mv = FlintSparkMaterializedView(
+        testMvName,
+        testQuery,
+        Map.empty,
+        FlintSparkIndexOptions(Map("watermark_delay" -> "30 Seconds")))
+
       val actualPlan = mv.buildStream(spark).queryExecution.logical
       assert(
         actualPlan.sameSemantics(
           streamingRelation(testTable)
             .where($"age" > 30)
-            .watermark($"time", "0 Minute")
+            .watermark($"time", "30 Seconds")
             .groupBy($"TUMBLE".function($"time", "1 Minute"))(
               $"window.start" as "startTime",
               count(1) as "count")))
diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewITSuite.scala
index 29ab433c6..29ce4e248 100644
--- a/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewITSuite.scala
+++ b/integ-test/src/test/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewITSuite.scala
@@ -43,7 +43,11 @@ class FlintSparkMaterializedViewITSuite extends FlintSparkSuite {
 
   test("create materialized view with metadata successfully") {
     val indexOptions =
-      FlintSparkIndexOptions(Map("auto_refresh" -> "true", "checkpoint_location" -> "s3://test/"))
+      FlintSparkIndexOptions(
+        Map(
+          "auto_refresh" -> "true",
+          "checkpoint_location" -> "s3://test/",
+          "watermark_delay" -> "30 Seconds"))
     flint
       .materializedView()
       .name(testMvName)
@@ -70,7 +74,8 @@ class FlintSparkMaterializedViewITSuite extends FlintSparkSuite {
          |    }],
          |    "options": {
          |      "auto_refresh": "true",
-         |      "checkpoint_location": "s3://test/"
+         |      "checkpoint_location": "s3://test/",
+         |      "watermark_delay": "30 Seconds"
          |    },
          |    "properties": {}
          |  },
@@ -147,7 +152,7 @@ class FlintSparkMaterializedViewITSuite extends FlintSparkSuite {
     }
   }
 
-  test("incremental refresh materialized view with filtering query") {
+  test("incremental refresh materialized view with filtering aggregate query") {
     val filterQuery =
       s"""
          | SELECT
@@ -155,7 +160,7 @@ class FlintSparkMaterializedViewITSuite extends FlintSparkSuite {
          |   COUNT(*) AS count
          | FROM $testTable
          | WHERE address = 'Seattle'
-         | GROUP BY TUMBLE(time, '10 Minutes')
+         | GROUP BY TUMBLE(time, '5 Minutes')
          |""".stripMargin
 
     withIncrementalMaterializedView(filterQuery) { indexData =>
@@ -190,7 +195,10 @@ class FlintSparkMaterializedViewITSuite extends FlintSparkSuite {
       codeBlock: DataFrame => Unit): Unit = {
     withTempDir { checkpointDir =>
       val indexOptions = FlintSparkIndexOptions(
-        Map("auto_refresh" -> "true", "checkpoint_location" -> checkpointDir.getAbsolutePath))
+        Map(
+          "auto_refresh" -> "true",
+          "checkpoint_location" -> checkpointDir.getAbsolutePath,
+          "watermark_delay" -> "1 Minute")) // This must be small to ensure window closed soon
 
       flint
         .materializedView()