From 63ece19a69243ee817f08d0228cc433a6718c59f Mon Sep 17 00:00:00 2001 From: YANGDB Date: Tue, 10 Oct 2023 23:21:02 -0700 Subject: [PATCH] add correlation tests Signed-off-by: YANGDB --- .../ppl/FlintSparkPPLCorrelationITSuite.scala | 76 +++++++++++++++++-- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/integ-test/src/test/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCorrelationITSuite.scala b/integ-test/src/test/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCorrelationITSuite.scala index cfcefe7cb..c345fd5bc 100644 --- a/integ-test/src/test/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCorrelationITSuite.scala +++ b/integ-test/src/test/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCorrelationITSuite.scala @@ -51,6 +51,7 @@ class FlintSparkPPLCorrelationITSuite | ( | name STRING, | occupation STRING, + | country STRING, | salary INT | ) | USING CSV @@ -71,16 +72,22 @@ class FlintSparkPPLCorrelationITSuite | VALUES ('Jake', 70, 'California', 'USA'), | ('Hello', 30, 'New York', 'USA'), | ('John', 25, 'Ontario', 'Canada'), + | ('Jim', 27, 'B.C', 'Canada'), + | ('Peter', 57, 'B.C', 'Canada'), + | ('Rick', 70, 'B.C', 'Canada'), + | ('David', 40, 'Washington', 'USA'), | ('Jane', 20, 'Quebec', 'Canada') | """.stripMargin) // Insert data into the new table sql(s""" | INSERT INTO $testTable2 | PARTITION (year=2023, month=4) - | VALUES ('Jake', 'Engineer', 100000), - | ('Hello', 'Artist', 70000), - | ('John', 'Doctor', 120000), - | ('Jane', 'Scientist', 90000) + | VALUES ('Jake', 'Engineer', 'England' , 100000), + | ('Hello', 'Artist', 'USA', 70000), + | ('John', 'Doctor', 'Canada', 120000), + | ('David', 'Doctor', 'USA', 120000), + | ('David', 'Unemployed', 'Canada', 0), + | ('Jane', 'Scientist', 'Canada', 90000) | """.stripMargin) } @@ -113,10 +120,12 @@ class FlintSparkPPLCorrelationITSuite val results: Array[Row] = frame.collect() // Define the expected results val expectedResults: Array[Row] = Array( - Row("Jake", 70, "California", "USA", 2023, 4, "Jake", "Engineer", 100000, 2023, 4), - Row("Hello", 30, "New York", "USA", 2023, 4, "Hello", "Artist", 70000, 2023, 4), - Row("John", 25, "Ontario", "Canada", 2023, 4, "John", "Doctor", 120000, 2023, 4), - Row("Jane", 20, "Quebec", "Canada", 2023, 4, "Jane", "Scientist", 90000, 2023, 4)) + Row("Jake", 70, "California", "USA", 2023, 4, "Jake", "Engineer", "England", 100000, 2023, 4), + Row("Hello", 30, "New York", "USA", 2023, 4, "Hello", "Artist", "USA", 70000, 2023, 4), + Row("John", 25, "Ontario", "Canada", 2023, 4, "John", "Doctor", "Canada", 120000, 2023, 4), + Row("David", 40, "Washington", "USA", 2023, 4, "David", "Unemployed", "Canada", 0, 2023, 4), + Row("David", 40, "Washington", "USA", 2023, 4, "David", "Doctor", "USA", 120000, 2023, 4), + Row("Jane", 20, "Quebec", "Canada", 2023, 4, "Jane", "Scientist", "Canada", 90000, 2023, 4)) implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) // Compare the results @@ -153,4 +162,55 @@ class FlintSparkPPLCorrelationITSuite assert(compareByString(expectedPlan) === compareByString(logicalPlan)) } + test("create ppl correlation query with two tables correlating on a two fields test") { + val frame = sql(s""" + | source = $testTable1, $testTable2| where year = 2023 AND month = 4 | correlate exact fields(name, country) scope(month, 1W) + | mapping($testTable1.name = $testTable2.name, $testTable1.country = $testTable2.country) + | """.stripMargin) + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = Array( + Row("David", 40, "Washington", "USA", 2023, 4, "David", "Doctor", "USA", 120000, 2023, 4), + Row("Hello", 30, "New York", "USA", 2023, 4, "Hello", "Artist", "USA", 70000, 2023, 4), + Row("John", 25, "Ontario", "Canada", 2023, 4, "John", "Doctor", "Canada", 120000, 2023, 4), + Row("Jane", 20, "Quebec", "Canada", 2023, 4, "Jane", "Scientist", "Canada", 90000, 2023, 4)) + + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + // Compare the results + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Define unresolved relations + val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) + val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) + + // Define filter expressions + val filter1Expr = And( + EqualTo(UnresolvedAttribute("year"), Literal(2023)), + EqualTo(UnresolvedAttribute("month"), Literal(4))) + val filter2Expr = And( + EqualTo(UnresolvedAttribute("year"), Literal(2023)), + EqualTo(UnresolvedAttribute("month"), Literal(4))) + // Define subquery aliases + val plan1 = Filter(filter1Expr, table1) + val plan2 = Filter(filter2Expr, table2) + + // Define join condition + val joinCondition = + And( + EqualTo(UnresolvedAttribute(s"$testTable1.name"), UnresolvedAttribute(s"$testTable2.name")), + EqualTo(UnresolvedAttribute(s"$testTable1.country"), UnresolvedAttribute(s"$testTable2.country")) + ) + + // Create Join plan + val joinPlan = Join(plan1, plan2, Inner, Some(joinCondition), JoinHint.NONE) + + // Add the projection + val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) + + // Retrieve the logical plan + val logicalPlan: LogicalPlan = frame.queryExecution.logical + // Compare the two plans + assert(compareByString(expectedPlan) === compareByString(logicalPlan)) + } }