-
Notifications
You must be signed in to change notification settings - Fork 33
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support flatten with alias #927
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ import java.nio.file.Files | |
import org.opensearch.flint.spark.FlattenGenerator | ||
import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq | ||
|
||
import org.apache.spark.sql.{QueryTest, Row} | ||
import org.apache.spark.sql.{AnalysisException, QueryTest, Row} | ||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} | ||
import org.apache.spark.sql.catalyst.expressions.{Alias, EqualTo, GeneratorOuter, Literal, Or} | ||
import org.apache.spark.sql.catalyst.plans.logical._ | ||
|
@@ -347,4 +347,85 @@ class FlintSparkPPLFlattenITSuite | |
val expectedPlan = Project(Seq(UnresolvedStar(None)), flattenMultiValue) | ||
comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) | ||
} | ||
|
||
test("flatten struct nested table using alias") { | ||
val frame = sql(s""" | ||
| source = $structNestedTable | ||
| | flatten struct_col | ||
| | flatten field1 as subfield_1 | ||
| | flatten struct_col2 as (field1, field2_2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a test case: it should fail
|
||
| | flatten field1 as subfield_2 | ||
| """.stripMargin) | ||
|
||
assert( | ||
frame.columns.sameElements( | ||
Array("int_col", "field2", "subfield_1", "field2_2", "subfield_2"))) | ||
val results: Array[Row] = frame.collect() | ||
implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Int](_.getAs[Int](0)) | ||
val expectedResults: Array[Row] = | ||
Array( | ||
Row(30, 123, "value1", 23, "valueA"), | ||
Row(40, 123, "value5", 33, "valueB"), | ||
Row(30, 823, "value4", 83, "valueC"), | ||
Row(40, 456, "value2", 46, "valueD"), | ||
Row(50, 789, "value3", 89, "valueE")).sorted | ||
// Compare the results | ||
assert(results.sorted.sameElements(expectedResults)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. try using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, seems that function will do sorting internally |
||
|
||
// duplicate alias names | ||
val frame2 = sql(s""" | ||
| source = $structNestedTable | ||
| | flatten struct_col as (field1, field2_2) | ||
| | flatten field1 as subfield_1 | ||
| | flatten struct_col2 as (field1, field2_2) | ||
| | flatten field1 as subfield_2 | ||
Comment on lines
+375
to
+381
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. duplicated alias case I mentioned is
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any concern on your case? I think the purpose is to test on duplicate alias names between 2 flatten command. I only change the field name in the original test cases to satisfy our purpose, so we can reuse the constructed expected results. |
||
| """.stripMargin) | ||
|
||
// alias names duplicate with existing fields | ||
assert( | ||
frame2.columns.sameElements( | ||
Array("int_col", "field2_2", "subfield_1", "field2_2", "subfield_2"))) | ||
assert(frame2.collect().sorted.sameElements(expectedResults)) | ||
|
||
val frame3 = sql(s""" | ||
| source = $structNestedTable | ||
| | flatten struct_col as (field1, field2_2) | ||
| | flatten field1 as int_col | ||
| | flatten struct_col2 as (field1, field2_2) | ||
| | flatten field1 as int_col | ||
| """.stripMargin) | ||
|
||
assert( | ||
frame3.columns.sameElements(Array("int_col", "field2_2", "int_col", "field2_2", "int_col"))) | ||
assert(frame3.collect().sorted.sameElements(expectedResults)) | ||
|
||
// Throw AnalysisException if The number of aliases supplied in the AS clause does not match the | ||
// number of columns output | ||
val except = intercept[AnalysisException] { | ||
sql(s""" | ||
| source = $structNestedTable | ||
| | flatten struct_col as (field1) | ||
| | flatten field1 as int_col | ||
| | flatten struct_col2 as (field1, field2_2) | ||
| | flatten field1 as int_col | ||
| """.stripMargin) | ||
} | ||
assert(except.message.contains( | ||
"The number of aliases supplied in the AS clause does not match the number of columns output by the UDTF")) | ||
|
||
// Throw AnalysisException because of ambiguous | ||
val except2 = intercept[AnalysisException] { | ||
sql(s""" | ||
| source = $structNestedTable | ||
| | flatten struct_col as (field1, field2_2) | ||
| | flatten field1 as int_col | ||
| | flatten struct_col2 as (field1, field2_2) | ||
| | flatten field1 as int_col | ||
| | fields field2_2 | ||
| """.stripMargin) | ||
} | ||
assert(except2.message.contains( | ||
"[AMBIGUOUS_REFERENCE] Reference `field2_2` is ambiguous, could be: [`field2_2`, `field2_2`].")) | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -259,7 +259,7 @@ expandCommand | |
; | ||
|
||
flattenCommand | ||
: FLATTEN fieldExpression | ||
: FLATTEN fieldExpression (AS alias = identifierSeq)? | ||
; | ||
|
||
trendlineCommand | ||
|
@@ -1032,6 +1032,11 @@ qualifiedName | |
: ident (DOT ident)* # identsAsQualifiedName | ||
; | ||
|
||
identifierSeq | ||
: qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq | ||
| LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq | ||
Comment on lines
+1036
to
+1037
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This grammar allows four patterns:
Is
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not restrict to two, as you said, both 4 patterns are all allowed. For most of the case, both is ok with or without (), but having () is safer to avoid ambiguous. For example, in SQL, when having sql like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. That's my point. How about only remains
or
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YANG-DB any thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LantaoJin I personally dont mind the extra verbose - so Im good with @qianheng-aws suggestion... |
||
; | ||
|
||
tableQualifiedName | ||
: tableIdent (DOT ident)* # identsAsTableQualifiedName | ||
; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,9 +13,9 @@ import org.scalatest.matchers.should.Matchers | |
|
||
import org.apache.spark.SparkFunSuite | ||
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} | ||
import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, GeneratorOuter, Literal, NullsLast, RegExpExtract, SortOrder} | ||
import org.apache.spark.sql.catalyst.expressions.{Alias, GeneratorOuter, Literal, RegExpExtract} | ||
import org.apache.spark.sql.catalyst.plans.PlanTest | ||
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, GlobalLimit, LocalLimit, Project, Sort} | ||
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, Project} | ||
import org.apache.spark.sql.types.IntegerType | ||
|
||
class PPLLogicalPlanFlattenCommandTranslatorTestSuite | ||
|
@@ -153,4 +153,45 @@ class PPLLogicalPlanFlattenCommandTranslatorTestSuite | |
comparePlans(expectedPlan, logPlan, checkAnalysis = false) | ||
} | ||
|
||
test("test flatten with one alias") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @qianheng-aws plz add additional flatten with aliases queries unit tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @YANG-DB we already have but named "test flatten with alias list" |
||
val context = new CatalystPlanContext | ||
val logPlan = | ||
planTransformer.visit( | ||
plan(pplParser, "source=relation | flatten field_with_array as col1"), | ||
context) | ||
|
||
val relation = UnresolvedRelation(Seq("relation")) | ||
val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array")) | ||
val outerGenerator = GeneratorOuter(flattenGenerator) | ||
val generate = | ||
Generate(outerGenerator, seq(), true, None, Seq(UnresolvedAttribute("col1")), relation) | ||
val dropSourceColumn = | ||
DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate) | ||
val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn) | ||
comparePlans(expectedPlan, logPlan, checkAnalysis = false) | ||
} | ||
|
||
test("test flatten with alias list") { | ||
val context = new CatalystPlanContext | ||
val logPlan = | ||
planTransformer.visit( | ||
plan(pplParser, "source=relation | flatten field_with_array as (col1, col2)"), | ||
context) | ||
|
||
val relation = UnresolvedRelation(Seq("relation")) | ||
val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array")) | ||
val outerGenerator = GeneratorOuter(flattenGenerator) | ||
val generate = Generate( | ||
outerGenerator, | ||
seq(), | ||
true, | ||
None, | ||
Seq(UnresolvedAttribute("col1"), UnresolvedAttribute("col2")), | ||
relation) | ||
val dropSourceColumn = | ||
DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate) | ||
val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn) | ||
comparePlans(expectedPlan, logPlan, checkAnalysis = false) | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a test for duplicated aliases:
And
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Case 1 will be similar to this test except that it aliased
field2
tofield2_2
.opensearch-spark/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala
Line 298 in b050da3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean that will the duplicated
field2_2
in query fail?