From c063d5e00a65490cde8dfc76ff8f9a2a601d26d5 Mon Sep 17 00:00:00 2001 From: Lantao Jin Date: Tue, 9 Jul 2024 23:16:22 +0800 Subject: [PATCH] Span in PPL statsByClause could be specified after fields (#2720) Signed-off-by: Lantao Jin --- docs/user/ppl/cmd/stats.rst | 16 +++++- .../opensearch/sql/ppl/StatsCommandIT.java | 50 +++++++++++++++++++ .../org/opensearch/sql/util/MatcherUtils.java | 10 ++++ ppl/src/main/antlr/OpenSearchPPLParser.g4 | 1 + .../sql/ppl/parser/AstBuilderTest.java | 23 +++++++-- 5 files changed, 95 insertions(+), 5 deletions(-) diff --git a/docs/user/ppl/cmd/stats.rst b/docs/user/ppl/cmd/stats.rst index 096d3eacfc..19f5069bba 100644 --- a/docs/user/ppl/cmd/stats.rst +++ b/docs/user/ppl/cmd/stats.rst @@ -43,7 +43,7 @@ stats ... [by-clause] * Description: The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets. * Default: If no is specified, the stats command returns only one row, which is the aggregation over the entire result set. -* span-expression: optional. +* span-expression: optional, at most one. * Syntax: span(field_expr, interval_expr) * Description: The unit of the interval expression is the natural unit by default. If the field is a date and time type field, and the interval is in date/time units, you will need to specify the unit in the interval expression. For example, to split the field ``age`` into buckets by 10 years, it looks like ``span(age, 10)``. And here is another example of time span, the span to split a ``timestamp`` field into hourly intervals, it looks like ``span(timestamp, 1h)``. @@ -424,6 +424,20 @@ PPL query:: | 1 | 35 | M | +-------+------------+----------+ +Span will always be the first grouping key whatever order you specify. + +PPL query:: + + os> source=accounts | stats count() as cnt by gender, span(age, 5) as age_span + fetched rows / total rows = 3/3 + +-------+------------+----------+ + | cnt | age_span | gender | + |-------+------------+----------| + | 1 | 25 | F | + | 2 | 30 | M | + | 1 | 35 | M | + +-------+------------+----------+ + Example 10: Calculate the count and get email list by a gender and span ======================================================================= diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java index 40acd2f093..2d1cd709e1 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java @@ -11,7 +11,9 @@ import static org.opensearch.sql.util.MatcherUtils.rows; import static org.opensearch.sql.util.MatcherUtils.schema; import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; +import static org.opensearch.sql.util.MatcherUtils.verifyDataRowsInOrder; import static org.opensearch.sql.util.MatcherUtils.verifySchema; +import static org.opensearch.sql.util.MatcherUtils.verifySchemaInOrder; import java.io.IOException; import org.json.JSONObject; @@ -190,6 +192,54 @@ public void testStatsAliasedSpan() throws IOException { verifyDataRows(response, rows(1, 20), rows(6, 30)); } + @Test + public void testStatsBySpanAndMultipleFields() throws IOException { + JSONObject response = + executeQuery( + String.format( + "source=%s | stats count() by span(age,10), gender, state", TEST_INDEX_BANK)); + verifySchemaInOrder( + response, + schema("count()", null, "integer"), + schema("span(age,10)", null, "integer"), + schema("gender", null, "string"), + schema("state", null, "string")); + verifyDataRowsInOrder( + response, + rows(1, 20, "f", "VA"), + rows(1, 30, "f", "IN"), + rows(1, 30, "f", "PA"), + rows(1, 30, "m", "IL"), + rows(1, 30, "m", "MD"), + rows(1, 30, "m", "TN"), + rows(1, 30, "m", "WA")); + } + + @Test + public void testStatsByMultipleFieldsAndSpan() throws IOException { + // Use verifySchemaInOrder() and verifyDataRowsInOrder() to check that the span column is always + // the first column in result whatever the order of span in query is first or last one + JSONObject response = + executeQuery( + String.format( + "source=%s | stats count() by gender, state, span(age,10)", TEST_INDEX_BANK)); + verifySchemaInOrder( + response, + schema("count()", null, "integer"), + schema("span(age,10)", null, "integer"), + schema("gender", null, "string"), + schema("state", null, "string")); + verifyDataRowsInOrder( + response, + rows(1, 20, "f", "VA"), + rows(1, 30, "f", "IN"), + rows(1, 30, "f", "PA"), + rows(1, 30, "m", "IL"), + rows(1, 30, "m", "MD"), + rows(1, 30, "m", "TN"), + rows(1, 30, "m", "WA")); + } + @Test public void testStatsPercentile() throws IOException { JSONObject response = diff --git a/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java b/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java index d444218c66..26a60cb4e5 100644 --- a/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java +++ b/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java @@ -144,6 +144,16 @@ public static void verifySchema(JSONObject response, Matcher... matc } } + @SafeVarargs + public static void verifySchemaInOrder(JSONObject response, Matcher... matchers) { + try { + verifyInOrder(response.getJSONArray("schema"), matchers); + } catch (Exception e) { + LOG.error(String.format("verify schema failed, response: %s", response.toString()), e); + throw e; + } + } + @SafeVarargs public static void verifyDataRows(JSONObject response, Matcher... matchers) { verify(response.getJSONArray("datarows"), matchers); diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 5a9c179d1a..39fb7f53a6 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -188,6 +188,7 @@ statsByClause : BY fieldList | BY bySpanClause | BY bySpanClause COMMA fieldList + | BY fieldList COMMA bySpanClause ; bySpanClause diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index c9989a49c4..ced266ed78 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -319,11 +319,26 @@ public void testStatsCommandWithSpan() { exprList(alias("f1", field("f1")), alias("f2", field("f2"))), alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)), defaultStatsArgs())); - } - @Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class) - public void throwExceptionIfSpanInGroupByList() { - plan("source=t | stats avg(price) by f1, f2, span(timestamp, 1h)"); + assertEqual( + "source=t | stats avg(price) by b, span(timestamp, 1h)", + agg( + relation("t"), + exprList(alias("avg(price)", aggregate("avg", field("price")))), + emptyList(), + exprList(alias("b", field("b"))), + alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)), + defaultStatsArgs())); + + assertEqual( + "source=t | stats avg(price) by f1, f2, span(timestamp, 1h)", + agg( + relation("t"), + exprList(alias("avg(price)", aggregate("avg", field("price")))), + emptyList(), + exprList(alias("f1", field("f1")), alias("f2", field("f2"))), + alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)), + defaultStatsArgs())); } @Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class)