diff --git a/docs/ppl-lang/ppl-rare-command.md b/docs/ppl-lang/ppl-rare-command.md index f39990cc7..e3ad21f4e 100644 --- a/docs/ppl-lang/ppl-rare-command.md +++ b/docs/ppl-lang/ppl-rare-command.md @@ -12,7 +12,7 @@ Using ``rare`` command to find the least common tuple of values of all fields in * N: number of results to return. **Default**: 10 * field-list: mandatory. comma-delimited list of field names. * by-clause: optional. one or more fields to group the results by. -* top_approx: approximate the count by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html). +* rare_approx: approximate count of the rare (n) fields by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html). ### Example 1: Find the least common values in a field @@ -22,6 +22,7 @@ The example finds least common gender of all the accounts. PPL query: os> source=accounts | rare gender; + os> source=accounts | rare_approx 10 gender; os> source=accounts | rare_approx gender; fetched rows / total rows = 2/2 +----------+ diff --git a/docs/ppl-lang/ppl-top-command.md b/docs/ppl-lang/ppl-top-command.md index a2bd93564..93d3a7148 100644 --- a/docs/ppl-lang/ppl-top-command.md +++ b/docs/ppl-lang/ppl-top-command.md @@ -11,7 +11,7 @@ Using ``top`` command to find the most common tuple of values of all fields in t * N: number of results to return. **Default**: 10 * field-list: mandatory. comma-delimited list of field names. * by-clause: optional. one or more fields to group the results by. -* top_approx: approximate the count by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html). +* top_approx: approximate count of the (n) top fields by using estimated [cardinality by HyperLogLog++ algorithm](https://spark.apache.org/docs/3.5.2/sql-ref-functions-builtin.html). ### Example 1: Find the most common values in a field @@ -20,7 +20,7 @@ The example finds most common gender of all the accounts. PPL query: os> source=accounts | top gender; - os> source=accounts_approx | top gender; + os> source=accounts | top_approx gender; fetched rows / total rows = 2/2 +----------+ | gender | @@ -35,7 +35,7 @@ The example finds most common gender of all the accounts. PPL query: - os> source=accounts_approx | top 1 gender; + os> source=accounts | top_approx 1 gender; fetched rows / total rows = 1/1 +----------+ | gender | diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index 40ebab28a..10b2e01b8 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -25,7 +25,7 @@ EVAL: 'EVAL'; HEAD: 'HEAD'; TOP_APPROX: 'TOP_APPROX'; TOP: 'TOP'; -RARE_APPROX: 'RARE_APPROX'; +RARE_APPROX: 'RARE_APPROX'; RARE: 'RARE'; PARSE: 'PARSE'; METHOD: 'METHOD'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index 9f5e08ce3..63efd8c6c 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -1124,6 +1124,7 @@ keywordsCanBeId // AGGREGATIONS | statsFunctionName | DISTINCT_COUNT + | DISTINCT_COUNT_APPROX | PERCENTILE | PERCENTILE_APPROX | ESTDC diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java index 5a717dd48..1621e65d5 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java @@ -188,13 +188,6 @@ public LogicalPlan reduce(BiFunction tran return result; }).orElse(getPlan())); } - - /** - * update context using the given action and node - */ - public CatalystPlanContext update(UnaryOperator action) { - return action.apply(this); - } /** * apply for each plan with the given function diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java index 9f46bcb82..00a7905f0 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java @@ -370,21 +370,19 @@ public LogicalPlan visitAlias(Alias node, CatalystPlanContext context) { @Override public LogicalPlan visitProject(Project node, CatalystPlanContext context) { - context.update((ctx) -> { - if (node.isExcluded()) { - List intersect = ctx.getProjectedFields().stream() - .filter(node.getProjectList()::contains) - .collect(Collectors.toList()); - if (!intersect.isEmpty()) { - // Fields in parent projection, but they have be excluded in child. For example, - // source=t | fields - A, B | fields A, B, C will throw "[Field A, Field B] can't be resolved" - throw new SyntaxCheckException(intersect + " can't be resolved"); - } - } else { - ctx.withProjectedFields(node.getProjectList()); + //update plan's context prior to visiting node children + if (node.isExcluded()) { + List intersect = context.getProjectedFields().stream() + .filter(node.getProjectList()::contains) + .collect(Collectors.toList()); + if (!intersect.isEmpty()) { + // Fields in parent projection, but they have be excluded in child. For example, + // source=t | fields - A, B | fields A, B, C will throw "[Field A, Field B] can't be resolved" + throw new SyntaxCheckException(intersect + " can't be resolved"); } - return ctx; - }); + } else { + context.withProjectedFields(node.getProjectList()); + } LogicalPlan child = visitFirstChild(node, context); visitExpressionList(node.getProjectList(), context); diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java index 0b0fb8314..0a4f19b53 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/BuiltinFunctionTransformer.java @@ -26,8 +26,10 @@ import java.util.Map; import java.util.function.Function; +import static org.opensearch.flint.spark.ppl.OpenSearchPPLLexer.DISTINCT_COUNT_APPROX; import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADD; import static org.opensearch.sql.expression.function.BuiltinFunctionName.ADDDATE; +import static org.opensearch.sql.expression.function.BuiltinFunctionName.APPROX_COUNT_DISTINCT; import static org.opensearch.sql.expression.function.BuiltinFunctionName.ARRAY_LENGTH; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DATEDIFF; import static org.opensearch.sql.expression.function.BuiltinFunctionName.DATE_ADD; @@ -109,6 +111,7 @@ public interface BuiltinFunctionTransformer { .put(TO_JSON_STRING, "to_json") .put(JSON_KEYS, "json_object_keys") .put(JSON_EXTRACT, "get_json_object") + .put(APPROX_COUNT_DISTINCT, "approx_count_distinct") .build(); /**