update documentation with tablesample(50 percent) option

Signed-off-by: YANGDB <[email protected]>
opensearch-project · Oct 21, 2024 · 076ae34 · 076ae34
1 parent 9cf7fee
commit 076ae34
Show file tree

Hide file tree

Showing 15 changed files with 813 additions and 32 deletions.
diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md
@@ -136,6 +136,7 @@ source = table |  where ispresent(a) |
 [See additional command details](ppl-stats-command.md)
 
 - `source = table | stats avg(a) `
+- `source = table tablesample(50 percent) | stats avg(a) `
 - `source = table | where a < 50 | stats avg(c) `
 - `source = table | stats max(c) by b`
 - `source = table | stats count(c) by b | head 5`
@@ -148,6 +149,7 @@ source = table |  where ispresent(a) |
 **Aggregations With Span**
 - `source = table  | stats count(a) by span(a, 10) as a_span`
 - `source = table  | stats sum(age) by span(age, 5) as age_span | head 2`
+- `source = table  tablesample(50 percent) | stats sum(age) by span(age, 5) as age_span | head 2`
 - `source = table  | stats avg(age) by span(age, 20) as age_span, country  | sort - age_span |  head 2`
 
 **Aggregations With TimeWindow Span (tumble windowing function)**
@@ -181,13 +183,15 @@ source = table |  where ispresent(a) |
 
 - `source=accounts | rare gender`
 - `source=accounts | rare age by gender`
+- `source=accounts  tablesample(50 percent) | rare age by gender`
 
 #### **Top**
 [See additional command details](ppl-top-command.md)
 
 - `source=accounts | top gender`
 - `source=accounts | top 1 gender`
 - `source=accounts | top 1 age by gender`
+- `source=accounts  tablesample(50 percent) | top 1 age by gender`
 
 #### **Parse**
 [See additional command details](ppl-parse-command.md)
@@ -234,6 +238,9 @@ source = table |  where ispresent(a) |
 [See additional command details](ppl-join-command.md)
 
 - `source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
+- `source = table1  tablesample(50 percent) | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
+- `source = table1 | inner join left = l right = r on l.a = r.a table2  tablesample(50 percent) | fields l.a, r.a, b, c`
+- `source = table1 tablesample(50 percent)  | inner join left = l right = r on l.a = r.a table2  tablesample(50 percent) | fields l.a, r.a, b, c`
 - `source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
 - `source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
 - `source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
@@ -262,11 +269,14 @@ _- **Limitation: "REPLACE" or "APPEND" clause must contain "AS"**_
 [See additional command details](ppl-subquery-command.md)
 
 - `source = outer | where a in [ source = inner | fields b ]`
+- `source = outer tablesample(50 percent) | where a in [ source = inner | fields b ]`
 - `source = outer | where (a) in [ source = inner | fields b ]`
+- `source = outer | where (a) in [ source = inner tablesample(50 percent) | fields b ]`
 - `source = outer | where (a,b,c) in [ source = inner | fields d,e,f ]`
 - `source = outer | where a not in [ source = inner | fields b ]`
 - `source = outer | where (a) not in [ source = inner | fields b ]`
 - `source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ]`
+- `source = outer tablesample(50 percent) | where (a,b,c) not in [ source = inner tablesample(50 percent) | fields d,e,f ]`
 - `source = outer a in [ source = inner | fields b ]` (search filtering with subquery)
 - `source = outer a not in [ source = inner | fields b ]` (search filtering with subquery)
 - `source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ]` (nested)
@@ -368,10 +378,22 @@ Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table in
 `InSubquery`, `ExistsSubquery` and `ScalarSubquery` are all subquery expressions. But `RelationSubquery` is not a subquery expression, it is a subquery plan which is common used in Join or Search clause.
 
 - `source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side)
+- `source = table1 tablesample(50 percent) | join left = l right = r [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side)
 - `source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1`
 
 _- **Limitation: another command usage of (relation) subquery is in `appendcols` commands which is unsupported**_
 
+#### **fillnull**
+[See additional command details](ppl-fillnull-command.md)
+
+```sql
+   -  `source=accounts | fillnull fields status_code=101`
+   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
+    - `source=accounts | fillnull using field1=101`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
+```
+
 ---
 #### Experimental Commands:
 [See additional command details](ppl-correlation-command.md)
@@ -385,15 +407,4 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols`
 > ppl-correlation-command is an experimental command - it may be removed in future versions
 
 ---
-### Planned Commands:
-
-#### **fillnull**
-
-```sql
-   -  `source=accounts | fillnull fields status_code=101`
-   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
-    - `source=accounts | fillnull using field1=101`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
-```
 [See additional command details](planning/ppl-fillnull-command.md)
diff --git a/...ppl-lang/planning/ppl-fillnull-command.md → docs/ppl-lang/ppl-fillnull-command.md b/...ppl-lang/planning/ppl-fillnull-command.md → docs/ppl-lang/ppl-fillnull-command.md
diff --git a/docs/ppl-lang/ppl-rare-command.md b/docs/ppl-lang/ppl-rare-command.md
@@ -44,3 +44,19 @@ PPL query:
     | M        | 33    |
     | M        | 36    |
     +----------+-------+
+
+### Example 3: Find the rare address using only 50% of the actual data (sampling)
+
+PPL query:
+
+    os> source = accounts TABLESAMPLE(50 percent) | rare address
+
+The logical plan outcome of the rare queries:
+
+```sql
+'Sort ['COUNT('address) AS count_address#91 ASC NULLS FIRST], true
++- 'Aggregate ['address], ['COUNT('address) AS count_address#90, 'address]
+   +- 'Sample 0.0, 0.5, false, 0
+      +- 'UnresolvedRelation [accounts], [], false
+
+```
diff --git a/docs/ppl-lang/ppl-search-command.md b/docs/ppl-lang/ppl-search-command.md
@@ -40,3 +40,40 @@ PPL query:
     | 13               | Nanette     | 789 Madison Street | 32838     | F        | Nogal  | Quility    | VA      | 28    | null                 | Bates      |
     +------------------+-------------+--------------------+-----------+----------+--------+------------+---------+-------+----------------------+------------+
 
+### Example 3: Fetch data with a sampling percentage ( including an aggregation)
+The following example demonstrates how to sample 50% of the data from the table and then perform aggregation (finding rare occurrences of address).
+
+PPL query:
+
+    os> source = account  TABLESAMPLE(75 percent) | top 3 country by occupation
+
+This query samples 75% of the records from account table, then retrieves the top 3 countries grouped by occupation
+
+```sql
+SELECT *
+FROM (
+         SELECT country, occupation, COUNT(country) AS count_country
+         FROM account
+                  TABLESAMPLE(75 PERCENT)
+         GROUP BY country, occupation
+         ORDER BY COUNT(country) DESC NULLS LAST
+             LIMIT 3
+     ) AS subquery
+    LIMIT 3;
+```
+Logical Plan Equivalent:
+
+```sql
+'Project [*]
++- 'GlobalLimit 3
+   +- 'LocalLimit 3
+      +- 'Sort ['COUNT('country) AS count_country#68 DESC NULLS LAST], true
+         +- 'Aggregate ['country, 'occupation AS occupation#67], ['COUNT('country) AS count_country#66, 'country, 'occupation AS occupation#67]
+            +- 'Sample 0.0, 0.75, false, 0
+               +- 'UnresolvedRelation [account], [], false
+
+```
+
+By introducing the `TABLESAMPLE` instruction into the source command, one can now sample data as part of your queries and reducing the amount of data being scanned thereby converting precision with performance.
+
+The `percent` parameter will give the actual approximation of the true value with the needed trade of between accuracy and performance.
diff --git a/docs/ppl-lang/ppl-top-command.md b/docs/ppl-lang/ppl-top-command.md
@@ -56,3 +56,22 @@ PPL query:
     | M        | 32    |
     +----------+-------+
 
+
+### Example 3: Find the top country by occupation using only 75% of the actual data (sampling)
+
+PPL query:
+
+    os> source = account  TABLESAMPLE(75 percent) | top 3 country by occupation
+
+The logical plan outcome of the top queries:
+
+```sql
+'Project [*]
++- 'GlobalLimit 3
+   +- 'LocalLimit 3
+      +- 'Sort ['COUNT('country) AS count_country#68 DESC NULLS LAST], true
+         +- 'Aggregate ['country, 'occupation AS occupation#67], ['COUNT('country) AS count_country#66, 'country, 'occupation AS occupation#67]
+            +- 'Sample 0.0, 0.75, false, 0
+               +- 'UnresolvedRelation [account], [], false
+
+```
diff --git a/...src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLTopAndRareITSuite.scala b/...src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLTopAndRareITSuite.scala
@@ -114,7 +114,7 @@ class FlintSparkPPLTopAndRareITSuite
         Seq(addressField),
         aggregateExpressions,
         Sample(
-          0.5,
+          0,
           0.5,
           withReplacement = false,
           0,
@@ -274,18 +274,18 @@ class FlintSparkPPLTopAndRareITSuite
     val expectedPlan = Project(Seq(UnresolvedStar(None)), planWithLimit)
     comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
   }
-  
-  test("create ppl top 3 countries query test with tablesample 50%") {
+
+  test("create ppl top 2 countries query test with tablesample 50%") {
     val frame = sql(s"""
-         | source = $newTestTable  TABLESAMPLE(50 percent) | top 3 country
+         | source = $newTestTable TABLESAMPLE(50 percent) | top 2 country
          | """.stripMargin)
 
     // Retrieve the results
     val results: Array[Row] = frame.collect()
-    assert(results.length == 3)
+    assert(results.length == 1)
 
-    val expectedRows = Set(Row(6, "Canada"), Row(3, "USA"), Row(1, "England"))
-    val actualRows = results.take(3).toSet
+    val expectedRows = Set(Row(4, "Canada"))
+    val actualRows = results.take(1).toSet
 
     // Compare the sets
     assert(
@@ -303,7 +303,12 @@ class FlintSparkPPLTopAndRareITSuite
       Aggregate(
         Seq(countryField),
         aggregateExpressions,
-        UnresolvedRelation(Seq("spark_catalog", "default", "new_flint_ppl_test")))
+        Sample(
+          0,
+          0.5,
+          withReplacement = false,
+          0,
+          UnresolvedRelation(Seq("spark_catalog", "default", "new_flint_ppl_test"))))
 
     val sortedPlan: LogicalPlan =
       Sort(
@@ -317,12 +322,12 @@ class FlintSparkPPLTopAndRareITSuite
         aggregatePlan)
 
     val planWithLimit =
-      GlobalLimit(Literal(3), LocalLimit(Literal(3), sortedPlan))
+      GlobalLimit(Literal(2), LocalLimit(Literal(2), sortedPlan))
     val expectedPlan = Project(Seq(UnresolvedStar(None)), planWithLimit)
     comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
   }
 
-  test("create ppl top 2 countries by occupation field query test") {
+  test("create ppl top 3 countries by occupation field query test") {
     val frame = sql(s"""
          | source = $newTestTable| top 3 country by occupation
          | """.stripMargin)
@@ -373,17 +378,18 @@ class FlintSparkPPLTopAndRareITSuite
     comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
 
   }
-  test("create ppl top 2 countries by occupation field query test  with tablesample 50%") {
+
+  test("create ppl top 3 countries by occupation field query test with tablesample 75%") {
     val frame = sql(s"""
-         | source = $newTestTable  TABLESAMPLE(50 percent) | top 3 country by occupation
+         | source = $newTestTable  TABLESAMPLE(75 percent) | top 3 country by occupation
          | """.stripMargin)
 
     // Retrieve the results
     val results: Array[Row] = frame.collect()
     assert(results.length == 3)
 
     val expectedRows =
-      Set(Row(3, "Canada", "Doctor"), Row(2, "Canada", "Scientist"), Row(2, "USA", "Engineer"))
+      Set(Row(2, "Canada", "Doctor"), Row(2, "Canada", "Scientist"), Row(1, "USA", "Engineer"))
     val actualRows = results.take(3).toSet
 
     // Compare the sets
@@ -405,7 +411,12 @@ class FlintSparkPPLTopAndRareITSuite
       Aggregate(
         Seq(countryField, occupationFieldAlias),
         aggregateExpressions,
-        UnresolvedRelation(Seq("spark_catalog", "default", "new_flint_ppl_test")))
+        Sample(
+          0,
+          0.75,
+          withReplacement = false,
+          0,
+          UnresolvedRelation(Seq("spark_catalog", "default", "new_flint_ppl_test"))))
 
     val sortedPlan: LogicalPlan =
       Sort(

diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java
@@ -40,7 +40,7 @@ static Optional<QualifiedName> resolveField(List<UnresolvedRelation> relations,
     }
 
     static Optional<TablesampleContext> tablesampleBuilder(OpenSearchPPLParser.TablesampleClauseContext context) {
-        if(context.percentage != null)
+        if(context != null && context.percentage != null)
             return Optional.of(new TablesampleContext(Integer.parseInt(context.percentage.getText())));
         return Optional.empty();
     }