opensearch-project · YANG-DB · Oct 18, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md
@@ -136,6 +136,7 @@ source = table |  where ispresent(a) |
 [See additional command details](ppl-stats-command.md)
 
 - `source = table | stats avg(a) `
+- `source = table tablesample(50 percent) | stats avg(a) `
 - `source = table | where a < 50 | stats avg(c) `
 - `source = table | stats max(c) by b`
 - `source = table | stats count(c) by b | head 5`
@@ -148,6 +149,7 @@ source = table |  where ispresent(a) |
 **Aggregations With Span**
 - `source = table  | stats count(a) by span(a, 10) as a_span`
 - `source = table  | stats sum(age) by span(age, 5) as age_span | head 2`
+- `source = table  tablesample(50 percent) | stats sum(age) by span(age, 5) as age_span | head 2`
 - `source = table  | stats avg(age) by span(age, 20) as age_span, country  | sort - age_span |  head 2`
 
 **Aggregations With TimeWindow Span (tumble windowing function)**
@@ -181,13 +183,15 @@ source = table |  where ispresent(a) |
 
 - `source=accounts | rare gender`
 - `source=accounts | rare age by gender`
+- `source=accounts  tablesample(50 percent) | rare age by gender`
 
 #### **Top**
 [See additional command details](ppl-top-command.md)
 
 - `source=accounts | top gender`
 - `source=accounts | top 1 gender`
 - `source=accounts | top 1 age by gender`
+- `source=accounts  tablesample(50 percent) | top 1 age by gender`
 
 #### **Parse**
 [See additional command details](ppl-parse-command.md)
@@ -234,6 +238,9 @@ source = table |  where ispresent(a) |
 [See additional command details](ppl-join-command.md)
 
 - `source = table1 | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
+- `source = table1  tablesample(50 percent) | inner join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
+- `source = table1 | inner join left = l right = r on l.a = r.a table2  tablesample(50 percent) | fields l.a, r.a, b, c`
+- `source = table1 tablesample(50 percent)  | inner join left = l right = r on l.a = r.a table2  tablesample(50 percent) | fields l.a, r.a, b, c`
 - `source = table1 | left join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
 - `source = table1 | right join left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
 - `source = table1 | full left = l right = r on l.a = r.a table2 | fields l.a, r.a, b, c`
@@ -262,11 +269,14 @@ _- **Limitation: "REPLACE" or "APPEND" clause must contain "AS"**_
 [See additional command details](ppl-subquery-command.md)
 
 - `source = outer | where a in [ source = inner | fields b ]`
+- `source = outer tablesample(50 percent) | where a in [ source = inner | fields b ]`
 - `source = outer | where (a) in [ source = inner | fields b ]`
+- `source = outer | where (a) in [ source = inner tablesample(50 percent) | fields b ]`
 - `source = outer | where (a,b,c) in [ source = inner | fields d,e,f ]`
 - `source = outer | where a not in [ source = inner | fields b ]`
 - `source = outer | where (a) not in [ source = inner | fields b ]`
 - `source = outer | where (a,b,c) not in [ source = inner | fields d,e,f ]`
+- `source = outer tablesample(50 percent) | where (a,b,c) not in [ source = inner tablesample(50 percent) | fields d,e,f ]`
 - `source = outer a in [ source = inner | fields b ]` (search filtering with subquery)
 - `source = outer a not in [ source = inner | fields b ]` (search filtering with subquery)
 - `source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ]` (nested)
@@ -368,10 +378,22 @@ Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table in
 `InSubquery`, `ExistsSubquery` and `ScalarSubquery` are all subquery expressions. But `RelationSubquery` is not a subquery expression, it is a subquery plan which is common used in Join or Search clause.
 
 - `source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side)
+- `source = table1 tablesample(50 percent) | join left = l right = r [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side)
 - `source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1`
 
 _- **Limitation: another command usage of (relation) subquery is in `appendcols` commands which is unsupported**_
 
+#### **fillnull**
+[See additional command details](ppl-fillnull-command.md)
+
+```sql
+   -  `source=accounts | fillnull fields status_code=101`
+   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
+    - `source=accounts | fillnull using field1=101`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
+```
+
 ---
 #### Experimental Commands:
 [See additional command details](ppl-correlation-command.md)
@@ -385,15 +407,4 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols`
 > ppl-correlation-command is an experimental command - it may be removed in future versions
 
 ---
-### Planned Commands:
-
-#### **fillnull**
-
-```sql
-   -  `source=accounts | fillnull fields status_code=101`
-   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
-    - `source=accounts | fillnull using field1=101`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
-```
 [See additional command details](planning/ppl-fillnull-command.md)
@@ -21,7 +21,7 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
 
 
 * **Commands**
-
+  
     - [`explain command `](PPL-Example-Commands.md/#explain)
 
     - [`dedup command `](ppl-dedup-command.md)
@@ -44,6 +44,8 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
 
     - [`search command`](ppl-search-command.md)
 
+    - [`head` VS `tablesample`](ppl-limit-vs-sample-command.md)
+
     - [`sort command`](ppl-sort-command.md)
 
     - [`stats command`](ppl-stats-command.md)

diff --git a/...ppl-lang/planning/ppl-fillnull-command.md → docs/ppl-lang/ppl-fillnull-command.md b/...ppl-lang/planning/ppl-fillnull-command.md → docs/ppl-lang/ppl-fillnull-command.md
@@ -0,0 +1,50 @@
+### `head` Vs `TABLESAMPLE`
+The primary difference between `head` and `TABLESAMPLE` in PPL lies in how they operate on the data:
+
+1. `head`:
+
+   **Purpose**: head is used to return a specified number of rows from the result set, after all other operations (like filtering, aggregations, etc.) have been applied.
+
+   **Behavior**: It retrieves the first N rows from the query result in a deterministic manner, based on the ordering of the data (or lack thereof). If there is no explicit `sort` clause, the result may be somewhat arbitrary but repeatable.
+
+   **Execution**: head is applied after any `where`, `stats`, or `sort` clauses. It restricts the number of rows after processing the entire dataset.
+
+   **Example**:
+    ```sql
+       source = t | head 10
+    ```
+   This will return exactly 10 rows from the account table.
+
+2. `TABLESAMPLE`:
+
+   **Purpose**: `TABLESAMPLE` is used to retrieve a random subset of rows from a table. It selects a percentage of the rows from the table in a non-deterministic, probabilistic manner.
+
+   **Behavior**: The rows returned by `TABLESAMPLE` are randomly selected based on a given percentage or fraction of the table. It is not guaranteed that the same number of rows will be returned each time the query is run.
+
+   **Execution**: `TABLESAMPLE` is applied directly to the underlying table before any other operations like filtering or aggregation. _**It reduces the dataset size early in the query execution._**
+
+   **Example**:
+    ```sql
+        source = t TABLESAMPLE(50 PERCENT)
+    ```
+   This will randomly select approximately 50% of the rows from the account table.
+
+3. Key Differences:
+   Practical Implications:
+
+| Feature                   | `head`                                                      | `TABLESAMPLE`                                      |
+|---------------------------|-------------------------------------------------------------|----------------------------------------------------|
+| **Behavior**              | Returns a fixed number of rows                              | Returns a random subset of rows                    |
+| **Determinism**           | Deterministic (returns same rows if no ordering is changed) | Non-deterministic (random rows each time)          |
+| **Scope**                 | Applied after all operations                                | Applied directly to the table                      |
+| **Use case**              | Retrieve a specific number of rows                          | Work with a random sample of data                  |
+| **Impact on performance** | Still processes the full dataset and then heads             | Can reduce the amount of data processed earlier    |
+| **Percentage/Fraction**   | Not supported, only absolute numbers                        | Supports percentage-based sampling                 |
+
+- Use `head` when you need an exact number of rows, especially in a final result set .
+- Use `TABLESAMPLE` when you need a rough approximation of the data or want to randomly sample data without processing the entire table, which can help improve query performance when working with large datasets.
+
+
+
+
+
@@ -44,3 +44,19 @@ PPL query:
     | M        | 33    |
     | M        | 36    |
     +----------+-------+
+
+### Example 3: Find the rare address using only 50% of the actual data (sampling)
+
+PPL query:
+
+    os> source = accounts TABLESAMPLE(50 percent) | rare address
+
+The logical plan outcome of the rare queries:
+
+```sql
+'Sort ['COUNT('address) AS count_address#91 ASC NULLS FIRST], true
++- 'Aggregate ['address], ['COUNT('address) AS count_address#90, 'address]
+   +- 'Sample 0.0, 0.5, false, 0
+      +- 'UnresolvedRelation [accounts], [], false
+
+```
@@ -40,3 +40,40 @@ PPL query:
     | 13               | Nanette     | 789 Madison Street | 32838     | F        | Nogal  | Quility    | VA      | 28    | null                 | Bates      |
     +------------------+-------------+--------------------+-----------+----------+--------+------------+---------+-------+----------------------+------------+
 
+### Example 3: Fetch data with a sampling percentage ( including an aggregation)
+The following example demonstrates how to sample 50% of the data from the table and then perform aggregation (finding rare occurrences of address).
+
+PPL query:
+
+    os> source = account  TABLESAMPLE(75 percent) | top 3 country by occupation
+
+This query samples 75% of the records from account table, then retrieves the top 3 countries grouped by occupation
+
+```sql
+SELECT *
+FROM (
+         SELECT country, occupation, COUNT(country) AS count_country
+         FROM account
+                  TABLESAMPLE(75 PERCENT)
+         GROUP BY country, occupation
+         ORDER BY COUNT(country) DESC NULLS LAST
+             LIMIT 3
+     ) AS subquery
+    LIMIT 3;
+```
+Logical Plan Equivalent:
+
+```sql
+'Project [*]
++- 'GlobalLimit 3
+   +- 'LocalLimit 3
+      +- 'Sort ['COUNT('country) AS count_country#68 DESC NULLS LAST], true
+         +- 'Aggregate ['country, 'occupation AS occupation#67], ['COUNT('country) AS count_country#66, 'country, 'occupation AS occupation#67]
+            +- 'Sample 0.0, 0.75, false, 0
+               +- 'UnresolvedRelation [account], [], false
+
+```
+
+By introducing the `TABLESAMPLE` instruction into the source command, one can now sample data as part of your queries and reducing the amount of data being scanned thereby converting precision with performance.
+
+The `percent` parameter will give the actual approximation of the true value with the needed trade of between accuracy and performance.
@@ -56,3 +56,22 @@ PPL query:
     | M        | 32    |
     +----------+-------+
 
+
+### Example 3: Find the top country by occupation using only 75% of the actual data (sampling)
+
+PPL query:
+
+    os> source = account  TABLESAMPLE(75 percent) | top 3 country by occupation
+
+The logical plan outcome of the top queries:
+
+```sql
+'Project [*]
++- 'GlobalLimit 3
+   +- 'LocalLimit 3
+      +- 'Sort ['COUNT('country) AS count_country#68 DESC NULLS LAST], true
+         +- 'Aggregate ['country, 'occupation AS occupation#67], ['COUNT('country) AS count_country#66, 'country, 'occupation AS occupation#67]
+            +- 'Sample 0.0, 0.75, false, 0
+               +- 'UnresolvedRelation [account], [], false
+
+```
@@ -227,6 +227,46 @@ class FlintSparkPPLAggregationWithSpanITSuite
     assert(compareByString(expectedPlan) === compareByString(logicalPlan))
   }
 
+  test(
+    "create ppl average age by span of interval of 10 years group by country head (limit) 2 query test with tablesample(100 percent)") {
+    val frame = sql(s"""
+         | source = $testTable  tablesample(100 percent)| stats avg(age) by span(age, 10) as age_span, country | head 3
+         | """.stripMargin)
+    // Retrieve the results
+    val results: Array[Row] = frame.collect()
+    // Define the expected results
+    val expectedResults: Array[Row] =
+      Array(Row(70.0d, "USA", 70L), Row(30.0d, "USA", 30L), Row(22.5d, "Canada", 20L))
+
+    // Compare the results
+    implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](1))
+    assert(results.sorted.sameElements(expectedResults.sorted))
+    // Retrieve the logical plan
+    val logicalPlan: LogicalPlan = frame.queryExecution.logical
+    // Define the expected logical plan
+    val star = Seq(UnresolvedStar(None))
+    val ageField = UnresolvedAttribute("age")
+    val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test"))
+    val countryField = UnresolvedAttribute("country")
+    val countryAlias = Alias(countryField, "country")()
+
+    val aggregateExpressions =
+      Alias(UnresolvedFunction(Seq("AVG"), Seq(ageField), isDistinct = false), "avg(age)")()
+    val span = Alias(
+      Multiply(Floor(Divide(UnresolvedAttribute("age"), Literal(10))), Literal(10)),
+      "age_span")()
+    val aggregatePlan =
+      Aggregate(
+        Seq(countryAlias, span),
+        Seq(aggregateExpressions, countryAlias, span),
+        Sample(0, 1, withReplacement = false, 0, table))
+    val limitPlan = Limit(Literal(3), aggregatePlan)
+    val expectedPlan = Project(star, limitPlan)
+
+    // Compare the two plans
+    assert(compareByString(expectedPlan) === compareByString(logicalPlan))
+  }
+
   test(
     "create ppl average age by span of interval of 10 years group by country head (limit) 2 query and sort by test ") {
     val frame = sql(s"""