Merge branch 'main' into validate-tumble-function-argument

opensearch-project · Nov 7, 2024 · ca1f9c5 · ca1f9c5
2 parents 5f14bc8 + 4303057
commit ca1f9c5
Show file tree

Hide file tree

Showing 130 changed files with 8,270 additions and 799 deletions.
diff --git a/build.sbt b/build.sbt
@@ -154,6 +154,7 @@ lazy val pplSparkIntegration = (project in file("ppl-spark-integration"))
       "com.stephenn" %% "scalatest-json-jsonassert" % "0.2.5" % "test",
       "com.github.sbt" % "junit-interface" % "0.13.3" % "test",
       "org.projectlombok" % "lombok" % "1.18.30",
+      "com.github.seancfoley" % "ipaddress" % "5.5.1",
     ),
     libraryDependencies ++= deps(sparkVersion),
     // ANTLR settings
@@ -237,7 +238,8 @@ lazy val integtest = (project in file("integ-test"))
     inConfig(IntegrationTest)(Defaults.testSettings ++ Seq(
       IntegrationTest / javaSource := baseDirectory.value / "src/integration/java",
       IntegrationTest / scalaSource := baseDirectory.value / "src/integration/scala",
-      IntegrationTest / parallelExecution := false,
+      IntegrationTest / resourceDirectory := baseDirectory.value / "src/integration/resources",
+        IntegrationTest / parallelExecution := false,
       IntegrationTest / fork := true,
     )),
     inConfig(AwsIntegrationTest)(Defaults.testSettings ++ Seq(

diff --git a/docs/index.md b/docs/index.md
@@ -549,6 +549,7 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i
 - `spark.flint.monitor.initialDelaySeconds`: Initial delay in seconds before starting the monitoring task. Default value is 15.
 - `spark.flint.monitor.intervalSeconds`: Interval in seconds for scheduling the monitoring task. Default value is 60.
 - `spark.flint.monitor.maxErrorCount`: Maximum number of consecutive errors allowed before stopping the monitoring task. Default value is 5.
+- `spark.flint.metadataCacheWrite.enabled`: default is false. enable writing metadata to index mappings _meta as read cache for frontend user to access. Do not use in production, this setting will be removed in later version.
 
 #### Data Type Mapping
 

diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md
@@ -58,6 +58,16 @@ _- **Limitation: new field added by eval command with a function cannot be dropp
 - `source = table | where a not in (1, 2, 3) | fields a,b,c`
 - `source = table | where a between 1 and 4` - Note: This returns a >= 1 and a <= 4, i.e. [1, 4]
 - `source = table | where b not between '2024-09-10' and '2025-09-10'` - Note: This returns b >= '2024-09-10' and b <= '2025-09-10'
+- `source = table | where cidrmatch(ip, '192.169.1.0/24')` 
+- `source = table | where cidrmatch(ipv6, '2003:db8::/32')`
+- `source = table | trendline sma(2, temperature) as temp_trend`
+
+#### **IP related queries**
+[See additional command details](functions/ppl-ip.md)
+
+- `source = table | where cidrmatch(ip, '192.169.1.0/24')`
+- `source = table | where isV6 = false and isValid = true and cidrmatch(ipAddress, '192.168.1.0/24')`
+- `source = table | where isV6 = true | eval inRange = case(cidrmatch(ipAddress, '2003:db8::/32'), 'in' else 'out') | fields ip, inRange`
 
 ```sql
  source = table | eval status_category =
@@ -120,6 +130,15 @@ Assumptions: `a`, `b`, `c`, `d`, `e` are existing fields in `table`
 - `source = table | fillnull using a = 101, b = 102`
 - `source = table | fillnull using a = concat(b, c), d = 2 * pi() * e`
 
+### Flatten
+[See additional command details](ppl-flatten-command.md)
+Assumptions: `bridges`, `coor` are existing fields in `table`, and the field's types are `struct<?,?>` or `array<struct<?,?>>`  
+- `source = table | flatten bridges`
+- `source = table | flatten coor`
+- `source = table | flatten bridges | flatten coor`
+- `source = table | fields bridges | flatten bridges`
+- `source = table | fields country, bridges | flatten bridges | fields country, length | stats avg(length) as avg by country`
+
 ```sql
 source = table | eval e = eval status_category =
         case(a >= 200 AND a < 300, 'Success',
@@ -287,7 +306,11 @@ source = table |  where ispresent(a) |
 - `source = table1 | left semi join left = l right = r on l.a = r.a table2`
 - `source = table1 | left anti join left = l right = r on l.a = r.a table2`
 - `source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ]`
-
+- `source = table1 | inner join on table1.a = table2.a table2 | fields table1.a, table2.a, table1.b, table1.c` (directly refer table name)
+- `source = table1 | inner join on a = c table2 | fields a, b, c, d` (ignore side aliases as long as no ambiguous)
+- `source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields l.a, r.a` (side alias overrides table alias)
+- `source = table1 as t1 | join left = l right = r on l.a = r.a table2 as t2 | fields t1.a, t2.a` (error, side alias overrides table alias)
+- `source = table1 | join left = l right = r on l.a = r.a [ source = table2 ] as s | fields l.a, s.a` (error, side alias overrides subquery alias)
 
 #### **Lookup**
 [See additional command details](ppl-lookup-command.md)
@@ -418,8 +441,30 @@ Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table in
 
 _- **Limitation: another command usage of (relation) subquery is in `appendcols` commands which is unsupported**_
 
----
-#### Experimental Commands:
+
+#### **fillnull**
+[See additional command details](ppl-fillnull-command.md)
+```sql
+   -  `source=accounts | fillnull fields status_code=101`
+   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
+    - `source=accounts | fillnull using field1=101`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
+    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
+```
+
+#### **expand**
+[See additional command details](ppl-expand-command.md)
+```sql
+   -  `source = table | expand field_with_array as array_list`
+   -  `source = table | expand employee | stats max(salary) as max by state, company`
+   -  `source = table | expand employee as worker | stats max(salary) as max by state, company`
+   -  `source = table | expand employee as worker | eval bonus = salary * 3 | fields worker, bonus`
+   -  `source = table | expand employee | parse description '(?<email>.+@.+)' | fields employee, email`
+   -  `source = table | eval array=json_array(1, 2, 3) | expand array as uid | fields name, occupation, uid`
+   -  `source = table | expand multi_valueA as multiA | expand multi_valueB as multiB`
+```
+
+#### Correlation Commands:
 [See additional command details](ppl-correlation-command.md)
 
 ```sql
@@ -431,14 +476,3 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols`
 > ppl-correlation-command is an experimental command - it may be removed in future versions
 
 ---
-### Planned Commands:
-
-#### **fillnull**
-[See additional command details](ppl-fillnull-command.md)
-```sql
-   -  `source=accounts | fillnull fields status_code=101`
-   -  `source=accounts | fillnull fields request_path='/not_found', timestamp='*'`
-    - `source=accounts | fillnull using field1=101`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5`
-    - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'`
-```
diff --git a/docs/ppl-lang/README.md b/docs/ppl-lang/README.md
@@ -31,6 +31,8 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
     - [`describe command`](PPL-Example-Commands.md/#describe)
 
     - [`fillnull command`](ppl-fillnull-command.md)
+
+    - [`flatten command`](ppl-flatten-command.md)
 
     - [`eval command`](ppl-eval-command.md)
 
@@ -67,7 +69,10 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
     - [`subquery commands`](ppl-subquery-command.md)
 
     - [`correlation commands`](ppl-correlation-command.md)
-
+
+    - [`trendline commands`](ppl-trendline-command.md)
+
+    - [`expand commands`](ppl-expand-command.md)
 
 * **Functions**
 
@@ -87,6 +92,9 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
 
     - [`Cryptographic Functions`](functions/ppl-cryptographic.md)
 
+    - [`IP Address Functions`](functions/ppl-ip.md)
+
+    - [`Lambda Functions`](functions/ppl-lambda.md)
 
 ---
 ### PPL On Spark
@@ -98,11 +106,15 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).
 ### Example PPL Queries
 See samples of [PPL queries](PPL-Example-Commands.md) 
 
+---
+### TPC-H PPL Query Rewriting
+See samples of [TPC-H PPL query rewriting](ppl-tpch.md)
+
 ---
 ### Planned PPL Commands
 
  - [`FillNull`](planning/ppl-fillnull-command.md)
 
 ---
 ### PPL Project Roadmap
-[PPL Github Project Roadmap](https://github.com/orgs/opensearch-project/projects/214)
+[PPL Github Project Roadmap](https://github.com/orgs/opensearch-project/projects/214)
diff --git a/docs/ppl-lang/functions/ppl-datetime.md b/docs/ppl-lang/functions/ppl-datetime.md
@@ -14,7 +14,7 @@ Argument type: DATE, LONG
 
 (DATE, LONG) -> DATE
 
-Antonyms: `SUBDATE`_
+Antonyms: `SUBDATE`
 
 Example:
 
@@ -795,7 +795,7 @@ Argument type: DATE/TIMESTAMP, LONG
 
 (DATE, LONG) -> DATE
 
-Antonyms: `ADDDATE`_
+Antonyms: `ADDDATE`
 
 Example:
 
@@ -982,3 +982,134 @@ Example:
     +----------------------------+
 
 
+### `DATE_ADD`
+
+**Description:**
+
+Usage: date_add(date, INTERVAL expr unit) adds the interval expr to date.
+
+Argument type: DATE, INTERVAL
+
+Return type: DATE
+
+Antonyms: `DATE_SUB`
+
+Example::
+
+    os> source=people | eval `'2020-08-26' + 1d` = DATE_ADD(DATE('2020-08-26'), INTERVAL 1 DAY) | fields `'2020-08-26' + 1d`
+    fetched rows / total rows = 1/1
+    +---------------------+
+    | '2020-08-26' + 1d   |
+    |---------------------+
+    | 2020-08-27          |
+    +---------------------+
+
+
+### `DATE_SUB`
+
+**Description:**
+
+Usage: date_sub(date, INTERVAL expr unit) subtracts the interval expr from date.
+
+Argument type: DATE, INTERVAL
+
+Return type: DATE
+
+Antonyms: `DATE_ADD`
+
+Example::
+
+    os> source=people | eval `'2008-01-02' - 31d` = DATE_SUB(DATE('2008-01-02'), INTERVAL 31 DAY) | fields `'2008-01-02' - 31d`
+    fetched rows / total rows = 1/1
+    +---------------------+
+    | '2008-01-02' - 31d  |
+    |---------------------+
+    | 2007-12-02          |
+    +---------------------+
+
+
+### `TIMESTAMPADD`
+
+**Description:**
+
+Usage: Returns a TIMESTAMP value based on a passed in DATE/TIMESTAMP/STRING argument and an INTERVAL and INTEGER argument which determine the amount of time to be added.
+If the third argument is a STRING, it must be formatted as a valid TIMESTAMP.
+If the third argument is a DATE, it will be automatically converted to a TIMESTAMP.
+
+Argument type: INTERVAL, INTEGER, DATE/TIMESTAMP/STRING
+
+INTERVAL must be one of the following tokens: [SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR]
+
+Examples::
+
+    os> source=people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`
+    fetched rows / total rows = 1/1
+    +----------------------------------------------+--------------------------------------------------+
+    | TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') |
+    |----------------------------------------------+--------------------------------------------------|
+    | 2000-01-18 00:00:00                          | 1999-10-01 00:00:00                              |
+    +----------------------------------------------+--------------------------------------------------+
+
+
+### `TIMESTAMPDIFF`
+
+**Description:**
+
+Usage: TIMESTAMPDIFF(interval, start, end) returns the difference between the start and end date/times in interval units.
+Arguments will be automatically converted to a ]TIMESTAMP when appropriate.
+Any argument that is a STRING must be formatted as a valid TIMESTAMP.
+
+Argument type: INTERVAL, DATE/TIMESTAMP/STRING, DATE/TIMESTAMP/STRING
+
+INTERVAL must be one of the following tokens: [SECOND, MINUTE, HOUR, DAY, WEEK, MONTH, QUARTER, YEAR]
+
+Examples::
+
+    os> source=people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`
+    fetched rows / total rows = 1/1
+    +-------------------------------------------------------------------+-------------------------------------------------------------------------------------------+
+    | TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) |
+    |-------------------------------------------------------------------+-------------------------------------------------------------------------------------------|
+    | 4                                                                 | -23                                                                                       |
+    +-------------------------------------------------------------------+-------------------------------------------------------------------------------------------+
+
+
+### `UTC_TIMESTAMP`
+
+**Description:**
+
+Returns the current UTC timestamp as a value in 'YYYY-MM-DD hh:mm:ss'.
+
+Return type: TIMESTAMP
+
+Specification: UTC_TIMESTAMP() -> TIMESTAMP
+
+Example::
+
+    > source=people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`
+    fetched rows / total rows = 1/1
+    +---------------------+
+    | UTC_TIMESTAMP()     |
+    |---------------------|
+    | 2022-10-03 17:54:28 |
+    +---------------------+
+
+
+### `CURRENT_TIMEZONE`
+
+**Description:**
+
+Returns the current local timezone.
+
+Return type: STRING
+
+Example::
+
+    > source=people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`
+    fetched rows / total rows = 1/1
+    +------------------------+
+    | CURRENT_TIMEZONE()     |
+    |------------------------|
+    | America/Chicago        |
+    +------------------------+
+
diff --git a/docs/ppl-lang/functions/ppl-ip.md b/docs/ppl-lang/functions/ppl-ip.md
@@ -0,0 +1,35 @@
+## PPL IP Address Functions
+
+### `CIDRMATCH`
+
+**Description**
+
+`CIDRMATCH(ip, cidr)` checks if ip is within the specified cidr range.
+
+**Argument type:**
+ - STRING, STRING
+ - Return type: **BOOLEAN**
+
+Example:
+
+    os> source=ips | where cidrmatch(ip, '192.169.1.0/24') | fields ip
+    fetched rows / total rows = 1/1
+    +--------------+
+    | ip           |
+    |--------------|
+    | 192.169.1.5  |
+    +--------------+
+
+    os> source=ipsv6 | where cidrmatch(ip, '2003:db8::/32') | fields ip
+    fetched rows / total rows = 1/1
+    +-----------------------------------------+
+    | ip                                      |
+    |-----------------------------------------|
+    | 2003:0db8:0000:0000:0000:0000:0000:0000 |
+    +-----------------------------------------+
+
+Note:
+ - `ip` can be an IPv4 or an IPv6 address
+ - `cidr` can be an IPv4 or an IPv6 block
+ - `ip` and `cidr` must be either both IPv4 or both IPv6
+ - `ip` and `cidr` must both be valid and non-empty/non-null