Skip to content

Commit

Permalink
Support PPL JSON functions: construction and extraction (#780)
Browse files Browse the repository at this point in the history
* first commit

Signed-off-by: Lantao Jin <[email protected]>

* add docs and fix IT

Signed-off-by: Lantao Jin <[email protected]>

* add examples for json_extract()

Signed-off-by: Lantao Jin <[email protected]>

* fix missing import and doc link

Signed-off-by: Lantao Jin <[email protected]>

* minor

Signed-off-by: Lantao Jin <[email protected]>

* add UT and optimize the doc

Signed-off-by: Lantao Jin <[email protected]>

* typo

Signed-off-by: Lantao Jin <[email protected]>

* fix the issue when merge conflicts

Signed-off-by: Lantao Jin <[email protected]>

---------

Signed-off-by: Lantao Jin <[email protected]>
  • Loading branch information
LantaoJin authored Oct 25, 2024
1 parent ed1b24c commit ee75048
Show file tree
Hide file tree
Showing 10 changed files with 1,059 additions and 17 deletions.
2 changes: 2 additions & 0 deletions docs/ppl-lang/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ For additional examples see the next [documentation](PPL-Example-Commands.md).

- [`String Functions`](functions/ppl-string.md)

- [`JSON Functions`](functions/ppl-json.md)

- [`Condition Functions`](functions/ppl-condition.md)

- [`Type Conversion Functions`](functions/ppl-conversion.md)
Expand Down
237 changes: 237 additions & 0 deletions docs/ppl-lang/functions/ppl-json.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
## PPL JSON Functions

### `JSON`

**Description**

`json(value)` Evaluates whether a value can be parsed as JSON. Returns the json string if valid, null otherwise.

**Argument type:** STRING/JSON_ARRAY/JSON_OBJECT

**Return type:** STRING

A STRING expression of a valid JSON object format.

Example:

os> source=people | eval `valid_json()` = json('[1,2,3,{"f1":1,"f2":[5,6]},4]') | fields valid_json
fetched rows / total rows = 1/1
+---------------------------------+
| valid_json |
+---------------------------------+
| [1,2,3,{"f1":1,"f2":[5,6]},4] |
+---------------------------------+

os> source=people | eval `invalid_json()` = json('{"invalid": "json"') | fields invalid_json
fetched rows / total rows = 1/1
+----------------+
| invalid_json |
+----------------+
| null |
+----------------+


### `JSON_OBJECT`

**Description**

`json_object(<key>, <value>[, <key>, <value>]...)` returns a JSON object from members of key-value pairs.

**Argument type:**
- A \<key\> must be STRING.
- A \<value\> can be any data types.

**Return type:** JSON_OBJECT (Spark StructType)

A StructType expression of a valid JSON object.

Example:

os> source=people | eval result = json(json_object('key', 123.45)) | fields result
fetched rows / total rows = 1/1
+------------------+
| result |
+------------------+
| {"key":123.45} |
+------------------+

os> source=people | eval result = json(json_object('outer', json_object('inner', 123.45))) | fields result
fetched rows / total rows = 1/1
+------------------------------+
| result |
+------------------------------+
| {"outer":{"inner":123.45}} |
+------------------------------+


### `JSON_ARRAY`

**Description**

`json_array(<value>...)` Creates a JSON ARRAY using a list of values.

**Argument type:**
- A \<value\> can be any kind of value such as string, number, or boolean.

**Return type:** ARRAY (Spark ArrayType)

An array of any supported data type for a valid JSON array.

Example:

os> source=people | eval `json_array` = json_array(1, 2, 0, -1, 1.1, -0.11)
fetched rows / total rows = 1/1
+----------------------------+
| json_array |
+----------------------------+
| 1.0,2.0,0.0,-1.0,1.1,-0.11 |
+----------------------------+

os> source=people | eval `json_array_object` = json(json_object("array", json_array(1, 2, 0, -1, 1.1, -0.11)))
fetched rows / total rows = 1/1
+----------------------------------------+
| json_array_object |
+----------------------------------------+
| {"array":[1.0,2.0,0.0,-1.0,1.1,-0.11]} |
+----------------------------------------+

### `JSON_ARRAY_LENGTH`

**Description**

`json_array_length(jsonArray)` Returns the number of elements in the outermost JSON array.

**Argument type:** STRING/JSON_ARRAY

A STRING expression of a valid JSON array format, or JSON_ARRAY object.

**Return type:** INTEGER

`NULL` is returned in case of any other valid JSON string, `NULL` or an invalid JSON.

Example:

os> source=people | eval `lenght1` = json_array_length('[1,2,3,4]'), `lenght2` = json_array_length('[1,2,3,{"f1":1,"f2":[5,6]},4]'), `not_array` = json_array_length('{"key": 1}')
fetched rows / total rows = 1/1
+-----------+-----------+-------------+
| lenght1 | lenght2 | not_array |
+-----------+-----------+-------------+
| 4 | 5 | null |
+-----------+-----------+-------------+

os> source=people | eval `json_array` = json_array_length(json_array(1,2,3,4)), `empty_array` = json_array_length(json_array())
fetched rows / total rows = 1/1
+--------------+---------------+
| json_array | empty_array |
+--------------+---------------+
| 4 | 0 |
+--------------+---------------+

### `JSON_EXTRACT`

**Description**

`json_extract(jsonStr, path)` Extracts json object from a json string based on json path specified. Return null if the input json string is invalid.

**Argument type:** STRING, STRING

**Return type:** STRING

A STRING expression of a valid JSON object format.

`NULL` is returned in case of an invalid JSON.

Example:

os> source=people | eval `json_extract('{"a":"b"}', '$.a')` = json_extract('{"a":"b"}', '$a')
fetched rows / total rows = 1/1
+----------------------------------+
| json_extract('{"a":"b"}', 'a') |
+----------------------------------+
| b |
+----------------------------------+

os> source=people | eval `json_extract('{"a":[{"b":1},{"b":2}]}', '$.a[1].b')` = json_extract('{"a":[{"b":1},{"b":2}]}', '$.a[1].b')
fetched rows / total rows = 1/1
+-----------------------------------------------------------+
| json_extract('{"a":[{"b":1.0},{"b":2.0}]}', '$.a[1].b') |
+-----------------------------------------------------------+
| 2.0 |
+-----------------------------------------------------------+

os> source=people | eval `json_extract('{"a":[{"b":1},{"b":2}]}', '$.a[*].b')` = json_extract('{"a":[{"b":1},{"b":2}]}', '$.a[*].b')
fetched rows / total rows = 1/1
+-----------------------------------------------------------+
| json_extract('{"a":[{"b":1.0},{"b":2.0}]}', '$.a[*].b') |
+-----------------------------------------------------------+
| [1.0,2.0] |
+-----------------------------------------------------------+

os> source=people | eval `invalid_json` = json_extract('{"invalid": "json"')
fetched rows / total rows = 1/1
+----------------+
| invalid_json |
+----------------+
| null |
+----------------+


### `JSON_KEYS`

**Description**

`json_keys(jsonStr)` Returns all the keys of the outermost JSON object as an array.

**Argument type:** STRING

A STRING expression of a valid JSON object format.

**Return type:** ARRAY[STRING]

`NULL` is returned in case of any other valid JSON string, or an empty string, or an invalid JSON.

Example:

os> source=people | eval `keys` = json_keys('{"f1":"abc","f2":{"f3":"a","f4":"b"}}')
fetched rows / total rows = 1/1
+------------+
| keus |
+------------+
| [f1, f2] |
+------------+

os> source=people | eval `keys` = json_keys('[1,2,3,{"f1":1,"f2":[5,6]},4]')
fetched rows / total rows = 1/1
+--------+
| keys |
+--------+
| null |
+--------+

### `JSON_VALID`

**Description**

`json_valid(jsonStr)` Evaluates whether a JSON string uses valid JSON syntax and returns TRUE or FALSE.

**Argument type:** STRING

**Return type:** BOOLEAN

Example:

os> source=people | eval `valid_json` = json_valid('[1,2,3,4]'), `invalid_json` = json_valid('{"invalid": "json"') | feilds `valid_json`, `invalid_json`
fetched rows / total rows = 1/1
+--------------+----------------+
| valid_json | invalid_json |
+--------------+----------------+
| True | False |
+--------------+----------------+

os> source=accounts | where json_valid('[1,2,3,4]') and isnull(email) | fields account_number, email
fetched rows / total rows = 1/1
+------------------+---------+
| account_number | email |
|------------------+---------|
| 13 | null |
+------------------+---------+
Original file line number Diff line number Diff line change
Expand Up @@ -642,4 +642,31 @@ trait FlintSparkSuite extends QueryTest with FlintSuite with OpenSearchSuite wit
| (6, 403, '/home', null)
| """.stripMargin)
}

protected def createNullableJsonContentTable(testTable: String): Unit = {
sql(s"""
| CREATE TABLE $testTable
| (
| id INT,
| jString STRING,
| isValid BOOLEAN
| )
| USING $tableType $tableOptions
|""".stripMargin)

sql(s"""
| INSERT INTO $testTable
| VALUES (1, '{"account_number":1,"balance":39225,"age":32,"gender":"M"}', true),
| (2, '{"f1":"abc","f2":{"f3":"a","f4":"b"}}', true),
| (3, '[1,2,3,{"f1":1,"f2":[5,6]},4]', true),
| (4, '[]', true),
| (5, '{"teacher":"Alice","student":[{"name":"Bob","rank":1},{"name":"Charlie","rank":2}]}', true),
| (6, '[1,2,3]', true),
| (7, '[1,2', false),
| (8, '[invalid json]', false),
| (9, '{"invalid": "json"', false),
| (10, 'invalid json', false),
| (11, null, false)
| """.stripMargin)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ import org.opensearch.flint.spark.{FlintPPLSparkExtensions, FlintSparkExtensions

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, QueryTest, Row}
import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode
import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
import org.apache.spark.sql.flint.config.FlintSparkConf.OPTIMIZER_RULE_ENABLED
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession

trait FlintPPLSuite extends FlintSparkSuite {
override protected def sparkConf: SparkConf = {
Expand All @@ -29,11 +25,11 @@ trait FlintPPLSuite extends FlintSparkSuite {
def assertSameRows(expected: Seq[Row], df: DataFrame): Unit = {
QueryTest.sameRows(expected, df.collect().toSeq).foreach { results =>
fail(s"""
|Results do not match for query:
|${df.queryExecution}
|== Results ==
|$results
""".stripMargin)
|Results do not match for query:
|${df.queryExecution}
|== Results ==
|$results
""".stripMargin)
}
}
}
Loading

0 comments on commit ee75048

Please sign in to comment.