Skip to content

Commit

Permalink
Update functions.scala
Browse files Browse the repository at this point in the history
Added functions.json_tuple
functions.cbrt
functions.from_json
functions.date_sub
  • Loading branch information
sfc-gh-sjayabalan committed Jul 31, 2024
1 parent e29ad3e commit 6b37cc5
Showing 1 changed file with 126 additions and 127 deletions.
253 changes: 126 additions & 127 deletions src/main/scala/com/snowflake/snowpark/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3139,7 +3139,132 @@ object functions {
* @group agg_func
*/
def listagg(col: Column): Column = listagg(col, "", isDistinct = false)
/**
* This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns
* in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column.
*
* NOTE:
* <ul>
* <li> Timestamp type: there is no interpretation of date values as UTC</li>
* <li> Identifiers with spaces: Snowflake returns error when an invalid expression is sent. </li>
*
* Usage:
* <pre>
* df = session.createDataFrame(Seq(("CR", "{\"id\": 5, \"name\": \"Jose\", \"age\": 29}"))).toDF(Seq("nationality", "json_string"))
* </pre>
* When the result of this function is the only part of the select statement, no changes are needed:
* <pre>
* df.select(json_tuple(col("json_string"), "id", "name", "age")).show()
* </pre>
*
* <pre>
* ----------------------
* |"C0" |"C1" |"C2" |
* ----------------------
* |5 |Jose |29 |
* ----------------------
* </pre>
* However, when specifying multiple columns, an expression like this is required:
* <pre>
* df.select(
* col("nationality")
* , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
* ).show()
* </pre>
*
* <pre>
* -------------------------------------------------
* |"NATIONALITY" |"C0" |"C1" |"C2" |"C3" |
* -------------------------------------------------
* |CR |5 |Jose |29 |Mobilize |
* -------------------------------------------------
* </pre>
* @since 1.10.0
* @param json Column containing the JSON string text.
* @param fields Fields to pull from the JSON file.
* @return Column sequence with the specified strings.
*/
def json_tuple(json: Column, fields: String*): Seq[Column] = {
var i = -1
fields.map(f => {
i += 1
builtin("JSON_EXTRACT_PATH_TEXT")(json, f).as(s"c$i")
})
}

/**
* Used to calculate the cubic root of a number.
* @since 1.10.0
* @param column Column to calculate the cubic root.
* @return Column object.
*/
def cbrt(e: Column): Column = {
builtin("CBRT")(e)
}

/**
* Used to calculate the cubic root of a number. There were slight differences found:
* @since 1.10.0
* @param column Column to calculate the cubic root.
* @return Column object.
*/
def cbrt(columnName: String): Column = {
cbrt(col(columnName))
}

/**
* This function converts a JSON string to a variant in Snowflake.
*
* In Snowflake the values are converted automatically, however they're converted as variants, meaning that the printSchema function would return different datatypes.
* To convert the datatype and it to be printed as the expected datatype, it should be read on the selectExpr function as "json['relative']['age']::integer".
* <pre>
* val data_for_json = Seq(
* (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}"),
* (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}")
* )
* val data_for_json_column = Seq("col1", "col2")
* val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column)
*
* val json_df = df_for_json.select(
* from_json(col("col2")).as("json")
* )
*
* json_df.selectExpr(
* "json['id']::integer as id"
* , "json['age']::integer as age"
* , "json['relative']['id']::integer as rel_id"
* , "json['relative']['age']::integer as rel_age"
* ).show(10, 10000)
* </pre>
*
* <pre>
* -----------------------------------------
* |"ID" |"AGE" |"REL_ID" |"REL_AGE" |
* -----------------------------------------
* |172319 |41 |885471 |29 |
* |532161 |17 |873513 |47 |
* -----------------------------------------
* </pre>
* @since 1.10.0
* @param e String column to convert to variant.
* @return Column object.
*/
def from_json(e: Column): Column = {
builtin("TRY_PARSE_JSON")(e)
}

/**
* This function receives a date or timestamp, as well as a properly formatted string and subtracts the specified
* amount of days from it. If receiving a string, this string is casted to date using try_cast and if it's not possible to cast, returns null. If receiving
* a timestamp it will be casted to date (removing its time).
* @since 1.10.0
* @param start Date, Timestamp or String column to subtract days from.
* @param days Days to subtract.
* @return Column object.
*/
def date_sub(start: Column, days: Int): Column = {
dateadd("DAY", lit(days * -1), sqlExpr(s"try_cast(${start.getName.get} :: STRING as DATE)"))
}
/**
* Invokes a built-in snowflake function with the specified name and arguments.
* Arguments can be of two types
Expand Down Expand Up @@ -3867,130 +3992,4 @@ object functions {
"")(func)
}

}
/**
* This leverages JSON_EXTRACT_PATH_TEXT and improves functionality by allowing multiple columns
* in a single call, whereas JSON_EXTRACT_PATH_TEXT must be called once for every column.
*
* NOTE:
* <ul>
* <li> Timestamp type: there is no interpretation of date values as UTC</li>
* <li> Identifiers with spaces: Snowflake returns error when an invalid expression is sent. </li>
*
* Usage:
* <pre>
* df = session.createDataFrame(Seq(("CR", "{\"id\": 5, \"name\": \"Jose\", \"age\": 29}"))).toDF(Seq("nationality", "json_string"))
* </pre>
* When the result of this function is the only part of the select statement, no changes are needed:
* <pre>
* df.select(json_tuple(col("json_string"), "id", "name", "age")).show()
* </pre>
*
* <pre>
* ----------------------
* |"C0" |"C1" |"C2" |
* ----------------------
* |5 |Jose |29 |
* ----------------------
* </pre>
* However, when specifying multiple columns, an expression like this is required:
* <pre>
* df.select(
* col("nationality")
* , json_tuple(col("json_string"), "id", "name", "age"):_* // Notice the :_* syntax.
* ).show()
* </pre>
*
* <pre>
* -------------------------------------------------
* |"NATIONALITY" |"C0" |"C1" |"C2" |"C3" |
* -------------------------------------------------
* |CR |5 |Jose |29 |Mobilize |
* -------------------------------------------------
* </pre>
* @since 1.10.0
* @param json Column containing the JSON string text.
* @param fields Fields to pull from the JSON file.
* @return Column sequence with the specified strings.
*/
def json_tuple(json: Column, fields: String*): Seq[Column] = {
var i = -1
fields.map(f => {
i += 1
builtin("JSON_EXTRACT_PATH_TEXT")(json, f).as(s"c$i")
})
}

/**
* Used to calculate the cubic root of a number.
* @since 1.10.0
* @param column Column to calculate the cubic root.
* @return Column object.
*/
def cbrt(e: Column): Column = {
builtin("CBRT")(e)
}

/**
* Used to calculate the cubic root of a number. There were slight differences found:
* @since 1.10.0
* @param column Column to calculate the cubic root.
* @return Column object.
*/
def cbrt(columnName: String): Column = {
cbrt(col(columnName))
}

/**
* This function converts a JSON string to a variant in Snowflake.
*
* In Snowflake the values are converted automatically, however they're converted as variants, meaning that the printSchema function would return different datatypes.
* To convert the datatype and it to be printed as the expected datatype, it should be read on the selectExpr function as "json['relative']['age']::integer".
* <pre>
* val data_for_json = Seq(
* (1, "{\"id\": 172319, \"age\": 41, \"relative\": {\"id\": 885471, \"age\": 29}}"),
* (2, "{\"id\": 532161, \"age\": 17, \"relative\":{\"id\": 873513, \"age\": 47}}")
* )
* val data_for_json_column = Seq("col1", "col2")
* val df_for_json = session.createDataFrame(data_for_json).toDF(data_for_json_column)
*
* val json_df = df_for_json.select(
* from_json(col("col2")).as("json")
* )
*
* json_df.selectExpr(
* "json['id']::integer as id"
* , "json['age']::integer as age"
* , "json['relative']['id']::integer as rel_id"
* , "json['relative']['age']::integer as rel_age"
* ).show(10, 10000)
* </pre>
*
* <pre>
* -----------------------------------------
* |"ID" |"AGE" |"REL_ID" |"REL_AGE" |
* -----------------------------------------
* |172319 |41 |885471 |29 |
* |532161 |17 |873513 |47 |
* -----------------------------------------
* </pre>
* @since 1.10.0
* @param e String column to convert to variant.
* @return Column object.
*/
def from_json(e: Column): Column = {
builtin("TRY_PARSE_JSON")(e)
}

/**
* This function receives a date or timestamp, as well as a properly formatted string and subtracts the specified
* amount of days from it. If receiving a string, this string is casted to date using try_cast and if it's not possible to cast, returns null. If receiving
* a timestamp it will be casted to date (removing its time).
* @since 1.10.0
* @param start Date, Timestamp or String column to subtract days from.
* @param days Days to subtract.
* @return Column object.
*/
def date_sub(start: Column, days: Int): Column = {
dateadd("DAY", lit(days * -1), sqlExpr(s"try_cast(${start.getName.get} :: STRING as DATE)"))
}
}

0 comments on commit 6b37cc5

Please sign in to comment.