Skip to content

Commit

Permalink
[SPARK-50231][PYTHON] Make function instr accept Column substring
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Make function `instr` accept Column `substring`

### Why are the changes needed?
in Spark Connect, this function actually accepts Column `substring`

### Does this PR introduce _any_ user-facing change?
yes, new feature

### How was this patch tested?
added doctests

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #48761 from zhengruifeng/py_instr.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
zhengruifeng committed Nov 5, 2024
1 parent b7e70fe commit 066a30e
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 11 deletions.
2 changes: 1 addition & 1 deletion python/pyspark/sql/connect/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2482,7 +2482,7 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column:
format_string.__doc__ = pysparkfuncs.format_string.__doc__


def instr(str: "ColumnOrName", substr: str) -> Column:
def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column:
return _invoke_function("instr", _to_col(str), lit(substr))


Expand Down
39 changes: 30 additions & 9 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12821,7 +12821,7 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column:


@_try_remote_functions
def instr(str: "ColumnOrName", substr: str) -> Column:
def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column:
"""
Locate the position of the first occurrence of substr column in the given string.
Returns null if either of the arguments are null.
Expand All @@ -12838,25 +12838,46 @@ def instr(str: "ColumnOrName", substr: str) -> Column:

Parameters
----------
str : :class:`~pyspark.sql.Column` or str
str : :class:`~pyspark.sql.Column` or column name
target column to work on.
substr : str
substr : :class:`~pyspark.sql.Column` or literal string
substring to look for.

.. versionchanged:: 4.0.0
`substr` now accepts column.

Returns
-------
:class:`~pyspark.sql.Column`
location of the first occurrence of the substring as integer.

Examples
--------
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(instr(df.s, 'b').alias('s')).collect()
[Row(s=2)]
"""
from pyspark.sql.classic.column import _to_java_column
Example 1: Using a literal string as the 'substring'

return _invoke_function("instr", _to_java_column(str), _enum_to_value(substr))
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",])
>>> df.select("*", sf.instr(df.s, "b")).show()
+----+-----------+
| s|instr(s, b)|
+----+-----------+
|abcd| 2|
| xyz| 0|
+----+-----------+

Example 2: Using a Column 'substring'

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",])
>>> df.select("*", sf.instr("s", sf.lit("abc").substr(0, 2))).show()
+----+---------------------------+
| s|instr(s, substr(abc, 0, 2))|
+----+---------------------------+
|abcd| 1|
| xyz| 0|
+----+---------------------------+
"""
return _invoke_function_over_columns("instr", str, lit(substr))


@_try_remote_functions
Expand Down
15 changes: 14 additions & 1 deletion sql/api/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3995,7 +3995,20 @@ object functions {
* @group string_funcs
* @since 1.5.0
*/
def instr(str: Column, substring: String): Column = Column.fn("instr", str, lit(substring))
def instr(str: Column, substring: String): Column = instr(str, lit(substring))

/**
* Locate the position of the first occurrence of substr column in the given string. Returns
* null if either of the arguments are null.
*
* @note
* The position is not zero based, but 1 based index. Returns 0 if substr could not be found
* in str.
*
* @group string_funcs
* @since 4.0.0
*/
def instr(str: Column, substring: Column): Column = Column.fn("instr", str, substring)

/**
* Computes the character length of a given string or number of bytes of a binary string. The
Expand Down

0 comments on commit 066a30e

Please sign in to comment.