diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 6f3ce942eb17c..0c1fd63de5c94 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -2482,7 +2482,7 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: format_string.__doc__ = pysparkfuncs.format_string.__doc__ -def instr(str: "ColumnOrName", substr: str) -> Column: +def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column: return _invoke_function("instr", _to_col(str), lit(substr)) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 1e5349fb1649c..1ee5c357bd6ef 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -12821,7 +12821,7 @@ def format_string(format: str, *cols: "ColumnOrName") -> Column: @_try_remote_functions -def instr(str: "ColumnOrName", substr: str) -> Column: +def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column: """ Locate the position of the first occurrence of substr column in the given string. Returns null if either of the arguments are null. @@ -12838,11 +12838,14 @@ def instr(str: "ColumnOrName", substr: str) -> Column: Parameters ---------- - str : :class:`~pyspark.sql.Column` or str + str : :class:`~pyspark.sql.Column` or column name target column to work on. - substr : str + substr : :class:`~pyspark.sql.Column` or literal string substring to look for. + .. versionchanged:: 4.0.0 + `substr` now accepts column. + Returns ------- :class:`~pyspark.sql.Column` @@ -12850,13 +12853,31 @@ def instr(str: "ColumnOrName", substr: str) -> Column: Examples -------- - >>> df = spark.createDataFrame([('abcd',)], ['s',]) - >>> df.select(instr(df.s, 'b').alias('s')).collect() - [Row(s=2)] - """ - from pyspark.sql.classic.column import _to_java_column + Example 1: Using a literal string as the 'substring' - return _invoke_function("instr", _to_java_column(str), _enum_to_value(substr)) + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",]) + >>> df.select("*", sf.instr(df.s, "b")).show() + +----+-----------+ + | s|instr(s, b)| + +----+-----------+ + |abcd| 2| + | xyz| 0| + +----+-----------+ + + Example 2: Using a Column 'substring' + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",]) + >>> df.select("*", sf.instr("s", sf.lit("abc").substr(0, 2))).show() + +----+---------------------------+ + | s|instr(s, substr(abc, 0, 2))| + +----+---------------------------+ + |abcd| 1| + | xyz| 0| + +----+---------------------------+ + """ + return _invoke_function_over_columns("instr", str, lit(substr)) @_try_remote_functions diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index 8c49952bc31e3..35453603de5d3 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -3995,7 +3995,20 @@ object functions { * @group string_funcs * @since 1.5.0 */ - def instr(str: Column, substring: String): Column = Column.fn("instr", str, lit(substring)) + def instr(str: Column, substring: String): Column = instr(str, lit(substring)) + + /** + * Locate the position of the first occurrence of substr column in the given string. Returns + * null if either of the arguments are null. + * + * @note + * The position is not zero based, but 1 based index. Returns 0 if substr could not be found + * in str. + * + * @group string_funcs + * @since 4.0.0 + */ + def instr(str: Column, substring: Column): Column = Column.fn("instr", str, substring) /** * Computes the character length of a given string or number of bytes of a binary string. The