Skip to content

Commit

Permalink
expr/str: add support for json_extract and split
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilgarg28 committed Nov 9, 2024
1 parent 149d39a commit 2beb5ba
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 52 deletions.
1 change: 1 addition & 0 deletions .wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ schemaless
schemas
sdk
secret
sep
signup
signups
sinked
Expand Down
2 changes: 2 additions & 0 deletions docs/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,11 @@ sidebar:
- "api-reference/expressions/str/concat"
- "api-reference/expressions/str/contains"
- "api-reference/expressions/str/endswith"
- "api-reference/expressions/str/json_extract"
- "api-reference/expressions/str/len"
- "api-reference/expressions/str/lower"
- "api-reference/expressions/str/parse"
- "api-reference/expressions/str/split"
- "api-reference/expressions/str/startswith"
- "api-reference/expressions/str/strptime"
- "api-reference/expressions/str/upper"
Expand Down
46 changes: 44 additions & 2 deletions docs/examples/api-reference/expressions/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,6 @@ def test_strptime():
assert expr.typeof(schema={"x": str}) == datetime
assert expr.typeof(schema={"x": Optional[str]}) == Optional[datetime]

# TODO: replace NaT with pd.NA
# TODO: replace pd.Timestamp with datetime
df = pd.DataFrame({"x": ["2021-01-01", "2021-02-01", None]})
schema = {"x": Optional[str]}
assert expr.eval(df, schema).tolist() == [
Expand Down Expand Up @@ -353,3 +351,47 @@ def test_strptime():
with pytest.raises(Exception):
expr.eval(df, schema)
# /docsnip


def test_json_extract():
# docsnip json_extract
from fennel.expr import col

# docsnip-highlight next-line
expr = col("s").str.json_extract("$.x.y")

# return type is always Optional[str]
assert expr.typeof(schema={"s": str}) == Optional[str]
assert expr.typeof(schema={"s": Optional[str]}) == Optional[str]

# can be evaluated with a dataframe
df = pd.DataFrame(
{"s": ['{"x": {"y": "hello"}}', '{"x": {"y": 1}}', "{}", None]}
)
schema = {"s": Optional[str]}
# NOTE that the integer value 1 is returned as a string and not an int
# also invalid paths (e.g. "$.x.y" in case 3 of "{}") return null
assert expr.eval(df, schema).tolist() == ["hello", "1", pd.NA, pd.NA]
# /docsnip


def test_split():
# docsnip split
from fennel.expr import col

# docsnip-highlight next-line
expr = col("s").str.split(",")

assert expr.typeof(schema={"s": str}) == List[str]
assert expr.typeof(schema={"s": Optional[str]}) == Optional[List[str]]

# can be evaluated with a dataframe
df = pd.DataFrame({"s": ["a,b,c", "d,e", "f", None]})
schema = {"s": Optional[str]}
assert expr.eval(df, schema).tolist() == [
["a", "b", "c"],
["d", "e"],
["f"],
pd.NA,
]
# /docsnip
36 changes: 36 additions & 0 deletions docs/pages/api-reference/expressions/str/json_extract.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
title: Json Extract
order: 0
status: published
---

### Json Extract

Function to extract a value from a json encoded string using a json path.

#### Parameters
<Expandable title="path" type="str">
The json path to use when extracting the value from the json encoded string.
See [this page](https://goessner.net/articles/JsonPath/) for more details on
json path syntax. The extracted value is always returned as a string or None
if the path is not valid/found.
</Expandable>

<pre snippet="api-reference/expressions/str#json_extract"
status="success" message="Extracting a value from a json encoded string">
</pre>


#### Returns
<Expandable type="Expr">
Returns an expression object denoting the result of the `json_extract` expression.
The resulting expression is of type `Optional[str]` and more specifically is None
when the base string is None or the path is not found in the json encoded string.
</Expandable>


#### Errors
<Expandable title="Use of invalid types">
The `str` namespace must be invoked on an expression that evaluates to string
or optional of string.
</Expandable>
32 changes: 32 additions & 0 deletions docs/pages/api-reference/expressions/str/split.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
title: Split
order: 0
status: published
---

### Split

Function to split a string into a list of strings using a separator.

#### Parameters
<Expandable title="sep" type="str">
The separator string to use when splitting the string.
</Expandable>

<pre snippet="api-reference/expressions/str#split"
status="success" message="Splitting a string by comma">
</pre>

#### Returns
<Expandable type="Expr">
Returns an expression object denoting the result of the `split` function.
The resulting expression is of type `List[str]` or `Optional[List[str]]` depending on
input being nullable.
</Expandable>


#### Errors
<Expandable title="Use of invalid types">
The `str` namespace must be invoked on an expression that evaluates to string
or optional of string.
</Expandable>
5 changes: 4 additions & 1 deletion fennel/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## [1.5.49] - 2024-11-09
- Add json_extract and split string expressions

## [1.5.48] - 2024-11-06
- Use v2 by default in query_offline

Expand All @@ -22,7 +25,7 @@
- Increase client timeouts

## [1.5.41] - 2024-10-21
- Add window support dedup
- Add support for window based dedup operator

## [1.5.40] - 2024-10-05
- Add changelog operator
Expand Down
16 changes: 16 additions & 0 deletions fennel/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,16 @@ class Concat(StringOp):
other: Expr


@dataclass
class StringJsonExtract(StringOp):
path: str


@dataclass
class StringSplit(StringOp):
sep: str


class _String(Expr):
def __init__(self, expr: Expr, op: StringOp):
self.op = op
Expand All @@ -577,6 +587,12 @@ def contains(self, item) -> _Bool:
item_expr = make_expr(item)
return _Bool(_String(self, StrContains(item_expr)))

def json_extract(self, path: str) -> _String:
return _String(self, StringJsonExtract(path))

def split(self, sep: str) -> _List:
return _List(_String(self, StringSplit(sep)), ListNoop())

def concat(self, other: Expr) -> _String:
other = make_expr(other)
return _String(self, Concat(other))
Expand Down
10 changes: 10 additions & 0 deletions fennel/expr/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
StringStrpTime,
StringParse,
StrStartsWith,
StringSplit,
StringJsonExtract,
StrEndsWith,
Lower,
Upper,
Expand Down Expand Up @@ -294,6 +296,14 @@ def visitString(self, obj):
endswith=proto.EndsWith(key=self.visit(obj.op.item))
)
)
elif isinstance(obj.op, StringJsonExtract):
expr.string_fn.fn.CopyFrom(
proto.StringOp(json_extract=proto.JsonExtract(path=obj.op.path))
)
elif isinstance(obj.op, StringSplit):
expr.string_fn.fn.CopyFrom(
proto.StringOp(split=proto.Split(sep=obj.op.sep))
)
else:
raise InvalidExprException("invalid string operation: %s" % obj.op)
expr.string_fn.string.CopyFrom(self.visit(obj.operand))
Expand Down
78 changes: 39 additions & 39 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2beb5ba

Please sign in to comment.