Skip to content

Commit

Permalink
expr/docs: document all list functions
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilgarg28 committed Sep 20, 2024
1 parent 3dd15f1 commit 3ad5732
Show file tree
Hide file tree
Showing 12 changed files with 364 additions and 76 deletions.
1 change: 1 addition & 0 deletions .wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ hackathon
hardcoded
hashable
hashmap
hasnull
hostname
html
hudi
Expand Down
13 changes: 7 additions & 6 deletions docs/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,13 @@ sidebar:
# - "api-reference/expressions/dt.second"
# - "api-reference/expressions/dt.strftime"

# - slug: "api-reference/expressions/list"
# title: "List Expressions"
# pages:
# - "api-reference/expressions/list.len"
# - "api-reference/expressions/list.hasnull"
# - "api-reference/expressions/list.contains"
- slug: "api-reference/expressions/list"
title: "List Expressions"
pages:
- "api-reference/expressions/list/at"
- "api-reference/expressions/list/contains"
- "api-reference/expressions/list/hasnull"
- "api-reference/expressions/list/len"

- slug: "api-reference/expressions/num"
title: "Num Expressions"
Expand Down
138 changes: 138 additions & 0 deletions docs/examples/api-reference/expressions/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import pytest
from typing import Optional, List
import pandas as pd


def test_len():
# docsnip len
from fennel.expr import col

# docsnip-highlight next-line
expr = col("x").list.len()

# len works for any list type or optional list type
assert expr.typeof(schema={"x": List[int]}) == int
assert expr.typeof(schema={"x": Optional[List[float]]}) == Optional[int]

# can be evaluated with a dataframe
df = pd.DataFrame({"x": [[1, 2, 3], [4, 5], [], None]})
schema = {"x": Optional[List[int]]}
assert expr.eval(df, schema=schema).tolist() == [3, 2, 0, pd.NA]

# schema of column must be list of something
with pytest.raises(ValueError):
expr.typeof(schema={"x": int})
# /docsnip


def test_has_null():
# docsnip has_null
from fennel.expr import col

# docsnip-highlight next-line
expr = col("x").list.hasnull()

# len works for any list type or optional list type
assert expr.typeof(schema={"x": List[int]}) == bool
assert expr.typeof(schema={"x": Optional[List[float]]}) == Optional[bool]

# can be evaluated with a dataframe
df = pd.DataFrame({"x": [[1, 2, 3], [4, 5, None], [], None]})
schema = {"x": Optional[List[Optional[int]]]}
assert expr.eval(df, schema=schema).tolist() == [False, True, False, pd.NA]

# schema of column must be list of something
with pytest.raises(ValueError):
expr.typeof(schema={"x": int})
# /docsnip


def test_contains():
# docsnip contains
from fennel.expr import col

# docsnip-highlight next-line
expr = col("x").list.contains(col("y"))

# contains works for only list types
assert expr.typeof(schema={"x": List[int], "y": int}) == bool
assert (
expr.typeof(schema={"x": Optional[List[float]], "y": float})
== Optional[bool]
)

# however doesn't work if item is not of the same type as the list elements
with pytest.raises(ValueError):
expr.typeof(schema={"x": List[int], "y": str})

# can be evaluated with a dataframe
df = pd.DataFrame(
{
"x": [[1, 2, 3], [4, 5, None], [4, 5, None], None, []],
"y": [1, 5, 3, 4, None],
}
)
schema = {"x": Optional[List[Optional[int]]], "y": Optional[int]}
assert expr.eval(df, schema=schema).tolist() == [
True,
True,
pd.NA,
pd.NA,
False,
]

# schema of column must be list of something
with pytest.raises(ValueError):
expr.typeof(schema={"x": int})
# /docsnip


def test_at():
# docsnip at
from fennel.expr import col

# docsnip-highlight next-line
expr = col("x").list.at(col("y"))

# contains works for only list types, index can be int/optional[int]
assert expr.typeof(schema={"x": List[int], "y": int}) == Optional[int]
assert expr.typeof(schema={"x": List[str], "y": int}) == Optional[str]

schema = {"x": Optional[List[float]], "y": float}
with pytest.raises(Exception):
expr.typeof(schema=schema)

# can be evaluated with a dataframe
df = pd.DataFrame(
{
"x": [[1, 2, 3], [4, 5, None], [4, 5, None], None],
"y": [1, 5, 0, 4],
}
)
schema = {"x": Optional[List[Optional[int]]], "y": int}
assert expr.eval(df, schema=schema).tolist() == [2, pd.NA, 4, pd.NA]

# schema of column must be list of something
with pytest.raises(ValueError):
expr.typeof(schema={"x": int})
# /docsnip


def test_at_negative():
# docsnip at_negative
from fennel.expr import col

# docsnip-highlight next-line
expr = col("x").list.at(col("y"))

# negative indices until -len(list) are allowed and do reverse indexing
# beyond that, start returning None like other out-of-bounds indices
df = pd.DataFrame(
{
"x": [[1, 2, 3], [4, 5, None], [4, 5, None], None],
"y": [-1, -5, -2, -4],
}
)
schema = {"x": Optional[List[Optional[int]]], "y": int}
assert expr.eval(df, schema=schema).tolist() == [3, pd.NA, 5, pd.NA]
# /docsnip
43 changes: 43 additions & 0 deletions docs/pages/api-reference/expressions/list/at.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
---
title: At
order: 0
status: published
---

### At

Function to get the value of the element at a given index of the list.

#### Parameters
<Expandable title="index" type="Expr">
The index at which list's value needs to be evaluated. This expression is expected
to evaluate to an int. Fennel supports indexing by negative integers as well.
</Expandable>

<pre snippet="api-reference/expressions/list#at"
status="success" message="Getting the value of a list's element at given index">
</pre>

<pre snippet="api-reference/expressions/list#at_negative"
status="success" message="Also works with negative indices">
</pre>


#### Returns
<Expandable type="Expr">
Returns an expression object denoting the value of the list at the given index.
If the index is out of bounds of list's length, `None` is returned. Consequently,
for a list of elements of type `T`, `at` always returns `Optional[T]`.

Fennel also supports negative indices: -1 maps to the last element of the list,
-2 to the second last element of the list and so on. Negative indices smaller
than -len start returning `None` like other out-of-bound indices.
</Expandable>


#### Errors
<Expandable title="Use of invalid types">
The `list` namespace must be invoked on an expression that evaluates to list
or optional of list. Similarly, `index` must evaluate to an element of type `int`
or `Optional[int]`.
</Expandable>
53 changes: 53 additions & 0 deletions docs/pages/api-reference/expressions/list/contains.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
---
title: Contains
order: 0
status: published
---

### Contains

Function to check if the given list contains a given element.

#### Parameters
<Expandable title="item" type="Expr">
`contains` check if the base list contains the `item` or not.
</Expandable>

<pre snippet="api-reference/expressions/list#contains"
status="success" message="Checking if a list contains a given item">
</pre>


#### Returns
<Expandable type="Expr">
Returns an expression object denoting the result of the `contains` expression.
The resulting expression is of type `bool` or `Optional[bool]` depending on
either of input/item being nullable.

Note that, Fennel expressions borrow semantics from SQL and treat `None` as
an unknown value. As a result, the following rules apply to `contains` in
presence of nulls:
- If the base list itself is `None`, the result is `None` regardless of the item.
- If the item is `None`, the result is `None` regardless of the list, unless it
is empty, in which case, the answer is `False` (after all, if the list is empty,
no matter the value of the item, it's not present in the list).
- If the item is not `None` and is present in the list, the answer is obviously
`True`
- However, if the item is not `None`, is not present in the list but the list
has some `None` element, the result is still `None` (because the `None` values
in the list may have been that element - we just can't say)

This is somewhat (but not exactly) similar to Spark's `array_contains` [function](https://docs.databricks.com/en/sql/language-manual/functions/array_contains.html).
</Expandable>
:::info
If you are interested in checking if a list has any `None` elements, a better
way of doing that is to use [hasnull](/api-reference/expressions/list/hasnull).
:::


#### Errors
<Expandable title="Use of invalid types">
The `list` namespace must be invoked on an expression that evaluates to list
or optional of list. Similarly, `item` must evaluate to an element of type `T`
or `Optional[T]` if the list itself was of type `List[T]` (or `Optional[List[T]]`)
</Expandable>
27 changes: 27 additions & 0 deletions docs/pages/api-reference/expressions/list/hasnull.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
title: Has Null
order: 0
status: published
---

### Has Null

Function to check if the given list has any `None` values.

<pre snippet="api-reference/expressions/list#has_null"
status="success" message="Checking if a list has any null values">
</pre>

#### Returns
<Expandable type="Expr">
Returns an expression object denoting the result of the `hasnull` function.
The resulting expression is of type `bool` or `Optional[bool]` depending on
the input being nullable.
</Expandable>


#### Errors
<Expandable title="Use of invalid types">
The `list` namespace must be invoked on an expression that evaluates to list
or optional of list.
</Expandable>
27 changes: 27 additions & 0 deletions docs/pages/api-reference/expressions/list/len.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
title: Len
order: 0
status: published
---

### Len

Function to get the length of a list.

<pre snippet="api-reference/expressions/list#len"
status="success" message="Getting the length of a list">
</pre>

#### Returns
<Expandable type="Expr">
Returns an expression object denoting the result of the `len` function.
The resulting expression is of type `int` or `Optional[int]` depending on
the input being nullable.
</Expandable>


#### Errors
<Expandable title="Use of invalid types">
The `list` namespace must be invoked on an expression that evaluates to list
or optional of list.
</Expandable>
4 changes: 2 additions & 2 deletions docs/pages/api-reference/expressions/struct/get.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ status: published

### Get

Function to get a given field from a struct.
Function to get the value of a given field from a struct.

#### Parameters
<Expandable title="field" type="str">
Expand All @@ -15,7 +15,7 @@ must be a literal string, not an expression.
</Expandable>

<pre snippet="api-reference/expressions/struct_snip#get"
status="success" message="Get a field from a sturct">
status="success" message="Get a field from a struct">
</pre>

#### Returns
Expand Down
5 changes: 4 additions & 1 deletion fennel/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,10 +823,13 @@ def contains(self, item: Expr) -> _Bool:
item_expr = make_expr(item)
return _Bool(_List(self, ListContains(item_expr)))

def get(self, index: Expr) -> Expr:
def at(self, index: Expr) -> Expr:
index_expr = make_expr(index)
return _List(self, ListGet(index_expr))

def hasnull(self) -> _Bool:
return _Bool(_List(self, ListHasNull()))


#######################################################

Expand Down
6 changes: 3 additions & 3 deletions fennel/expr/test_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def test_parse():
def test_list():
test_cases = [
ExprTestCase(
expr=(col("a").list.get(0)),
expr=(col("a").list.at(0)),
df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}),
schema={"a": List[int]},
display="col('a')[0]",
Expand All @@ -732,7 +732,7 @@ def test_list():
),
# Get index where index is an expression
ExprTestCase(
expr=(col("a").list.get(col("b") + col("c"))),
expr=(col("a").list.at(col("b") + col("c"))),
df=pd.DataFrame(
{
"a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]],
Expand All @@ -753,7 +753,7 @@ def test_list():
),
# Out of bounds index
ExprTestCase(
expr=(col("a").list.get(col("b"))),
expr=(col("a").list.at(col("b"))),
df=pd.DataFrame(
{
"a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]],
Expand Down
Loading

0 comments on commit 3ad5732

Please sign in to comment.