expr/docs: document all list functions

fennel-ai · Sep 20, 2024 · 3ad5732 · 3ad5732
1 parent 3dd15f1
commit 3ad5732
Show file tree

Hide file tree

Showing 12 changed files with 364 additions and 76 deletions.
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -244,6 +244,7 @@ hackathon
 hardcoded
 hashable
 hashmap
+hasnull
 hostname
 html
 hudi

diff --git a/docs/api.yml b/docs/api.yml
@@ -111,12 +111,13 @@ sidebar:
   #     - "api-reference/expressions/dt.second"
   #     - "api-reference/expressions/dt.strftime"
 
-  # - slug: "api-reference/expressions/list"
-  #   title: "List Expressions"
-  #   pages:
-  #     - "api-reference/expressions/list.len"
-  #     - "api-reference/expressions/list.hasnull"
-  #     - "api-reference/expressions/list.contains"
+  - slug: "api-reference/expressions/list"
+    title: "List Expressions"
+    pages:
+      - "api-reference/expressions/list/at"
+      - "api-reference/expressions/list/contains"
+      - "api-reference/expressions/list/hasnull"
+      - "api-reference/expressions/list/len"
 
   - slug: "api-reference/expressions/num"
     title: "Num Expressions"

diff --git a/docs/examples/api-reference/expressions/list.py b/docs/examples/api-reference/expressions/list.py
@@ -0,0 +1,138 @@
+import pytest
+from typing import Optional, List
+import pandas as pd
+
+
+def test_len():
+    # docsnip len
+    from fennel.expr import col
+
+    # docsnip-highlight next-line
+    expr = col("x").list.len()
+
+    # len works for any list type or optional list type
+    assert expr.typeof(schema={"x": List[int]}) == int
+    assert expr.typeof(schema={"x": Optional[List[float]]}) == Optional[int]
+
+    # can be evaluated with a dataframe
+    df = pd.DataFrame({"x": [[1, 2, 3], [4, 5], [], None]})
+    schema = {"x": Optional[List[int]]}
+    assert expr.eval(df, schema=schema).tolist() == [3, 2, 0, pd.NA]
+
+    # schema of column must be list of something
+    with pytest.raises(ValueError):
+        expr.typeof(schema={"x": int})
+    # /docsnip
+
+
+def test_has_null():
+    # docsnip has_null
+    from fennel.expr import col
+
+    # docsnip-highlight next-line
+    expr = col("x").list.hasnull()
+
+    # len works for any list type or optional list type
+    assert expr.typeof(schema={"x": List[int]}) == bool
+    assert expr.typeof(schema={"x": Optional[List[float]]}) == Optional[bool]
+
+    # can be evaluated with a dataframe
+    df = pd.DataFrame({"x": [[1, 2, 3], [4, 5, None], [], None]})
+    schema = {"x": Optional[List[Optional[int]]]}
+    assert expr.eval(df, schema=schema).tolist() == [False, True, False, pd.NA]
+
+    # schema of column must be list of something
+    with pytest.raises(ValueError):
+        expr.typeof(schema={"x": int})
+    # /docsnip
+
+
+def test_contains():
+    # docsnip contains
+    from fennel.expr import col
+
+    # docsnip-highlight next-line
+    expr = col("x").list.contains(col("y"))
+
+    # contains works for only list types
+    assert expr.typeof(schema={"x": List[int], "y": int}) == bool
+    assert (
+        expr.typeof(schema={"x": Optional[List[float]], "y": float})
+        == Optional[bool]
+    )
+
+    # however doesn't work if item is not of the same type as the list elements
+    with pytest.raises(ValueError):
+        expr.typeof(schema={"x": List[int], "y": str})
+
+    # can be evaluated with a dataframe
+    df = pd.DataFrame(
+        {
+            "x": [[1, 2, 3], [4, 5, None], [4, 5, None], None, []],
+            "y": [1, 5, 3, 4, None],
+        }
+    )
+    schema = {"x": Optional[List[Optional[int]]], "y": Optional[int]}
+    assert expr.eval(df, schema=schema).tolist() == [
+        True,
+        True,
+        pd.NA,
+        pd.NA,
+        False,
+    ]
+
+    # schema of column must be list of something
+    with pytest.raises(ValueError):
+        expr.typeof(schema={"x": int})
+    # /docsnip
+
+
+def test_at():
+    # docsnip at
+    from fennel.expr import col
+
+    # docsnip-highlight next-line
+    expr = col("x").list.at(col("y"))
+
+    # contains works for only list types, index can be int/optional[int]
+    assert expr.typeof(schema={"x": List[int], "y": int}) == Optional[int]
+    assert expr.typeof(schema={"x": List[str], "y": int}) == Optional[str]
+
+    schema = {"x": Optional[List[float]], "y": float}
+    with pytest.raises(Exception):
+        expr.typeof(schema=schema)
+
+    # can be evaluated with a dataframe
+    df = pd.DataFrame(
+        {
+            "x": [[1, 2, 3], [4, 5, None], [4, 5, None], None],
+            "y": [1, 5, 0, 4],
+        }
+    )
+    schema = {"x": Optional[List[Optional[int]]], "y": int}
+    assert expr.eval(df, schema=schema).tolist() == [2, pd.NA, 4, pd.NA]
+
+    # schema of column must be list of something
+    with pytest.raises(ValueError):
+        expr.typeof(schema={"x": int})
+    # /docsnip
+
+
+def test_at_negative():
+    # docsnip at_negative
+    from fennel.expr import col
+
+    # docsnip-highlight next-line
+    expr = col("x").list.at(col("y"))
+
+    # negative indices until -len(list) are allowed and do reverse indexing
+    # beyond that, start returning None like other out-of-bounds indices
+    df = pd.DataFrame(
+        {
+            "x": [[1, 2, 3], [4, 5, None], [4, 5, None], None],
+            "y": [-1, -5, -2, -4],
+        }
+    )
+    schema = {"x": Optional[List[Optional[int]]], "y": int}
+    assert expr.eval(df, schema=schema).tolist() == [3, pd.NA, 5, pd.NA]
+    # /docsnip
diff --git a/docs/pages/api-reference/expressions/list/at.md b/docs/pages/api-reference/expressions/list/at.md
@@ -0,0 +1,43 @@
+---
+title: At
+order: 0
+status: published
+---
+
+### At
+
+Function to get the value of the element at a given index of the list.
+
+#### Parameters
+<Expandable title="index" type="Expr">
+The index at which list's value needs to be evaluated. This expression is expected
+to evaluate to an int. Fennel supports indexing by negative integers as well.
+</Expandable>
+
+<pre snippet="api-reference/expressions/list#at"
+    status="success" message="Getting the value of a list's element at given index">
+</pre>
+
+<pre snippet="api-reference/expressions/list#at_negative"
+    status="success" message="Also works with negative indices">
+</pre>
+
+
+#### Returns
+<Expandable type="Expr">
+Returns an expression object denoting the value of the list at the given index.
+If the index is out of bounds of list's length, `None` is returned. Consequently,
+for a list of elements of type `T`, `at` always returns `Optional[T]`.
+
+Fennel also supports negative indices: -1 maps to the last element of the list, 
+-2 to the second last element of the list and so on. Negative indices smaller 
+than -len start returning `None` like other out-of-bound indices.
+</Expandable>
+
+
+#### Errors
+<Expandable title="Use of invalid types">
+The `list` namespace must be invoked on an expression that evaluates to list
+or optional of list. Similarly, `index` must evaluate to an element of type `int`
+or `Optional[int]`.
+</Expandable>
diff --git a/docs/pages/api-reference/expressions/list/contains.md b/docs/pages/api-reference/expressions/list/contains.md
@@ -0,0 +1,53 @@
+---
+title: Contains
+order: 0
+status: published
+---
+
+### Contains
+
+Function to check if the given list contains a given element.
+
+#### Parameters
+<Expandable title="item" type="Expr">
+`contains` check if the base list contains the `item` or not.
+</Expandable>
+
+<pre snippet="api-reference/expressions/list#contains"
+    status="success" message="Checking if a list contains a given item">
+</pre>
+
+
+#### Returns
+<Expandable type="Expr">
+Returns an expression object denoting the result of the `contains` expression.
+The resulting expression is of type `bool` or `Optional[bool]` depending on
+either of input/item being nullable.
+
+Note that, Fennel expressions borrow semantics from SQL and treat `None` as 
+an unknown value. As a result, the following rules apply to `contains` in 
+presence of nulls:
+- If the base list itself is `None`, the result is `None` regardless of the item.
+- If the item is `None`, the result is `None` regardless of the list, unless it
+  is empty, in which case, the answer is `False` (after all, if the list is empty,
+  no matter the value of the item, it's not present in the list).
+- If the item is not `None` and is present in the list, the answer is obviously
+  `True`
+- However, if the item is not `None`, is not present in the list but the list
+  has some `None` element, the result is still `None` (because the `None` values
+  in the list may have been that element - we just can't say)
+
+This is somewhat (but not exactly) similar to Spark's `array_contains` [function](https://docs.databricks.com/en/sql/language-manual/functions/array_contains.html).
+</Expandable>
+:::info
+If you are interested in checking if a list has any `None` elements, a better
+way of doing that is to use [hasnull](/api-reference/expressions/list/hasnull).
+:::
+
+
+#### Errors
+<Expandable title="Use of invalid types">
+The `list` namespace must be invoked on an expression that evaluates to list
+or optional of list. Similarly, `item` must evaluate to an element of type `T`
+or `Optional[T]` if the list itself was of type `List[T]` (or `Optional[List[T]]`)
+</Expandable>
diff --git a/docs/pages/api-reference/expressions/list/hasnull.md b/docs/pages/api-reference/expressions/list/hasnull.md
@@ -0,0 +1,27 @@
+---
+title: Has Null
+order: 0
+status: published
+---
+
+### Has Null
+
+Function to check if the given list has any `None` values.
+
+<pre snippet="api-reference/expressions/list#has_null"
+    status="success" message="Checking if a list has any null values">
+</pre>
+
+#### Returns
+<Expandable type="Expr">
+Returns an expression object denoting the result of the `hasnull` function.
+The resulting expression is of type `bool` or `Optional[bool]` depending on
+the input being nullable.
+</Expandable>
+
+
+#### Errors
+<Expandable title="Use of invalid types">
+The `list` namespace must be invoked on an expression that evaluates to list
+or optional of list. 
+</Expandable>
diff --git a/docs/pages/api-reference/expressions/list/len.md b/docs/pages/api-reference/expressions/list/len.md
@@ -0,0 +1,27 @@
+---
+title: Len
+order: 0
+status: published
+---
+
+### Len
+
+Function to get the length of a list.
+
+<pre snippet="api-reference/expressions/list#len"
+    status="success" message="Getting the length of a list">
+</pre>
+
+#### Returns
+<Expandable type="Expr">
+Returns an expression object denoting the result of the `len` function.
+The resulting expression is of type `int` or `Optional[int]` depending on
+the input being nullable.
+</Expandable>
+
+
+#### Errors
+<Expandable title="Use of invalid types">
+The `list` namespace must be invoked on an expression that evaluates to list
+or optional of list. 
+</Expandable>
diff --git a/docs/pages/api-reference/expressions/struct/get.md b/docs/pages/api-reference/expressions/struct/get.md
@@ -6,7 +6,7 @@ status: published
 
 ### Get
 
-Function to get a given field from a struct.
+Function to get the value of a given field from a struct.
 
 #### Parameters
 <Expandable title="field" type="str">
@@ -15,7 +15,7 @@ must be a literal string, not an expression.
 </Expandable>
 
 <pre snippet="api-reference/expressions/struct_snip#get"
-    status="success" message="Get a field from a sturct">
+    status="success" message="Get a field from a struct">
 </pre>
 
 #### Returns

diff --git a/fennel/expr/expr.py b/fennel/expr/expr.py
@@ -823,10 +823,13 @@ def contains(self, item: Expr) -> _Bool:
         item_expr = make_expr(item)
         return _Bool(_List(self, ListContains(item_expr)))
 
-    def get(self, index: Expr) -> Expr:
+    def at(self, index: Expr) -> Expr:
         index_expr = make_expr(index)
         return _List(self, ListGet(index_expr))
 
+    def hasnull(self) -> _Bool:
+        return _Bool(_List(self, ListHasNull()))
+
 
 #######################################################
 

diff --git a/fennel/expr/test_expr.py b/fennel/expr/test_expr.py
@@ -721,7 +721,7 @@ def test_parse():
 def test_list():
     test_cases = [
         ExprTestCase(
-            expr=(col("a").list.get(0)),
+            expr=(col("a").list.at(0)),
             df=pd.DataFrame({"a": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}),
             schema={"a": List[int]},
             display="col('a')[0]",
@@ -732,7 +732,7 @@ def test_list():
         ),
         # Get index where index is an expression
         ExprTestCase(
-            expr=(col("a").list.get(col("b") + col("c"))),
+            expr=(col("a").list.at(col("b") + col("c"))),
             df=pd.DataFrame(
                 {
                     "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]],
@@ -753,7 +753,7 @@ def test_list():
         ),
         # Out of bounds index
         ExprTestCase(
-            expr=(col("a").list.get(col("b"))),
+            expr=(col("a").list.at(col("b"))),
             df=pd.DataFrame(
                 {
                     "a": [[1, 2, 3, 4], [4, 5, 6, 12], [7, 8, 9, 19]],
-Original file line number
+Diff line change
@@ Expand Up / @@ -244,6 +244,7 @@ hackathon @@
     hardcoded
     hashable
     hashmap
+    hasnull
     hostname
     html
     hudi
@@ Expand Down @@