From 0ed1e1f51270b406eb41d943fd45712866a68f75 Mon Sep 17 00:00:00 2001
From: Younes Strittmatter <younes_strittmatter@brown.edu>
Date: Sun, 8 Sep 2024 15:56:41 -0400
Subject: [PATCH 1/2] feat: add generalized transformation from pandas to numpy

---
 src/autora/utils/transform.py | 141 ++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 src/autora/utils/transform.py
diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py
new file mode 100644
index 00000000..1303c579
--- /dev/null
+++ b/src/autora/utils/transform.py
@@ -0,0 +1,141 @@
+from typing import Union
+
+import numpy as np
+import pandas as pd
+
+
+def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+    """
+    Transforms a pandas data frame to a numpy array
+    Args:
+        df: the pandas data frame
+
+    Returns:
+        a numpy array
+
+    Examples:
+        Same result as np.array(df) if rows of df are one dimensional:
+        >>> df_one = pd.DataFrame({
+        ...     'x_0': [1, 2, 3],
+        ...     'x_1': [4, 5, 6],
+        ...     'x_2': [7, 8, 9]})
+        >>> np.array_equal(np.array(df_one), df_to_array(df_one))
+        True
+
+        If the rows contain lists ...
+        >>> df_list = pd.DataFrame({
+        ...     'x_0': [[0, 0], [1, 0], [2, 0]],
+        ...     'x_1': [[0, 1], [1, 1], [2, 1]],
+        ...     'x_2': [[0, 2], [1, 2], [2, 2]]
+        ... })
+        >>> array_transformed = df_to_array(df_list)
+        >>> array_cast = np.array(df_list)
+
+        the results are not equal:
+        >>> np.array_equal(array_transformed, array_cast)
+        False
+
+        The cast array contains objects which are hard to work with:
+        >>> array_cast
+        array([[list([0, 0]), list([0, 1]), list([0, 2])],
+               [list([1, 0]), list([1, 1]), list([1, 2])],
+               [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object)
+
+        The transformed array containst vectors (numbers):
+        >>> array_transformed
+        array([[[0, 0],
+                [0, 1],
+                [0, 2]],
+        <BLANKLINE>
+               [[1, 0],
+                [1, 1],
+                [1, 2]],
+        <BLANKLINE>
+               [[2, 0],
+                [2, 1],
+                [2, 2]]])
+
+        ... the same is true for arrays:
+        >>> df_array = pd.DataFrame({
+        ...     'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])],
+        ...     'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])],
+        ...     'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])]
+        ... })
+        >>> array_transformed = df_to_array(df_array)
+        >>> array_cast = np.array(df_list)
+
+        the results are not equal:
+        >>> np.array_equal(array_transformed, array_cast)
+        False
+
+        The cast array contains objects which are hard to work with:
+        >>> array_cast
+        array([[list([0, 0]), list([0, 1]), list([0, 2])],
+               [list([1, 0]), list([1, 1]), list([1, 2])],
+               [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object)
+
+        The transformed array containst vectors (numbers):
+        >>> array_transformed
+        array([[[0, 0],
+                [0, 1],
+                [0, 2]],
+        <BLANKLINE>
+               [[1, 0],
+                [1, 1],
+                [1, 2]],
+        <BLANKLINE>
+               [[2, 0],
+                [2, 1],
+                [2, 2]]])
+
+        # This also works with more nesting:
+        >>> df_nested = pd.DataFrame({
+        ...     'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]],
+        ...     'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]]
+        ... })
+        >>> df_to_array(df_nested)
+        array([[[[0, 0],
+                 [1, 1]],
+        <BLANKLINE>
+                [[1, 1],
+                 [1, 1]]],
+        <BLANKLINE>
+        <BLANKLINE>
+               [[[0, 0],
+                 [2, 2]],
+        <BLANKLINE>
+                [[1, 1],
+                 [2, 2]]]])
+
+        When the inner lists don't have the same shape, an error is thrown and one can use
+        a flattening version of this (ATTENTION: when using the flattening version,
+        information about which entry belongs to which condition is lost):
+    """
+
+    _lst = [list(row) for _, row in df.iterrows()]
+    return np.array(_lst)
+
+
+def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+    """
+    Flattens elements in a pandas DataFrame to resolve shape inconsistencies.
+
+    Args:
+        df: A pandas DataFrame or Series with inconsistent element shapes.
+
+    Returns:
+        A numpy array where all elements are flattened.
+
+    Example:
+        >>> df_inconsistent = pd.DataFrame({
+        ...     'x_0': [0, 2, 4],
+        ...     'x_1': [[1, 1], [3, 3], [5, 5]]
+        ... })
+        >>> df_to_array_flatten(df_inconsistent)
+        array([[0, 1, 1],
+               [2, 3, 3],
+               [4, 5, 5]])
+    """
+    return np.array(
+        [np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row])
+         for _, row in df.iterrows()])

From 44f1b33f38cd3f7117df7424bddf1f8abe577843 Mon Sep 17 00:00:00 2001
From: Younes Strittmatter <younes_strittmatter@brown.edu>
Date: Sun, 8 Sep 2024 16:32:49 -0400
Subject: [PATCH 2/2] feat: add generalized metrics

---
 src/autora/utils/metrics.py   | 124 ++++++++++++++++++++++++++++++++++
 src/autora/utils/transform.py |  31 ++++++---
 2 files changed, 144 insertions(+), 11 deletions(-)
 create mode 100644 src/autora/utils/metrics.py

diff --git a/src/autora/utils/metrics.py b/src/autora/utils/metrics.py
new file mode 100644
index 00000000..d719d65a
--- /dev/null
+++ b/src/autora/utils/metrics.py
@@ -0,0 +1,124 @@
+import numpy as np
+
+
+def norms(arr: np.ndarray) -> np.ndarray:
+    """
+    Calculate the norms along the first axis
+    Examples:
+        >>> import pandas as pd
+        >>> from autora.utils.transform import to_array
+
+        Simple dataframe with one condition
+        >>> df = pd.DataFrame({'x_0': [.2, 2, 3]})
+
+        First transform:
+        >>> as_array = to_array(df)
+        >>> norms(as_array)
+        array([0.2, 2. , 3. ])
+
+        >>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
+        >>> as_array = to_array(df_two_dim)
+        >>> norms(as_array)
+        array([1., 1., 5.])
+
+        For nested dataframes
+        >>> df_nested = pd.DataFrame({
+        ...     'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]]
+        ... })
+        >>> as_array = to_array(df_nested)
+        >>> norms(as_array)
+        array([0., 1., 1., 5.])
+
+        ... and deeply nested
+        >>> df_nested_deep = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]]
+        ... })
+        >>> as_array = to_array(df_nested_deep)
+        >>> norms(as_array)
+        array([1., 5.])
+
+        ... no matter how many columns
+        >>> df_nested_deep_multi_column = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> as_array = to_array(df_nested_deep_multi_column)
+        >>> norms(as_array)
+        array([5., 1.])
+    """
+    return np.array([np.linalg.norm(np.ravel(row)) for row in arr])
+
+
+def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray:
+    """
+    Calculate the euclidian distance between two arrays no matter their dimension along the
+    first axis
+    Examples:
+        >>> import pandas as pd
+        >>> from autora.utils.transform import to_array
+
+        Simple dataframe with one condition
+        >>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]})
+        >>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]})
+
+        First transform:
+        >>> as_array_1 = to_array(df_1)
+        >>> as_array_2 = to_array(df_2)
+        >>> distances(as_array_1, as_array_2)
+        array([1., 1., 1.])
+
+        >>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]})
+        >>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]})
+        >>> as_array_1 = to_array(df_two_dim_1)
+        >>> as_array_2 = to_array(df_two_dim_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 1., 0.])
+
+        For nested dataframes
+        >>> df_nested_1 = pd.DataFrame({
+        ...     'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]]
+        ... })
+        >>> df_nested_2 = pd.DataFrame({
+        ...     'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_1)
+        >>> as_array_2 = to_array(df_nested_2)
+        >>> distances(as_array_1, as_array_2)
+        array([1., 2., 3., 4., 5.])
+
+        ... and deeply nested
+        >>> df_nested_deep_1 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]]
+        ... })
+        >>> df_nested_deep_2 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_deep_1)
+        >>> as_array_2 = to_array(df_nested_deep_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 5.])
+
+        ... no matter how many columns
+        >>> df_nested_deep_multi_column_1 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> df_nested_deep_multi_column_2 = pd.DataFrame({
+        ...     'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]],
+        ...     'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]]
+        ... })
+        >>> as_array_1 = to_array(df_nested_deep_multi_column_1)
+        >>> as_array_2 = to_array(df_nested_deep_multi_column_2)
+        >>> distances(as_array_1, as_array_2)
+        array([0., 0.])
+
+    """
+    # Check that the two arrays have the same shape
+    assert arr_1.shape == arr_2.shape, "Arrays must have the same shape"
+
+    # For each row, calculate the squared distance
+    return np.sqrt(
+        np.array(
+            [np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)]
+        )
+    )
diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py
index 1303c579..c383744e 100644
--- a/src/autora/utils/transform.py
+++ b/src/autora/utils/transform.py
@@ -4,11 +4,11 @@
 import pandas as pd
 
 
-def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
     """
     Transforms a pandas data frame to a numpy array
     Args:
-        df: the pandas data frame
+        arr: the pandas data frame
 
     Returns:
         a numpy array
@@ -19,7 +19,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [1, 2, 3],
         ...     'x_1': [4, 5, 6],
         ...     'x_2': [7, 8, 9]})
-        >>> np.array_equal(np.array(df_one), df_to_array(df_one))
+        >>> np.array_equal(np.array(df_one), to_array(df_one))
         True
 
         If the rows contain lists ...
@@ -28,7 +28,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_1': [[0, 1], [1, 1], [2, 1]],
         ...     'x_2': [[0, 2], [1, 2], [2, 2]]
         ... })
-        >>> array_transformed = df_to_array(df_list)
+        >>> array_transformed = to_array(df_list)
         >>> array_cast = np.array(df_list)
 
         the results are not equal:
@@ -61,7 +61,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])],
         ...     'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])]
         ... })
-        >>> array_transformed = df_to_array(df_array)
+        >>> array_transformed = to_array(df_array)
         >>> array_cast = np.array(df_list)
 
         the results are not equal:
@@ -93,7 +93,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]],
         ...     'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]]
         ... })
-        >>> df_to_array(df_nested)
+        >>> to_array(df_nested)
         array([[[[0, 0],
                  [1, 1]],
         <BLANKLINE>
@@ -111,12 +111,14 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         a flattening version of this (ATTENTION: when using the flattening version,
         information about which entry belongs to which condition is lost):
     """
+    if isinstance(arr, np.ndarray):
+        return arr
 
-    _lst = [list(row) for _, row in df.iterrows()]
+    _lst = [list(row) for _, row in arr.iterrows()]
     return np.array(_lst)
 
 
-def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
+def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray:
     """
     Flattens elements in a pandas DataFrame to resolve shape inconsistencies.
 
@@ -131,11 +133,18 @@ def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array:
         ...     'x_0': [0, 2, 4],
         ...     'x_1': [[1, 1], [3, 3], [5, 5]]
         ... })
-        >>> df_to_array_flatten(df_inconsistent)
+        >>> to_array_flatten(df_inconsistent)
         array([[0, 1, 1],
                [2, 3, 3],
                [4, 5, 5]])
     """
+    if isinstance(arr, np.ndarray):
+        return arr
     return np.array(
-        [np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row])
-         for _, row in df.iterrows()])
+        [
+            np.concatenate(
+                [np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]
+            )
+            for _, row in arr.iterrows()
+        ]
+    )