From 0ed1e1f51270b406eb41d943fd45712866a68f75 Mon Sep 17 00:00:00 2001 From: Younes Strittmatter Date: Sun, 8 Sep 2024 15:56:41 -0400 Subject: [PATCH 1/2] feat: add generalized transformation from pandas to numpy --- src/autora/utils/transform.py | 141 ++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 src/autora/utils/transform.py diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py new file mode 100644 index 00000000..1303c579 --- /dev/null +++ b/src/autora/utils/transform.py @@ -0,0 +1,141 @@ +from typing import Union + +import numpy as np +import pandas as pd + + +def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: + """ + Transforms a pandas data frame to a numpy array + Args: + df: the pandas data frame + + Returns: + a numpy array + + Examples: + Same result as np.array(df) if rows of df are one dimensional: + >>> df_one = pd.DataFrame({ + ... 'x_0': [1, 2, 3], + ... 'x_1': [4, 5, 6], + ... 'x_2': [7, 8, 9]}) + >>> np.array_equal(np.array(df_one), df_to_array(df_one)) + True + + If the rows contain lists ... + >>> df_list = pd.DataFrame({ + ... 'x_0': [[0, 0], [1, 0], [2, 0]], + ... 'x_1': [[0, 1], [1, 1], [2, 1]], + ... 'x_2': [[0, 2], [1, 2], [2, 2]] + ... }) + >>> array_transformed = df_to_array(df_list) + >>> array_cast = np.array(df_list) + + the results are not equal: + >>> np.array_equal(array_transformed, array_cast) + False + + The cast array contains objects which are hard to work with: + >>> array_cast + array([[list([0, 0]), list([0, 1]), list([0, 2])], + [list([1, 0]), list([1, 1]), list([1, 2])], + [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) + + The transformed array containst vectors (numbers): + >>> array_transformed + array([[[0, 0], + [0, 1], + [0, 2]], + + [[1, 0], + [1, 1], + [1, 2]], + + [[2, 0], + [2, 1], + [2, 2]]]) + + ... the same is true for arrays: + >>> df_array = pd.DataFrame({ + ... 'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])], + ... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])], + ... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])] + ... }) + >>> array_transformed = df_to_array(df_array) + >>> array_cast = np.array(df_list) + + the results are not equal: + >>> np.array_equal(array_transformed, array_cast) + False + + The cast array contains objects which are hard to work with: + >>> array_cast + array([[list([0, 0]), list([0, 1]), list([0, 2])], + [list([1, 0]), list([1, 1]), list([1, 2])], + [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) + + The transformed array containst vectors (numbers): + >>> array_transformed + array([[[0, 0], + [0, 1], + [0, 2]], + + [[1, 0], + [1, 1], + [1, 2]], + + [[2, 0], + [2, 1], + [2, 2]]]) + + # This also works with more nesting: + >>> df_nested = pd.DataFrame({ + ... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]], + ... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]] + ... }) + >>> df_to_array(df_nested) + array([[[[0, 0], + [1, 1]], + + [[1, 1], + [1, 1]]], + + + [[[0, 0], + [2, 2]], + + [[1, 1], + [2, 2]]]]) + + When the inner lists don't have the same shape, an error is thrown and one can use + a flattening version of this (ATTENTION: when using the flattening version, + information about which entry belongs to which condition is lost): + """ + + _lst = [list(row) for _, row in df.iterrows()] + return np.array(_lst) + + +def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array: + """ + Flattens elements in a pandas DataFrame to resolve shape inconsistencies. + + Args: + df: A pandas DataFrame or Series with inconsistent element shapes. + + Returns: + A numpy array where all elements are flattened. + + Example: + >>> df_inconsistent = pd.DataFrame({ + ... 'x_0': [0, 2, 4], + ... 'x_1': [[1, 1], [3, 3], [5, 5]] + ... }) + >>> df_to_array_flatten(df_inconsistent) + array([[0, 1, 1], + [2, 3, 3], + [4, 5, 5]]) + """ + return np.array( + [np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]) + for _, row in df.iterrows()]) From 44f1b33f38cd3f7117df7424bddf1f8abe577843 Mon Sep 17 00:00:00 2001 From: Younes Strittmatter Date: Sun, 8 Sep 2024 16:32:49 -0400 Subject: [PATCH 2/2] feat: add generalized metrics --- src/autora/utils/metrics.py | 124 ++++++++++++++++++++++++++++++++++ src/autora/utils/transform.py | 31 ++++++--- 2 files changed, 144 insertions(+), 11 deletions(-) create mode 100644 src/autora/utils/metrics.py diff --git a/src/autora/utils/metrics.py b/src/autora/utils/metrics.py new file mode 100644 index 00000000..d719d65a --- /dev/null +++ b/src/autora/utils/metrics.py @@ -0,0 +1,124 @@ +import numpy as np + + +def norms(arr: np.ndarray) -> np.ndarray: + """ + Calculate the norms along the first axis + Examples: + >>> import pandas as pd + >>> from autora.utils.transform import to_array + + Simple dataframe with one condition + >>> df = pd.DataFrame({'x_0': [.2, 2, 3]}) + + First transform: + >>> as_array = to_array(df) + >>> norms(as_array) + array([0.2, 2. , 3. ]) + + >>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) + >>> as_array = to_array(df_two_dim) + >>> norms(as_array) + array([1., 1., 5.]) + + For nested dataframes + >>> df_nested = pd.DataFrame({ + ... 'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]] + ... }) + >>> as_array = to_array(df_nested) + >>> norms(as_array) + array([0., 1., 1., 5.]) + + ... and deeply nested + >>> df_nested_deep = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]] + ... }) + >>> as_array = to_array(df_nested_deep) + >>> norms(as_array) + array([1., 5.]) + + ... no matter how many columns + >>> df_nested_deep_multi_column = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> as_array = to_array(df_nested_deep_multi_column) + >>> norms(as_array) + array([5., 1.]) + """ + return np.array([np.linalg.norm(np.ravel(row)) for row in arr]) + + +def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray: + """ + Calculate the euclidian distance between two arrays no matter their dimension along the + first axis + Examples: + >>> import pandas as pd + >>> from autora.utils.transform import to_array + + Simple dataframe with one condition + >>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]}) + >>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]}) + + First transform: + >>> as_array_1 = to_array(df_1) + >>> as_array_2 = to_array(df_2) + >>> distances(as_array_1, as_array_2) + array([1., 1., 1.]) + + >>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) + >>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]}) + >>> as_array_1 = to_array(df_two_dim_1) + >>> as_array_2 = to_array(df_two_dim_2) + >>> distances(as_array_1, as_array_2) + array([0., 1., 0.]) + + For nested dataframes + >>> df_nested_1 = pd.DataFrame({ + ... 'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]] + ... }) + >>> df_nested_2 = pd.DataFrame({ + ... 'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]] + ... }) + >>> as_array_1 = to_array(df_nested_1) + >>> as_array_2 = to_array(df_nested_2) + >>> distances(as_array_1, as_array_2) + array([1., 2., 3., 4., 5.]) + + ... and deeply nested + >>> df_nested_deep_1 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]] + ... }) + >>> df_nested_deep_2 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]] + ... }) + >>> as_array_1 = to_array(df_nested_deep_1) + >>> as_array_2 = to_array(df_nested_deep_2) + >>> distances(as_array_1, as_array_2) + array([0., 5.]) + + ... no matter how many columns + >>> df_nested_deep_multi_column_1 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> df_nested_deep_multi_column_2 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> as_array_1 = to_array(df_nested_deep_multi_column_1) + >>> as_array_2 = to_array(df_nested_deep_multi_column_2) + >>> distances(as_array_1, as_array_2) + array([0., 0.]) + + """ + # Check that the two arrays have the same shape + assert arr_1.shape == arr_2.shape, "Arrays must have the same shape" + + # For each row, calculate the squared distance + return np.sqrt( + np.array( + [np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)] + ) + ) diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py index 1303c579..c383744e 100644 --- a/src/autora/utils/transform.py +++ b/src/autora/utils/transform.py @@ -4,11 +4,11 @@ import pandas as pd -def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: +def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: """ Transforms a pandas data frame to a numpy array Args: - df: the pandas data frame + arr: the pandas data frame Returns: a numpy array @@ -19,7 +19,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: ... 'x_0': [1, 2, 3], ... 'x_1': [4, 5, 6], ... 'x_2': [7, 8, 9]}) - >>> np.array_equal(np.array(df_one), df_to_array(df_one)) + >>> np.array_equal(np.array(df_one), to_array(df_one)) True If the rows contain lists ... @@ -28,7 +28,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: ... 'x_1': [[0, 1], [1, 1], [2, 1]], ... 'x_2': [[0, 2], [1, 2], [2, 2]] ... }) - >>> array_transformed = df_to_array(df_list) + >>> array_transformed = to_array(df_list) >>> array_cast = np.array(df_list) the results are not equal: @@ -61,7 +61,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: ... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])], ... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])] ... }) - >>> array_transformed = df_to_array(df_array) + >>> array_transformed = to_array(df_array) >>> array_cast = np.array(df_list) the results are not equal: @@ -93,7 +93,7 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: ... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]], ... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]] ... }) - >>> df_to_array(df_nested) + >>> to_array(df_nested) array([[[[0, 0], [1, 1]], @@ -111,12 +111,14 @@ def df_to_array(df: Union[pd.DataFrame, pd.Series]) -> np.array: a flattening version of this (ATTENTION: when using the flattening version, information about which entry belongs to which condition is lost): """ + if isinstance(arr, np.ndarray): + return arr - _lst = [list(row) for _, row in df.iterrows()] + _lst = [list(row) for _, row in arr.iterrows()] return np.array(_lst) -def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array: +def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: """ Flattens elements in a pandas DataFrame to resolve shape inconsistencies. @@ -131,11 +133,18 @@ def df_to_array_flatten(df: Union[pd.DataFrame, pd.Series]) -> np.array: ... 'x_0': [0, 2, 4], ... 'x_1': [[1, 1], [3, 3], [5, 5]] ... }) - >>> df_to_array_flatten(df_inconsistent) + >>> to_array_flatten(df_inconsistent) array([[0, 1, 1], [2, 3, 3], [4, 5, 5]]) """ + if isinstance(arr, np.ndarray): + return arr return np.array( - [np.concatenate([np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row]) - for _, row in df.iterrows()]) + [ + np.concatenate( + [np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row] + ) + for _, row in arr.iterrows() + ] + )