diff --git a/src/autora/utils/metrics.py b/src/autora/utils/metrics.py new file mode 100644 index 00000000..d719d65a --- /dev/null +++ b/src/autora/utils/metrics.py @@ -0,0 +1,124 @@ +import numpy as np + + +def norms(arr: np.ndarray) -> np.ndarray: + """ + Calculate the norms along the first axis + Examples: + >>> import pandas as pd + >>> from autora.utils.transform import to_array + + Simple dataframe with one condition + >>> df = pd.DataFrame({'x_0': [.2, 2, 3]}) + + First transform: + >>> as_array = to_array(df) + >>> norms(as_array) + array([0.2, 2. , 3. ]) + + >>> df_two_dim = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) + >>> as_array = to_array(df_two_dim) + >>> norms(as_array) + array([1., 1., 5.]) + + For nested dataframes + >>> df_nested = pd.DataFrame({ + ... 'x_0': [[0, 0], [0, 1], [1, 0], [3, 4]] + ... }) + >>> as_array = to_array(df_nested) + >>> norms(as_array) + array([0., 1., 1., 5.]) + + ... and deeply nested + >>> df_nested_deep = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 4]]] + ... }) + >>> as_array = to_array(df_nested_deep) + >>> norms(as_array) + array([1., 5.]) + + ... no matter how many columns + >>> df_nested_deep_multi_column = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> as_array = to_array(df_nested_deep_multi_column) + >>> norms(as_array) + array([5., 1.]) + """ + return np.array([np.linalg.norm(np.ravel(row)) for row in arr]) + + +def distances(arr_1: np.ndarray, arr_2: np.ndarray) -> np.ndarray: + """ + Calculate the euclidian distance between two arrays no matter their dimension along the + first axis + Examples: + >>> import pandas as pd + >>> from autora.utils.transform import to_array + + Simple dataframe with one condition + >>> df_1 = pd.DataFrame({'x_0': [0, 1, 2]}) + >>> df_2 = pd.DataFrame({'x_0': [1, 2, 3]}) + + First transform: + >>> as_array_1 = to_array(df_1) + >>> as_array_2 = to_array(df_2) + >>> distances(as_array_1, as_array_2) + array([1., 1., 1.]) + + >>> df_two_dim_1 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 0, 4]}) + >>> df_two_dim_2 = pd.DataFrame({'x_0': [0, 1, 3], 'x_1': [1, 1, 4]}) + >>> as_array_1 = to_array(df_two_dim_1) + >>> as_array_2 = to_array(df_two_dim_2) + >>> distances(as_array_1, as_array_2) + array([0., 1., 0.]) + + For nested dataframes + >>> df_nested_1 = pd.DataFrame({ + ... 'x_0': [[0, 0], [0, 2], [0, 2], [0, 10], [4, 0]] + ... }) + >>> df_nested_2 = pd.DataFrame({ + ... 'x_0': [[1, 0], [0, 0], [0, 5], [0, 6], [0, 3]] + ... }) + >>> as_array_1 = to_array(df_nested_1) + >>> as_array_2 = to_array(df_nested_2) + >>> distances(as_array_1, as_array_2) + array([1., 2., 3., 4., 5.]) + + ... and deeply nested + >>> df_nested_deep_1 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[6, 0], [0, 10]]] + ... }) + >>> df_nested_deep_2 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 1]], [[3, 0], [0, 6]]] + ... }) + >>> as_array_1 = to_array(df_nested_deep_1) + >>> as_array_2 = to_array(df_nested_deep_2) + >>> distances(as_array_1, as_array_2) + array([0., 5.]) + + ... no matter how many columns + >>> df_nested_deep_multi_column_1 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> df_nested_deep_multi_column_2 = pd.DataFrame({ + ... 'x_0': [[[0, 0], [0, 4]], [[1, 0], [0, 0]]], + ... 'x_1': [[[0, 3], [0, 0]], [[0, 0], [0, 0]]] + ... }) + >>> as_array_1 = to_array(df_nested_deep_multi_column_1) + >>> as_array_2 = to_array(df_nested_deep_multi_column_2) + >>> distances(as_array_1, as_array_2) + array([0., 0.]) + + """ + # Check that the two arrays have the same shape + assert arr_1.shape == arr_2.shape, "Arrays must have the same shape" + + # For each row, calculate the squared distance + return np.sqrt( + np.array( + [np.sum((np.ravel(a) - np.ravel(b)) ** 2) for a, b in zip(arr_1, arr_2)] + ) + ) diff --git a/src/autora/utils/transform.py b/src/autora/utils/transform.py new file mode 100644 index 00000000..c383744e --- /dev/null +++ b/src/autora/utils/transform.py @@ -0,0 +1,150 @@ +from typing import Union + +import numpy as np +import pandas as pd + + +def to_array(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: + """ + Transforms a pandas data frame to a numpy array + Args: + arr: the pandas data frame + + Returns: + a numpy array + + Examples: + Same result as np.array(df) if rows of df are one dimensional: + >>> df_one = pd.DataFrame({ + ... 'x_0': [1, 2, 3], + ... 'x_1': [4, 5, 6], + ... 'x_2': [7, 8, 9]}) + >>> np.array_equal(np.array(df_one), to_array(df_one)) + True + + If the rows contain lists ... + >>> df_list = pd.DataFrame({ + ... 'x_0': [[0, 0], [1, 0], [2, 0]], + ... 'x_1': [[0, 1], [1, 1], [2, 1]], + ... 'x_2': [[0, 2], [1, 2], [2, 2]] + ... }) + >>> array_transformed = to_array(df_list) + >>> array_cast = np.array(df_list) + + the results are not equal: + >>> np.array_equal(array_transformed, array_cast) + False + + The cast array contains objects which are hard to work with: + >>> array_cast + array([[list([0, 0]), list([0, 1]), list([0, 2])], + [list([1, 0]), list([1, 1]), list([1, 2])], + [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) + + The transformed array containst vectors (numbers): + >>> array_transformed + array([[[0, 0], + [0, 1], + [0, 2]], + + [[1, 0], + [1, 1], + [1, 2]], + + [[2, 0], + [2, 1], + [2, 2]]]) + + ... the same is true for arrays: + >>> df_array = pd.DataFrame({ + ... 'x_0': [np.array([0, 0]), np.array([1, 0]), np.array([2, 0])], + ... 'x_1': [np.array([0, 1]), np.array([1, 1]), np.array([2, 1])], + ... 'x_2': [np.array([0, 2]), np.array([1, 2]), np.array([2, 2])] + ... }) + >>> array_transformed = to_array(df_array) + >>> array_cast = np.array(df_list) + + the results are not equal: + >>> np.array_equal(array_transformed, array_cast) + False + + The cast array contains objects which are hard to work with: + >>> array_cast + array([[list([0, 0]), list([0, 1]), list([0, 2])], + [list([1, 0]), list([1, 1]), list([1, 2])], + [list([2, 0]), list([2, 1]), list([2, 2])]], dtype=object) + + The transformed array containst vectors (numbers): + >>> array_transformed + array([[[0, 0], + [0, 1], + [0, 2]], + + [[1, 0], + [1, 1], + [1, 2]], + + [[2, 0], + [2, 1], + [2, 2]]]) + + # This also works with more nesting: + >>> df_nested = pd.DataFrame({ + ... 'x_0': [[[0,0],[1,1]], [[0,0],[2,2]]], + ... 'x_1': [[[1,1],[1,1]], [[1,1],[2,2]]] + ... }) + >>> to_array(df_nested) + array([[[[0, 0], + [1, 1]], + + [[1, 1], + [1, 1]]], + + + [[[0, 0], + [2, 2]], + + [[1, 1], + [2, 2]]]]) + + When the inner lists don't have the same shape, an error is thrown and one can use + a flattening version of this (ATTENTION: when using the flattening version, + information about which entry belongs to which condition is lost): + """ + if isinstance(arr, np.ndarray): + return arr + + _lst = [list(row) for _, row in arr.iterrows()] + return np.array(_lst) + + +def to_array_flatten(arr: Union[pd.DataFrame, pd.Series, np.ndarray]) -> np.ndarray: + """ + Flattens elements in a pandas DataFrame to resolve shape inconsistencies. + + Args: + df: A pandas DataFrame or Series with inconsistent element shapes. + + Returns: + A numpy array where all elements are flattened. + + Example: + >>> df_inconsistent = pd.DataFrame({ + ... 'x_0': [0, 2, 4], + ... 'x_1': [[1, 1], [3, 3], [5, 5]] + ... }) + >>> to_array_flatten(df_inconsistent) + array([[0, 1, 1], + [2, 3, 3], + [4, 5, 5]]) + """ + if isinstance(arr, np.ndarray): + return arr + return np.array( + [ + np.concatenate( + [np.ravel(x) if isinstance(x, (list, np.ndarray)) else [x] for x in row] + ) + for _, row in arr.iterrows() + ] + )