diff --git a/src/autora/experimentalist/inequality/__init__.py b/src/autora/experimentalist/inequality/__init__.py index 835c54d..b37eccc 100644 --- a/src/autora/experimentalist/inequality/__init__.py +++ b/src/autora/experimentalist/inequality/__init__.py @@ -1,6 +1,7 @@ -from typing import Iterable, Literal +from typing import Literal, Union import numpy as np +import pandas as pd from sklearn.metrics import DistanceMetric from autora.utils.deprecation import deprecated_alias @@ -30,8 +31,8 @@ def sample( - condition_pool: np.ndarray, - reference_conditions: np.ndarray, + conditions: Union[pd.DataFrame, np.ndarray], + reference_conditions: Union[pd.DataFrame, np.ndarray], num_samples: int = 1, equality_distance: float = 0, metric: str = "euclidean", @@ -43,7 +44,7 @@ def sample( into reference_conditions and are included in the summed equality calculation. Args: - condition_pool: pool of IV conditions to evaluate inequality + conditions: pool of IV conditions to evaluate inequality reference_conditions: reference pool of IV conditions num_samples: number of samples to select equality_distance: the distance to decide if two data points are equal. @@ -58,71 +59,79 @@ def sample( Examples: The value 1 is not in the reference. Therefore it is choosen. - >>> summed_inequality_sampler([1, 2, 3], [2, 3, 4]) + >>> summed_inequality_sample([1, 2, 3], [2, 3, 4]) array([[1]]) The equality distance is set to 0.4. 1 and 1.3 are considered equal, so are 3 and 3.1. Therefore 2 is choosen. - >>> summed_inequality_sampler([1, 2, 3], [1.3, 2.7, 3.1], 1, .4) + >>> summed_inequality_sample([1, 2, 3], [1.3, 2.7, 3.1], 1, .4) array([[2]]) The value 3 appears least often in the reference. - >>> summed_inequality_sampler([1, 2, 3], [1, 1, 1, 2, 2, 2, 3, 3]) + >>> summed_inequality_sample([1, 2, 3], [1, 1, 1, 2, 2, 2, 3, 3]) array([[3]]) The experimentalist "fills up" the reference array so the values are contributed evenly - >>> summed_inequality_sampler([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 1, 2, 2, 2, 2, 3, 3, 3], 3) + >>> summed_inequality_sample([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 1, 2, 2, 2, 2, 3, 3, 3], 3) array([[1], [3], [1]]) The experimentalist samples without replacemnt! - >>> summed_inequality_sampler([1, 2, 3], [1, 1, 1], 3) + >>> summed_inequality_sample([1, 2, 3], [1, 1, 1], 3) array([[3], [2], [1]]) """ - if isinstance(condition_pool, Iterable): - condition_pool = np.array(list(condition_pool)) + X = np.array(conditions) - if isinstance(reference_conditions, Iterable): - reference_conditions = np.array(list(reference_conditions)) + _reference_conditions = reference_conditions.copy() + if isinstance(reference_conditions, pd.DataFrame): + if set(conditions.columns) != set(reference_conditions.columns): + raise Exception( + f"Variable names {set(conditions.columns)} in conditions" + f"and {set(reference_conditions.columns)} in allowed values don't match. " + ) - if condition_pool.ndim == 1: - condition_pool = condition_pool.reshape(-1, 1) + _reference_conditions = _reference_conditions[conditions.columns] - if reference_conditions.ndim == 1: - reference_conditions = reference_conditions.reshape(-1, 1) + X_reference_conditions = np.array(_reference_conditions) - if condition_pool.shape[1] != reference_conditions.shape[1]: + if X.ndim == 1: + X = X.reshape(-1, 1) + + if X_reference_conditions.ndim == 1: + X_reference_conditions = X_reference_conditions.reshape(-1, 1) + + if X.shape[1] != X_reference_conditions.shape[1]: raise ValueError( - f"condition_pool and reference_conditions must have the same number of columns.\n" - f"condition_pool has {condition_pool.shape[1]} columns, " - f"while reference_conditions has {reference_conditions.shape[1]} columns." + f"conditions and reference_conditions must have the same number of columns.\n" + f"conditions has {X.shape[1]} columns, " + f"while reference_conditions has {X_reference_conditions.shape[1]} columns." ) - if condition_pool.shape[0] < num_samples: + if X.shape[0] < num_samples: raise ValueError( - f"condition_pool must have at least {num_samples} rows matching the number " + f"conditions must have at least {num_samples} rows matching the number " f"of requested samples." ) dist = DistanceMetric.get_metric(metric) - # create a list to store the n condition_pool values with the highest inequality scores + # create a list to store the n conditions values with the highest inequality scores condition_pool_res = [] # choose the canditate with the highest inequality score n-times for _ in range(num_samples): summed_equalities = [] # loop over all IV values - for row in condition_pool: + for row in X: # calculate the distances between the current row in matrix1 # and all other rows in matrix2 summed_equality = 0 - for reference_conditions_row in reference_conditions: + for reference_conditions_row in X_reference_conditions: distance = dist.pairwise([row, reference_conditions_row])[0, 1] summed_equality += distance > equality_distance @@ -130,17 +139,18 @@ def sample( summed_equalities.append(summed_equality) # sort the rows in matrix1 by their summed distances - condition_pool = condition_pool[np.argsort(summed_equalities)[::-1]] + X = X[np.argsort(summed_equalities)[::-1]] # append the first value of the sorted list to the result - condition_pool_res.append(condition_pool[0]) + condition_pool_res.append(X[0]) # add the chosen value to reference_conditions - reference_conditions = np.append( - reference_conditions, [condition_pool[0]], axis=0 - ) - # remove the chosen value from condition_pool - condition_pool = condition_pool[1:] - - return np.array(condition_pool_res[:num_samples]) + X_reference_conditions = np.append(X_reference_conditions, [X[0]], axis=0) + # remove the chosen value from X + X = X[1:] + + new_conditions = np.array(condition_pool_res[:num_samples]) + if isinstance(conditions, pd.DataFrame): + new_conditions = pd.DataFrame(new_conditions, columns=conditions.columns) + return new_conditions summed_inequality_sample = sample