diff --git a/setup.cfg b/setup.cfg index 0dab5eb3..ec8b9978 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = matplotlib>=2.1.0 seaborn>=0.11.1 dcor>=0.5.3 - pyemd==0.5.1 + synthesized-insight>=0.3 [options.packages.find] where = src diff --git a/src/fairlens/metrics/distance.py b/src/fairlens/metrics/distance.py index ccdef798..04759014 100644 --- a/src/fairlens/metrics/distance.py +++ b/src/fairlens/metrics/distance.py @@ -8,12 +8,13 @@ import numpy as np import pandas as pd -import pyemd -from scipy.spatial.distance import jensenshannon -from scipy.stats import entropy, kruskal, ks_2samp +from scipy.stats import wasserstein_distance +from synthesized_insight.metrics import HellingerDistance as HD +from synthesized_insight.metrics import JensenShannonDivergence as JSD +from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest +from synthesized_insight.metrics import KullbackLeiblerDivergence as KLD from .. import utils -from ..metrics import significance as pv class DistanceMetric(ABC): @@ -47,9 +48,9 @@ def __call__(self, x: pd.Series, y: pd.Series) -> Optional[float]: Args: x (pd.Series): - The data in the column representing the first group. + The data in the first sample. y (pd.Series): - The data in the column representing the second group. + The data in the second sample. Returns: Optional[float]: @@ -67,9 +68,9 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: Args: x (pd.Series): - The data in the column representing the first group. + The data in the first sample. y (pd.Series): - The data in the column representing the second group. + The data in the second sample. Returns: bool: @@ -79,13 +80,13 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: @abstractmethod def distance(self, x: pd.Series, y: pd.Series) -> float: - """Distance between the distribution of numerical data in x and y. Derived classes must implement this. + """Distance between the distributions in x and y. Derived classes must implement this. Args: x (pd.Series): - Numerical data in a column. + The data in the first sample. y (pd.Series): - Numerical data in a column. + The data in the second sample. Returns: float: @@ -93,22 +94,6 @@ def distance(self, x: pd.Series, y: pd.Series) -> float: """ ... - def p_value(self, x: pd.Series, y: pd.Series) -> float: - """Returns a p-value for the test that x and y are sampled from the same distribution. - - Args: - x (pd.Series): - Numerical data in a column. - y (pd.Series): - Numerical data in a column. - - Returns: - float: - The computed p-value. - """ - - raise NotImplementedError() - @property @abstractmethod def id(self) -> str: @@ -130,103 +115,37 @@ class ContinuousDistanceMetric(DistanceMetric): Subclasses must implement a distance method. """ - def __init__(self, p_value_test="bootstrap"): - """Initialize continuous distance metric. - - Args: - p_value_test (str, optional): - Choose which method of resampling will be used to compute the p-value. Overidden by metrics - such as Kolmogrov Smirnov Distance. - Defaults to "permutation". - """ - - self.p_value_test = p_value_test - def check_input(self, x: pd.Series, y: pd.Series) -> bool: x_dtype = utils.infer_dtype(x).dtype y_dtype = utils.infer_dtype(y).dtype return x_dtype in ["int64", "float64"] and y_dtype in ["int64", "float64"] - def p_value(self, x: pd.Series, y: pd.Series) -> float: - if self.p_value_test == "permutation": - ts_distribution = pv.permutation_statistic(x, y, self.distance, n_perm=100) - elif self.p_value_test == "bootstrap": - ts_distribution = pv.bootstrap_statistic(x, y, self.distance, n_samples=1000) - else: - raise ValueError('p_value_test must be one of ["permutation", "bootstrap"]') - - return pv.resampling_p_value(self.distance(x, y), ts_distribution) - class CategoricalDistanceMetric(DistanceMetric): """ Base class for distance metrics on categorical data. - Continuous data is automatically binned to create histograms, bin edges can be provided as an argument - and will be used to bin continous data. If the data has been pre-binned and consists of pd.Intervals - for instance, the histograms will be computed using the counts of each bin, and the bin_edges, if given, - will be used in metrics such as EarthMoversDistanceCategorical to compute the distance space. - - Subclasses must implement a distance_pdf method. + Subclasses must implement a distance method. """ - def __init__(self, bin_edges: Optional[np.ndarray] = None): - """Initialize categorical distance metric. - - Args: - bin_edges (Optional[np.ndarray], optional): - A numpy array of bin edges used to bin continuous data or to indicate bins of pre-binned data - to metrics which take the distance space into account. - i.e. For bins [0-5, 5-10, 10-15, 15-20], bin_edges would be [0, 5, 10, 15, 20]. - See numpy.histogram_bin_edges() for more information. - """ - - self.bin_edges = bin_edges - def check_input(self, x: pd.Series, y: pd.Series) -> bool: x_dtype = utils.infer_dtype(x).dtype y_dtype = utils.infer_dtype(y).dtype return x_dtype == y_dtype - def distance(self, x: pd.Series, y: pd.Series) -> float: - (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True) - - return self.distance_pdf(p, q, bin_edges) - @abstractmethod - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - """Distance between 2 aligned normalized histograms. Derived classes must implement this. - - Args: - p (pd.Series): - A normalized histogram. - q (pd.Series): - A normalized histogram. - bin_edges (Optional[np.ndarray]): - bin_edges for binned continuous data. Used by metrics such as Earth Mover's Distance to compute the - distance metric space. - - Returns: - float: - The computed distance. - """ - ... - - def p_value(self, x: pd.Series, y: pd.Series) -> float: - (h_x, h_y), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, normalize=False, ret_bins=True) - - def distance_call(h_x, h_y): - with np.errstate(divide="ignore", invalid="ignore"): - p = pd.Series(np.nan_to_num(h_x / h_x.sum())) - q = pd.Series(np.nan_to_num(h_y / h_y.sum())) - - return self.distance_pdf(p, q, bin_edges) +class BinaryDistanceMetric(DistanceMetric): + """ + Base class for distance metrics on binary data. - ts_distribution = pv.bootstrap_binned_statistic(h_x, h_y, distance_call, n_samples=100) + Subclasses must implement a distance method. + """ - return pv.resampling_p_value(distance_call(h_x, h_y), ts_distribution) + def check_input(self, x: pd.Series, y: pd.Series) -> bool: + joint = pd.concat((x, y)) + return utils.infer_distr_type(joint).is_binary() and (np.sort(joint.unique()) == [0, 1]).all() class MeanDistance(ContinuousDistanceMetric): @@ -242,11 +161,11 @@ def id(self) -> str: return "mean" -class BinomialDistance(ContinuousDistanceMetric): +class BinomialDistance(BinaryDistanceMetric): """ Difference distance between two binary data samples. i.e p_x - p_y, where p_x, p_y are the probabilities of success in x and y, respectively. - The p-value computed is for the null hypothesis is that the probability of success is p_y. + Data is assumed to be a series of 1, 0 (success, failure) Bernoulli random variates. """ @@ -256,13 +175,6 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: def distance(self, x: pd.Series, y: pd.Series) -> float: return x.mean() - y.mean() - def p_value(self, x: pd.Series, y: pd.Series) -> float: - p_obs = x.mean() - p_null = y.mean() - n = len(x) - - return pv.binominal_proportion_p_value(p_obs, p_null, n) - @property def id(self) -> str: return "binomial" @@ -274,10 +186,7 @@ class KolmogorovSmirnovDistance(ContinuousDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return ks_2samp(x, y)[0] - - def p_value(self, x: pd.Series, y: pd.Series) -> float: - return ks_2samp(x, y)[1] + return KolmogorovSmirnovDistanceTest()._compute_test(x, y)[0] @property def id(self) -> str: @@ -285,11 +194,12 @@ def id(self) -> str: class KruskalWallis(ContinuousDistanceMetric): - def distance(self, x: pd.Series, y: pd.Series) -> float: - return kruskal(x, y)[0] + """ + Kruskal Wallis H test between two data samples. + """ - def p_value(self, x: pd.Series, y: pd.Series) -> float: - return kruskal(x, y)[1] + def distance(self, x: pd.Series, y: pd.Series) -> float: + return KruskalWallisTest()._compute_test(x, y)[0] @property def id(self) -> str: @@ -299,24 +209,27 @@ def id(self) -> str: class EarthMoversDistance(CategoricalDistanceMetric): """ Earth movers distance (EMD), aka Wasserstein 1-distance, for categorical data. - - Using EarthMoversDistance on the raw data is faster and recommended. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - distance_matrix = 1 - np.eye(len(p)) + def __init__(self, bin_edges: Optional[np.ndarray] = None): + """ + Args: + bin_edges (Optional[np.ndarray], optional): + A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data. + Defaults to None. + """ + + self.bin_edges = bin_edges - if bin_edges is not None: - # Use pair-wise euclidean distances between bin centers for scale data - bin_centers = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) - xx, yy = np.meshgrid(bin_centers, bin_centers) - distance_matrix = np.abs(xx - yy) + def distance(self, x: pd.Series, y: pd.Series) -> float: + (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True) - p = np.array(p).astype(np.float64) - q = np.array(q).astype(np.float64) - distance_matrix = distance_matrix.astype(np.float64) + if bin_edges is None: + bin_centers = np.arange(len(p)) + else: + bin_centers = (np.array(bin_edges[:-1]) + np.array(bin_edges[1:])) / 2 - return pyemd.emd(p, q, distance_matrix) + return wasserstein_distance(bin_centers, bin_centers, u_weights=p, v_weights=q) @property def id(self) -> str: @@ -328,8 +241,8 @@ class KullbackLeiblerDivergence(CategoricalDistanceMetric): Kullback–Leibler Divergence or Relative Entropy between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return entropy(np.array(p), np.array(q)) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return KLD()._compute_metric(x, y) @property def id(self) -> str: @@ -341,8 +254,8 @@ class JensenShannonDivergence(CategoricalDistanceMetric): Jensen-Shannon Divergence between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return jensenshannon(p, q) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return JSD()._compute_metric(x, y) @property def id(self) -> str: @@ -351,24 +264,21 @@ def id(self) -> str: class Norm(CategoricalDistanceMetric): """ - LP Norm between two probability distributions. + L-P Norm between two probability distributions. """ - def __init__(self, bin_edges: Optional[np.ndarray] = None, ord: Union[str, int] = 2): + def __init__(self, ord: Union[str, int] = 2): """ Args: - bin_edges (Optional[np.ndarray], optional): - A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data. - Defaults to None. ord (Union[str, int], optional): The order of the norm. Possible values include positive numbers, 'fro', 'nuc'. See numpy.linalg.norm for more details. Defaults to 2. """ - super().__init__(bin_edges=bin_edges) self.ord = ord - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: + def distance(self, x: pd.Series, y: pd.Series) -> float: + (p, q), _ = utils.zipped_hist((x, y), ret_bins=True) return np.linalg.norm(p - q, ord=self.ord) @property @@ -381,8 +291,8 @@ class HellingerDistance(CategoricalDistanceMetric): Hellinger distance between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return np.linalg.norm(np.sqrt(p) - np.sqrt(q)) / np.sqrt(2) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return HD()._compute_metric(x, y) @property def id(self) -> str: diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py index ce49c459..ecfc37f0 100644 --- a/src/fairlens/metrics/unified.py +++ b/src/fairlens/metrics/unified.py @@ -3,34 +3,30 @@ """ import multiprocessing as mp -from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union +from collections import namedtuple +from typing import Any, Callable, List, Mapping, Optional, Tuple, Union import pandas as pd +from synthesized_insight.check import Check, ColumnCheck +from synthesized_insight.metrics.base import TwoColumnMetric +from synthesized_insight.metrics.statistical_tests import BootstrapTest, KolmogorovSmirnovDistanceTest, PermutationTest from .. import utils from .correlation import cramers_v, kruskal_wallis, pearson from .distance import BinomialDistance, DistanceMetric, EarthMoversDistance, KolmogorovSmirnovDistance +DistanceResult = namedtuple("DistanceResult", ("distance", "p_value")) -def auto_distance(column: pd.Series) -> Type[DistanceMetric]: - """Return a suitable statistical distance metric based on the distribution of the data. - Args: - column (pd.Series): - The input data in a pd.Series. +class _OptimisticCheck(ColumnCheck): + def continuous(self, sr: pd.Series) -> bool: + return True - Returns: - Type[DistanceMetric]: - The class of the distance metric. - """ + def categorical(self, sr: pd.Series) -> bool: + return True - distr_type = utils.infer_distr_type(column) - if distr_type.is_continuous(): - return KolmogorovSmirnovDistance - elif distr_type.is_binary(): - return BinomialDistance - return EarthMoversDistance +check = _OptimisticCheck() def stat_distance( @@ -39,9 +35,10 @@ def stat_distance( group1: Union[Mapping[str, List[Any]], pd.Series], group2: Union[Mapping[str, List[Any]], pd.Series], mode: str = "auto", - p_value: bool = False, + test: str = "default", + alternative: str = "two-sided", **kwargs, -) -> Tuple[float, ...]: +) -> Tuple[float, Optional[float]]: """Computes the statistical distance between two probability distributions ie. group 1 and group 2, with respect to the target attribute. The distance metric can be chosen through the mode parameter. If mode is set to "auto", the most suitable metric depending on the target attributes' distribution is chosen. @@ -63,18 +60,23 @@ def stat_distance( a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate to index a subgroup from the dataframe. Examples: {"Sex": ["Male"]}, df["Sex"] == "Female" - mode (str): + mode (str, optional): Which distance metric to use. Can be the names of classes from `fairlens.metrics`, or their id() strings. If set to "auto", the method automatically picks a suitable metric based on the distribution of the target attribute. Defaults to "auto". - p_value (bool): - Returns the a suitable p-value for the metric if it exists. Defaults to False. + test (str, optional): + The statistical test to use to compute the p-value. Can take values + ["default", "bootstrap", "permutation"]. If set to "default", returns any p-value + for the metric specified by `mode`. Defaults to "default" + alternative (str, optional): + The alternative hypothesis for the test. Can take values ["two-sided", "greater", "less"]. + Defaults to "two-sided". **kwargs: Keyword arguments for the distance metric. Passed to the __init__ function of distance metrics. Returns: - Tuple[float, ...]: - The distance as a float, and the p-value if p_value is set to True and can be computed. + Tuple[float, Optional[float]]: + The distance as a float, the p-value as a float. Examples: >>> df = pd.read_csv("datasets/compas.csv") @@ -87,30 +89,65 @@ def stat_distance( (0.0816143577815524, 0.02693435054772131) """ + if test not in ["default", "bootstrap", "permutation"]: + raise ValueError('Invalid value for test. Valid values: ["default", "bootstrap", "permutation"]') + + if alternative not in ["two-sided", "greater", "less"]: + raise ValueError('Invalid value for alternative. Valid values: ["two-sided", "greater", "less"]') + # Parse group arguments into pandas series' pred1, pred2 = tuple(utils.get_predicates_mult(df, [group1, group2])) group1 = df[pred1][target_attr] group2 = df[pred2][target_attr] - # Choose the distance metric + # Choose the metric if mode == "auto": - dist_class = auto_distance(df[target_attr]) - elif mode in DistanceMetric._class_dict: - dist_class = DistanceMetric._class_dict[mode] - else: - raise ValueError(f"Invalid mode. Valid modes include:\n{DistanceMetric._class_dict.keys()}") + mode = auto_distance(df[target_attr]) + + if mode not in DistanceMetric._class_dict: + raise ValueError(f"Invalid value for mode. Valid values:\n{DistanceMetric._class_dict.keys()}") + + dist_class = DistanceMetric._class_dict[mode] + distance_metric = dist_class(**kwargs) + metric = _MetricAdaptor(distance_metric) - metric = dist_class(**kwargs) - d = metric(group1, group2) + d, p = None, None + + if test == "bootstrap": + d, p = BootstrapTest(metric, alternative=alternative, check=check)(group1, group2) + elif test == "permutation": + d, p = PermutationTest(metric, alternative=alternative, check=check)(group1, group2) + elif test == "default" and mode == "ks_distance": + d, p = KolmogorovSmirnovDistanceTest(alternative=alternative, check=check)(group1, group2) + else: + d = distance_metric(group1, group2) + p = None if d is None: raise ValueError("Incompatible data inside both series") - if p_value: - p = metric.p_value(group1, group2) - return (d, p) + return DistanceResult(distance=d, p_value=p) - return (d,) + +def auto_distance(column: pd.Series) -> str: + """Return a suitable statistical distance metric based on the distribution of the data. + + Args: + column (pd.Series): + The input data in a pd.Series. + + Returns: + str: + The id of the distance metric. + """ + + distr_type = utils.infer_distr_type(column) + if distr_type.is_continuous(): + return KolmogorovSmirnovDistance._get_id() + elif distr_type.is_binary(): + return BinomialDistance._get_id() + + return EarthMoversDistance._get_id() def correlation_matrix( @@ -192,3 +229,17 @@ def _correlation_matrix_helper( else: return cat_cat_metric(sr_a, sr_b) + + +class _MetricAdaptor(TwoColumnMetric): + def __init__(self, metric: DistanceMetric): + self.metric = metric + + def __call__(self, sr_a: pd.Series, sr_b: pd.Series): + return self.metric(sr_a, sr_b) + + def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check): + return True + + def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): + return self.metric.distance(sr_a, sr_b) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 35f4098f..f7db5593 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -from pyemd import emd_samples from fairlens.metrics.distance import BinomialDistance from fairlens.metrics.distance import EarthMoversDistance as EMD @@ -43,11 +42,6 @@ def test_stat_distance_auto(): assert stat_distance(df, target_attr, pred1, pred2, mode="ks_distance")[0] == res -def test_auto_binning(): - res = emd_samples(group1, group2) - assert stat_distance(df, target_attr, pred1, pred2, mode="emd")[0] == res - - def test_mean_distance(): assert Mean()(pd.Series(np.arange(100)), pd.Series(np.arange(10))) == 45 diff --git a/tests/test_significance.py b/tests/test_significance.py index a33ae88f..b554e926 100644 --- a/tests/test_significance.py +++ b/tests/test_significance.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from fairlens.metrics.distance import BinomialDistance, MeanDistance +from fairlens.metrics.distance import MeanDistance from fairlens.metrics.significance import binominal_proportion_p_value as bin_prop from fairlens.metrics.significance import bootstrap_binned_statistic as bootstrap_binned from fairlens.metrics.significance import bootstrap_statistic as bootstrap @@ -17,9 +17,9 @@ def test_binomial(): assert abs(bin_prop(0.2, 0.1, 10) - (1 - (0.9 ** 10 + 0.9 ** 9))) < epsilon - assert BinomialDistance().p_value(pd.Series([1, 1]), pd.Series([0, 0])) == 0 - assert BinomialDistance().p_value(pd.Series([1, 0]), pd.Series([1, 0])) == 1 - assert BinomialDistance().p_value(pd.Series([1, 0, 1, 1]), pd.Series([1, 0, 1, 0])) == 0.625 + assert bin_prop(1, 0, 2) == 0 + assert bin_prop(0.5, 0.5, 2) == 1 + assert bin_prop(0.75, 0.5, 4) == 0.625 def test_bootstrap():