From 818e42f8d535f49a573b08e38eb5858a507f776f Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Wed, 15 Sep 2021 08:30:33 +0100 Subject: [PATCH 1/3] integrate insight w distance metrics --- src/fairlens/metrics/distance.py | 202 ++++++++++--------------------- src/fairlens/metrics/unified.py | 75 +++++++----- tests/test_significance.py | 8 +- 3 files changed, 118 insertions(+), 167 deletions(-) diff --git a/src/fairlens/metrics/distance.py b/src/fairlens/metrics/distance.py index ccdef798..7e8c5ca3 100644 --- a/src/fairlens/metrics/distance.py +++ b/src/fairlens/metrics/distance.py @@ -3,20 +3,34 @@ """ import inspect -from abc import ABC, abstractmethod +from abc import abstractmethod from typing import Dict, Optional, Type, Union import numpy as np import pandas as pd import pyemd -from scipy.spatial.distance import jensenshannon -from scipy.stats import entropy, kruskal, ks_2samp +from synthesized_insight import ColumnCheck +from synthesized_insight.metrics import HellingerDistance as HD +from synthesized_insight.metrics import JensenShannonDivergence as JSD +from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest +from synthesized_insight.metrics import KullbackLeiblerDivergence as KLD +from synthesized_insight.metrics import TwoColumnMetric from .. import utils -from ..metrics import significance as pv -class DistanceMetric(ABC): +class _OptimisticCheck(ColumnCheck): + def continuous(self, sr: pd.Series) -> bool: + return True + + def categorical(self, sr: pd.Series) -> bool: + return True + + +check = _OptimisticCheck() + + +class DistanceMetric(TwoColumnMetric): """ Base class for distance metrics that compare samples from two distributions. @@ -47,9 +61,9 @@ def __call__(self, x: pd.Series, y: pd.Series) -> Optional[float]: Args: x (pd.Series): - The data in the column representing the first group. + The data in the first sample. y (pd.Series): - The data in the column representing the second group. + The data in the second sample. Returns: Optional[float]: @@ -67,9 +81,9 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: Args: x (pd.Series): - The data in the column representing the first group. + The data in the first sample. y (pd.Series): - The data in the column representing the second group. + The data in the second sample. Returns: bool: @@ -79,13 +93,13 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: @abstractmethod def distance(self, x: pd.Series, y: pd.Series) -> float: - """Distance between the distribution of numerical data in x and y. Derived classes must implement this. + """Distance between the distributions in x and y. Derived classes must implement this. Args: x (pd.Series): - Numerical data in a column. + The data in the first sample. y (pd.Series): - Numerical data in a column. + The data in the second sample. Returns: float: @@ -93,22 +107,6 @@ def distance(self, x: pd.Series, y: pd.Series) -> float: """ ... - def p_value(self, x: pd.Series, y: pd.Series) -> float: - """Returns a p-value for the test that x and y are sampled from the same distribution. - - Args: - x (pd.Series): - Numerical data in a column. - y (pd.Series): - Numerical data in a column. - - Returns: - float: - The computed p-value. - """ - - raise NotImplementedError() - @property @abstractmethod def id(self) -> str: @@ -130,103 +128,37 @@ class ContinuousDistanceMetric(DistanceMetric): Subclasses must implement a distance method. """ - def __init__(self, p_value_test="bootstrap"): - """Initialize continuous distance metric. - - Args: - p_value_test (str, optional): - Choose which method of resampling will be used to compute the p-value. Overidden by metrics - such as Kolmogrov Smirnov Distance. - Defaults to "permutation". - """ - - self.p_value_test = p_value_test - def check_input(self, x: pd.Series, y: pd.Series) -> bool: x_dtype = utils.infer_dtype(x).dtype y_dtype = utils.infer_dtype(y).dtype return x_dtype in ["int64", "float64"] and y_dtype in ["int64", "float64"] - def p_value(self, x: pd.Series, y: pd.Series) -> float: - if self.p_value_test == "permutation": - ts_distribution = pv.permutation_statistic(x, y, self.distance, n_perm=100) - elif self.p_value_test == "bootstrap": - ts_distribution = pv.bootstrap_statistic(x, y, self.distance, n_samples=1000) - else: - raise ValueError('p_value_test must be one of ["permutation", "bootstrap"]') - - return pv.resampling_p_value(self.distance(x, y), ts_distribution) - class CategoricalDistanceMetric(DistanceMetric): """ Base class for distance metrics on categorical data. - Continuous data is automatically binned to create histograms, bin edges can be provided as an argument - and will be used to bin continous data. If the data has been pre-binned and consists of pd.Intervals - for instance, the histograms will be computed using the counts of each bin, and the bin_edges, if given, - will be used in metrics such as EarthMoversDistanceCategorical to compute the distance space. - - Subclasses must implement a distance_pdf method. + Subclasses must implement a distance method. """ - def __init__(self, bin_edges: Optional[np.ndarray] = None): - """Initialize categorical distance metric. - - Args: - bin_edges (Optional[np.ndarray], optional): - A numpy array of bin edges used to bin continuous data or to indicate bins of pre-binned data - to metrics which take the distance space into account. - i.e. For bins [0-5, 5-10, 10-15, 15-20], bin_edges would be [0, 5, 10, 15, 20]. - See numpy.histogram_bin_edges() for more information. - """ - - self.bin_edges = bin_edges - def check_input(self, x: pd.Series, y: pd.Series) -> bool: x_dtype = utils.infer_dtype(x).dtype y_dtype = utils.infer_dtype(y).dtype return x_dtype == y_dtype - def distance(self, x: pd.Series, y: pd.Series) -> float: - (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True) - - return self.distance_pdf(p, q, bin_edges) - - @abstractmethod - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - """Distance between 2 aligned normalized histograms. Derived classes must implement this. - - Args: - p (pd.Series): - A normalized histogram. - q (pd.Series): - A normalized histogram. - bin_edges (Optional[np.ndarray]): - bin_edges for binned continuous data. Used by metrics such as Earth Mover's Distance to compute the - distance metric space. - - Returns: - float: - The computed distance. - """ - ... - def p_value(self, x: pd.Series, y: pd.Series) -> float: - (h_x, h_y), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, normalize=False, ret_bins=True) - - def distance_call(h_x, h_y): - with np.errstate(divide="ignore", invalid="ignore"): - p = pd.Series(np.nan_to_num(h_x / h_x.sum())) - q = pd.Series(np.nan_to_num(h_y / h_y.sum())) - - return self.distance_pdf(p, q, bin_edges) +class BinaryDistanceMetric(DistanceMetric): + """ + Base class for distance metrics on binary data. - ts_distribution = pv.bootstrap_binned_statistic(h_x, h_y, distance_call, n_samples=100) + Subclasses must implement a distance method. + """ - return pv.resampling_p_value(distance_call(h_x, h_y), ts_distribution) + def check_input(self, x: pd.Series, y: pd.Series) -> bool: + joint = pd.concat((x, y)) + return utils.infer_distr_type(joint).is_binary() and (np.sort(joint.unique()) == [0, 1]).all() class MeanDistance(ContinuousDistanceMetric): @@ -242,11 +174,11 @@ def id(self) -> str: return "mean" -class BinomialDistance(ContinuousDistanceMetric): +class BinomialDistance(BinaryDistanceMetric): """ Difference distance between two binary data samples. i.e p_x - p_y, where p_x, p_y are the probabilities of success in x and y, respectively. - The p-value computed is for the null hypothesis is that the probability of success is p_y. + Data is assumed to be a series of 1, 0 (success, failure) Bernoulli random variates. """ @@ -256,13 +188,6 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool: def distance(self, x: pd.Series, y: pd.Series) -> float: return x.mean() - y.mean() - def p_value(self, x: pd.Series, y: pd.Series) -> float: - p_obs = x.mean() - p_null = y.mean() - n = len(x) - - return pv.binominal_proportion_p_value(p_obs, p_null, n) - @property def id(self) -> str: return "binomial" @@ -274,10 +199,7 @@ class KolmogorovSmirnovDistance(ContinuousDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return ks_2samp(x, y)[0] - - def p_value(self, x: pd.Series, y: pd.Series) -> float: - return ks_2samp(x, y)[1] + return KolmogorovSmirnovDistanceTest(check=check)(x, y)[0] @property def id(self) -> str: @@ -285,11 +207,12 @@ def id(self) -> str: class KruskalWallis(ContinuousDistanceMetric): - def distance(self, x: pd.Series, y: pd.Series) -> float: - return kruskal(x, y)[0] + """ + Kruskal Wallis H test between two data samples. + """ - def p_value(self, x: pd.Series, y: pd.Series) -> float: - return kruskal(x, y)[1] + def distance(self, x: pd.Series, y: pd.Series) -> float: + return KruskalWallisTest(check=check)(x, y)[0] @property def id(self) -> str: @@ -299,11 +222,21 @@ def id(self) -> str: class EarthMoversDistance(CategoricalDistanceMetric): """ Earth movers distance (EMD), aka Wasserstein 1-distance, for categorical data. - - Using EarthMoversDistance on the raw data is faster and recommended. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: + def __init__(self, bin_edges: Optional[np.ndarray] = None): + """ + Args: + bin_edges (Optional[np.ndarray], optional): + A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data. + Defaults to None. + """ + + self.bin_edges = bin_edges + + def distance(self, x: pd.Series, y: pd.Series) -> float: + (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True) + distance_matrix = 1 - np.eye(len(p)) if bin_edges is not None: @@ -328,8 +261,8 @@ class KullbackLeiblerDivergence(CategoricalDistanceMetric): Kullback–Leibler Divergence or Relative Entropy between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return entropy(np.array(p), np.array(q)) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return KLD(check=check)(x, y) @property def id(self) -> str: @@ -341,8 +274,8 @@ class JensenShannonDivergence(CategoricalDistanceMetric): Jensen-Shannon Divergence between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return jensenshannon(p, q) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return JSD(check=check)(x, y) @property def id(self) -> str: @@ -351,24 +284,21 @@ def id(self) -> str: class Norm(CategoricalDistanceMetric): """ - LP Norm between two probability distributions. + L-P Norm between two probability distributions. """ - def __init__(self, bin_edges: Optional[np.ndarray] = None, ord: Union[str, int] = 2): + def __init__(self, ord: Union[str, int] = 2): """ Args: - bin_edges (Optional[np.ndarray], optional): - A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data. - Defaults to None. ord (Union[str, int], optional): The order of the norm. Possible values include positive numbers, 'fro', 'nuc'. See numpy.linalg.norm for more details. Defaults to 2. """ - super().__init__(bin_edges=bin_edges) self.ord = ord - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: + def distance(self, x: pd.Series, y: pd.Series) -> float: + (p, q), _ = utils.zipped_hist((x, y), ret_bins=True) return np.linalg.norm(p - q, ord=self.ord) @property @@ -381,8 +311,8 @@ class HellingerDistance(CategoricalDistanceMetric): Hellinger distance between two probability distributions. """ - def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float: - return np.linalg.norm(np.sqrt(p) - np.sqrt(q)) / np.sqrt(2) + def distance(self, x: pd.Series, y: pd.Series) -> float: + return HD(check=check)(x, y) @property def id(self) -> str: diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py index ce49c459..e4dce2e0 100644 --- a/src/fairlens/metrics/unified.py +++ b/src/fairlens/metrics/unified.py @@ -3,16 +3,20 @@ """ import multiprocessing as mp -from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union +from collections import namedtuple +from typing import Any, Callable, List, Mapping, Optional, Tuple, Union import pandas as pd +from synthesized_insight.metrics.statistical_tests import BootstrapTest, KolmogorovSmirnovDistanceTest, PermutationTest from .. import utils from .correlation import cramers_v, kruskal_wallis, pearson from .distance import BinomialDistance, DistanceMetric, EarthMoversDistance, KolmogorovSmirnovDistance +DistanceResult = namedtuple("DistanceResult", ("distance", "p_value")) -def auto_distance(column: pd.Series) -> Type[DistanceMetric]: + +def auto_distance(column: pd.Series) -> str: """Return a suitable statistical distance metric based on the distribution of the data. Args: @@ -20,17 +24,17 @@ def auto_distance(column: pd.Series) -> Type[DistanceMetric]: The input data in a pd.Series. Returns: - Type[DistanceMetric]: - The class of the distance metric. + str: + The id of the distance metric. """ distr_type = utils.infer_distr_type(column) if distr_type.is_continuous(): - return KolmogorovSmirnovDistance + return KolmogorovSmirnovDistance._get_id() elif distr_type.is_binary(): - return BinomialDistance + return BinomialDistance._get_id() - return EarthMoversDistance + return EarthMoversDistance._get_id() def stat_distance( @@ -39,9 +43,11 @@ def stat_distance( group1: Union[Mapping[str, List[Any]], pd.Series], group2: Union[Mapping[str, List[Any]], pd.Series], mode: str = "auto", - p_value: bool = False, + test: str = "default", + alternative: str = "two-sided", + cl: float = 0.95, **kwargs, -) -> Tuple[float, ...]: +) -> Tuple[float, Optional[float]]: """Computes the statistical distance between two probability distributions ie. group 1 and group 2, with respect to the target attribute. The distance metric can be chosen through the mode parameter. If mode is set to "auto", the most suitable metric depending on the target attributes' distribution is chosen. @@ -63,18 +69,23 @@ def stat_distance( a predicate itself, i.e. pandas series consisting of bools which can be used as a predicate to index a subgroup from the dataframe. Examples: {"Sex": ["Male"]}, df["Sex"] == "Female" - mode (str): + mode (str, optional): Which distance metric to use. Can be the names of classes from `fairlens.metrics`, or their id() strings. If set to "auto", the method automatically picks a suitable metric based on the distribution of the target attribute. Defaults to "auto". - p_value (bool): - Returns the a suitable p-value for the metric if it exists. Defaults to False. + test (str, optional): + The statistical test to use to compute the p-value. Can take values + ["default", "bootstrap", "permutation"]. If set to "default", returns any p-value + for the metric specified by `mode`. Defaults to "default" + alternative (str, optional): + The alternative hypothesis for the test. Can take values ["two-sided", "greater", "less"]. + Defaults to "two-sided". **kwargs: Keyword arguments for the distance metric. Passed to the __init__ function of distance metrics. Returns: - Tuple[float, ...]: - The distance as a float, and the p-value if p_value is set to True and can be computed. + Tuple[float, Optional[float]]: + The distance as a float, the p-value as a float. Examples: >>> df = pd.read_csv("datasets/compas.csv") @@ -87,30 +98,40 @@ def stat_distance( (0.0816143577815524, 0.02693435054772131) """ + if test not in ["default", "bootstrap", "permutation"]: + raise ValueError('Invalid value for test. Valid values: ["default", "bootstrap", "permutation"]') + + if alternative not in ["two-sided", "greater", "less"]: + raise ValueError('Invalid value for alternative. Valid values: ["two-sided", "greater", "less"]') + # Parse group arguments into pandas series' pred1, pred2 = tuple(utils.get_predicates_mult(df, [group1, group2])) group1 = df[pred1][target_attr] group2 = df[pred2][target_attr] - # Choose the distance metric + # Choose the metric if mode == "auto": - dist_class = auto_distance(df[target_attr]) - elif mode in DistanceMetric._class_dict: - dist_class = DistanceMetric._class_dict[mode] - else: - raise ValueError(f"Invalid mode. Valid modes include:\n{DistanceMetric._class_dict.keys()}") + mode = auto_distance(df[target_attr]) + if mode not in DistanceMetric._class_dict: + raise ValueError(f"Invalid value for mode. Valid values:\n{DistanceMetric._class_dict.keys()}") + + dist_class = DistanceMetric._class_dict[mode] metric = dist_class(**kwargs) - d = metric(group1, group2) - if d is None: - raise ValueError("Incompatible data inside both series") + d, p = None, None - if p_value: - p = metric.p_value(group1, group2) - return (d, p) + if test == "bootstrap": + d, p = BootstrapTest(metric, alternative=alternative) + elif test == "permutation": + d, p = PermutationTest(metric, alternative=alternative) + elif test == "default" and mode == "ks_distance": + d, p = KolmogorovSmirnovDistanceTest(metric, alternative=alternative) + + if d is None or p is None: + raise ValueError("Incompatible data inside both series") - return (d,) + return DistanceResult(distance=d, p_value=p) def correlation_matrix( diff --git a/tests/test_significance.py b/tests/test_significance.py index a33ae88f..b554e926 100644 --- a/tests/test_significance.py +++ b/tests/test_significance.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from fairlens.metrics.distance import BinomialDistance, MeanDistance +from fairlens.metrics.distance import MeanDistance from fairlens.metrics.significance import binominal_proportion_p_value as bin_prop from fairlens.metrics.significance import bootstrap_binned_statistic as bootstrap_binned from fairlens.metrics.significance import bootstrap_statistic as bootstrap @@ -17,9 +17,9 @@ def test_binomial(): assert abs(bin_prop(0.2, 0.1, 10) - (1 - (0.9 ** 10 + 0.9 ** 9))) < epsilon - assert BinomialDistance().p_value(pd.Series([1, 1]), pd.Series([0, 0])) == 0 - assert BinomialDistance().p_value(pd.Series([1, 0]), pd.Series([1, 0])) == 1 - assert BinomialDistance().p_value(pd.Series([1, 0, 1, 1]), pd.Series([1, 0, 1, 0])) == 0.625 + assert bin_prop(1, 0, 2) == 0 + assert bin_prop(0.5, 0.5, 2) == 1 + assert bin_prop(0.75, 0.5, 4) == 0.625 def test_bootstrap(): From 0b977d3d647fd0e428455c9483122e796e8caa83 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Thu, 16 Sep 2021 12:13:20 +0100 Subject: [PATCH 2/3] create metric adapter --- src/fairlens/metrics/distance.py | 27 +++--------- src/fairlens/metrics/unified.py | 74 ++++++++++++++++++++++---------- 2 files changed, 59 insertions(+), 42 deletions(-) diff --git a/src/fairlens/metrics/distance.py b/src/fairlens/metrics/distance.py index 7e8c5ca3..4ebbce0e 100644 --- a/src/fairlens/metrics/distance.py +++ b/src/fairlens/metrics/distance.py @@ -3,34 +3,21 @@ """ import inspect -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Dict, Optional, Type, Union import numpy as np import pandas as pd import pyemd -from synthesized_insight import ColumnCheck from synthesized_insight.metrics import HellingerDistance as HD from synthesized_insight.metrics import JensenShannonDivergence as JSD from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest from synthesized_insight.metrics import KullbackLeiblerDivergence as KLD -from synthesized_insight.metrics import TwoColumnMetric from .. import utils -class _OptimisticCheck(ColumnCheck): - def continuous(self, sr: pd.Series) -> bool: - return True - - def categorical(self, sr: pd.Series) -> bool: - return True - - -check = _OptimisticCheck() - - -class DistanceMetric(TwoColumnMetric): +class DistanceMetric(ABC): """ Base class for distance metrics that compare samples from two distributions. @@ -199,7 +186,7 @@ class KolmogorovSmirnovDistance(ContinuousDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return KolmogorovSmirnovDistanceTest(check=check)(x, y)[0] + return KolmogorovSmirnovDistanceTest()._compute_test(x, y)[0] @property def id(self) -> str: @@ -212,7 +199,7 @@ class KruskalWallis(ContinuousDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return KruskalWallisTest(check=check)(x, y)[0] + return KruskalWallisTest()._compute_test(x, y)[0] @property def id(self) -> str: @@ -262,7 +249,7 @@ class KullbackLeiblerDivergence(CategoricalDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return KLD(check=check)(x, y) + return KLD()._compute_metric(x, y) @property def id(self) -> str: @@ -275,7 +262,7 @@ class JensenShannonDivergence(CategoricalDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return JSD(check=check)(x, y) + return JSD()._compute_metric(x, y) @property def id(self) -> str: @@ -312,7 +299,7 @@ class HellingerDistance(CategoricalDistanceMetric): """ def distance(self, x: pd.Series, y: pd.Series) -> float: - return HD(check=check)(x, y) + return HD()._compute_metric(x, y) @property def id(self) -> str: diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py index e4dce2e0..ecfc37f0 100644 --- a/src/fairlens/metrics/unified.py +++ b/src/fairlens/metrics/unified.py @@ -7,6 +7,8 @@ from typing import Any, Callable, List, Mapping, Optional, Tuple, Union import pandas as pd +from synthesized_insight.check import Check, ColumnCheck +from synthesized_insight.metrics.base import TwoColumnMetric from synthesized_insight.metrics.statistical_tests import BootstrapTest, KolmogorovSmirnovDistanceTest, PermutationTest from .. import utils @@ -16,25 +18,15 @@ DistanceResult = namedtuple("DistanceResult", ("distance", "p_value")) -def auto_distance(column: pd.Series) -> str: - """Return a suitable statistical distance metric based on the distribution of the data. +class _OptimisticCheck(ColumnCheck): + def continuous(self, sr: pd.Series) -> bool: + return True - Args: - column (pd.Series): - The input data in a pd.Series. + def categorical(self, sr: pd.Series) -> bool: + return True - Returns: - str: - The id of the distance metric. - """ - - distr_type = utils.infer_distr_type(column) - if distr_type.is_continuous(): - return KolmogorovSmirnovDistance._get_id() - elif distr_type.is_binary(): - return BinomialDistance._get_id() - return EarthMoversDistance._get_id() +check = _OptimisticCheck() def stat_distance( @@ -45,7 +37,6 @@ def stat_distance( mode: str = "auto", test: str = "default", alternative: str = "two-sided", - cl: float = 0.95, **kwargs, ) -> Tuple[float, Optional[float]]: """Computes the statistical distance between two probability distributions ie. group 1 and group 2, with respect @@ -117,23 +108,48 @@ def stat_distance( raise ValueError(f"Invalid value for mode. Valid values:\n{DistanceMetric._class_dict.keys()}") dist_class = DistanceMetric._class_dict[mode] - metric = dist_class(**kwargs) + distance_metric = dist_class(**kwargs) + metric = _MetricAdaptor(distance_metric) d, p = None, None if test == "bootstrap": - d, p = BootstrapTest(metric, alternative=alternative) + d, p = BootstrapTest(metric, alternative=alternative, check=check)(group1, group2) elif test == "permutation": - d, p = PermutationTest(metric, alternative=alternative) + d, p = PermutationTest(metric, alternative=alternative, check=check)(group1, group2) elif test == "default" and mode == "ks_distance": - d, p = KolmogorovSmirnovDistanceTest(metric, alternative=alternative) + d, p = KolmogorovSmirnovDistanceTest(alternative=alternative, check=check)(group1, group2) + else: + d = distance_metric(group1, group2) + p = None - if d is None or p is None: + if d is None: raise ValueError("Incompatible data inside both series") return DistanceResult(distance=d, p_value=p) +def auto_distance(column: pd.Series) -> str: + """Return a suitable statistical distance metric based on the distribution of the data. + + Args: + column (pd.Series): + The input data in a pd.Series. + + Returns: + str: + The id of the distance metric. + """ + + distr_type = utils.infer_distr_type(column) + if distr_type.is_continuous(): + return KolmogorovSmirnovDistance._get_id() + elif distr_type.is_binary(): + return BinomialDistance._get_id() + + return EarthMoversDistance._get_id() + + def correlation_matrix( df: pd.DataFrame, num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson, @@ -213,3 +229,17 @@ def _correlation_matrix_helper( else: return cat_cat_metric(sr_a, sr_b) + + +class _MetricAdaptor(TwoColumnMetric): + def __init__(self, metric: DistanceMetric): + self.metric = metric + + def __call__(self, sr_a: pd.Series, sr_b: pd.Series): + return self.metric(sr_a, sr_b) + + def check_column_types(cls, sr_a: pd.Series, sr_b: pd.Series, check: Check): + return True + + def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series): + return self.metric.distance(sr_a, sr_b) From 9aba15959026ffc769bf06c70b54e879753c4088 Mon Sep 17 00:00:00 2001 From: Hilly12 Date: Thu, 16 Sep 2021 14:31:55 +0100 Subject: [PATCH 3/3] replace pyemd with scipy.wasserstein_distance --- setup.cfg | 2 +- src/fairlens/metrics/distance.py | 19 ++++++------------- tests/test_metrics.py | 6 ------ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/setup.cfg b/setup.cfg index 0dab5eb3..ec8b9978 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = matplotlib>=2.1.0 seaborn>=0.11.1 dcor>=0.5.3 - pyemd==0.5.1 + synthesized-insight>=0.3 [options.packages.find] where = src diff --git a/src/fairlens/metrics/distance.py b/src/fairlens/metrics/distance.py index 4ebbce0e..04759014 100644 --- a/src/fairlens/metrics/distance.py +++ b/src/fairlens/metrics/distance.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -import pyemd +from scipy.stats import wasserstein_distance from synthesized_insight.metrics import HellingerDistance as HD from synthesized_insight.metrics import JensenShannonDivergence as JSD from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest @@ -224,19 +224,12 @@ def __init__(self, bin_edges: Optional[np.ndarray] = None): def distance(self, x: pd.Series, y: pd.Series) -> float: (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True) - distance_matrix = 1 - np.eye(len(p)) + if bin_edges is None: + bin_centers = np.arange(len(p)) + else: + bin_centers = (np.array(bin_edges[:-1]) + np.array(bin_edges[1:])) / 2 - if bin_edges is not None: - # Use pair-wise euclidean distances between bin centers for scale data - bin_centers = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) - xx, yy = np.meshgrid(bin_centers, bin_centers) - distance_matrix = np.abs(xx - yy) - - p = np.array(p).astype(np.float64) - q = np.array(q).astype(np.float64) - distance_matrix = distance_matrix.astype(np.float64) - - return pyemd.emd(p, q, distance_matrix) + return wasserstein_distance(bin_centers, bin_centers, u_weights=p, v_weights=q) @property def id(self) -> str: diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 35f4098f..f7db5593 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -from pyemd import emd_samples from fairlens.metrics.distance import BinomialDistance from fairlens.metrics.distance import EarthMoversDistance as EMD @@ -43,11 +42,6 @@ def test_stat_distance_auto(): assert stat_distance(df, target_attr, pred1, pred2, mode="ks_distance")[0] == res -def test_auto_binning(): - res = emd_samples(group1, group2) - assert stat_distance(df, target_attr, pred1, pred2, mode="emd")[0] == res - - def test_mean_distance(): assert Mean()(pd.Series(np.arange(100)), pd.Series(np.arange(10))) == 45