From acc86774bacd8399703234e03bd3fbe74806f78b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Czy=C5=BC?= Date: Wed, 6 Mar 2024 18:07:38 +0100 Subject: [PATCH] Experiments (#17) --- .gitignore | 17 ++ labelshift/algorithms/api.py | 2 + labelshift/algorithms/bayesian_discrete.py | 43 ++- .../algorithms/expectation_maximization.py | 17 +- .../algorithms/gaussian_mixture_model.py | 11 +- labelshift/algorithms/ratio_estimator.py | 43 ++- labelshift/api.py | 7 + labelshift/datasets/api.py | 20 ++ labelshift/datasets/discrete_categorical.py | 172 ++++++++---- labelshift/datasets/split.py | 142 ++++++++++ labelshift/experiments/__init__.py | 5 + labelshift/experiments/api.py | 22 ++ labelshift/experiments/names.py | 12 + labelshift/{ => experiments}/timer.py | 0 labelshift/partition.py | 129 +++++++++ labelshift/scoring.py | 6 + requirements.txt | 2 +- scripts/design_categorical.py | 156 +++++++++++ scripts/experiment1/1-1.py | 32 +++ scripts/experiment1/1-2.py | 33 +++ scripts/experiment1/1-3.py | 33 +++ scripts/experiment1/plot_figure.py | 97 +++++++ scripts/experiment_external_dataset.py | 134 +++++++++ scripts/experiment_external_dataset2.py | 179 ++++++++++++ scripts/experiment_gaussian.py | 144 ++++++++++ scripts/run_categorical.py | 256 ++++++++++++++++++ setup.cfg | 1 + tests/algorithms/test_bayesian_discrete.py | 6 +- tests/algorithms/test_bbse.py | 6 +- tests/algorithms/test_ratio_estimator.py | 6 +- tests/datasets/test_discrete_categorical.py | 32 ++- tests/datasets/test_split.py | 37 +++ tests/test_partition.py | 81 ++++++ tests/test_timer.py | 2 +- 34 files changed, 1774 insertions(+), 111 deletions(-) create mode 100644 labelshift/api.py create mode 100644 labelshift/datasets/api.py create mode 100644 labelshift/datasets/split.py create mode 100644 labelshift/experiments/__init__.py create mode 100644 labelshift/experiments/api.py create mode 100644 labelshift/experiments/names.py rename labelshift/{ => experiments}/timer.py (100%) create mode 100644 labelshift/partition.py create mode 100644 scripts/design_categorical.py create mode 100644 scripts/experiment1/1-1.py create mode 100644 scripts/experiment1/1-2.py create mode 100644 scripts/experiment1/1-3.py create mode 100644 scripts/experiment1/plot_figure.py create mode 100644 scripts/experiment_external_dataset.py create mode 100644 scripts/experiment_external_dataset2.py create mode 100644 scripts/experiment_gaussian.py create mode 100644 scripts/run_categorical.py create mode 100644 tests/datasets/test_split.py create mode 100644 tests/test_partition.py diff --git a/.gitignore b/.gitignore index 250f8f9..ccf9ec1 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,23 @@ local/ private/ +# Data +*.json +*.yml +*.yaml +*.csv +*.npy +*.npz + +# Plots +*.eps +*.gif +*.jpg +*.jpeg +*.pdf +*.png +*.svg + # Editors .idea/ .vscode/ diff --git a/labelshift/algorithms/api.py b/labelshift/algorithms/api.py index a6a18f5..db1281d 100644 --- a/labelshift/algorithms/api.py +++ b/labelshift/algorithms/api.py @@ -8,10 +8,12 @@ from labelshift.algorithms.bbse import BlackBoxShiftEstimator from labelshift.algorithms.classify_and_count import ClassifyAndCount from labelshift.algorithms.ratio_estimator import InvariantRatioEstimator +from labelshift.interfaces.point_estimators import SummaryStatistic __all__ = [ "BlackBoxShiftEstimator", "ClassifyAndCount", "DiscreteCategoricalMAPEstimator", "InvariantRatioEstimator", + "SummaryStatistic", ] diff --git a/labelshift/algorithms/bayesian_discrete.py b/labelshift/algorithms/bayesian_discrete.py index 418f89b..ef6ff25 100644 --- a/labelshift/algorithms/bayesian_discrete.py +++ b/labelshift/algorithms/bayesian_discrete.py @@ -3,7 +3,7 @@ Proposed in TODO(Pawel): Add citation to pre-print after AISTATS reviews. """ -from typing import cast, NewType, Optional +from typing import cast, NewType, Optional, Union import arviz as az import numpy as np @@ -35,11 +35,31 @@ class SamplingParams(pydantic.BaseModel): ) +def dirichlet_alphas(L: int, alpha: Union[float, ArrayLike]) -> np.ndarray: + """Convenient initialization of alpha (pseudocounts) + parameters of the Dirichlet prior. + + Args: + alpha: either an array of shape (L,) or a float. + If a float, vector (alpha, alpha, ..., alpha) + is created + + Returns: + alphas, shape (L,) + """ + if isinstance(alpha, float): + return np.ones(L) * alpha + else: + alpha = np.asarray(alpha) + assert alpha.shape == (L,) + return alpha + + def build_model( n_y_and_c_labeled: ArrayLike, n_c_unlabeled: ArrayLike, - alpha_p_y_labeled: Optional[ArrayLike] = None, - alpha_p_y_unlabeled: Optional[ArrayLike] = None, + alpha_p_y_labeled: Union[float, ArrayLike] = 1.0, + alpha_p_y_unlabeled: Union[float, ArrayLike] = 1.0, ) -> DiscreteBayesianQuantificationModel: """Builds the discrete Bayesian quantification model, basing on the sufficient statistic of the data. @@ -59,15 +79,8 @@ def build_model( assert n_y_labeled.shape == (L,) assert n_c_unlabeled.shape == (K,) - alpha_p_y_labeled = ( - np.ones(L) if alpha_p_y_labeled is None else np.asarray(alpha_p_y_labeled) - ) - alpha_p_y_unlabeled = ( - np.ones(L) if alpha_p_y_unlabeled is None else np.asarray(alpha_p_y_unlabeled) - ) - - assert alpha_p_y_labeled.shape == (L,) - assert alpha_p_y_unlabeled.shape == (L,) + alpha_p_y_labeled = dirichlet_alphas(L, alpha_p_y_labeled) + alpha_p_y_unlabeled = dirichlet_alphas(L, alpha_p_y_unlabeled) model = pm.Model() with model: @@ -140,13 +153,16 @@ class DiscreteCategoricalMAPEstimator(pe.SummaryStatisticPrevalenceEstimator): """A version of Bayesian quantification which finds the Maximum a Posteriori solution.""" - def __init__(self, max_eval: int = 10_000) -> None: + def __init__( + self, max_eval: int = 10_000, alpha_unlabeled: Union[float, ArrayLike] = 1.0 + ) -> None: """ Args: max_eval: maximal number of evaluations of the posterior during the optimization to find the MAP """ self._max_eval = max_eval + self._alpha_unlabeled = alpha_unlabeled def estimate_from_summary_statistic( self, /, statistic: pe.SummaryStatistic @@ -155,6 +171,7 @@ def estimate_from_summary_statistic( model = build_model( n_c_unlabeled=statistic.n_c_unlabeled, n_y_and_c_labeled=statistic.n_y_and_c_labeled, + alpha_p_y_unlabeled=self._alpha_unlabeled, ) with model: optimal = pymc.find_MAP(maxeval=self._max_eval) diff --git a/labelshift/algorithms/expectation_maximization.py b/labelshift/algorithms/expectation_maximization.py index 2ff51c3..6286b05 100644 --- a/labelshift/algorithms/expectation_maximization.py +++ b/labelshift/algorithms/expectation_maximization.py @@ -14,7 +14,7 @@ def expectation_maximization( *, initial_prevalences: Optional[ArrayLike] = None, max_steps: int = 10000, - atol: float = 0.01, + tolerance: float = 0.01, ) -> np.ndarray: """Expectation maximization algorithm, as described in @@ -24,13 +24,13 @@ def expectation_maximization( Args: predictions: test set probability predictions. Shape (n_samples, n_classes). - prevalences: prevalences in the training data set. + training_prevalences: prevalences in the training data set. Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized. initial_prevalences: starting prevalences for optimization. If not provided, the training prevalences are used. Shape (n_classes,), (n_classes, 1) or (1, n_classes). Will be normalized. max_steps: maximal number of iteration steps - atol: desired accuracy (for early stopping) + tolerance: desired accuracy (for early stopping) Returns: test set prevalences, shape (n_classes,). @@ -48,6 +48,7 @@ def expectation_maximization( test_prevalences = training_prevalences.copy() # Iteratively improve the estimate of the test set prevalences + converged: bool = False for _ in range(max_steps): old_prevalences = test_prevalences.copy() @@ -59,10 +60,12 @@ def expectation_maximization( ) / len(new_predictions) # Check if converged - if np.allclose(old_prevalences, test_prevalences, atol=atol, rtol=0): + if np.max(np.abs(old_prevalences - test_prevalences)) < tolerance: + converged = True break - warnings.warn( - RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.") - ) + if not converged: + warnings.warn( + RuntimeWarning(f"Required accuracy not reached in {max_steps} steps.") + ) return test_prevalences.ravel() diff --git a/labelshift/algorithms/gaussian_mixture_model.py b/labelshift/algorithms/gaussian_mixture_model.py index 03a5d9c..b00797f 100644 --- a/labelshift/algorithms/gaussian_mixture_model.py +++ b/labelshift/algorithms/gaussian_mixture_model.py @@ -5,7 +5,7 @@ Note: This algorithm models the data in 1D. """ -from typing import Optional, Sequence, Tuple +from typing import Sequence, Tuple, Union import numpy as np import pymc as pm @@ -21,7 +21,7 @@ def build_model( unlabeled_data: ArrayLike, mean_params: Tuple[float, float] = (0.0, 1.0), sigma_param: float = 1.0, - alpha: Optional[ArrayLike] = None, + alpha: Union[float, ArrayLike] = None, ) -> pm.Model: """Builds a PyMC model for Bayesian quantification for 1D data assumed to be sampled from a mixture of normals. @@ -38,7 +38,8 @@ def build_model( mean_params: used to initialize the prior on the component means sigma_param: used to initialize the prior on the component sigmas alpha: used to initialize the Dirichlet prior on P_unlabeled(Y). - Shape (n_components,) + Can be an array of shape (n_components,) + or a float, so that a uniform vector (alpha, alpha, ...) is used. Returns: a PyMC model with the following variables: @@ -51,8 +52,8 @@ def build_model( assert unlabeled_data.shape == (len(unlabeled_data),) n_y = len(labeled_data) - if alpha is None: - alpha = np.ones(n_y) + if isinstance(alpha, float): + alpha = alpha * np.ones(n_y) else: alpha = np.asarray(alpha) diff --git a/labelshift/algorithms/ratio_estimator.py b/labelshift/algorithms/ratio_estimator.py index 346eb53..07099a7 100644 --- a/labelshift/algorithms/ratio_estimator.py +++ b/labelshift/algorithms/ratio_estimator.py @@ -41,7 +41,7 @@ ``H_hat[l, k] = G_hat[l, k] = E_labeled[ g(X)[k] | Y = l] \\in R^{L x (K-1)}.`` """ -from typing import Tuple +from typing import Optional, Tuple import numpy as np from numpy.typing import ArrayLike @@ -103,9 +103,42 @@ def prevalence_from_vector_and_matrix( def calculate_vector_and_matrix_from_predictions( unlabeled_predictions: ArrayLike, labeled_predictions: ArrayLike, -) -> None: - """This method has not been implemented yet.""" - raise NotImplementedError + labeled_ground_truth: ArrayLike, + L: Optional[int] = None, + enforce_square: bool = True, + restricted: bool = True, + rcond: float = 1e-4, +) -> np.ndarray: + """TODO(Pawel): Fix this docstring. + + Args: + unlabeled_predictions: shape (N', K) + labeled_predictions: shape (N, K) + labeled_ground_truth: shape (N,). Each entry is in {0, ..., L-1}. + """ + unlabeled_predictions = np.asarray(unlabeled_predictions) + labeled_predictions = np.asarray(labeled_predictions) + labeled_ground_truth = np.asarray(labeled_ground_truth, dtype=int) + + K = unlabeled_predictions.shape[1] + L: int = K if L is None else L + + assert labeled_predictions.shape == (len(labeled_ground_truth), K) + + unlabeled_vector = unlabeled_predictions.mean(axis=0)[: K - 1] # Shape (K - 1,) + labeled_matrix = np.zeros((L, K - 1)) + + for l in range(L): # noqa: E741 ambiguous name variable + index = labeled_ground_truth == l + labeled_matrix[l, :] = labeled_predictions[index, : K - 1].mean(axis=0) + + return prevalence_from_vector_and_matrix( + vector=unlabeled_vector, + matrix=labeled_matrix, + restricted=restricted, + enforce_square=enforce_square, + rcond=rcond, + ) def calculate_vector_and_matrix_from_summary_statistics( @@ -200,4 +233,6 @@ def estimate_from_summary_statistic( return prevalence_from_summary_statistics( n_c_unlabeled=statistic.n_c_unlabeled, n_y_and_c_labeled=statistic.n_y_and_c_labeled, + enforce_square=self._enforce_square, + rcond=self._rcond, ) diff --git a/labelshift/api.py b/labelshift/api.py new file mode 100644 index 0000000..51ff24c --- /dev/null +++ b/labelshift/api.py @@ -0,0 +1,7 @@ +import labelshift.experiments.api as experiments +import labelshift.datasets.api as datasets + +__all__ = [ + "experiments", + "datasets", +] diff --git a/labelshift/datasets/api.py b/labelshift/datasets/api.py new file mode 100644 index 0000000..59624a9 --- /dev/null +++ b/labelshift/datasets/api.py @@ -0,0 +1,20 @@ +from labelshift.datasets.split import ( + IDataset, + n_classes, + SplitDataset, + SplitSpecification, + split_dataset, +) +from labelshift.datasets.discrete_categorical import DiscreteSampler, almost_eye + +__all__ = [ + # `split` submodule + "IDataset", + "n_classes", + "SplitDataset", + "SplitSpecification", + "split_dataset", + # `discrete_categorical` submodule + "DiscreteSampler", + "almost_eye", +] diff --git a/labelshift/datasets/discrete_categorical.py b/labelshift/datasets/discrete_categorical.py index e2a3962..971e464 100644 --- a/labelshift/datasets/discrete_categorical.py +++ b/labelshift/datasets/discrete_categorical.py @@ -1,104 +1,156 @@ """Discrete categorical sampler.""" +import dataclasses import math +from typing import Tuple, Any, Union, Optional + import numpy as np from numpy.typing import ArrayLike from labelshift.interfaces.point_estimators import SummaryStatistic +RNG = Any -class DiscreteSampler: - """Samples from the discrete model P(C|Y).""" - def __init__( - self, p_y_labeled: ArrayLike, p_y_unlabeled: ArrayLike, p_c_cond_y: ArrayLike - ) -> None: +@dataclasses.dataclass +class SummaryMultinomialStatistic: + n_y: np.ndarray + n_c: np.ndarray + n_y_and_c: np.ndarray + + +class SimpleMultinomialSampler: + def __init__(self, p_y: ArrayLike, p_c_cond_y: ArrayLike) -> None: """ Args: - p_y_labeled: P_train(Y) vector, shape (L,) - p_y_unlabeled: P_test(Y) vector, shape (L,) - p_c_cond_y: P(C|Y), shape (L, K) + p_y: P(Y) vector, shape (L,) + p_c_cond_y: P(C|Y) matrix, shape (L, K) """ - self._p_y_labeled = np.asarray(p_y_labeled) - self._p_y_unlabeled = np.asarray(p_y_unlabeled) - self._c_cond_y = np.asarray(p_c_cond_y) + self._p_y = np.asarray(p_y) + self._p_c_cond_y = np.asarray(p_c_cond_y) - assert len(self._c_cond_y.shape) == 2 - self._L = self._c_cond_y.shape[0] - self._K = self._c_cond_y.shape[1] + assert len(self._p_c_cond_y.shape) == 2 + self._L = self._p_c_cond_y.shape[0] + self._K = self._p_c_cond_y.shape[1] - assert self._p_y_labeled.shape == (self._L,) - assert self._p_y_unlabeled.shape == (self._L,) + assert self._p_y.shape == (self._L,) - assert np.min(self._p_y_labeled) >= 0 - assert np.min(self._p_y_unlabeled) >= 0 - assert np.min(self._c_cond_y) >= 0 + assert np.min(self._p_y) >= 0 + assert np.min(self._p_c_cond_y) >= 0 - assert math.isclose(np.sum(self._p_y_labeled), 1.0) - assert math.isclose(np.sum(self._p_y_unlabeled), 1.0) + assert math.isclose(np.sum(self._p_y), 1.0) for label in range(self._L): - s = self._c_cond_y[label, :].sum() + s = self._p_c_cond_y[label, :].sum() assert math.isclose(s, 1.0) @property - def p_y_labeled(self) -> np.ndarray: - """P_labeled(Y) vector. Shape (size_Y,)""" - return self._p_y_labeled - - @property - def p_y_unlabeled(self) -> np.ndarray: - """P_unlabeled(Y) vector. Shape (size_Y,)""" - return self._p_y_unlabeled + def p_y(self) -> np.ndarray: + return self._p_y @property def p_c_cond_y(self) -> np.ndarray: - """P(C | Y) matrix, shape (size_Y, size_C)""" - return self._c_cond_y + return self._p_c_cond_y @property - def p_c_unlabeled(self) -> np.ndarray: - """P_unlabeled(C) vector, shape (size_C,)""" - return self._c_cond_y.T @ self._p_y_unlabeled + def p_c(self) -> np.ndarray: + return np.einsum("lk,l->k", self._p_c_cond_y, self._p_y) @property - def p_c_labeled(self) -> np.ndarray: - """P_labeled(C) vector, shape (size_C,)""" - return self._c_cond_y.T @ self._p_y_labeled - - @property - def size_Y(self) -> int: - """How many Y there are.""" + def n_y(self) -> int: return self._L - def size_C(self) -> int: - """How many C there are.""" + @property + def n_c(self) -> int: return self._K def sample_summary_statistic( - self, n_labeled: int = 1000, n_unlabeled: int = 1000, seed: int = 42 - ) -> SummaryStatistic: - """Samples the summary statistic from the model. - - Args: - n_labeled: number of examples in the labeled data set - n_unlabeled: number of examples in the unlabeled data set - seed: random seed - """ - rng = np.random.default_rng(seed) + self, n: int, rng: Union[int, RNG] + ) -> SummaryMultinomialStatistic: + rng = np.random.default_rng(rng) - n_y = rng.multinomial(n_labeled, self._p_y_labeled) + n_y = rng.multinomial(n, self._p_y) n_y_and_c = np.vstack( - [rng.multinomial(n, p) for n, p in zip(n_y, self._c_cond_y)] + [rng.multinomial(n, p) for n, p in zip(n_y, self._p_c_cond_y)] + ) + n_c = n_y_and_c.sum(axis=0) + + assert n_c.shape == (self._K,) + + return SummaryMultinomialStatistic( + n_c=n_c, + n_y_and_c=n_y_and_c, + n_y=n_y, ) - n_c = rng.multinomial(n_unlabeled, self._c_cond_y.T @ self._p_y_unlabeled) + +class DiscreteSampler: + def __init__( + self, + sampler_labeled: SimpleMultinomialSampler, + sampler_unlabeled: SimpleMultinomialSampler, + ) -> None: + assert sampler_labeled.n_c == sampler_unlabeled.n_c + assert sampler_labeled.n_y == sampler_unlabeled.n_y + + self.labeled = sampler_labeled + self.unlabeled = sampler_unlabeled + + @property + def n_y(self) -> int: + return self.labeled.n_y + + @property + def n_c(self) -> int: + return self.labeled.n_c + + def sample_from_both( + self, n_labeled: int, n_unlabeled: int, seed: int + ) -> Tuple[SummaryMultinomialStatistic, SummaryMultinomialStatistic]: + rng1, rng2 = [ + np.random.default_rng(s) for s in np.random.SeedSequence(seed).spawn(2) + ] + + return ( + self.labeled.sample_summary_statistic(n=n_labeled, rng=rng1), + self.unlabeled.sample_summary_statistic(n=n_unlabeled, rng=rng2), + ) + + def sample_summary_statistic( + self, n_labeled: int, n_unlabeled: int, seed: int + ) -> SummaryStatistic: + labeled, unlabeled = self.sample_from_both( + n_labeled=n_labeled, n_unlabeled=n_unlabeled, seed=seed + ) return SummaryStatistic( - n_y_labeled=n_y, - n_c_unlabeled=n_c, - n_y_and_c_labeled=n_y_and_c, + n_y_labeled=labeled.n_y, + n_y_and_c_labeled=labeled.n_y_and_c, + n_c_unlabeled=unlabeled.n_c, ) +def discrete_sampler_factory( + p_y_labeled: ArrayLike, + p_y_unlabeled: ArrayLike, + p_c_cond_y_labeled: ArrayLike, + p_c_cond_y_unlabeled: Optional[ArrayLike] = None, +) -> DiscreteSampler: + p_c_cond_y_labeled = np.asarray(p_c_cond_y_labeled) + + if p_c_cond_y_unlabeled is None: + p_c_cond_y_unlabeled = p_c_cond_y_labeled.copy() + + return DiscreteSampler( + sampler_labeled=SimpleMultinomialSampler( + p_y=p_y_labeled, + p_c_cond_y=p_c_cond_y_labeled, + ), + sampler_unlabeled=SimpleMultinomialSampler( + p_y=p_y_unlabeled, + p_c_cond_y=p_c_cond_y_unlabeled, + ), + ) + + def almost_eye(y: int, c: int, diagonal: float = 1.0) -> np.ndarray: """Matrix P(C | Y) with fixed "diagonal" terms. diff --git a/labelshift/datasets/split.py b/labelshift/datasets/split.py new file mode 100644 index 0000000..ed61519 --- /dev/null +++ b/labelshift/datasets/split.py @@ -0,0 +1,142 @@ +"""Utilities for working with NumPy datasets.""" +import dataclasses +from typing import List, Protocol + +import numpy as np +import pydantic + + +class IDataset(Protocol): + """Interface for the dataset used. Note that it's designed + to be compatible with the scikit-learn's datasets.""" + + @property + def data(self) -> np.ndarray: + """The covariates X. Shape (n_samples, ...).""" + raise NotImplementedError + + @property + def target(self) -> np.ndarray: + """The target labels Y. Shape (n_samples,).""" + raise NotImplementedError + + +def n_classes(dataset: IDataset) -> int: + """Calculates the number of classes in the dataset.""" + return len(np.unique(dataset.target)) + + +@dataclasses.dataclass +class SplitDataset: + """This class represents training, validation, and test datasets.""" + + train_x: np.ndarray + train_y: np.ndarray + + valid_x: np.ndarray + valid_y: np.ndarray + + test_x: np.ndarray + test_y: np.ndarray + + +class SplitSpecification(pydantic.BaseModel): + """Contains the specification about the class prevalences + in each of training, validation, and test datasets. + + Each of the lists should be of length `n_classes` + and at `y`th position the required number of instances + in the given dataset should be given. + """ + + train: List[int] + valid: List[int] + test: List[int] + + +def _calculate_number_required(specification: SplitSpecification, label: int) -> int: + """Calculates the required number of instances of label `label` + according to `specification`.""" + return ( + specification.train[label] + + specification.valid[label] + + specification.test[label] + ) + + +def split_dataset( + dataset: IDataset, specification: SplitSpecification, random_seed: int +) -> SplitDataset: + """Splits `dataset` according to `specification`.""" + n_labels = n_classes(dataset) + + if set(np.unique(dataset.target)) != set(range(n_labels)): + raise ValueError( + f"Labels must be 0-indexed integers: {dataset.target_names} != " + f"{set(range(n_labels))}." + ) + if { + len(specification.train), + len(specification.valid), + len(specification.train), + } != {n_labels}: + raise ValueError("Wrong length of the specification.") + + rng = np.random.default_rng(random_seed) + + train_indices: List[np.ndarray] = [] + valid_indices: List[np.ndarray] = [] + test_indices: List[np.ndarray] = [] + + for label in range(n_labels): + # Take the index of the points in `dataset` corresponding to the `label` + # Shape (n_points_with_this_label,). + index: np.ndarray = np.asarray(dataset.target == label).nonzero()[0] + + n_required = _calculate_number_required( + specification=specification, label=label + ) + + if n_required > index.shape[0]: + raise ValueError( + f"According to specification one needs {n_required} data points " + f"of label {label}," + f"but only {index.shape[0]} are available." + ) + + # Shuffle index + shuffled_index = rng.permutation(index) + + # Get the required data points from this data set + # according to the permuted index + n_train = specification.train[label] + n_valid = specification.valid[label] + n_test = specification.test[label] + + # Silence errors related to spacings around : + index_train = shuffled_index[:n_train] + index_valid = shuffled_index[n_train : n_train + n_valid] # noqa: E203 + index_test = shuffled_index[ + n_train + n_valid : n_train + n_valid + n_test # noqa: E203 + ] + + assert len(index_train) == n_train + assert len(index_valid) == n_valid + assert len(index_test) == n_test + + train_indices.append(index_train) + valid_indices.append(index_valid) + test_indices.append(index_test) + + train_index = rng.permutation(np.hstack(train_indices)) + valid_index = rng.permutation(np.hstack(valid_indices)) + test_index = rng.permutation(np.hstack(test_indices)) + + return SplitDataset( + train_x=dataset.data[train_index, ...], + train_y=dataset.target[train_index], + valid_x=dataset.data[valid_index, ...], + valid_y=dataset.target[valid_index], + test_x=dataset.data[test_index, ...], + test_y=dataset.target[test_index], + ) diff --git a/labelshift/experiments/__init__.py b/labelshift/experiments/__init__.py new file mode 100644 index 0000000..73a5018 --- /dev/null +++ b/labelshift/experiments/__init__.py @@ -0,0 +1,5 @@ +"""Subpackage with experimental utilities and designs. + +Note: + This file should be empty. For the API see the `api.py` submodule. +""" diff --git a/labelshift/experiments/api.py b/labelshift/experiments/api.py new file mode 100644 index 0000000..4dab1ad --- /dev/null +++ b/labelshift/experiments/api.py @@ -0,0 +1,22 @@ +"""The experimental utilities.""" +from typing import TypeVar, Optional + +from labelshift.experiments.timer import Timer +from labelshift.experiments.names import generate_name + +_T = TypeVar("_T") + + +def calculate_value(*, overwrite: Optional[_T], default: _T) -> _T: + if overwrite is None: + return default + else: + return overwrite + + +__all__ = [ + "Timer", + "calculate_value", + "generate_name", + "calculate_value", +] diff --git a/labelshift/experiments/names.py b/labelshift/experiments/names.py new file mode 100644 index 0000000..2d24ae6 --- /dev/null +++ b/labelshift/experiments/names.py @@ -0,0 +1,12 @@ +"""Utilities for dealing with filesystem IO.""" +import petname +from datetime import datetime + + +def generate_name() -> str: + """Generates a name with timestamp and a random part.""" + + now = datetime.now() # current date and time + date_time = now.strftime("%Y%m%d-%H%M%S") + suffix = petname.generate(separator="-", words=3) + return f"{date_time}-{suffix}" diff --git a/labelshift/timer.py b/labelshift/experiments/timer.py similarity index 100% rename from labelshift/timer.py rename to labelshift/experiments/timer.py diff --git a/labelshift/partition.py b/labelshift/partition.py new file mode 100644 index 0000000..38cb61a --- /dev/null +++ b/labelshift/partition.py @@ -0,0 +1,129 @@ +"""Partition of the real line into intervals.""" +from typing import List, Sequence, Tuple + +import numpy as np +from numpy.typing import ArrayLike +from scipy import stats + +_INFINITY = np.inf + + +class RealLinePartition: + """Partitions the real line into K (disjoint) intervals.""" + + def __init__(self, breakpoints: Sequence[float]) -> None: + """ + + Args: + breakpoints: points a_1, ..., a_{K-1} determine + the partition into K intervals + (-oo, a_1), [a_1, a_2), [a_2, a_3), ..., [a_{K-1}, +oo) + """ + breakpoints = list(breakpoints) + # Numer of intervals we want + K = len(breakpoints) + 1 + + # Make sure that the points are in ascending order + # and that no two values are equal. + # Note that we want k = 0, ... + # and k + 1 = 1, ..., K-2, + # as we have K-1 breakpoints + # Hence, we want to have k = 0, ..., K-3 + for k in range(K - 2): + assert breakpoints[k] < breakpoints[k + 1] + + self._breakpoints = np.asarray(breakpoints) + + self._intervals: List[Tuple[float, float]] = ( + [(-_INFINITY, breakpoints[0])] + + [(breakpoints[k], breakpoints[k + 1]) for k in range(K - 2)] + + [(breakpoints[-1], _INFINITY)] + ) + + assert ( + len(self._intervals) == K + ), f"Expected {K} intervals, but got the following: {self._intervals}." + + def __len__(self) -> int: + """Number of the intervals, K.""" + return len(self._intervals) + + def predict(self, X: np.ndarray) -> np.ndarray: + """Predicts the labels of points in `X`. + + Args: + X: points on the real line, shape (n, 1) or (n,) + + Returns: + array of integers valued in {0, 1, ..., K-1} with the + interval number, for each point in `X` + """ + X = np.asarray(X).ravel() # Shape (n,) + return np.asarray([self._predict_point(x) for x in X], dtype=int) + + def _predict_point(self, x: float) -> int: + """Predict the label (interval) for point `x`.""" + for k, (a, b) in enumerate(self._intervals): + if a <= x < b: + return k + raise ValueError(f"No interval containing {x}.") + + def interval(self, k: int) -> Tuple[float, float]: + """Returns the ends of the `k`th interval. + + Args: + k: should be valued between 0 and K-1. + + Raises: + IndexError, if k >= K (is out of range) + """ + return self._intervals[k] + + @property + def breakpoints(self, include_inf: bool = False) -> np.ndarray: + """Returns the breakpoints. + + Args: + include_inf: whether the infinities should be returned or not + + Returns: + breakpoints. Shape (K-1,) if `include_inf` is False + and (K+1,) if `include_inf` is True + """ + if not include_inf: + return self._breakpoints.copy() + else: + return np.asarray([-_INFINITY] + self._breakpoints.tolist() + [_INFINITY]) + + +def gaussian_probability_masses( + means: ArrayLike, sigmas: ArrayLike, partition: RealLinePartition +) -> np.ndarray: + """ + Args: + means: shape (L,) + sigmas: shape (L,) + partition: partition into K intervals + + Returns: + matrix P(C|Y), shape (L, K). The (l,k)th entry is the probability mass + of the `l`th Gaussian contained in the `k`th interval + """ + means = np.asarray(means) + sigmas = np.asarray(sigmas) + L = len(means) + assert means.shape == sigmas.shape == (L,) + + K = len(partition) + p_c_cond_y = np.zeros((L, K)) + + for l in range(L): # noqa: E741 + for k in range(K): + mu, sigma = means[l], sigmas[l] + + a, b = partition.interval(k) + p_c_cond_y[l, k] = stats.norm.cdf(b, loc=mu, scale=sigma) - stats.norm.cdf( + a, loc=mu, scale=sigma + ) + + return p_c_cond_y diff --git a/labelshift/scoring.py b/labelshift/scoring.py index c4fa163..6129120 100644 --- a/labelshift/scoring.py +++ b/labelshift/scoring.py @@ -206,3 +206,9 @@ def __init__(self) -> None: def _calculate_error(self, true: np.ndarray, estimated: np.ndarray) -> float: return self._kl.error(true, estimated) + self._kl.error(estimated, true) + + +class FisherRaoDistance(MulticlassQuantificationError): + def error(self, true: ArrayLike, estimated: ArrayLike) -> float: + bhattacharyya_coefficient = np.dot(np.sqrt(true), np.sqrt(estimated)) + return 2 * np.arccos(bhattacharyya_coefficient) diff --git a/requirements.txt b/requirements.txt index df3a1bc..0aa9708 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ arviz numpy +petname pydantic scikit-learn scipy - # Code quality tools black flake8 diff --git a/scripts/design_categorical.py b/scripts/design_categorical.py new file mode 100644 index 0000000..6e0ce90 --- /dev/null +++ b/scripts/design_categorical.py @@ -0,0 +1,156 @@ +"""Experimental design for the categorical experiment. + +Use it to generate a list of commands to be run.""" +from pathlib import Path +from typing import Optional + +DIRECTORY = Path("data/generated/categorical_experiment") + +ESTIMATOR_CONFIGURATIONS = { + "MAP-1": "--algorithm MAP --bayesian-alpha 1", + "MAP-2": "--algorithm MAP --bayesian-alpha 2", + "CC": "--algorithm CC", + "IR": "--algorithm IR --restricted true", + "BBSE": "--algorithm BBSE", +} + +N_SEEDS: int = 2 + +N_LABELED: int = 1_000 +N_UNLABELED: int = 500 +QUALITY_LABELED: float = 0.85 +PI_UNLABELED: float = 0.7 +L: int = 5 +K: int = 5 + + +def command( + estimator_key: str, + seed: int, + output_dir: Path, + n_y: int = L, + n_c: int = K, + n_labeled: int = N_LABELED, + n_unlabeled: int = N_UNLABELED, + quality_labeled: float = QUALITY_LABELED, + quality_unlabeled: Optional[float] = None, + pi_unlabeled: float = PI_UNLABELED, +) -> str: + estimator_args = ESTIMATOR_CONFIGURATIONS[estimator_key] + + quality_unlabeled_str = ( + "" if quality_unlabeled is None else f"--quality-unlabeled {quality_unlabeled}" + ) + + print( + f"python scripts/run_categorical.py " + f"--n-labeled {n_labeled} --n-unlabeled {n_unlabeled} " + f"--quality {quality_labeled} {quality_unlabeled_str} " + f"--prevalence-unlabeled {pi_unlabeled} " + f"--seed {seed} " + f"--output-dir {output_dir} " + f"--K {n_y} --L {n_c} " + f"--tag {estimator_key} {estimator_args}" + ) + + +def experiment_change_prevalence() -> None: + """Fix L = K = 5 and change pi'_1.""" + for seed in range(N_SEEDS): + for pi_unlabeled in [0.5, 0.6, 0.7, 0.8, 0.9]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = ( + DIRECTORY / "change_prevalence" / f"{algorithm}-{pi_unlabeled}" + ) + command( + output_dir=output_dir, + pi_unlabeled=pi_unlabeled, + seed=seed, + estimator_key=algorithm, + ) + + +def experiment_change_n_unlabeled() -> None: + """Change N'.""" + for seed in range(N_SEEDS): + for n_unlabeled in [10, 50, 100, 500, 1000, 10000]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = ( + DIRECTORY / "change_n_unlabeled" / f"{algorithm}-{n_unlabeled}" + ) + command( + n_unlabeled=n_unlabeled, + seed=seed, + estimator_key=algorithm, + output_dir=output_dir, + ) + + +def experiment_change_k() -> None: + """Change K, keeping L fixed.""" + for seed in range(N_SEEDS): + for n_c in [2, 3, 5, 7, 9]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = DIRECTORY / "change_k" / f"{algorithm}-{n_c}" + command( + seed=seed, + output_dir=output_dir, + estimator_key=algorithm, + n_c=n_c, + ) + + +def experiment_change_jointly_l_and_k() -> None: + """Jointly change L = K.""" + for seed in range(N_SEEDS): + for lk in [2, 3, 5, 7, 9, 10]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = DIRECTORY / "change_jointly_lk" / f"{algorithm}-{lk}" + command( + seed=seed, + estimator_key=algorithm, + output_dir=output_dir, + n_c=lk, + n_y=lk, + ) + + +def experiment_change_quality() -> None: + """Change quality.""" + for seed in range(N_SEEDS): + for quality in [0.55, 0.65, 0.75, 0.85, 0.95]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = DIRECTORY / "change_quality" / f"{algorithm}-{quality}" + command( + quality_labeled=quality, + seed=seed, + estimator_key=algorithm, + output_dir=output_dir, + ) + + +def experiment_misspecified() -> None: + """Change quality in the unlabeled population, so that the model is misspecified.""" + for seed in range(N_SEEDS): + for quality_prime in [0.45, 0.55, 0.65, 0.75, 0.80, 0.85, 0.90, 0.95]: + for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): + output_dir = DIRECTORY / "misspecified" / f"{algorithm}-{quality_prime}" + command( + quality_unlabeled=quality_prime, + seed=seed, + output_dir=output_dir, + estimator_key=algorithm, + ) + + +def main() -> None: + experiment_change_prevalence() + experiment_change_n_unlabeled() + experiment_change_quality() + experiment_change_jointly_l_and_k() + experiment_change_k() + experiment_misspecified() + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment1/1-1.py b/scripts/experiment1/1-1.py new file mode 100644 index 0000000..faf0d71 --- /dev/null +++ b/scripts/experiment1/1-1.py @@ -0,0 +1,32 @@ +""" +Fixed q = 0.85, N' = 500 and changed the prevalence π'1 in range {0.5, 0.6, 0.7, 0.8, 0.9}. +""" +algorithms = [ + "ClassifyAndCount", + "RatioEstimator", + "BlackBoxShiftEstimator", + "BayesianMAP", +] + + +def main() -> None: + n_labeled = 1000 + n_unlabeled = 500 + quality = 0.85 + n_seeds = 30 + + pi_labeled = 0.5 + + for pi_unlabeled in [0.5, 0.6, 0.7, 0.8, 0.9]: + for seed in range(n_seeds): + for algorithm in algorithms: + try: + output_dir = f"experiment1-1/{algorithm}" + command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" + print(command) + except Exception as e: + print(e) + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment1/1-2.py b/scripts/experiment1/1-2.py new file mode 100644 index 0000000..9a924d0 --- /dev/null +++ b/scripts/experiment1/1-2.py @@ -0,0 +1,33 @@ +""" +Fixed q = 0.85, π′1 = 0.7 and changed N' in range +{10, 50, 100, 500, 1000, 10000} +""" +algorithms = [ + "ClassifyAndCount", + "RatioEstimator", + "BlackBoxShiftEstimator", + "BayesianMAP", +] + + +def main() -> None: + n_labeled = 1000 + quality = 0.85 + n_seeds = 30 + + pi_unlabeled = 0.7 + pi_labeled = 0.5 + + for n_unlabeled in [10, 50, 100, 500, 1000, 10000]: + for seed in range(n_seeds): + for algorithm in algorithms: + try: + output_dir = f"experiment1-2/{algorithm}" + command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" + print(command) + except Exception as e: + print(e) + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment1/1-3.py b/scripts/experiment1/1-3.py new file mode 100644 index 0000000..3389f8b --- /dev/null +++ b/scripts/experiment1/1-3.py @@ -0,0 +1,33 @@ +""" +Fixed π′1 = 0.7, N'= 500 and changed q in range +{0.55, 0.65, 0.75, 0.85, 0.95} +""" +algorithms = [ + "ClassifyAndCount", + "RatioEstimator", + "BlackBoxShiftEstimator", + "BayesianMAP", +] + + +def main() -> None: + n_labeled = 1000 + n_unlabeled = 500 + n_seeds = 30 + + pi_unlabeled = 0.7 + pi_labeled = 0.5 + + for quality in [0.55, 0.65, 0.75, 0.85, 0.95]: + for seed in range(n_seeds): + for algorithm in algorithms: + try: + output_dir = f"experiment1-3/{algorithm}" + command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" + print(command) + except Exception as e: + print(e) + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment1/plot_figure.py b/scripts/experiment1/plot_figure.py new file mode 100644 index 0000000..d0aa63a --- /dev/null +++ b/scripts/experiment1/plot_figure.py @@ -0,0 +1,97 @@ +import json +import string +from pathlib import Path + +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd + + +rename_dict = { + "ClassifyAndCount": "CC", + "RatioEstimator": "IR", + "BlackBoxShiftEstimator": "BBSE", + "BayesianMAP": "MAP", +} + +hue_order = [ + "CC", + "IR", + "BBSE", + "MAP", +] + + +def file_to_row(file): + with open(file) as f: + x = json.load(f) + return { + "Algorithm": rename_dict[x["algorithm"]], + "true": x["true"][0], + "estimated": x["estimated"][0], + "quality": x["sampler"]["p_c_cond_y"][0][0], + "n_labeled": x["sampler"]["n_labeled"], + "n_unlabeled": x["sampler"]["n_unlabeled"], + } + + +def experiment_directory_to_dataframe(experiment_directory) -> pd.DataFrame: + files = list( + Path(experiment_directory).rglob( + "*.json", + ) + ) + df = pd.DataFrame([file_to_row(f) for f in files]) + df["error"] = df["estimated"] - df["true"] + return df + + +def main() -> None: + fig, axs = plt.subplots(3, 1, figsize=(4, 12), sharey=False) + + experiment1 = "experiment1-1" + df1 = experiment_directory_to_dataframe(experiment1) + sns.boxplot( + df1, x="true", y="error", hue="Algorithm", ax=axs[0], hue_order=hue_order + ) + axs[0].set_xlabel(r"Prevalence $\pi'_1$") + axs[0].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") + + experiment2 = "experiment1-2" + df2 = experiment_directory_to_dataframe(experiment2) + sns.boxplot( + df2, x="n_unlabeled", y="error", hue="Algorithm", ax=axs[1], hue_order=hue_order + ) + + axs[1].set_xlabel(r"Unlabeled data set size $N'$") + axs[1].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") + axs[1].legend([], [], frameon=False) + + experiment3 = "experiment1-3" + df3 = experiment_directory_to_dataframe(experiment3) + sns.boxplot( + df3, x="quality", y="error", hue="Algorithm", ax=axs[2], hue_order=hue_order + ) + + axs[2].set_xlabel(r"Classifier quality $q$") + axs[2].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") + axs[2].legend([], [], frameon=False) + + for n, ax in enumerate(axs): + ax.text( + -0.1, + 1.1, + string.ascii_uppercase[n], + transform=ax.transAxes, + size=20, + weight="bold", + ) + + sns.move_legend(axs[0], "lower left") # , bbox_to_anchor=(1, 1)) + + fig.tight_layout() + fig.savefig("experiment1.pdf") + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment_external_dataset.py b/scripts/experiment_external_dataset.py new file mode 100644 index 0000000..a5a892c --- /dev/null +++ b/scripts/experiment_external_dataset.py @@ -0,0 +1,134 @@ +import enum + +import numpy as np +import sklearn.datasets +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression + +import labelshift.datasets.split as split +import labelshift.summary_statistic as summ + +import labelshift.algorithms.api as algos +import labelshift.algorithms.ratio_estimator as re +from labelshift.algorithms.expectation_maximization import expectation_maximization + + +class Algorithm(enum.Enum): + EM = "ExpectationMaximization" + CC = "ClassifyAndCount" + BBSE_HARD = "BBSE-Hard" + RATIO_HARD = "InvariantRatio-Hard" + BAYESIAN = "Bayesian-MAP" + RATIO_SOFT = "InvariantRatio-Soft" + + +def get_estimate( + algorithm: Algorithm, + n_y_c_labeled: np.ndarray, + n_c_unlabeled: np.ndarray, + y_labeled: np.ndarray, + prob_c_labeled: np.ndarray, + prob_c_unlabeled: np.ndarray, + labeled_prevalence: np.ndarray, +) -> np.ndarray: + """Function running the (point) prevalence estimator. + + Args: + algorithm: estimator + n_y_c_labeled: matrix with counts of predictions and true values, shape (L, K) + n_c_unlabeled: vector with prediction counts on unlabeled data set, shape (K,) + y_labeled: true labels in the labeled data set, shape (N,) + prob_c_labeled: predictions of the classifier on the labeled data set, shape (N, K) + prob_c_unlabeled: predictions of the classifier on the unlabeled data set, shape (N', K) + labeled_prevalence: prevalence vector on the labeled distribution, shape (L,) + """ + summary_statistic = algos.SummaryStatistic( + n_y_labeled=None, n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled + ) + + if algorithm == Algorithm.EM: + return expectation_maximization( + predictions=prob_c_unlabeled, training_prevalences=labeled_prevalence + ) + elif algorithm == Algorithm.CC: + return algos.ClassifyAndCount().estimate_from_summary_statistic( + summary_statistic + ) + elif algorithm == Algorithm.BBSE_HARD: + return algos.BlackBoxShiftEstimator( + p_y_labeled=labeled_prevalence + ).estimate_from_summary_statistic(summary_statistic) + elif algorithm == Algorithm.RATIO_HARD: + return algos.InvariantRatioEstimator( + restricted=True + ).estimate_from_summary_statistic(summary_statistic) + elif algorithm == Algorithm.BAYESIAN: + return algos.DiscreteCategoricalMAPEstimator().estimate_from_summary_statistic( + summary_statistic + ) + elif algorithm == Algorithm.RATIO_SOFT: + return re.calculate_vector_and_matrix_from_predictions( + unlabeled_predictions=prob_c_unlabeled, + labeled_predictions=prob_c_labeled, + labeled_ground_truth=y_labeled, + ) + else: + raise ValueError(f"Algorithm {algorithm} not recognized.") + + +def main() -> None: + L = 2 + K = L + dataset = sklearn.datasets.load_breast_cancer() + print(len(dataset.target)) + + random_seed: int = 22 + n_training_examples: int = 200 + n_labeled_examples: int = 100 + n_unlabeled_examples: int = 150 + prevalence_labeled: np.ndarray = np.ones(2) / 2 + prevalence_unlabeled: np.ndarray = np.asarray([0.3, 0.7]) + + specification = split.SplitSpecification( + train=np.asarray(prevalence_labeled * n_training_examples, dtype=int).tolist(), + valid=np.asarray(prevalence_labeled * n_labeled_examples, dtype=int).tolist(), + test=np.asarray( + prevalence_unlabeled * n_unlabeled_examples, dtype=int + ).tolist(), + ) + + datasets = split.split_dataset( + dataset=dataset, specification=specification, random_seed=random_seed + ) + + classifier = DecisionTreeClassifier(random_state=random_seed + 1) + classifier = RandomForestClassifier(random_state=random_seed + 1) + classifier = LogisticRegression(random_state=random_seed + 1) + classifier.fit(datasets.train_x, datasets.train_y) + + # The count values + n_y_c_labeled = summ.count_values_joint( + L, K, datasets.valid_y, classifier.predict(datasets.valid_x) + ) + n_c_unlabeled = summ.count_values(K, classifier.predict(datasets.test_x)) + + labeled_probabilities = classifier.predict_proba(datasets.valid_x) + unlabeled_probabilities = classifier.predict_proba(datasets.test_x) + + for alg in Algorithm: + print(alg) + estimate = get_estimate( + algorithm=alg, + n_y_c_labeled=n_y_c_labeled, + n_c_unlabeled=n_c_unlabeled, + y_labeled=datasets.valid_y, + prob_c_labeled=labeled_probabilities, + prob_c_unlabeled=unlabeled_probabilities, + labeled_prevalence=prevalence_labeled, + ) + print(estimate) + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment_external_dataset2.py b/scripts/experiment_external_dataset2.py new file mode 100644 index 0000000..e143fef --- /dev/null +++ b/scripts/experiment_external_dataset2.py @@ -0,0 +1,179 @@ +import enum + +import arviz as az +import matplotlib.pyplot as plt +import numpy as np +import pymc as pm +import sklearn.datasets +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression + +import labelshift.datasets.split as split +import labelshift.summary_statistic as summ + +import labelshift.algorithms.api as algos +import labelshift.algorithms.ratio_estimator as re +import labelshift.algorithms.bayesian_discrete as bay +from labelshift.algorithms.expectation_maximization import expectation_maximization + +plt.rcParams.update({"font.size": 14}) + + +class Algorithm(enum.Enum): + EM = "EM" + CC = "CC" + BBSE_HARD = "BBSE" + RATIO_HARD = "IR: hard" + RATIO_SOFT = "IR: soft" + + +def get_estimate( + algorithm: Algorithm, + n_y_c_labeled: np.ndarray, + n_c_unlabeled: np.ndarray, + y_labeled: np.ndarray, + prob_c_labeled: np.ndarray, + prob_c_unlabeled: np.ndarray, + labeled_prevalence: np.ndarray, +) -> np.ndarray: + """Function running the (point) prevalence estimator. + + Args: + algorithm: estimator + n_y_c_labeled: matrix with counts of predictions and true values, shape (L, K) + n_c_unlabeled: vector with prediction counts on unlabeled data set, shape (K,) + y_labeled: true labels in the labeled data set, shape (N,) + prob_c_labeled: predictions of the classifier on the labeled data set, shape (N, K) + prob_c_unlabeled: predictions of the classifier on the unlabeled data set, shape (N', K) + labeled_prevalence: prevalence vector on the labeled distribution, shape (L,) + """ + summary_statistic = algos.SummaryStatistic( + n_y_labeled=None, n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled + ) + + if algorithm == Algorithm.EM: + return expectation_maximization( + predictions=prob_c_unlabeled, training_prevalences=labeled_prevalence + ) + elif algorithm == Algorithm.CC: + return algos.ClassifyAndCount().estimate_from_summary_statistic( + summary_statistic + ) + elif algorithm == Algorithm.BBSE_HARD: + return algos.BlackBoxShiftEstimator( + p_y_labeled=labeled_prevalence + ).estimate_from_summary_statistic(summary_statistic) + elif algorithm == Algorithm.RATIO_HARD: + return algos.InvariantRatioEstimator( + restricted=True + ).estimate_from_summary_statistic(summary_statistic) + elif algorithm == Algorithm.RATIO_SOFT: + return re.calculate_vector_and_matrix_from_predictions( + unlabeled_predictions=prob_c_unlabeled, + labeled_predictions=prob_c_labeled, + labeled_ground_truth=y_labeled, + ) + else: + raise ValueError(f"Algorithm {algorithm} not recognized.") + + +def main() -> None: + L = 2 + K = L + dataset = sklearn.datasets.load_breast_cancer() + print(len(dataset.target)) + + ymax: float = 7.0 + random_seed: int = 20 + n_training_examples: int = 200 + n_labeled_examples: int = 100 + n_unlabeled_examples: int = 150 + prevalence_labeled: np.ndarray = np.ones(2) / 2 + prevalence_unlabeled: np.ndarray = np.asarray([0.3, 0.7]) + + specification = split.SplitSpecification( + train=np.asarray(prevalence_labeled * n_training_examples, dtype=int).tolist(), + valid=np.asarray(prevalence_labeled * n_labeled_examples, dtype=int).tolist(), + test=np.asarray( + prevalence_unlabeled * n_unlabeled_examples, dtype=int + ).tolist(), + ) + + datasets = split.split_dataset( + dataset=dataset, specification=specification, random_seed=random_seed + ) + + # classifier = DecisionTreeClassifier(random_state=random_seed + 1) + classifier = RandomForestClassifier(random_state=random_seed + 1) + # classifier = LogisticRegression(random_state=random_seed + 1) + classifier.fit(datasets.train_x, datasets.train_y) + + # The count values + n_y_c_labeled = summ.count_values_joint( + L, K, datasets.valid_y, classifier.predict(datasets.valid_x) + ) + n_c_unlabeled = summ.count_values(K, classifier.predict(datasets.test_x)) + + labeled_probabilities = classifier.predict_proba(datasets.valid_x) + unlabeled_probabilities = classifier.predict_proba(datasets.test_x) + + with bay.build_model( + n_y_and_c_labeled=n_y_c_labeled, + n_c_unlabeled=n_c_unlabeled, + ): + idata = pm.sample() + + fig, ax = plt.subplots(figsize=(6, 4)) + _, ax_trash = plt.subplots() + + az.plot_posterior(idata, ax=[ax, ax_trash], var_names=bay.P_TEST_Y) + ax.set_title(r"$\pi'_1$ posterior") + + ax.vlines( + x=prevalence_unlabeled[0], + ymin=0, + ymax=ymax, + label="Ground truth", + colors=["k"], + linestyles=["--"], + ) + + linestyles = [ + "dashdot", + (0, (1, 1)), + "solid", + "dashed", + (0, (3, 10, 1, 10)), + ] + + for i, alg in enumerate(Algorithm): + print(alg) + estimate = get_estimate( + algorithm=alg, + n_y_c_labeled=n_y_c_labeled, + n_c_unlabeled=n_c_unlabeled, + y_labeled=datasets.valid_y, + prob_c_labeled=labeled_probabilities, + prob_c_unlabeled=unlabeled_probabilities, + labeled_prevalence=prevalence_labeled, + ) + + ax.vlines( + estimate[0], + ymin=0, + ymax=ymax, + label=alg.value, + colors=[f"C{i+2}"], + linestyles=[linestyles[i]], + ) + + print(estimate) + + fig.legend() + fig.tight_layout() + fig.savefig("plot_cancer.pdf") + + +if __name__ == "__main__": + main() diff --git a/scripts/experiment_gaussian.py b/scripts/experiment_gaussian.py new file mode 100644 index 0000000..285980d --- /dev/null +++ b/scripts/experiment_gaussian.py @@ -0,0 +1,144 @@ +"""This experiment plots the posterior in the Gaussian mixture model as well +as a discretized version of that. +""" +import string +from typing import List + +import arviz as az +import matplotlib.pyplot as plt +import numpy as np +import pymc as pm +import seaborn as sns + +import labelshift.partition as part +import labelshift.summary_statistic as summ +import labelshift.algorithms.bayesian_discrete as discrete + + +plt.rcParams.update({"font.size": 22}) + + +def plot_distributions( + ax: plt.Axes, + X: np.ndarray, + X1: np.ndarray, + breakpoints: np.ndarray, + height: float = 1.0, +) -> None: + """ + + Args: + ax: axes where to draw the plot + X: points from the labeled distribution, shape (n_labeled,) + X1: points from the unlabeled distribution, shape (n_unlabeled,) + breakpoints: breakpoints to be plotted, shape (n_breakpoints,) + """ + sns.kdeplot(data=np.hstack(X), ax=ax) + sns.kdeplot(data=np.hstack(X1), ax=ax) + + for bp in breakpoints: + ax.axvline(bp, ymax=height, linestyle="--", c="k", alpha=0.5) + + +def gaussian_model( + labeled_data: List[np.ndarray], unlabeled_data: np.ndarray +) -> pm.Model: + """ + Args: + labeled_data: list of samples attributed to each Y: + [ + [a1, ..., a_n0], + [b1, ..., b_n1] + ] + unlabeled_data: array of shape (n_unlabeled,) + """ + with pm.Model() as model: + mu = pm.Normal("mu", mu=0, sigma=1, shape=2) + sigma = pm.HalfNormal("sigma", sigma=1, shape=2) + + for i in range(2): + pm.Normal( + f"X_labeled{i}", mu=mu[i], sigma=sigma[i], observed=labeled_data[i] + ) + + weights = pm.Dirichlet("P_unlabeled(Y)", np.ones(2)) + + pm.NormalMixture( + "X_unlabeled", w=weights, mu=mu, sigma=sigma, observed=unlabeled_data + ) + + return model + + +def main() -> None: + """The main method.""" + mus = [0.0, 1.0] + sigmas = [0.3, 0.4] + ns = [500, 500] + ns_ = [200, 800] + K = 7 + L = 2 + + partition = part.RealLinePartition(np.linspace(-0.5, 1.5, K - 1)) + print(partition.breakpoints) + + assert len(partition) == K + + rng = np.random.default_rng(42) + + X_stratified = [ + rng.normal(loc=mu, scale=sigma, size=n) for mu, sigma, n in zip(mus, sigmas, ns) + ] + X = np.hstack(X_stratified) + Y = np.hstack([[i] * n for i, n in enumerate(ns)]) + + C = partition.predict(X) + + X1_stratified = [ + rng.normal(loc=mu, scale=sigma, size=n_) + for mu, sigma, n_ in zip(mus, sigmas, ns_) + ] + X1 = np.hstack(X1_stratified) + C1 = partition.predict(X1) + + n_c_unlabeled = summ.count_values(K, C1) + n_y_c_labeled = summ.count_values_joint(L, K, Y, C) + + print(n_c_unlabeled) + print(n_y_c_labeled) + + fig, axs = plt.subplots(3, figsize=(6, 9)) + plot_distributions(ax=axs[0], X=X, X1=X1, breakpoints=partition.breakpoints) + + with gaussian_model(labeled_data=X_stratified, unlabeled_data=X1): + gaussian_data = pm.sample() + + _, ax_trash = plt.subplots() + + az.plot_posterior(gaussian_data, ax=[axs[1], ax_trash], var_names="P_unlabeled(Y)") + axs[1].set_title(r"$\pi'_1$ (Gaussian)") + + with discrete.build_model( + n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled + ): + discrete_data = pm.sample() + + az.plot_posterior(discrete_data, ax=[axs[2], ax_trash], var_names=discrete.P_TEST_Y) + axs[2].set_title(r"$\pi'_1$ (Discrete)") + + for n, ax in enumerate(axs): + ax.text( + -0.1, + 1.1, + string.ascii_uppercase[n], + transform=ax.transAxes, + size=20, + weight="bold", + ) + + fig.tight_layout() + fig.savefig("plot.pdf") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_categorical.py b/scripts/run_categorical.py new file mode 100644 index 0000000..496f58c --- /dev/null +++ b/scripts/run_categorical.py @@ -0,0 +1,256 @@ +"""Sample data directly from P(C|Y) distribution and run specified quantification estimator.""" +import argparse +import enum +from pathlib import Path +from typing import List + +import pydantic + +import labelshift.interfaces.point_estimators as pe +import labelshift.datasets.discrete_categorical as dc +import labelshift.algorithms.api as algo +import labelshift.experiments.api as exp + + +class Algorithm(enum.Enum): + CLASSIFY_AND_COUNT = "CC" + RATIO_ESTIMATOR = "IR" + BBSE = "BBSE" + BAYESIAN = "MAP" + + +def create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + parser.add_argument( + "--n-labeled", type=int, default=1_000, help="Number of labeled examples." + ) + parser.add_argument( + "--n-unlabeled", type=int, default=1_000, help="Number of unlabeled examples." + ) + parser.add_argument( + "--quality", + type=float, + default=0.85, + help="Quality of the classifier on the labeled data.", + ) + parser.add_argument( + "--quality-unlabeled", + type=float, + default=None, + help="Quality of the classifier on the unlabeled data." + "Can be used to assess model misspecification. " + "If None, the quality will be the same for both labeled" + "and unlabeled data set (no misspecification).", + ) + parser.add_argument("--L", type=int, default=2, help="Number of classes L.") + parser.add_argument( + "--K", + type=int, + default=None, + help="Number of available predictions. Default: the same as L.", + ) + parser.add_argument( + "--prevalence-labeled", + type=float, + default=None, + help="Prevalence of the first class in the labeled data set. Default: 1/L (uniform).", + ) + parser.add_argument( + "--prevalence-unlabeled", + type=float, + default=None, + help="Prevalence of the first class in the unlabeled data set. Default: 1/L (uniform).", + ) + parser.add_argument( + "--seed", type=int, default=1, help="Random seed to sample the data." + ) + parser.add_argument("--algorithm", type=Algorithm, default=Algorithm.BAYESIAN) + parser.add_argument( + "--output", type=Path, default=Path(f"{exp.generate_name()}.json") + ) + parser.add_argument("--output-dir", type=Path, default=None) + + parser.add_argument( + "--bayesian-alpha", + type=float, + default=1.0, + help="Dirichlet prior specification for the Bayesian quantification.", + ) + parser.add_argument( + "--restricted", + type=bool, + default=True, + help="Whether to use restricted invariant ratio estimator.", + ) + + parser.add_argument( + "--tag", type=str, default="", help="Can be used to tag the run." + ) + + parser.add_argument("--dry-run", action="store_true") + + return parser + + +class EstimatorArguments(pydantic.BaseModel): + bayesian_alpha: float + restricted: bool + + +class Arguments(pydantic.BaseModel): + p_y_labeled: pydantic.confloat(gt=0, lt=1) + p_y_unlabeled: pydantic.confloat(gt=0, lt=1) + + quality_labeled: pydantic.confloat(ge=0, le=1) + quality_unlabeled: pydantic.confloat(ge=0, le=1) + + n_y: pydantic.PositiveInt = pydantic.Field(description="Number of labels, L.") + n_c: pydantic.PositiveInt = pydantic.Field(description="Number of predictions, K.") + + n_labeled: pydantic.PositiveInt + n_unlabeled: pydantic.PositiveInt + + seed: int + + algorithm: Algorithm + tag: str + estimator_arguments: EstimatorArguments + + +def parse_args(args) -> Arguments: + n_y = args.L + n_c = exp.calculate_value(overwrite=args.K, default=n_y) + + quality_unlabeled = exp.calculate_value( + overwrite=args.quality_unlabeled, default=args.quality + ) + + p_y_labeled = exp.calculate_value( + overwrite=args.prevalence_labeled, default=1 / n_y + ) + p_y_unlabeled = exp.calculate_value( + overwrite=args.prevalence_unlabeled, default=1 / n_y + ) + + return Arguments( + p_y_labeled=p_y_labeled, + p_y_unlabeled=p_y_unlabeled, + quality_labeled=args.quality, + quality_unlabeled=quality_unlabeled, + n_y=n_y, + n_c=n_c, + seed=args.seed, + n_labeled=args.n_labeled, + n_unlabeled=args.n_unlabeled, + algorithm=args.algorithm, + tag=args.tag, + estimator_arguments=EstimatorArguments( + bayesian_alpha=args.bayesian_alpha, + restricted=args.restricted, + ), + ) + + +def create_sampler(args: Arguments) -> dc.DiscreteSampler: + L = args.n_y + p_y_labeled = dc.almost_eye(L, L, diagonal=args.p_y_labeled)[0, :] + p_y_unlabeled = dc.almost_eye(L, L, diagonal=args.p_y_unlabeled)[0, :] + + p_c_cond_y_labeled = dc.almost_eye( + y=L, + c=args.n_c, + diagonal=args.quality_labeled, + ) + p_c_cond_y_unlabeled = dc.almost_eye( + y=L, + c=args.n_c, + diagonal=args.quality_unlabeled, + ) + + return dc.discrete_sampler_factory( + p_y_labeled=p_y_labeled, + p_y_unlabeled=p_y_unlabeled, + p_c_cond_y_labeled=p_c_cond_y_labeled, + p_c_cond_y_unlabeled=p_c_cond_y_unlabeled, + ) + + +def get_estimator(args: Arguments) -> pe.SummaryStatisticPrevalenceEstimator: + if args.algorithm == Algorithm.CLASSIFY_AND_COUNT: + if args.n_c != args.n_y: + raise ValueError("For classify and count you need K = L.") + return algo.ClassifyAndCount() + elif args.algorithm == Algorithm.RATIO_ESTIMATOR: + return algo.InvariantRatioEstimator( + restricted=args.estimator_arguments.restricted, enforce_square=False + ) + elif args.algorithm == Algorithm.BBSE: + return algo.BlackBoxShiftEstimator(enforce_square=False) + elif args.algorithm == Algorithm.BAYESIAN: + return algo.DiscreteCategoricalMAPEstimator( + alpha_unlabeled=args.estimator_arguments.bayesian_alpha + ) + else: + raise ValueError(f"Algorithm {args.algorithm} not recognized.") + + +class Result(pydantic.BaseModel): + p_y_unlabeled_true: List[float] + p_y_unlabeled_estimate: List[float] + time: float + algorithm: Algorithm + + input_arguments: Arguments + + +def dry_run(args: Arguments) -> None: + print("-- Dry run --\nUsed settings:") + print(args) + print("Exiting...") + + +def main() -> None: + """The main function of the experiment.""" + raw_args = create_parser().parse_args() + args: Arguments = parse_args(raw_args) + + if raw_args.dry_run: + dry_run(args) + return + + sampler = create_sampler(args) + + summary_statistic = sampler.sample_summary_statistic( + n_labeled=args.n_labeled, + n_unlabeled=args.n_unlabeled, + seed=args.seed, + ) + + estimator = get_estimator(args) + timer = exp.Timer() + estimate = estimator.estimate_from_summary_statistic(summary_statistic) + elapsed_time = timer.check() + + result = Result( + algorithm=args.algorithm, + time=elapsed_time, + p_y_unlabeled_true=sampler.unlabeled.p_y.tolist(), + p_y_unlabeled_estimate=estimate.tolist(), + input_arguments=args, + ) + + if raw_args.output_dir is not None: + raw_args.output_dir.mkdir(exist_ok=True, parents=True) + output_path = raw_args.output_dir / raw_args.output + else: + output_path = raw_args.output + + with open(output_path, "w") as f: + f.write(result.json()) + + print(result) + print("Finished.") + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg index 4aa5df7..e2b17dd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,7 @@ setup_requires = install_requires = arviz numpy >= 1.12,<2 + petname pydantic pymc scikit-learn diff --git a/tests/algorithms/test_bayesian_discrete.py b/tests/algorithms/test_bayesian_discrete.py index 81d462a..e9e23ac 100644 --- a/tests/algorithms/test_bayesian_discrete.py +++ b/tests/algorithms/test_bayesian_discrete.py @@ -19,8 +19,10 @@ def test_right_values(n_labeled: int = 10_000, n_unlabeled: int = 10_000) -> Non ] ) - sampler = dc.DiscreteSampler( - p_y_labeled=[0.4, 0.6], p_y_unlabeled=p_y_unlabeled, p_c_cond_y=p_c_cond_y + sampler = dc.discrete_sampler_factory( + p_y_labeled=[0.4, 0.6], + p_y_unlabeled=p_y_unlabeled, + p_c_cond_y_labeled=p_c_cond_y, ) statistic = sampler.sample_summary_statistic( n_labeled=n_labeled, n_unlabeled=n_unlabeled, seed=111 diff --git a/tests/algorithms/test_bbse.py b/tests/algorithms/test_bbse.py index 75e0878..0c9e507 100644 --- a/tests/algorithms/test_bbse.py +++ b/tests/algorithms/test_bbse.py @@ -65,8 +65,10 @@ def test_bbse_from_sufficient_statistic( ] ) - sampler = dc.DiscreteSampler( - p_y_labeled=p_y_labeled, p_y_unlabeled=p_y_unlabeled, p_c_cond_y=p_c_cond_y + sampler = dc.discrete_sampler_factory( + p_y_labeled=p_y_labeled, + p_y_unlabeled=p_y_unlabeled, + p_c_cond_y_labeled=p_c_cond_y, ) statistic = sampler.sample_summary_statistic( n_labeled=n_labeled, n_unlabeled=n_unlabeled, seed=111 diff --git a/tests/algorithms/test_ratio_estimator.py b/tests/algorithms/test_ratio_estimator.py index c88c58b..a13a67a 100644 --- a/tests/algorithms/test_ratio_estimator.py +++ b/tests/algorithms/test_ratio_estimator.py @@ -27,8 +27,10 @@ def test_ratio_estimator_from_sufficient_statistic( ] ) - sampler = dc.DiscreteSampler( - p_y_labeled=p_y_labeled, p_y_unlabeled=p_y_unlabeled, p_c_cond_y=p_c_cond_y + sampler = dc.discrete_sampler_factory( + p_y_labeled=p_y_labeled, + p_y_unlabeled=p_y_unlabeled, + p_c_cond_y_labeled=p_c_cond_y, ) statistic = sampler.sample_summary_statistic( n_labeled=n_labeled, n_unlabeled=n_unlabeled, seed=111 diff --git a/tests/datasets/test_discrete_categorical.py b/tests/datasets/test_discrete_categorical.py index 1f30ec7..eb1f5d9 100644 --- a/tests/datasets/test_discrete_categorical.py +++ b/tests/datasets/test_discrete_categorical.py @@ -70,14 +70,14 @@ def test_p_y_labeled(self, n_c: int, n_labeled: int, p_y: Sequence[float]) -> No p_c_cond_y = rng.dirichlet(alpha=np.ones(n_c), size=len(p_y)) p_y_unlabeled = np.ones_like(p_y) / len(p_y) - sampler = dc.DiscreteSampler( + sampler = dc.discrete_sampler_factory( p_y_labeled=p_y, p_y_unlabeled=p_y_unlabeled, - p_c_cond_y=p_c_cond_y, + p_c_cond_y_labeled=p_c_cond_y, ) summary_stats = sampler.sample_summary_statistic( - n_labeled=n_labeled, n_unlabeled=1 + n_labeled=n_labeled, n_unlabeled=1, seed=12 ) assert summary_stats.n_y_labeled.sum() == n_labeled @@ -97,12 +97,14 @@ def test_p_c(self) -> None: [0.0, 1, 0], ] - sampler = dc.DiscreteSampler( - p_y_labeled=p_y_labeled, p_y_unlabeled=p_y_unlabeled, p_c_cond_y=p_c_cond_y + sampler = dc.discrete_sampler_factory( + p_y_labeled=p_y_labeled, + p_y_unlabeled=p_y_unlabeled, + p_c_cond_y_labeled=p_c_cond_y, ) - assert sampler.p_c_labeled == pytest.approx(np.array([0.1, 0.9, 0.0])) - assert sampler.p_c_unlabeled == pytest.approx(np.array([0.2, 0.8, 0.0])) + assert sampler.labeled.p_c == pytest.approx(np.array([0.1, 0.9, 0.0])) + assert sampler.unlabeled.p_c == pytest.approx(np.array([0.2, 0.8, 0.0])) @pytest.mark.parametrize("n_y", (2, 5)) @pytest.mark.parametrize("n_c", (2, 3)) @@ -114,15 +116,17 @@ def test_create_right_fields(self, n_c: int, n_y: int) -> None: p_y_unlabeled = rng.dirichlet(alpha=np.ones(n_y)) p_y_labeled = rng.dirichlet(alpha=np.ones(n_y)) - sampler = dc.DiscreteSampler( + sampler = dc.discrete_sampler_factory( p_y_labeled=p_y_labeled, p_y_unlabeled=p_y_unlabeled, - p_c_cond_y=p_c_cond_y, + p_c_cond_y_labeled=p_c_cond_y, ) - assert sampler.p_c_cond_y == pytest.approx(p_c_cond_y) - assert sampler.p_y_labeled == pytest.approx(p_y_labeled) - assert sampler.p_y_unlabeled == pytest.approx(p_y_unlabeled) + assert sampler.labeled.p_c_cond_y == pytest.approx(p_c_cond_y) + assert sampler.unlabeled.p_c_cond_y == pytest.approx(p_c_cond_y) + + assert sampler.labeled.p_y == pytest.approx(p_y_labeled) + assert sampler.unlabeled.p_y == pytest.approx(p_y_unlabeled) - assert sampler.p_c_labeled.shape == (n_c,) - assert sampler.p_c_unlabeled.shape == (n_c,) + assert sampler.labeled.p_c.shape == (n_c,) + assert sampler.unlabeled.p_c.shape == (n_c,) diff --git a/tests/datasets/test_split.py b/tests/datasets/test_split.py new file mode 100644 index 0000000..18ac1fb --- /dev/null +++ b/tests/datasets/test_split.py @@ -0,0 +1,37 @@ +"""Tests of `labelshift.datasets.split`.""" +import numpy as np +import pytest +from sklearn import datasets + +import labelshift.datasets.split as split + + +@pytest.mark.parametrize("seed_prevalence", range(5)) +@pytest.mark.parametrize("n_classes", [2, 10]) +def test_split_dataset( + seed_prevalence: int, n_classes: int, seed_split: int = 0 +) -> None: + """ + Args: + seed_prevalence: seed used to draw the prevalence vectors + """ + rng = np.random.default_rng(seed_prevalence) + dataset = datasets.load_digits(n_class=n_classes) + + spec = split.SplitSpecification( + train=rng.choice(range(4, 20), size=n_classes).tolist(), + valid=rng.choice(range(4, 20), size=n_classes).tolist(), + test=rng.choice(range(4, 20), size=n_classes).tolist(), + ) + + split_dataset = split.split_dataset( + dataset=dataset, specification=spec, random_seed=seed_split + ) + + assert len(split_dataset.train_x) == sum(spec.train) + assert len(split_dataset.valid_x) == sum(spec.valid) + assert len(split_dataset.test_x) == sum(spec.test) + + assert len(split_dataset.train_x) == len(split_dataset.train_y) + assert len(split_dataset.valid_x) == len(split_dataset.valid_y) + assert len(split_dataset.test_x) == len(split_dataset.test_y) diff --git a/tests/test_partition.py b/tests/test_partition.py new file mode 100644 index 0000000..6d44f44 --- /dev/null +++ b/tests/test_partition.py @@ -0,0 +1,81 @@ +"""Tests of the `partition` submodule.""" +import numpy as np +import pytest +from scipy import stats + +import labelshift.partition as part + + +class TestRealLinePartition: + """Tests of the partition class.""" + + @pytest.mark.parametrize("k", (2, 3, 10)) + def test_length(self, k: int) -> None: + """Tests whether the length of the generated partition + is right. + + Args: + k: the length of the partition + """ + breakpoints = np.arange(k - 1) + + partition = part.RealLinePartition(breakpoints) + + assert len(partition) == k + for i in range(k): + a, b = partition.interval(i) + assert a < b + + for i in range(k, k + 10): + with pytest.raises(IndexError): + partition.interval(k) + + @pytest.mark.parametrize("a", [0.1, -0.5, 0.8]) + def test_two_intervals(self, a: float) -> None: + """We use one breakpoint to split the real line + into two halves. + + Args: + a: breakpoint + """ + partition = part.RealLinePartition([a]) + + assert len(partition) == 2 + assert partition.interval(0) == (-np.inf, a) + assert partition.interval(1) == (a, np.inf) + + with pytest.raises(IndexError): + partition.interval(2) + + +class TestGaussianProbabilityMasses: + """Tests of `gaussian_probability_masses` function.""" + + @pytest.mark.parametrize("mean", [0.1, 0.6, 10.0]) + @pytest.mark.parametrize("sigma", [1.0, 2.0]) + @pytest.mark.parametrize("a", [-0.3, 0.1, 1.0]) + def test_one_gaussian_halves(self, mean: float, sigma: float, a: float) -> None: + """We have L = 1 and K = 2.""" + + result = part.gaussian_probability_masses( + means=[mean], sigmas=[sigma], partition=part.RealLinePartition([a]) + ) + assert result.shape == (1, 2) + assert result[0][0] == pytest.approx(stats.norm.cdf(a, loc=mean, scale=sigma)) + assert result.sum() == pytest.approx(1.0) + + @pytest.mark.parametrize("K", (2, 3, 10)) + @pytest.mark.parametrize("L", (1, 2, 5)) + def test_shape(self, K: int, L: int) -> None: + """Tests if the generated shape is right.""" + partition = part.RealLinePartition(np.linspace(0, 1, K - 1)) + assert len(partition) == K + + means = np.linspace(-5, 5, L) + sigmas = np.linspace(0.1, 1.0, L) + + output = part.gaussian_probability_masses( + means=means, sigmas=sigmas, partition=partition + ) + + assert output.shape == (L, K) diff --git a/tests/test_timer.py b/tests/test_timer.py index 2dbdbf5..b572143 100644 --- a/tests/test_timer.py +++ b/tests/test_timer.py @@ -3,7 +3,7 @@ import pytest -from labelshift.timer import Timer +from labelshift.experiments.timer import Timer @pytest.mark.parametrize("t1", [0.1, 0.2])