From 83627ad817ac56b043459ff797db6861ba3c4672 Mon Sep 17 00:00:00 2001 From: Jordan Deklerk <111652310+jordandeklerk@users.noreply.github.com> Date: Thu, 24 Oct 2024 20:41:48 -0400 Subject: [PATCH] Complete overall of repo. Start modifying existing code to work with scikit-learn api. --- .github/ISSUE_TEMPLATE/bug_report.md | 37 + .github/ISSUE_TEMPLATE/feature_request.md | 20 + .github/dependabot.yml | 21 + .github/pull_request_template.md | 22 + .github/workflows/ci.yml | 39 + .github/workflows/publish.yml | 29 + __init__.py => pygridge/__init__.py | 0 {src => pygridge/src}/__init__.py | 0 {src => pygridge/src}/bisection.py | 0 {src => pygridge/src}/blockridge.py | 364 ++++--- pygridge/src/covariance_design.py | 906 +++++++++++++++++ {src => pygridge/src}/group_lasso.py | 0 pygridge/src/groupedfeatures.py | 349 +++++++ .../src}/lambda_max_group_lasso.py | 0 {src => pygridge/src}/lambda_max_lasso.py | 0 .../src}/lambda_max_sparse_group_lasso.py | 2 +- {src => pygridge/src}/lasso.py | 0 {src => pygridge/src}/nnls.py | 0 {src => pygridge/src}/seagull.py | 12 +- {src => pygridge/src}/sparse_group_lasso.py | 0 {test => pygridge/test}/__init__.py | 0 {test => pygridge/test}/test_bisection.py | 0 .../test/test_blockridge.py | 139 +-- pygridge/test/test_covariance_design.py | 287 ++++++ {test => pygridge/test}/test_group_lasso.py | 0 pygridge/test/test_groupedfeatures.py | 192 ++++ .../test}/test_lambda_max_group_lasso.py | 0 .../test}/test_lambda_max_lasso.py | 0 .../test_lambda_max_sparse_group_lasso.py | 0 {test => pygridge/test}/test_lasso.py | 0 {test => pygridge/test}/test_nnls.py | 0 pygridge/test/test_seagull.py | 185 ++++ .../test}/test_sparse_group_lasso.py | 0 src/covariance_design.py | 914 ------------------ src/groupedfeatures.py | 297 ------ test/test_blockridge.py | 645 ------------ test/test_covariance.py | 668 ------------- test/test_groupfeatures.py | 310 ------ test/test_seagull.py | 181 ---- 39 files changed, 2408 insertions(+), 3211 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/dependabot.yml create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml rename __init__.py => pygridge/__init__.py (100%) rename {src => pygridge/src}/__init__.py (100%) rename {src => pygridge/src}/bisection.py (100%) rename {src => pygridge/src}/blockridge.py (73%) create mode 100644 pygridge/src/covariance_design.py rename {src => pygridge/src}/group_lasso.py (100%) create mode 100644 pygridge/src/groupedfeatures.py rename {src => pygridge/src}/lambda_max_group_lasso.py (100%) rename {src => pygridge/src}/lambda_max_lasso.py (100%) rename {src => pygridge/src}/lambda_max_sparse_group_lasso.py (99%) rename {src => pygridge/src}/lasso.py (100%) rename {src => pygridge/src}/nnls.py (100%) rename {src => pygridge/src}/seagull.py (98%) rename {src => pygridge/src}/sparse_group_lasso.py (100%) rename {test => pygridge/test}/__init__.py (100%) rename {test => pygridge/test}/test_bisection.py (100%) rename test/test_blockridge_scikit.py => pygridge/test/test_blockridge.py (77%) create mode 100644 pygridge/test/test_covariance_design.py rename {test => pygridge/test}/test_group_lasso.py (100%) create mode 100644 pygridge/test/test_groupedfeatures.py rename {test => pygridge/test}/test_lambda_max_group_lasso.py (100%) rename {test => pygridge/test}/test_lambda_max_lasso.py (100%) rename {test => pygridge/test}/test_lambda_max_sparse_group_lasso.py (100%) rename {test => pygridge/test}/test_lasso.py (100%) rename {test => pygridge/test}/test_nnls.py (100%) create mode 100644 pygridge/test/test_seagull.py rename {test => pygridge/test}/test_sparse_group_lasso.py (100%) delete mode 100644 src/covariance_design.py delete mode 100644 src/groupedfeatures.py delete mode 100644 test/test_blockridge.py delete mode 100644 test/test_covariance.py delete mode 100644 test/test_groupfeatures.py delete mode 100644 test/test_seagull.py diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..2a4ccb1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,37 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '[BUG] ' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Import module '...' +2. Create object '....' +3. Call method '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Code Example** +```python +# Minimal code example that reproduces the issue +``` + +**Environment (please complete the following information):** + - OS: [e.g. Ubuntu 20.04] + - Python version: [e.g. 3.8] + - PyGRidge version: [e.g. 0.1.0] + - Relevant package versions: + - numpy: [e.g. 1.21.0] + - scikit-learn: [e.g. 1.0.2] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..a81156a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '[FEATURE] ' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or code examples about the feature request here. diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..c0212b0 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,21 @@ +version: 2 +updates: + # Maintain dependencies for GitHub Actions as recommended in SPEC8: + # https://github.com/scientific-python/specs/pull/325 + # At the time of writing, release critical workflows such as + # pypa/gh-action-pypi-publish should use hash-based versioning for security + # reasons. This strategy may be generalized to all other github actions + # in the future. + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + groups: + actions: + patterns: + - "*" + labels: + - "Build / CI" + - "dependencies" + reviewers: + - "scikit-learn/core-devs" \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..f6ff8da --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,22 @@ +## Description +Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. + +Fixes # (issue) + +## Type of change +Please delete options that are not relevant. + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) +- [ ] Documentation update + +## Checklist: +- [ ] My code follows the style guidelines of this project +- [ ] I have performed a self-review of my own code +- [ ] I have commented my code, particularly in hard-to-understand areas +- [ ] I have made corresponding changes to the documentation +- [ ] My changes generate no new warnings +- [ ] I have added tests that prove my fix is effective or that my feature works +- [ ] New and existing unit tests pass locally with my changes +- [ ] Any dependent changes have been merged and published in downstream modules diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..60eae8d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov + pip install -e . + + - name: Run tests + run: | + pytest --cov=pygridge --cov-report=xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + fail_ci_if_error: true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..b3d3e48 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,29 @@ +name: Publish to PyPI + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m build + twine upload dist/* diff --git a/__init__.py b/pygridge/__init__.py similarity index 100% rename from __init__.py rename to pygridge/__init__.py diff --git a/src/__init__.py b/pygridge/src/__init__.py similarity index 100% rename from src/__init__.py rename to pygridge/src/__init__.py diff --git a/src/bisection.py b/pygridge/src/bisection.py similarity index 100% rename from src/bisection.py rename to pygridge/src/bisection.py diff --git a/src/blockridge.py b/pygridge/src/blockridge.py similarity index 73% rename from src/blockridge.py rename to pygridge/src/blockridge.py index 1356c6c..b91554f 100644 --- a/src/blockridge.py +++ b/pygridge/src/blockridge.py @@ -15,21 +15,25 @@ class RidgeRegressionError(Exception): """Base exception class for Ridge regression errors.""" + pass class InvalidDimensionsError(RidgeRegressionError): """Exception raised when matrix dimensions are incompatible.""" + pass class SingularMatrixError(RidgeRegressionError): """Exception raised when a matrix is singular or nearly singular.""" + pass class NumericalInstabilityError(RidgeRegressionError): """Exception raised when numerical instability is detected.""" + pass @@ -163,7 +167,15 @@ def __init__(self, X: np.ndarray): self.n_samples_, self.n_features_ = X.shape self.gram_matrix_ = np.dot(X.T, X) / self.n_samples_ - self.gram_reg_ = self.gram_matrix_ + np.eye(self.n_features_) + + # Check if gram matrix is nearly singular + cond = np.linalg.cond(self.gram_matrix_) + if cond > 1e15: + raise SingularMatrixError( + f"Gram matrix is nearly singular with condition number: {cond}" + ) + + self.gram_reg_ = self.gram_matrix_.copy() self._update_cholesky() self.lower_ = True @@ -179,25 +191,20 @@ def _update_cholesky(self): self.gram_reg_chol_ = np.linalg.cholesky(self.gram_reg_) except np.linalg.LinAlgError: raise SingularMatrixError( - "Failed to compute Cholesky decomposition. Matrix may not be positive " - "definite." + "Failed to compute Cholesky decomposition. Matrix may not be positive definite." ) def set_params(self, groups: GroupedFeatures, alpha: np.ndarray): - r"""Update the regularization parameters and recompute the Cholesky decomposition. + diag = groups.group_expand(alpha) + if not isinstance(diag, np.ndarray): + diag = np.array(diag) - Parameters - ---------- - groups : GroupedFeatures - The grouped features object. - alpha : np.ndarray - The new :math:`\alpha` values for each group. + # Ensure diag has the correct shape + if len(diag) != self.n_features_: + raise ValueError( + f"Alpha expansion length ({len(diag)}) must match the number of features ({self.n_features_})" + ) - Returns - ------- - None - """ - diag = groups.group_expand(alpha) self.gram_reg_ = self.gram_matrix_ + np.diag(diag) self._update_cholesky() @@ -300,6 +307,14 @@ def __init__(self, X: np.ndarray): self.n_samples_, self.n_features_ = X.shape self.X_ = X self.gram_matrix_ = X.T @ X + + # Check if gram matrix is nearly singular + cond = np.linalg.cond(self.gram_matrix_) + if cond > 1e15: + raise SingularMatrixError( + f"Gram matrix is nearly singular with condition number: {cond}" + ) + self.alpha_inv_ = np.eye(self.n_features_) self.U_ = X.T self.V_ = X @@ -327,6 +342,11 @@ def set_params(self, groups: GroupedFeatures, alpha: np.ndarray): raise ValueError("Alpha values must be positive.") diag = np.array(groups.group_expand(alpha)) + if len(diag) != self.n_features_: + raise ValueError( + f"Alpha expansion length ({len(diag)}) must match the number of features ({self.n_features_})" + ) + self.alpha_inv_ = np.diag(1 / diag) self._woodbury_update() @@ -445,6 +465,14 @@ def __init__(self, X: np.ndarray): self.n_samples_, self.n_features_ = X.shape self.X_ = X self.gram_matrix_ = self.X_.T @ X / self.n_samples_ + + # Check if gram matrix is nearly singular + cond = np.linalg.cond(self.gram_matrix_) + if cond > 1e15: + raise SingularMatrixError( + f"Gram matrix is nearly singular with condition number: {cond}" + ) + self.A_ = np.eye(self.n_features_) + self.gram_matrix_ self.A_inv_ = np.linalg.inv(self.A_) self.U_ = self.X_.T / np.sqrt(self.n_samples_) @@ -473,8 +501,19 @@ def set_params(self, groups: GroupedFeatures, alpha: np.ndarray): raise ValueError("Alpha values must be non-negative.") diag = groups.group_expand(alpha) + if len(diag) != self.n_features_: + raise ValueError( + f"Alpha expansion length ({len(diag)}) must match the number of features ({self.n_features_})" + ) + self.A_ = np.diag(diag) + self.gram_matrix_ try: + # Check condition number before inversion + cond = np.linalg.cond(self.A_) + if cond > 1e15: + raise SingularMatrixError( + f"Matrix is nearly singular with condition number: {cond}" + ) self.A_inv_ = np.linalg.inv(self.A_) except np.linalg.LinAlgError: raise SingularMatrixError("Failed to invert A. Matrix may be singular.") @@ -659,16 +698,26 @@ def fit(self, X: np.ndarray, y: np.ndarray): """ self._validate_params() X, y = check_X_y(X, y, accept_sparse=False) - self.y_ = y # Store y for later use + self.y_ = y self.n_features_in_ = X.shape[1] self.n_samples_, self.n_features_ = X.shape self.groups_ = self.groups + # Validate that groups match features + total_features = sum(self.groups.ps) + if total_features != self.n_features_: + raise ValueError( + f"Total features in groups ({total_features}) must match data features ({self.n_features_})" + ) + # Initialize predictor based on problem dimensions if self.n_features_ <= self.n_samples_: self.predictor_ = CholeskyRidgePredictor(X) - elif self.n_features_ > self.n_samples_ and self.n_features_ < 4 * self.n_samples_: + elif ( + self.n_features_ > self.n_samples_ + and self.n_features_ < 4 * self.n_samples_ + ): self.predictor_ = WoodburyRidgePredictor(X) else: self.predictor_ = ShermanMorrisonRidgePredictor(X) @@ -691,9 +740,7 @@ def fit(self, X: np.ndarray, y: np.ndarray): self.gram_reg_inv_ = self.predictor_._solve_system(np.eye(self.n_features_)) # Compute leverage scores - self.gram_reg_inv_X_ = ( - self.predictor_._solve_system(X.T).T / self.n_samples_ - ) + self.gram_reg_inv_X_ = self.predictor_._solve_system(X.T).T / self.n_samples_ self.leverage_ = np.sum(X * self.gram_reg_inv_X_, axis=1) return self @@ -778,7 +825,7 @@ def get_mse(self, X_test: np.ndarray, y_test: np.ndarray) -> float: return np.mean((y_test - self.predict(X_test)) ** 2) -def lambda_lolas_rule(rdg: GroupRidgeRegressor, multiplier: float = 0.1) -> float: +def lambda_lolas_rule(estimator: GroupRidgeRegressor, multiplier: float = 0.1) -> float: r"""Compute the regularization parameter using the Panagiotis Lolas rule. The Lolas rule provides a heuristic for selecting the regularization parameter @@ -791,10 +838,10 @@ def lambda_lolas_rule(rdg: GroupRidgeRegressor, multiplier: float = 0.1) -> floa Parameters ---------- - rdg : GroupRidgeRegressor - The Ridge regression estimator. + estimator : GroupRidgeRegressor + The fitted group ridge regression estimator. multiplier : float, default=0.1 - A scalar multiplier for the rule. + Scaling factor for the regularization parameter. Returns ------- @@ -804,58 +851,90 @@ def lambda_lolas_rule(rdg: GroupRidgeRegressor, multiplier: float = 0.1) -> floa Raises ------ ValueError - If multiplier is not positive or if trace(X^T X) is zero. + If multiplier is not positive or if the estimator is not fitted. + If trace(X^T X) is zero. """ + if not hasattr(estimator, "predictor_"): + raise ValueError("Estimator must be fitted before calling lambda_lolas_rule.") + + if not isinstance(multiplier, (int, float)): + raise TypeError("multiplier must be a number.") if multiplier <= 0: - raise ValueError("Multiplier must be positive.") + raise ValueError("multiplier must be positive.") - trace_gram = rdg.predictor_._trace_gram_matrix() - if trace_gram == 0: + trace_gram = estimator.predictor_._trace_gram_matrix() + if np.isclose(trace_gram, 0): raise ValueError("Trace of X^T X is zero, leading to division by zero.") - return multiplier * rdg.n_features_**2 / rdg.n_samples_ / trace_gram + return multiplier * estimator.n_features_**2 / estimator.n_samples_ / trace_gram class MomentTunerSetup: - r"""Setup for the moment-based tuning of regularization parameters. + """Setup for moment-based tuning of regularization parameters. + + This class prepares the necessary statistics for tuning regularization parameters + using moment-based methods. It encapsulates the computation of coefficient norms, + gram matrix norms, and moment matrices needed for parameter selection. - This class prepares and computes moment-based statistics required for tuning - the regularization parameters (:math:`\lambda`) in Ridge regression. By leveraging - moments of the coefficients and the design matrix, it facilitates principled - selection of :math:`\lambda` values that balance bias and variance. + Parameters + ---------- + estimator : GroupRidgeRegressor + The fitted group ridge regression estimator. Attributes ---------- - groups : GroupedFeatures + groups_ : GroupedFeatures The grouped features object. - ps : np.ndarray - Array of the number of features in each group. - n : int - Number of samples. - beta_norms_squared : np.ndarray - Squared norms of coefficients for each group. - N_norms_squared : np.ndarray - Squared norms of the :math:`N` matrix for each group. - M_squared : np.ndarray - :math:`M^2` matrix computed as (:math:`p_s \times p_s^T) / n^2`. + n_features_per_group_ : ndarray of shape (n_groups,) + Number of features in each group. + n_samples_ : int + Number of samples in the training data. + coef_norms_squared_ : ndarray of shape (n_groups,) + Squared L2 norms of coefficients for each group. + gram_inv_norms_squared_ : ndarray of shape (n_groups,) + Squared Frobenius norms of gram matrix inverse blocks for each group. + moment_matrix_ : ndarray of shape (n_groups, n_groups) + Matrix of scaled outer products of group sizes. + + Raises + ------ + ValueError + If the estimator is not fitted or has incompatible dimensions. """ - def __init__(self, rdg: GroupRidgeRegressor): - self.groups_ = rdg.groups_ - self.n_features_per_group_ = np.array(rdg.groups_.ps) - self.n_samples_ = rdg.n_samples_ + def __init__(self, estimator: GroupRidgeRegressor): + if not hasattr(estimator, "predictor_"): + raise ValueError( + "Estimator must be fitted before MomentTunerSetup initialization." + ) + + self.groups_ = estimator.groups_ + self.n_features_per_group_ = np.array(estimator.groups_.ps) + self.n_samples_ = estimator.n_samples_ + + # Compute coefficient norms per group self.coef_norms_squared_ = np.array( - rdg.groups_.group_summary(rdg.coef_, lambda x: np.sum(np.abs(x) ** 2)) + estimator.groups_.group_summary( + estimator.coef_, lambda x: np.sum(np.abs(x) ** 2) + ) ) - gram_inv_matrix = rdg.gram_reg_inv_ # Use the (p, p) inverse matrix - if gram_inv_matrix.shape[1] != self.n_features_per_group_.sum(): + + # Validate and compute gram matrix inverse norms + gram_inv_matrix = estimator.gram_reg_inv_ + total_features = self.n_features_per_group_.sum() + if gram_inv_matrix.shape[1] != total_features: raise ValueError( - f"Length of gram_inv_matrix ({gram_inv_matrix.shape[1]}) does not match " - f"number of features ({self.n_features_per_group_.sum()})" + f"Gram inverse matrix dimension ({gram_inv_matrix.shape[1]}) " + f"does not match total features ({total_features})" ) + self.gram_inv_norms_squared_ = np.array( - rdg.groups_.group_summary(gram_inv_matrix, lambda x: np.sum(np.abs(x) ** 2)) + estimator.groups_.group_summary( + gram_inv_matrix, lambda x: np.sum(np.abs(x) ** 2) + ) ) + + # Compute moment matrix self.moment_matrix_ = ( np.outer(self.n_features_per_group_, self.n_features_per_group_) / self.n_samples_**2 @@ -863,7 +942,11 @@ def __init__(self, rdg: GroupRidgeRegressor): def sigma_squared_path( - rdg: GroupRidgeRegressor, mom: MomentTunerSetup, sigma_s_squared: np.ndarray + estimator: GroupRidgeRegressor, + moment_tuner: MomentTunerSetup, + sigma_squared_values: np.ndarray, + max_iter: int = 100, + tol: float = 1e-4, ): r"""Compute the regularization path for different values of :math:`\sigma^2`. @@ -889,51 +972,75 @@ def sigma_squared_path( Parameters ---------- - rdg : GroupRidgeRegressor - The Ridge regression estimator. - mom : MomentTunerSetup + estimator : GroupRidgeRegressor + The group ridge regression estimator. + moment_tuner : MomentTunerSetup The moment tuner setup containing necessary statistics. - sigma_s_squared : np.ndarray of shape (n_sigmas,) - An array of :math:`\sigma^2` values to evaluate. + sigma_squared_values : ndarray of shape (n_values,) + Array of noise variance values to evaluate. + max_iter : int, default=100 + Maximum number of iterations for optimization. + tol : float, default=1e-4 + Tolerance for optimization convergence. Returns ------- dict - A dictionary containing the regularization path information: - - 'alphas' : np.ndarray of shape (n_sigmas, n_groups) - Array of :math:`\alpha` values for each :math:`\sigma^2`. - - 'errors' : np.ndarray of shape (n_sigmas,) - Array of LOO errors corresponding to each :math:`\sigma^2`. - - 'coefs' : np.ndarray of shape (n_sigmas, n_features) - Array of coefficient vectors corresponding to each :math:`\sigma^2`. + Dictionary containing: + - 'alphas': ndarray of shape (n_values, n_groups) + Regularization parameters for each sigma squared value. + - 'errors': ndarray of shape (n_values,) + Leave-one-out errors for each sigma squared value. + - 'coefs': ndarray of shape (n_values, n_features) + Model coefficients for each sigma squared value. Raises ------ ValueError - If sigma_s_squared contains negative values. + If sigma_squared_values contains negative values or + if max_iter or tol have invalid values. """ - if np.any(sigma_s_squared < 0): - raise ValueError("sigma_s_squared values must be non-negative.") + if not isinstance(sigma_squared_values, np.ndarray): + sigma_squared_values = np.asarray(sigma_squared_values) - n_sigmas = len(sigma_s_squared) - n_groups = rdg.get_n_groups() - errors = np.zeros(n_sigmas) - alphas = np.zeros((n_sigmas, n_groups)) - coefs = np.zeros((n_sigmas, rdg.groups_.p)) + if np.any(sigma_squared_values < 0): + raise ValueError("sigma_squared_values must be non-negative") - for i, sigma_sq in enumerate(sigma_s_squared): + if not isinstance(max_iter, int) or max_iter <= 0: + raise ValueError("max_iter must be a positive integer") + + if not isinstance(tol, float) or tol <= 0: + raise ValueError("tol must be a positive float") + + n_values = len(sigma_squared_values) + n_groups = len(moment_tuner.n_features_per_group_) + n_features = moment_tuner.n_features_per_group_.sum() + + # Initialize result arrays + alphas = np.zeros((n_values, n_groups)) + errors = np.zeros(n_values) + coefs = np.zeros((n_values, n_features)) + + for i, sigma_sq in enumerate(sigma_squared_values): try: - alpha_tmp = get_lambdas(mom, sigma_sq) - alphas[i, :] = alpha_tmp - errors[i] = rdg.fit(alpha_tmp) - coefs[i, :] = rdg.coef_ - except RidgeRegressionError as e: - print(f"Error at :math:`\sigma^2 = {sigma_sq}`: {str(e)}") + # Compute optimal regularization parameters + alpha_values = get_lambdas(moment_tuner, sigma_sq) + alphas[i] = alpha_values + + # Fit model and store results + estimator.set_params(alpha=alpha_values) + estimator.fit(estimator.X_, estimator.y_) + errors[i] = estimator.get_loo_error() + coefs[i] = estimator.coef_ + + except Exception as e: + warnings.warn(f"Error at sigma_squared={sigma_sq}: {str(e)}") + continue return {"alphas": alphas, "errors": errors, "coefs": coefs} -def get_lambdas(mom: MomentTunerSetup, sigma_sq: float) -> np.ndarray: +def get_lambdas(moment_tuner: MomentTunerSetup, sigma_squared: float) -> np.ndarray: r"""Compute :math:`\alpha` values for a given :math:`\sigma^2`. This function calculates the regularization parameters (:math:`\alpha`) for each @@ -954,45 +1061,47 @@ def get_lambdas(mom: MomentTunerSetup, sigma_sq: float) -> np.ndarray: Parameters ---------- - mom : MomentTunerSetup + moment_tuner : MomentTunerSetup The moment tuner setup containing necessary statistics. - sigma_sq : float - The :math:`\sigma^2` value for which to compute :math:`\alpha`. + sigma_squared : float + The noise variance value. Returns ------- - np.ndarray of shape (n_groups,) - The computed :math:`\alpha` values for each feature group. + ndarray of shape (n_groups,) + Optimal regularization parameters for each group. Raises ------ ValueError - If sigma_sq is negative. - NumericalInstabilityError - If division by zero occurs during alpha computation. + If sigma_squared is negative. """ - if sigma_sq < 0: - raise ValueError("sigma_sq must be non-negative.") + if not isinstance(sigma_squared, (int, float)): + raise TypeError("sigma_squared must be a number") + if sigma_squared < 0: + raise ValueError("sigma_squared must be non-negative") - alpha_squared = get_alpha_s_squared(mom, sigma_sq) - group_ratios = np.array(mom.n_features_per_group_) / mom.n_samples_ + # Compute alpha squared values + alpha_squared = get_alpha_s_squared(moment_tuner, sigma_squared) + group_ratios = moment_tuner.n_features_per_group_ / moment_tuner.n_samples_ + # Handle numerical stability LARGE_VALUE = 1e12 - with np.errstate(divide="ignore", invalid="ignore"): - alphas = sigma_sq * group_ratios / alpha_squared - zero_alpha = alpha_squared == 0 - if np.any(zero_alpha): + lambdas = sigma_squared * group_ratios / alpha_squared + zero_mask = np.isclose(alpha_squared, 0) + if np.any(zero_mask): warnings.warn( - f"alpha_squared has zero values for groups: {np.where(zero_alpha)[0]}. " - "Assigning large alpha values to these groups." + f"Assigning large regularization values to groups: {np.where(zero_mask)[0]}" ) - alphas = np.where(zero_alpha, LARGE_VALUE, alphas) + lambdas = np.where(zero_mask, LARGE_VALUE, lambdas) - return alphas + return lambdas -def get_alpha_s_squared(mom: MomentTunerSetup, sigma_sq: float) -> np.ndarray: +def get_alpha_s_squared( + moment_tuner: MomentTunerSetup, sigma_squared: float +) -> np.ndarray: r"""Compute :math:`\alpha^2` values for a given :math:`\sigma^2` using Non-Negative Least Squares (NNLS). @@ -1015,38 +1124,45 @@ def get_alpha_s_squared(mom: MomentTunerSetup, sigma_sq: float) -> np.ndarray: Parameters ---------- - mom : MomentTunerSetup + moment_tuner : MomentTunerSetup The moment tuner setup containing necessary statistics. - sigma_sq : float - The :math:`\sigma^2` value for which to compute :math:`\alpha^2`. + sigma_squared : float + The noise variance value. Returns ------- - np.ndarray of shape (n_groups,) - The computed :math:`\alpha^2` values for each feature group. + ndarray of shape (n_groups,) + Squared alpha values for each group. Raises ------ ValueError - If sigma_sq is negative or if any n_features_per_group is zero. + If sigma_squared is negative or if group sizes are invalid. NNLSError - If the NNLS algorithm fails to converge. + If the non-negative least squares optimization fails. """ - if sigma_sq < 0: - raise ValueError("sigma_sq must be non-negative.") - if np.any(mom.n_features_per_group_ == 0): - raise ValueError("All n_features_per_group values must be non-zero.") + if not isinstance(sigma_squared, (int, float)): + raise TypeError("sigma_squared must be a number") + if sigma_squared < 0: + raise ValueError("sigma_squared must be non-negative") + + if np.any(moment_tuner.n_features_per_group_ <= 0): + raise ValueError("All group sizes must be positive") - # Compute the right-hand side - rhs = mom.coef_norms_squared_ - sigma_sq * mom.gram_inv_norms_squared_ - rhs = np.maximum(rhs, 0) + # Compute right-hand side of the system + rhs = ( + moment_tuner.coef_norms_squared_ + - sigma_squared * moment_tuner.gram_inv_norms_squared_ + ) + rhs = np.maximum(rhs, 0) # Ensure non-negativity - # Solve the NNLS problem: moment_matrix * alpha_per_group ≈ rhs try: - alpha_per_group = nonneg_lsq(mom.moment_matrix_, rhs, alg="fnnls") + # Solve non-negative least squares problem + alpha_per_group = nonneg_lsq(moment_tuner.moment_matrix_, rhs, alg="fnnls") except NNLSError as e: - raise NNLSError(f"Failed to compute alpha_squared: {str(e)}") + raise NNLSError(f"Non-negative least squares optimization failed: {str(e)}") - alpha_squared = alpha_per_group * mom.n_features_per_group_ + # Scale by group sizes + alpha_squared = alpha_per_group * moment_tuner.n_features_per_group_ - return alpha_squared \ No newline at end of file + return alpha_squared diff --git a/pygridge/src/covariance_design.py b/pygridge/src/covariance_design.py new file mode 100644 index 0000000..4093ced --- /dev/null +++ b/pygridge/src/covariance_design.py @@ -0,0 +1,906 @@ +"""Covariance matrix designs for various statistical models.""" + +import numpy as np +from abc import ABC, abstractmethod +from typing import List, Callable, Union +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted + +from groupedfeatures import GroupedFeatures, fill + + +class DiscreteNonParametric: + """Represents a discrete non-parametric spectrum. + + Parameters + ---------- + eigs : list of float + A list of eigenvalues. + probs : list of float + A list of probabilities corresponding to each eigenvalue. + + Attributes + ---------- + eigs : ndarray of shape (n_eigenvalues,) + Numpy array of eigenvalues. + probs : ndarray of shape (n_eigenvalues,) + Numpy array of probabilities for each eigenvalue. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import DiscreteNonParametric + >>> spectrum = DiscreteNonParametric(eigs=[1.0, 2.0, 3.0], probs=[0.2, 0.3, 0.5]) + >>> spectrum.eigs + array([1., 2., 3.]) + >>> spectrum.probs + array([0.2, 0.3, 0.5]) + """ + + def __init__(self, eigs: List[float], probs: List[float]): + if not isinstance(eigs, list): + raise TypeError( + f"'eigs' must be a list of floats, got {type(eigs).__name__}" + ) + if not all(isinstance(e, (int, float)) for e in eigs): + raise ValueError("All elements in 'eigs' must be integers or floats.") + if not isinstance(probs, list): + raise TypeError( + f"'probs' must be a list of floats, got {type(probs).__name__}" + ) + if not all(isinstance(p, (int, float)) for p in probs): + raise ValueError("All elements in 'probs' must be integers or floats.") + if len(eigs) != len(probs): + raise ValueError("'eigs' and 'probs' must be of the same length.") + if not np.isclose(sum(probs), 1.0): + raise ValueError("The probabilities in 'probs' must sum to 1.") + if any(p < 0 for p in probs): + raise ValueError("Probabilities in 'probs' must be non-negative.") + + self.eigs = np.array(eigs) + self.probs = np.array(probs) + + +class CovarianceDesign(BaseEstimator, TransformerMixin): + """Base class for covariance matrix designs. + + This class defines the interface for all covariance designs and implements + the scikit-learn estimator interface. + + Parameters + ---------- + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. + precision_ : ndarray of shape (n_features, n_features) + Estimated precision matrix (inverse of covariance). + Only stored if store_precision is True. + n_features_in_ : int + Number of features seen during fit. + + See Also + -------- + AR1Design : AutoRegressive model of order 1. + IdentityCovarianceDesign : Identity covariance matrix. + UniformScalingCovarianceDesign : Uniform scaling on diagonal. + ExponentialOrderStatsCovarianceDesign : Exponential order statistics. + BlockCovarianceDesign : Block diagonal covariance matrix. + """ + + _parameter_constraints = { + "store_precision": ["boolean"], + } + + def __init__(self, *, store_precision=True): + self.store_precision = store_precision + + @abstractmethod + def get_Sigma(self) -> np.ndarray: + """Construct and return the covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + The covariance matrix. + """ + pass + + @abstractmethod + def nfeatures(self) -> int: + """Get the number of features. + + Returns + ------- + n_features : int + Number of features. + """ + pass + + @abstractmethod + def spectrum(self) -> DiscreteNonParametric: + """Compute the spectral decomposition. + + Returns + ------- + spectrum : DiscreteNonParametric + Spectrum containing eigenvalues and probabilities. + """ + pass + + def get_precision(self): + """Get the precision matrix. + + Returns + ------- + precision : ndarray of shape (n_features, n_features) + The precision matrix. + """ + check_is_fitted(self, ["covariance_"]) + if self.store_precision and hasattr(self, "precision_"): + return self.precision_ + else: + return np.linalg.pinv(self.covariance_) + + def fit(self, X=None, y=None): + """Fit the covariance design. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features), default=None + Training data. Not used, present for API consistency. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + if X is not None: + X = check_array(X) + self.n_features_in_ = X.shape[1] + + self.covariance_ = self.get_Sigma() + if self.store_precision: + self.precision_ = np.linalg.pinv(self.covariance_) + return self + + def transform(self, X): + """Transform data using the covariance design. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Returns + ------- + X_transformed : ndarray of shape (n_samples, n_features) + The transformed data. + """ + check_is_fitted(self, ["covariance_"]) + X = check_array(X) + return X @ self.covariance_ + + def score(self, X, y=None): + """Compute the log-likelihood of X under the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + score : float + Log-likelihood of X under the model. + """ + check_is_fitted(self, ["covariance_"]) + X = check_array(X) + precision = self.get_precision() + n_samples, n_features = X.shape + log_likelihood = -np.sum(X @ precision @ X.T) + log_likelihood += n_samples * np.log(np.linalg.det(precision)) + log_likelihood -= n_samples * n_features * np.log(2 * np.pi) + log_likelihood /= 2.0 + return log_likelihood + + +class AR1Design(CovarianceDesign): + r"""Implements an AutoRegressive model of order 1 (AR(1)) for covariance matrix design. + + Parameters + ---------- + p : int, optional + Number of features. + rho : float, default=0.7 + AR(1) correlation coefficient. Must be in [0, 1). + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + AR(1) covariance matrix. + precision_ : ndarray of shape (n_features, n_features) + Precision matrix (inverse of covariance). + n_features_in_ : int + Number of features. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import AR1Design + >>> design = AR1Design(p=3, rho=0.5) + >>> design.fit() + >>> design.covariance_ + array([[1. , 0.5 , 0.25], + [0.5 , 1. , 0.5 ], + [0.25, 0.5 , 1. ]]) + """ + + _parameter_constraints = { + **CovarianceDesign._parameter_constraints, + "p": [int, None], + "rho": [float], + } + + def __init__(self, p=None, rho=0.7, *, store_precision=True): + super().__init__(store_precision=store_precision) + self.p = p + self.rho = rho + + if p is not None: + if not isinstance(p, int): + raise TypeError(f"'p' must be an integer, got {type(p).__name__}") + if p <= 0: + raise ValueError("'p' must be a positive integer.") + if not isinstance(rho, (int, float)): + raise TypeError(f"'rho' must be a float, got {type(rho).__name__}") + if not (0 <= rho < 1): + raise ValueError("'rho' must be in the interval [0, 1).") + + def get_Sigma(self): + """Construct the AR(1) covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + The AR(1) covariance matrix. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + if self.p <= 0: + raise ValueError("'p' must be a positive integer.") + + indices = np.arange(self.p) + Sigma = self.rho ** np.abs(np.subtract.outer(indices, indices)) + return Sigma + + def nfeatures(self): + """Get the number of features. + + Returns + ------- + n_features : int + Number of features. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + return self.p + + def spectrum(self): + """Compute the spectral decomposition. + + Returns + ------- + spectrum : DiscreteNonParametric + Spectrum containing eigenvalues and probabilities. + """ + Sigma = self.get_Sigma() + eigs = np.linalg.eigvals(Sigma) + probs = [1.0 / len(eigs)] * len(eigs) + return DiscreteNonParametric(eigs.tolist(), probs) + + +class DiagonalCovarianceDesign(CovarianceDesign): + """Base class for diagonal covariance matrix designs. + + Parameters + ---------- + p : int, optional + Number of features. + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Diagonal covariance matrix. + precision_ : ndarray of shape (n_features, n_features) + Precision matrix (inverse of covariance). + n_features_in_ : int + Number of features. + """ + + _parameter_constraints = { + **CovarianceDesign._parameter_constraints, + "p": [int, None], + } + + def __init__(self, p=None, *, store_precision=True): + super().__init__(store_precision=store_precision) + self.p = p + + if p is not None: + if not isinstance(p, int): + raise TypeError(f"'p' must be an integer, got {type(p).__name__}") + if p <= 0: + raise ValueError("'p' must be a positive integer.") + + def nfeatures(self): + """Get the number of features. + + Returns + ------- + n_features : int + Number of features. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + return self.p + + +class IdentityCovarianceDesign(DiagonalCovarianceDesign): + """Identity covariance matrix design. + + Parameters + ---------- + p : int, optional + Number of features. + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Identity matrix. + precision_ : ndarray of shape (n_features, n_features) + Identity matrix (same as covariance). + n_features_in_ : int + Number of features. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import IdentityCovarianceDesign + >>> design = IdentityCovarianceDesign(p=3) + >>> design.fit() + >>> design.covariance_ + array([[1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]) + """ + + def get_Sigma(self): + """Construct the identity covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + The identity matrix. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + return np.identity(self.p) + + def spectrum(self): + """Compute the spectral decomposition. + + Returns + ------- + spectrum : DiscreteNonParametric + Spectrum containing eigenvalues (all 1) and equal probabilities. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + eigs = [1.0] * self.p + probs = [1.0 / self.p] * self.p + return DiscreteNonParametric(eigs, probs) + + +class UniformScalingCovarianceDesign(DiagonalCovarianceDesign): + """Uniform scaling covariance matrix design. + + Parameters + ---------- + scaling : float, default=1.0 + Scaling factor for diagonal entries. + p : int, optional + Number of features. + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Scaled identity matrix. + precision_ : ndarray of shape (n_features, n_features) + Inverse of scaled identity matrix. + n_features_in_ : int + Number of features. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import UniformScalingCovarianceDesign + >>> design = UniformScalingCovarianceDesign(scaling=2.0, p=3) + >>> design.fit() + >>> design.covariance_ + array([[2., 0., 0.], + [0., 2., 0.], + [0., 0., 2.]]) + """ + + _parameter_constraints = { + **DiagonalCovarianceDesign._parameter_constraints, + "scaling": [float], + } + + def __init__(self, scaling=1.0, p=None, *, store_precision=True): + super().__init__(p=p, store_precision=store_precision) + self.scaling = scaling + + if not isinstance(scaling, (int, float)): + raise TypeError(f"'scaling' must be a float, got {type(scaling).__name__}") + if scaling <= 0: + raise ValueError("'scaling' must be a positive number.") + + def get_Sigma(self): + """Construct the scaled identity covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + The scaled identity matrix. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + return self.scaling * np.identity(self.p) + + def spectrum(self): + """Compute the spectral decomposition. + + Returns + ------- + spectrum : DiscreteNonParametric + Spectrum containing eigenvalues (all equal to scaling) and equal probabilities. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + eigs = [self.scaling] * self.p + probs = [1.0 / self.p] * self.p + return DiscreteNonParametric(eigs, probs) + + +class ExponentialOrderStatsCovarianceDesign(DiagonalCovarianceDesign): + """Exponential order statistics covariance matrix design. + + Parameters + ---------- + p : int, optional + Number of features. + rate : float, default=1.0 + Rate parameter for exponential distribution. + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Diagonal matrix with exponential order statistics. + precision_ : ndarray of shape (n_features, n_features) + Inverse of covariance matrix. + n_features_in_ : int + Number of features. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import ExponentialOrderStatsCovarianceDesign + >>> design = ExponentialOrderStatsCovarianceDesign(p=3, rate=1.0) + >>> design.fit() + >>> design.covariance_ # doctest: +SKIP + array([[2.19..., 0. , 0. ], + [0. , 1.09..., 0. ], + [0. , 0. , 0.40...]]) + """ + + _parameter_constraints = { + **DiagonalCovarianceDesign._parameter_constraints, + "rate": [float], + } + + def __init__(self, p=None, rate=1.0, *, store_precision=True): + super().__init__(p=p, store_precision=store_precision) + self.rate = rate + + if not isinstance(rate, (int, float)): + raise TypeError(f"'rate' must be a float, got {type(rate).__name__}") + if rate <= 0: + raise ValueError("'rate' must be a positive number.") + + def spectrum(self): + """Compute the spectral decomposition. + + Returns + ------- + spectrum : DiscreteNonParametric + Spectrum containing exponential order statistics eigenvalues and equal probabilities. + """ + if self.p is None: + raise ValueError("Number of features 'p' is not set.") + + p = self.p + rate = self.rate + + tmp = np.linspace(1 / (2 * p), 1 - 1 / (2 * p), p) + eigs = (1 / rate) * np.log(1 / tmp) + probs = [1.0 / p] * p + + return DiscreteNonParametric(eigs.tolist(), probs) + + def get_Sigma(self): + """Construct the diagonal covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + Diagonal matrix with exponential order statistics. + """ + spectrum = self.spectrum() + return np.diag(spectrum.eigs) + + +class BlockDiagonal: + """Block diagonal matrix. + + Parameters + ---------- + blocks : list of ndarray + List of square matrices to place on diagonal. + + Attributes + ---------- + blocks : list of ndarray + List of block matrices. + + Examples + -------- + >>> import numpy as np + >>> from PyGRidge.src.covariance_design import BlockDiagonal + >>> A = np.array([[1, 0], [0, 1]]) + >>> B = np.array([[2, 0], [0, 2]]) + >>> block_diag = BlockDiagonal([A, B]) + >>> block_diag.get_Sigma() + array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 2, 0], + [0, 0, 0, 2]]) + """ + + def __init__(self, blocks): + if not isinstance(blocks, list): + raise TypeError( + f"'blocks' must be a list of numpy arrays, got {type(blocks).__name__}" + ) + if not all(isinstance(block, np.ndarray) for block in blocks): + raise TypeError("All blocks must be numpy.ndarray instances.") + if not all(block.ndim == 2 for block in blocks): + raise ValueError("All blocks must be 2-dimensional arrays.") + if not all(block.shape[0] == block.shape[1] for block in blocks): + raise ValueError("All blocks must be square matrices.") + + self.blocks = blocks + + def get_Sigma(self): + """Construct the block diagonal matrix. + + Returns + ------- + Sigma : ndarray + The block diagonal matrix. + """ + return block_diag(*self.blocks) + + +def block_diag(*arrs): + """Construct a block diagonal matrix. + + Parameters + ---------- + *arrs : ndarray + Square arrays to place on diagonal. + + Returns + ------- + out : ndarray + The block diagonal matrix. + + Examples + -------- + >>> import numpy as np + >>> from PyGRidge.src.covariance_design import block_diag + >>> A = np.array([[1, 0], [0, 1]]) + >>> B = np.array([[2, 0], [0, 2]]) + >>> block_diag(A, B) + array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 2, 0], + [0, 0, 0, 2]]) + """ + if not all(isinstance(a, np.ndarray) for a in arrs): + raise TypeError("All arguments must be numpy.ndarray instances.") + if not all(a.ndim == 2 for a in arrs): + raise ValueError("All input arrays must be 2-dimensional.") + if not all(a.shape[0] == a.shape[1] for a in arrs): + raise ValueError("All input arrays must be square matrices.") + + if len(arrs) == 0: + return np.array([[]]) + + shapes = np.array([a.shape for a in arrs]) + out_shape = np.sum(shapes, axis=0) + out = np.zeros(out_shape, dtype=arrs[0].dtype) + r, c = 0, 0 + for a in arrs: + out[r:r + a.shape[0], c:c + a.shape[1]] = a + r += a.shape[0] + c += a.shape[1] + return out + + +class MixtureModel: + """Mixture model of multiple spectra. + + Parameters + ---------- + spectra : list of DiscreteNonParametric + List of component spectra. + mixing_prop : list of float + Mixing proportions for each spectrum. + + Attributes + ---------- + spectra : list of DiscreteNonParametric + Component spectra. + mixing_prop : list of float + Mixing proportions. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import DiscreteNonParametric, MixtureModel + >>> s1 = DiscreteNonParametric([1, 2], [0.5, 0.5]) + >>> s2 = DiscreteNonParametric([3, 4], [0.3, 0.7]) + >>> mix = MixtureModel([s1, s2], [0.6, 0.4]) + """ + + def __init__(self, spectra, mixing_prop): + if not isinstance(spectra, list): + raise TypeError( + "'spectra' must be a list of DiscreteNonParametric instances" + ) + if not all(isinstance(s, DiscreteNonParametric) for s in spectra): + raise TypeError( + "All elements in 'spectra' must be DiscreteNonParametric instances" + ) + if not isinstance(mixing_prop, list): + raise TypeError("'mixing_prop' must be a list of floats") + if not all(isinstance(p, (int, float)) for p in mixing_prop): + raise ValueError("All elements in 'mixing_prop' must be numbers") + if len(spectra) != len(mixing_prop): + raise ValueError("'spectra' and 'mixing_prop' must have same length") + if not np.isclose(sum(mixing_prop), 1.0): + raise ValueError("'mixing_prop' must sum to 1") + if any(p < 0 for p in mixing_prop): + raise ValueError("'mixing_prop' must be non-negative") + + self.spectra = spectra + self.mixing_prop = mixing_prop + + +class BlockCovarianceDesign(CovarianceDesign): + """Block diagonal covariance matrix design. + + Parameters + ---------- + blocks : list of CovarianceDesign + List of covariance designs for each block. + groups : GroupedFeatures, optional + Feature groupings. + store_precision : bool, default=True + Whether to store the precision matrix. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Block diagonal covariance matrix. + precision_ : ndarray of shape (n_features, n_features) + Precision matrix (inverse of covariance). + n_features_in_ : int + Total number of features. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import BlockCovarianceDesign + >>> from PyGRidge.src.covariance_design import IdentityCovarianceDesign + >>> block1 = IdentityCovarianceDesign(p=2) + >>> block2 = IdentityCovarianceDesign(p=3) + >>> design = BlockCovarianceDesign([block1, block2]) + >>> design.fit() + >>> design.covariance_ + array([[1., 0., 0., 0., 0.], + [0., 1., 0., 0., 0.], + [0., 0., 1., 0., 0.], + [0., 0., 0., 1., 0.], + [0., 0., 0., 0., 1.]]) + """ + + _parameter_constraints = { + **CovarianceDesign._parameter_constraints, + "blocks": [list], + "groups": [GroupedFeatures, None], + } + + def __init__(self, blocks, groups=None, *, store_precision=True): + super().__init__(store_precision=store_precision) + self.blocks = blocks + self.groups = groups + + if not isinstance(blocks, list): + raise TypeError("'blocks' must be a list of CovarianceDesign instances") + if not all(isinstance(block, CovarianceDesign) for block in blocks): + raise TypeError("All blocks must be CovarianceDesign instances") + if groups is not None: + if not isinstance(groups, GroupedFeatures): + raise TypeError("'groups' must be a GroupedFeatures instance") + if len(groups.ps) != len(blocks): + raise ValueError("Number of groups must match number of blocks") + + def get_Sigma(self): + """Construct the block diagonal covariance matrix. + + Returns + ------- + Sigma : ndarray of shape (n_features, n_features) + The block diagonal covariance matrix. + """ + block_matrices = [block.get_Sigma() for block in self.blocks] + return block_diag(*block_matrices) + + def nfeatures(self): + """Get the total number of features. + + Returns + ------- + n_features : int + Total number of features across all blocks. + """ + return sum(block.nfeatures() for block in self.blocks) + + def spectrum(self): + """Compute the spectral decomposition. + + Returns + ------- + spectrum : MixtureModel + Mixture model of block spectra. + """ + spectra = [block.spectrum() for block in self.blocks] + + if self.groups is not None: + total_p = self.groups.p + mixing_prop = [ps / total_p for ps in self.groups.ps] + else: + mixing_prop = [1.0 / len(spectra)] * len(spectra) + + return MixtureModel(spectra, mixing_prop) + + +def simulate_rotated_design(cov, n, rotated_measure=None): + """Simulate a rotated design matrix. + + Parameters + ---------- + cov : CovarianceDesign + Covariance design to use. + n : int + Number of samples. + rotated_measure : callable, optional + Random number generator. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + Simulated design matrix. + + Examples + -------- + >>> from PyGRidge.src.covariance_design import AR1Design, simulate_rotated_design + >>> design = AR1Design(p=3, rho=0.5) + >>> X = simulate_rotated_design(design, n=100) + """ + if not isinstance(cov, CovarianceDesign): + raise TypeError("'cov' must be a CovarianceDesign instance") + if not isinstance(n, int): + raise TypeError("'n' must be an integer") + if n <= 0: + raise ValueError("'n' must be positive") + if rotated_measure is not None and not callable(rotated_measure): + raise TypeError("'rotated_measure' must be callable") + + if rotated_measure is None: + rotated_measure = np.random.normal + + Sigma = cov.get_Sigma() + try: + L = np.linalg.cholesky(Sigma) + except np.linalg.LinAlgError: + raise ValueError("Covariance matrix is not positive definite") + + p = cov.nfeatures() + Z = rotated_measure(size=(n, p)) + return Z @ L + + +def set_groups(design, groups_or_p): + """Set the number of features for a covariance design. + + Parameters + ---------- + design : CovarianceDesign + The covariance design to modify. + groups_or_p : int or GroupedFeatures + Number of features or feature groupings. + """ + if not isinstance(design, CovarianceDesign): + raise TypeError("'design' must be a CovarianceDesign instance") + + if isinstance(groups_or_p, int): + if groups_or_p <= 0: + raise ValueError("'groups_or_p' as an integer must be a positive value.") + + if isinstance(design, BlockCovarianceDesign): + n_blocks = len(design.blocks) + size_per_block = groups_or_p // n_blocks + remainder = groups_or_p % n_blocks + sizes = [ + size_per_block + (1 if i < remainder else 0) for i in range(n_blocks) + ] + groups = GroupedFeatures(ps=sizes) + + for block, size in zip(design.blocks, sizes): + block.p = size + design.groups = groups + else: + design.p = groups_or_p + return + + # Check if groups_or_p is a GroupedFeatures instance using __class__ comparison + if groups_or_p.__class__.__name__ == "GroupedFeatures": + if isinstance(design, BlockCovarianceDesign): + if len(groups_or_p.ps) != len(design.blocks): + raise ValueError( + "Number of groups must match number of blocks in BlockCovarianceDesign." + ) + for block, group_size in zip(design.blocks, groups_or_p.ps): + block.p = group_size + design.groups = groups_or_p + else: + design.p = groups_or_p.nfeatures() + return + + # If we get here, the input type was invalid + raise TypeError("groups_or_p must be an instance of GroupedFeatures or int.") diff --git a/src/group_lasso.py b/pygridge/src/group_lasso.py similarity index 100% rename from src/group_lasso.py rename to pygridge/src/group_lasso.py diff --git a/pygridge/src/groupedfeatures.py b/pygridge/src/groupedfeatures.py new file mode 100644 index 0000000..834134f --- /dev/null +++ b/pygridge/src/groupedfeatures.py @@ -0,0 +1,349 @@ +"""Create and manage grouped feature structures for statistical modeling.""" + +from typing import Callable, Union, TypeVar, List, Optional +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + +T = TypeVar("T") + + +class GroupedFeatures(BaseEstimator, TransformerMixin): + r"""A class representing groups of features following scikit-learn's API. + + The first :math:`ps[0]` features are one group, the next :math:`ps[1]` features are the + second group, and so forth. + + Parameters + ---------- + ps : list of int + List of positive integers representing the size of each group. + group_operation : callable, default=None + Optional function to apply to each group during transformation. + If None, features are kept as is. + + Attributes + ---------- + ps_ : list of int + List of group sizes. + n_features_in_ : int + Total number of features, calculated as :math:`p = \sum ps`. + n_groups_ : int + Number of groups, denoted as :math:`G`. + feature_groups_ : list of range + List of range objects representing the indices for each group. + + Raises + ------ + TypeError + If `ps` is not a list or contains non-integer elements. + ValueError + If any group size is not positive. + """ + + def __init__(self, ps: List[int], group_operation: Optional[Callable] = None): + self.ps = ps + self.group_operation = group_operation + + def fit(self, X, y=None): + """Fit the GroupedFeatures transformer. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : None + Ignored. This parameter exists only for compatibility with + scikit-learn's transformer interface. + + Returns + ------- + self : object + Returns self. + """ + # Validate input + X = check_array(X, accept_sparse=True) + + if not isinstance(self.ps, list): + raise TypeError( + f"ps must be a list of positive integers, got {type(self.ps).__name__}" + ) + if not all(isinstance(p, int) for p in self.ps): + raise TypeError("All group sizes in ps must be integers") + if not all(p > 0 for p in self.ps): + raise ValueError("All group sizes in ps must be positive integers") + + # Store attributes + self.ps_ = self.ps + self.n_features_in_ = sum(self.ps_) + self.n_groups_ = len(self.ps_) + + # Validate feature dimensions + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but GroupedFeatures is expecting " + f"{self.n_features_in_} features as per the ps parameter." + ) + + # Precompute group indices + starts = np.cumsum([0] + self.ps_[:-1]).astype(int) + ends = np.cumsum(self.ps_).astype(int) + self.feature_groups_ = [range(start, end) for start, end in zip(starts, ends)] + + return self + + def transform(self, X): + """Transform the data by applying the group operation if specified. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data to be transformed. + + Returns + ------- + X_transformed : ndarray of shape (n_samples, n_features) or (n_samples, n_groups) + Transformed data. If group_operation is None, returns the original data. + Otherwise, returns the result of applying group_operation to each group. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=True) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but GroupedFeatures is expecting " + f"{self.n_features_in_} features as per the fit method." + ) + + if self.group_operation is None: + return X + + # Apply group operation to each group + transformed_groups = [] + for group_range in self.feature_groups_: + group_data = X[:, group_range] + try: + transformed = self.group_operation(group_data) + if isinstance(transformed, np.ndarray): + transformed = transformed.reshape(X.shape[0], -1) + transformed_groups.append(transformed) + except Exception as e: + raise RuntimeError( + f"Error applying group_operation to group {group_range}: {e}" + ) + + return np.column_stack(transformed_groups) + + def get_feature_names_out(self, feature_names_in=None): + """Get output feature names for transformation. + + Parameters + ---------- + feature_names_in : array-like of str or None, default=None + Input feature names. If None, returns generated names. + + Returns + ------- + feature_names_out : ndarray of str objects + Output feature names. + """ + check_is_fitted(self) + + if feature_names_in is None: + feature_names_in = [f"feature{i}" for i in range(self.n_features_in_)] + + if len(feature_names_in) != self.n_features_in_: + raise ValueError( + f"Length of feature_names_in ({len(feature_names_in)}) does not match " + f"number of features ({self.n_features_in_})" + ) + + if self.group_operation is None: + return np.array(feature_names_in) + + # If group operation is specified, generate group-based feature names + group_names = [] + for i, group_range in enumerate(self.feature_groups_): + group_names.append(f"group{i}") + return np.array(group_names) + + def group_idx(self, i: int) -> range: + """Get the range of feature indices for the i-th group. + + Parameters + ---------- + i : int + Index of the group (0-based). + + Returns + ------- + range + Range object representing the indices of the group. + + Raises + ------ + TypeError + If `i` is not an integer. + IndexError + If `i` is out of range [0, n_groups_ - 1]. + """ + check_is_fitted(self) + + if not isinstance(i, int): + raise TypeError(f"Group index i must be an integer, got {type(i).__name__}") + if not (0 <= i < self.n_groups_): + raise IndexError( + f"Group index i={i} is out of range [0, {self.n_groups_ - 1}]" + ) + + return self.feature_groups_[i] + + def group_summary( + self, + vec: Union[List[T], np.ndarray], + f: Callable[[Union[List[T], np.ndarray]], T], + ) -> List[T]: + """Apply a summary function to each group of features. + + Parameters + ---------- + vec : array-like of shape (n_features,) or (n_samples, n_features) + List or ndarray of features. + f : callable + Function that takes a list or ndarray of features and returns a + summary value. + + Returns + ------- + list + List of summary values, one per group. + """ + check_is_fitted(self) + + if not callable(f): + raise TypeError("f must be a callable function") + + if isinstance(vec, np.ndarray): + vec = check_array(vec, ensure_2d=False) + if vec.ndim == 1: + if vec.shape[0] != self.n_features_in_: + raise ValueError( + f"Length of vec ({vec.shape[0]}) does not match number of " + f"features ({self.n_features_in_})" + ) + vec = vec.reshape(1, -1) + elif vec.shape[1] != self.n_features_in_: + raise ValueError( + f"Length of vec ({vec.shape[1]}) does not match number of " + f"features ({self.n_features_in_})" + ) + elif not isinstance(vec, list) or len(vec) != self.n_features_in_: + raise ValueError( + f"vec must be array-like with {self.n_features_in_} features" + ) + + summaries = [] + for group_range in self.feature_groups_: + try: + group_features = ( + [vec[j] for j in group_range] + if isinstance(vec, list) + else vec[:, group_range] + ) + summaries.append(f(group_features)) + except Exception as e: + raise RuntimeError(f"Error applying function to group {group_range}: {e}") + + return summaries + + def group_expand(self, vec_or_num: Union[List[T], T, np.ndarray]) -> List[T]: + """Expand a vector or number to a list of features. + + Parameters + ---------- + vec_or_num : int, float, list or ndarray + Either a single number or a list/ndarray with length equal to number + of groups. + + Returns + ------- + list + Expanded list of features. + """ + check_is_fitted(self) + + if isinstance(vec_or_num, (int, float)): + return [vec_or_num] * self.n_features_in_ + + if isinstance(vec_or_num, (list, np.ndarray)): + if len(vec_or_num) != self.n_groups_: + raise ValueError( + f"Length of vec_or_num ({len(vec_or_num)}) does not match number of " + f"groups ({self.n_groups_})" + ) + + if isinstance(vec_or_num, np.ndarray): + vec_or_num = vec_or_num.tolist() + + expanded = [] + for val, group_range in zip(vec_or_num, self.feature_groups_): + expanded.extend([val] * len(group_range)) + return expanded + + raise TypeError( + "vec_or_num must be either a number (int or float) or a list/ndarray, got " + f"{type(vec_or_num).__name__}" + ) + + @classmethod + def from_group_size(cls, group_size: int, num_groups: int): + """Create a GroupedFeatures instance with uniform group sizes. + + Parameters + ---------- + group_size : int + Size of each group. + num_groups : int + Number of groups. + + Returns + ------- + GroupedFeatures + Instance with uniform group sizes. + """ + if not isinstance(group_size, int): + raise TypeError( + f"group_size must be an integer, got {type(group_size).__name__}" + ) + if not isinstance(num_groups, int): + raise TypeError( + f"num_groups must be an integer, got {type(num_groups).__name__}" + ) + if group_size <= 0: + raise ValueError("group_size must be a positive integer") + if num_groups <= 0: + raise ValueError("num_groups must be a positive integer") + + return cls([group_size] * num_groups) + + +def fill(value: T, length: int) -> List[T]: + """Fill a list with a given value repeated length times. + + Parameters + ---------- + value : T + Value to fill the list with. + length : int + Number of times to repeat the value. + + Returns + ------- + list + List containing the value repeated length times. + """ + if not isinstance(length, int): + raise TypeError(f"length must be an integer, got {type(length).__name__}") + if length < 0: + raise ValueError("length must be a non-negative integer") + return [value] * length diff --git a/src/lambda_max_group_lasso.py b/pygridge/src/lambda_max_group_lasso.py similarity index 100% rename from src/lambda_max_group_lasso.py rename to pygridge/src/lambda_max_group_lasso.py diff --git a/src/lambda_max_lasso.py b/pygridge/src/lambda_max_lasso.py similarity index 100% rename from src/lambda_max_lasso.py rename to pygridge/src/lambda_max_lasso.py diff --git a/src/lambda_max_sparse_group_lasso.py b/pygridge/src/lambda_max_sparse_group_lasso.py similarity index 99% rename from src/lambda_max_sparse_group_lasso.py rename to pygridge/src/lambda_max_sparse_group_lasso.py index 163cc81..a4eda9b 100644 --- a/src/lambda_max_sparse_group_lasso.py +++ b/pygridge/src/lambda_max_sparse_group_lasso.py @@ -1,7 +1,7 @@ """Compute the maximum lambda value for sparse group lasso regularization.""" import numpy as np -from bisection import bisection +from .bisection import bisection def lambda_max_sparse_group_lasso( diff --git a/src/lasso.py b/pygridge/src/lasso.py similarity index 100% rename from src/lasso.py rename to pygridge/src/lasso.py diff --git a/src/nnls.py b/pygridge/src/nnls.py similarity index 100% rename from src/nnls.py rename to pygridge/src/nnls.py diff --git a/src/seagull.py b/pygridge/src/seagull.py similarity index 98% rename from src/seagull.py rename to pygridge/src/seagull.py index 35a7fc0..afedfb9 100644 --- a/src/seagull.py +++ b/pygridge/src/seagull.py @@ -2,12 +2,12 @@ import numpy as np import warnings -from lambda_max_lasso import lambda_max_lasso -from lambda_max_group_lasso import lambda_max_group_lasso -from lambda_max_sparse_group_lasso import lambda_max_sparse_group_lasso -from lasso import lasso -from group_lasso import group_lasso -from sparse_group_lasso import sparse_group_lasso +from .lambda_max_lasso import lambda_max_lasso +from .lambda_max_group_lasso import lambda_max_group_lasso +from .lambda_max_sparse_group_lasso import lambda_max_sparse_group_lasso +from .lasso import lasso +from .group_lasso import group_lasso +from .sparse_group_lasso import sparse_group_lasso def seagull( diff --git a/src/sparse_group_lasso.py b/pygridge/src/sparse_group_lasso.py similarity index 100% rename from src/sparse_group_lasso.py rename to pygridge/src/sparse_group_lasso.py diff --git a/test/__init__.py b/pygridge/test/__init__.py similarity index 100% rename from test/__init__.py rename to pygridge/test/__init__.py diff --git a/test/test_bisection.py b/pygridge/test/test_bisection.py similarity index 100% rename from test/test_bisection.py rename to pygridge/test/test_bisection.py diff --git a/test/test_blockridge_scikit.py b/pygridge/test/test_blockridge.py similarity index 77% rename from test/test_blockridge_scikit.py rename to pygridge/test/test_blockridge.py index 518b360..fb5f652 100644 --- a/test/test_blockridge_scikit.py +++ b/pygridge/test/test_blockridge.py @@ -1,5 +1,3 @@ -#TODO: Fails 2 tests. Need to fix. - """Tests for scikit-learn compatible Ridge regression estimators.""" import pytest @@ -8,7 +6,7 @@ from sklearn.utils.estimator_checks import check_estimator from sklearn.exceptions import NotFittedError from sklearn.datasets import make_regression -from PyGRidge.src.blockridge import ( +from ..src.blockridge import ( GroupRidgeRegressor, CholeskyRidgePredictor, WoodburyRidgePredictor, @@ -21,17 +19,14 @@ SingularMatrixError, NumericalInstabilityError, ) -from PyGRidge.src.groupedfeatures import GroupedFeatures +from ..src.groupedfeatures import GroupedFeatures @pytest.fixture def sample_data(): """Generate sample regression data.""" X, y = make_regression( - n_samples=100, - n_features=20, - n_informative=10, - random_state=42 + n_samples=100, n_features=20, n_informative=10, random_state=42 ) return X, y @@ -39,7 +34,10 @@ def sample_data(): @pytest.fixture def sample_groups(): """Create sample feature groups.""" - return GroupedFeatures([5, 5, 5, 5]) # 4 groups of 5 features each + groups = GroupedFeatures([5, 5, 5, 5]) # 4 groups of 5 features each + # Fit with dummy data to initialize internal state + groups.fit(np.zeros((1, 20))) + return groups @pytest.fixture @@ -52,16 +50,21 @@ def fitted_model(sample_data, sample_groups): def test_scikit_learn_compatibility(): """Test scikit-learn estimator compatibility.""" - # Create a simple estimator with proper array alpha - groups = GroupedFeatures([3]) # Match number of features used by scikit-learn - alpha = np.array([1.0], dtype=float) # Ensure float array + # Create data with matching dimensions + n_samples, n_features = 10, 5 + X = np.random.randn(n_samples, n_features) + y = np.random.randn(n_samples) + + # Create and fit groups + groups = GroupedFeatures([n_features]) # One group with all features + groups.fit(X) # Fit groups with the data + alpha = np.array([1.0], dtype=float) + + # Create and fit model model = GroupRidgeRegressor(groups=groups, alpha=alpha) - - # Generate simple data for initial fit - X = np.random.randn(10, 3) # Match number of features with groups - y = np.random.randn(10) - model.fit(X, y) # Pre-fit to avoid initialization issues - + model.fit(X, y) + + # Run the scikit-learn compatibility checks check_estimator(model) @@ -87,6 +90,7 @@ def test_validate_params(self, sample_groups): # Invalid cases with pytest.raises(ValueError): empty_groups = GroupedFeatures([]) + empty_groups.fit(np.zeros((1, 0))) # Fit with empty data GroupRidgeRegressor(groups=empty_groups)._validate_params() with pytest.raises(ValueError): @@ -97,31 +101,37 @@ def test_fit(self, sample_data, sample_groups): """Test model fitting.""" X, y = sample_data model = GroupRidgeRegressor(groups=sample_groups) - + # Test successful fit model.fit(X, y) - assert hasattr(model, 'coef_') - assert hasattr(model, 'n_features_in_') + assert hasattr(model, "coef_") + assert hasattr(model, "n_features_in_") assert model.n_features_in_ == X.shape[1] - assert hasattr(model, 'y_') # Check y_ is stored - assert hasattr(model, 'gram_reg_inv_') # Check gram_reg_inv_ is stored + assert hasattr(model, "y_") + assert hasattr(model, "gram_reg_inv_") - # Test predictor selection + # Test predictor selection with small dimensions X_small = X[:, :10] - model_small = GroupRidgeRegressor(groups=GroupedFeatures([5, 5])) + small_groups = GroupedFeatures([5, 5]) + small_groups.fit(X_small) # Fit groups with the data + model_small = GroupRidgeRegressor(groups=small_groups) model_small.fit(X_small, y) assert isinstance(model_small.predictor_, CholeskyRidgePredictor) - X_large = np.random.randn(10, 50) - y_large = np.random.randn(10) - model_large = GroupRidgeRegressor(groups=GroupedFeatures([25, 25])) + # Test with larger dimensions + n_samples, n_features = 20, 10 + X_large = np.random.randn(n_samples, n_features) + y_large = np.random.randn(n_samples) + large_groups = GroupedFeatures([5, 5]) + large_groups.fit(X_large) # Fit groups with the data + model_large = GroupRidgeRegressor(groups=large_groups) model_large.fit(X_large, y_large) assert isinstance(model_large.predictor_, ShermanMorrisonRidgePredictor) def test_predict(self, fitted_model, sample_data): """Test prediction functionality.""" X, _ = sample_data - + # Test prediction shape y_pred = fitted_model.predict(X) assert y_pred.shape == (X.shape[0],) @@ -135,10 +145,11 @@ def test_get_set_params(self, sample_groups): """Test parameter getting and setting.""" model = GroupRidgeRegressor(groups=sample_groups) params = model.get_params() - assert 'groups' in params - assert 'alpha' in params + assert "groups" in params + assert "alpha" in params new_groups = GroupedFeatures([3, 3]) + new_groups.fit(np.zeros((1, 6))) # Fit with dummy data model.set_params(groups=new_groups) assert model.groups == new_groups @@ -165,6 +176,7 @@ def sample_matrices(self): """Generate sample matrices for testing.""" X = np.random.randn(10, 5) groups = GroupedFeatures([2, 3]) + groups.fit(X) # Fit groups with the data alpha = np.array([0.1, 0.2]) return X, groups, alpha @@ -172,7 +184,7 @@ def test_cholesky_predictor(self, sample_matrices): """Test CholeskyRidgePredictor.""" X, groups, alpha = sample_matrices predictor = CholeskyRidgePredictor(X) - + # Test initialization assert predictor.n_samples_ == X.shape[0] assert predictor.n_features_ == X.shape[1] @@ -180,29 +192,28 @@ def test_cholesky_predictor(self, sample_matrices): # Test parameter updates predictor.set_params(groups, alpha) - assert hasattr(predictor, 'gram_reg_chol_') + assert hasattr(predictor, "gram_reg_chol_") # Test error cases with pytest.raises(InvalidDimensionsError): CholeskyRidgePredictor(X.flatten()) - # Create a truly singular matrix + # Test with singular matrix X_singular = np.zeros((10, 5)) - X_singular[:, 0] = 1 # Make first column all ones - X_singular[:, 1] = 1 # Make second column identical to first - predictor_singular = CholeskyRidgePredictor(X_singular) - with pytest.raises((SingularMatrixError, np.linalg.LinAlgError)): - predictor_singular.set_params(groups, np.zeros_like(alpha)) + X_singular[:, 0] = 1 + X_singular[:, 1] = X_singular[:, 0] + with pytest.raises(SingularMatrixError): + predictor_singular = CholeskyRidgePredictor(X_singular) def test_woodbury_predictor(self, sample_matrices): """Test WoodburyRidgePredictor.""" X, groups, alpha = sample_matrices predictor = WoodburyRidgePredictor(X) - + # Test initialization assert predictor.n_samples_ == X.shape[0] assert predictor.n_features_ == X.shape[1] - assert hasattr(predictor, 'alpha_inv_') + assert hasattr(predictor, "alpha_inv_") # Test parameter updates predictor.set_params(groups, alpha) @@ -215,11 +226,11 @@ def test_sherman_morrison_predictor(self, sample_matrices): """Test ShermanMorrisonRidgePredictor.""" X, groups, alpha = sample_matrices predictor = ShermanMorrisonRidgePredictor(X) - + # Test initialization assert predictor.n_samples_ == X.shape[0] assert predictor.n_features_ == X.shape[1] - assert hasattr(predictor, 'A_inv_') + assert hasattr(predictor, "A_inv_") # Test parameter updates predictor.set_params(groups, alpha) @@ -227,9 +238,7 @@ def test_sherman_morrison_predictor(self, sample_matrices): # Test Sherman-Morrison formula u = np.random.randn(5) v = np.random.randn(5) - result = predictor._sherman_morrison_formula( - np.eye(5), u, v - ) + result = predictor._sherman_morrison_formula(np.eye(5), u, v) assert result.shape == (5, 5) @@ -239,11 +248,11 @@ class TestMomentTuner: def test_moment_tuner_setup(self, fitted_model): """Test MomentTunerSetup initialization and attributes.""" tuner = MomentTunerSetup(fitted_model) - - assert hasattr(tuner, 'groups_') - assert hasattr(tuner, 'n_features_per_group_') - assert hasattr(tuner, 'moment_matrix_') - + + assert hasattr(tuner, "groups_") + assert hasattr(tuner, "n_features_per_group_") + assert hasattr(tuner, "moment_matrix_") + # Test shape consistency n_groups = len(tuner.n_features_per_group_) assert tuner.moment_matrix_.shape == (n_groups, n_groups) @@ -253,7 +262,7 @@ def test_get_lambdas(self, fitted_model): """Test lambda parameter computation.""" tuner = MomentTunerSetup(fitted_model) sigma_sq = 0.1 - + lambdas = get_lambdas(tuner, sigma_sq) assert len(lambdas) == len(tuner.n_features_per_group_) assert np.all(lambdas >= 0) @@ -266,7 +275,7 @@ def test_get_alpha_squared(self, fitted_model): """Test alpha squared computation.""" tuner = MomentTunerSetup(fitted_model) sigma_sq = 0.1 - + alpha_squared = get_alpha_s_squared(tuner, sigma_sq) assert len(alpha_squared) == len(tuner.n_features_per_group_) assert np.all(alpha_squared >= 0) @@ -289,12 +298,11 @@ def test_lambda_lolas_rule(fitted_model): def test_edge_cases(sample_groups): """Test various edge cases and error conditions.""" - # Test with invalid input dimensions X_1d = np.random.randn(10) y_1d = np.random.randn(10) model = GroupRidgeRegressor(groups=sample_groups) - + with pytest.raises(ValueError): model.fit(X_1d, y_1d) @@ -302,24 +310,23 @@ def test_edge_cases(sample_groups): X_nan = np.random.randn(10, 20) X_nan[0, 0] = np.nan y_nan = np.random.randn(10) - + with pytest.raises(ValueError): model.fit(X_nan, y_nan) # Test with mismatched dimensions X = np.random.randn(10, 20) y_mismatched = np.random.randn(15) - + with pytest.raises(ValueError): model.fit(X, y_mismatched) # Test with singular matrix - # Create a matrix with perfect collinearity - X_singular = np.random.randn(10, 20) - X_singular[:, 1] = 2 * X_singular[:, 0] # Make second column linearly dependent + X_singular = np.zeros((10, 20)) + X_singular[:, 0] = 1 + X_singular[:, 1] = X_singular[:, 0] y = np.random.randn(10) - - # Use small alpha to ensure numerical issues + model_singular = GroupRidgeRegressor(groups=sample_groups) with pytest.raises((SingularMatrixError, np.linalg.LinAlgError)): model_singular.fit(X_singular, y) @@ -331,14 +338,16 @@ def test_numerical_stability(): X_large = np.random.randn(10, 5) * 1e10 y_large = np.random.randn(10) * 1e10 groups = GroupedFeatures([2, 3]) + groups.fit(X_large) # Fit groups with the data model = GroupRidgeRegressor(groups=groups) - + # This should still work despite large values model.fit(X_large, y_large) - + # Test with very small values X_small = np.random.randn(10, 5) * 1e-10 y_small = np.random.randn(10) * 1e-10 - + groups.fit(X_small) # Fit groups with the data + # This should still work despite small values model.fit(X_small, y_small) diff --git a/pygridge/test/test_covariance_design.py b/pygridge/test/test_covariance_design.py new file mode 100644 index 0000000..fd7bd2e --- /dev/null +++ b/pygridge/test/test_covariance_design.py @@ -0,0 +1,287 @@ +"""Tests for covariance matrix designs. + +This module contains tests for the covariance matrix design implementations +in covariance_design.py. +""" + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils._testing import assert_almost_equal +from sklearn.exceptions import NotFittedError + +from ..src.covariance_design import ( + DiscreteNonParametric, + AR1Design, + IdentityCovarianceDesign, + UniformScalingCovarianceDesign, + ExponentialOrderStatsCovarianceDesign, + BlockDiagonal, + block_diag, + BlockCovarianceDesign, + MixtureModel, + simulate_rotated_design, + set_groups +) +from ..src.groupedfeatures import GroupedFeatures + + +@pytest.fixture +def sample_data(): + """Generate sample data for testing.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 5) + return X + + +def test_discrete_nonparametric_init(): + """Test initialization of DiscreteNonParametric.""" + # Valid initialization + spec = DiscreteNonParametric([1.0, 2.0], [0.3, 0.7]) + assert_array_equal(spec.eigs, np.array([1.0, 2.0])) + assert_array_equal(spec.probs, np.array([0.3, 0.7])) + + # Invalid inputs + with pytest.raises(TypeError): + DiscreteNonParametric(1.0, [0.5, 0.5]) # eigs not a list + with pytest.raises(TypeError): + DiscreteNonParametric([1.0, 2.0], 0.5) # probs not a list + with pytest.raises(ValueError): + DiscreteNonParametric([1.0], [0.5, 0.5]) # length mismatch + with pytest.raises(ValueError): + DiscreteNonParametric([1.0, 2.0], [0.3, 0.3]) # probs don't sum to 1 + with pytest.raises(ValueError): + DiscreteNonParametric([1.0, 2.0], [-0.1, 1.1]) # negative prob + + +def test_ar1_design(): + """Test AR1Design covariance matrix.""" + # Test initialization + design = AR1Design(p=3, rho=0.5) + design.fit() + + # Expected covariance matrix for AR(1) with rho=0.5 + expected = np.array([ + [1.0, 0.5, 0.25], + [0.5, 1.0, 0.5], + [0.25, 0.5, 1.0] + ]) + assert_array_almost_equal(design.covariance_, expected) + + # Test invalid parameters + with pytest.raises(ValueError): + AR1Design(p=3, rho=1.0) # rho must be < 1 + with pytest.raises(ValueError): + AR1Design(p=3, rho=-0.1) # rho must be >= 0 + with pytest.raises(ValueError): + AR1Design(p=-1, rho=0.5) # p must be positive + + +def test_identity_covariance_design(): + """Test IdentityCovarianceDesign.""" + design = IdentityCovarianceDesign(p=3) + design.fit() + + expected = np.eye(3) + assert_array_equal(design.covariance_, expected) + assert_array_equal(design.get_precision(), expected) + + # Test spectrum + spectrum = design.spectrum() + assert_array_equal(spectrum.eigs, np.ones(3)) + assert_array_equal(spectrum.probs, np.ones(3) / 3) + + +def test_uniform_scaling_covariance_design(): + """Test UniformScalingCovarianceDesign.""" + scaling = 2.0 + design = UniformScalingCovarianceDesign(scaling=scaling, p=3) + design.fit() + + expected = scaling * np.eye(3) + assert_array_equal(design.covariance_, expected) + assert_array_equal(design.get_precision(), np.eye(3) / scaling) + + # Test invalid parameters + with pytest.raises(ValueError): + UniformScalingCovarianceDesign(scaling=-1.0, p=3) # scaling must be positive + + +def test_exponential_order_stats_covariance_design(): + """Test ExponentialOrderStatsCovarianceDesign.""" + design = ExponentialOrderStatsCovarianceDesign(p=3, rate=1.0) + design.fit() + + # Test that covariance is diagonal and ordered + assert np.all(np.diag(design.covariance_) > 0) # positive diagonal + assert np.all(np.tril(design.covariance_, -1) == 0) # lower triangular is zero + assert np.all(np.triu(design.covariance_, 1) == 0) # upper triangular is zero + + # Test ordering of eigenvalues + eigs = np.diag(design.covariance_) + assert np.all(np.diff(eigs) < 0) # eigenvalues should be decreasing + + +def test_block_diagonal(): + """Test BlockDiagonal class and block_diag function.""" + A = np.array([[1, 0], [0, 1]]) + B = np.array([[2, 0], [0, 2]]) + + # Test BlockDiagonal class + block_diag_mat = BlockDiagonal([A, B]) + expected = np.array([ + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 2, 0], + [0, 0, 0, 2] + ]) + assert_array_equal(block_diag_mat.get_Sigma(), expected) + + # Test block_diag function + assert_array_equal(block_diag(A, B), expected) + + # Test invalid inputs + with pytest.raises(ValueError): + BlockDiagonal([A, np.array([1, 2])]) # non-square matrix + with pytest.raises(TypeError): + BlockDiagonal([A, "not a matrix"]) # invalid type + + +def test_mixture_model(): + """Test MixtureModel.""" + s1 = DiscreteNonParametric([1, 2], [0.5, 0.5]) + s2 = DiscreteNonParametric([3, 4], [0.3, 0.7]) + + # Valid initialization + mix = MixtureModel([s1, s2], [0.6, 0.4]) + assert len(mix.spectra) == 2 + assert_array_almost_equal(mix.mixing_prop, [0.6, 0.4]) + + # Invalid inputs + with pytest.raises(ValueError): + MixtureModel([s1, s2], [0.5, 0.6]) # mixing props don't sum to 1 + with pytest.raises(ValueError): + MixtureModel([s1], [0.5, 0.5]) # length mismatch + + +def test_block_covariance_design(): + """Test BlockCovarianceDesign.""" + block1 = IdentityCovarianceDesign(p=2) + block2 = UniformScalingCovarianceDesign(scaling=2.0, p=2) + + # Test without groups + design = BlockCovarianceDesign([block1, block2]) + design.fit() + + expected = np.array([ + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 2, 0], + [0, 0, 0, 2] + ]) + assert_array_almost_equal(design.covariance_, expected) + + # Test with groups + groups = GroupedFeatures(ps=[2, 2]) + groups.fit(np.zeros((1, 4))) # Fit with dummy data + design = BlockCovarianceDesign([block1, block2], groups=groups) + design.fit() + assert_array_almost_equal(design.covariance_, expected) + + +def test_simulate_rotated_design(sample_data): + """Test simulate_rotated_design function.""" + design = AR1Design(p=5, rho=0.5) + design.fit() # Ensure design is fitted + n_samples = 100 + + # Test with default random measure + X = simulate_rotated_design(design, n_samples) + assert X.shape == (n_samples, 5) + + # Test with custom random measure + def custom_measure(size): + return np.ones(size) + + X = simulate_rotated_design(design, n_samples, rotated_measure=custom_measure) + assert X.shape == (n_samples, 5) + + # Test with unfitted design + unfitted_design = AR1Design(p=5, rho=0.5) + with pytest.raises(NotFittedError): + simulate_rotated_design(unfitted_design, n_samples) + + # Test other invalid inputs + with pytest.raises(TypeError): + simulate_rotated_design("not a design", n_samples) + with pytest.raises(ValueError): + simulate_rotated_design(design, -1) + + +def test_set_groups(): + """Test set_groups function.""" + # Test with integer input + design = IdentityCovarianceDesign() + set_groups(design, 5) + assert design.p == 5 + + # Test with GroupedFeatures input + block1 = IdentityCovarianceDesign() + block2 = UniformScalingCovarianceDesign(scaling=2.0) + block_design = BlockCovarianceDesign([block1, block2]) + + groups = GroupedFeatures(ps=[2, 3]) + groups.fit(np.zeros((1, 5))) # Fit with dummy data + set_groups(block_design, groups) + assert block1.p == 2 + assert block2.p == 3 + + # Test with unfitted GroupedFeatures + unfitted_groups = GroupedFeatures(ps=[2, 3]) + with pytest.raises(NotFittedError): + set_groups(block_design, unfitted_groups) + + # Test other invalid inputs + with pytest.raises(TypeError): + set_groups(design, "not an int or GroupedFeatures") + with pytest.raises(ValueError): + set_groups(design, -1) + + +def test_covariance_design_transform(sample_data): + """Test transform method of CovarianceDesign.""" + design = AR1Design(p=5, rho=0.5) + design.fit() + + # Test successful transform + X_transformed = design.transform(sample_data) + assert X_transformed.shape == sample_data.shape + + # Test transform before fit + design_unfit = AR1Design(p=5, rho=0.5) + with pytest.raises(NotFittedError): + design_unfit.transform(sample_data) + + # Test with invalid input dimensions + with pytest.raises(ValueError): + design.transform(sample_data[:, :3]) # Wrong number of features + + +def test_covariance_design_score(sample_data): + """Test score method of CovarianceDesign.""" + design = AR1Design(p=5, rho=0.5) + design.fit() + + # Test successful score computation + score = design.score(sample_data) + assert isinstance(score, float) + assert score <= 0 # Log-likelihood should be non-positive + + # Test score before fit + design_unfit = AR1Design(p=5, rho=0.5) + with pytest.raises(NotFittedError): + design_unfit.score(sample_data) + + # Test with invalid input dimensions + with pytest.raises(ValueError): + design.score(sample_data[:, :3]) # Wrong number of features diff --git a/test/test_group_lasso.py b/pygridge/test/test_group_lasso.py similarity index 100% rename from test/test_group_lasso.py rename to pygridge/test/test_group_lasso.py diff --git a/pygridge/test/test_groupedfeatures.py b/pygridge/test/test_groupedfeatures.py new file mode 100644 index 0000000..49789da --- /dev/null +++ b/pygridge/test/test_groupedfeatures.py @@ -0,0 +1,192 @@ +"""Tests for GroupedFeatures class.""" + +import pytest +import numpy as np +from numpy.testing import assert_array_equal, assert_array_almost_equal +from sklearn.utils.estimator_checks import check_estimator +from sklearn.exceptions import NotFittedError + +from ..src.groupedfeatures import GroupedFeatures, fill + + +@pytest.fixture +def sample_data(): + """Generate sample data for testing.""" + rng = np.random.RandomState(42) + X = rng.randn(100, 6) # 6 features for 2 groups of 3 + return X + + +@pytest.fixture +def sample_groups(): + """Create sample GroupedFeatures instance.""" + return GroupedFeatures([3, 3]) # 2 groups of 3 features each + + +def test_scikit_learn_compatibility(): + """Test scikit-learn estimator compatibility.""" + groups = GroupedFeatures([2, 2]) + check_estimator(groups) + + +def test_initialization(): + """Test GroupedFeatures initialization.""" + # Valid initialization + groups = GroupedFeatures([2, 3]) + assert groups.ps == [2, 3] + + # Invalid initializations + with pytest.raises(TypeError): + GroupedFeatures("not a list") + with pytest.raises(TypeError): + GroupedFeatures([1.5, 2]) # non-integer + with pytest.raises(ValueError): + GroupedFeatures([0, 1]) # zero group size + with pytest.raises(ValueError): + GroupedFeatures([-1, 2]) # negative group size + + +def test_from_group_size(): + """Test from_group_size class method.""" + groups = GroupedFeatures.from_group_size(2, 3) + assert groups.ps == [2, 2, 2] + + with pytest.raises(TypeError): + GroupedFeatures.from_group_size(1.5, 2) # non-integer group_size + with pytest.raises(TypeError): + GroupedFeatures.from_group_size(2, 2.5) # non-integer num_groups + with pytest.raises(ValueError): + GroupedFeatures.from_group_size(0, 2) # zero group_size + with pytest.raises(ValueError): + GroupedFeatures.from_group_size(2, 0) # zero num_groups + + +def test_fit_transform(sample_data, sample_groups): + """Test fit and transform methods.""" + # Test fit + sample_groups.fit(sample_data) + assert hasattr(sample_groups, "n_features_in_") + assert sample_groups.n_features_in_ == 6 + assert hasattr(sample_groups, "feature_groups_") + assert len(sample_groups.feature_groups_) == 2 + + # Test transform without group_operation (should return same data) + X_transformed = sample_groups.transform(sample_data) + assert_array_equal(X_transformed, sample_data) + + # Test transform with group_operation + def mean_operation(group_data): + return np.mean(group_data, axis=1, keepdims=True) + + groups_with_op = GroupedFeatures([3, 3], group_operation=mean_operation) + groups_with_op.fit(sample_data) + X_transformed = groups_with_op.transform(sample_data) + assert X_transformed.shape == (100, 2) # One value per group + + # Test transform before fit + groups_unfit = GroupedFeatures([3, 3]) + with pytest.raises(NotFittedError): + groups_unfit.transform(sample_data) + + +def test_get_feature_names_out(sample_data, sample_groups): + """Test get_feature_names_out method.""" + sample_groups.fit(sample_data) + + # Test without input feature names + feature_names = sample_groups.get_feature_names_out() + assert len(feature_names) == 6 + assert all(isinstance(name, str) for name in feature_names) + + # Test with input feature names + input_names = [f"feat_{i}" for i in range(6)] + feature_names = sample_groups.get_feature_names_out(input_names) + assert_array_equal(feature_names, input_names) + + # Test with group_operation (should return group names) + def mean_operation(group_data): + return np.mean(group_data, axis=1, keepdims=True) + + groups_with_op = GroupedFeatures([3, 3], group_operation=mean_operation) + groups_with_op.fit(sample_data) + feature_names = groups_with_op.get_feature_names_out() + assert len(feature_names) == 2 + assert all("group" in name for name in feature_names) + + +def test_group_idx(sample_groups): + """Test group_idx method.""" + sample_groups.fit(np.random.randn(10, 6)) # Fit with dummy data + + # Test valid indices + idx0 = sample_groups.group_idx(0) + assert list(idx0) == [0, 1, 2] + idx1 = sample_groups.group_idx(1) + assert list(idx1) == [3, 4, 5] + + # Test invalid indices + with pytest.raises(TypeError): + sample_groups.group_idx(1.5) # non-integer + with pytest.raises(IndexError): + sample_groups.group_idx(2) # out of range + + +def test_group_summary(sample_data, sample_groups): + """Test group_summary method.""" + sample_groups.fit(sample_data) + + # Test with numpy array + def mean_summary(x): + return np.mean(x) + + summaries = sample_groups.group_summary(sample_data, mean_summary) + assert len(summaries) == 2 + + # Test with list + data_list = sample_data.tolist() + summaries_list = sample_groups.group_summary(data_list, mean_summary) + assert len(summaries_list) == 2 + + # Test invalid inputs + with pytest.raises(TypeError): + sample_groups.group_summary(sample_data, "not a function") + + +def test_group_expand(sample_groups): + """Test group_expand method.""" + sample_groups.fit(np.random.randn(10, 6)) # Fit with dummy data + + # Test with single number + expanded = sample_groups.group_expand(1.0) + assert len(expanded) == 6 + assert all(x == 1.0 for x in expanded) + + # Test with list + expanded = sample_groups.group_expand([1.0, 2.0]) + assert len(expanded) == 6 + assert expanded[:3] == [1.0, 1.0, 1.0] + assert expanded[3:] == [2.0, 2.0, 2.0] + + # Test with numpy array + expanded = sample_groups.group_expand(np.array([1.0, 2.0])) + assert len(expanded) == 6 + assert all(expanded[:3] == 1.0) + assert all(expanded[3:] == 2.0) + + # Test invalid inputs + with pytest.raises(ValueError): + sample_groups.group_expand([1.0]) # wrong length + with pytest.raises(TypeError): + sample_groups.group_expand("not a number or list") + + +def test_fill(): + """Test fill function.""" + filled = fill(1.0, 3) + assert len(filled) == 3 + assert all(x == 1.0 for x in filled) + + with pytest.raises(TypeError): + fill(1.0, 1.5) # non-integer length + with pytest.raises(ValueError): + fill(1.0, -1) # negative length diff --git a/test/test_lambda_max_group_lasso.py b/pygridge/test/test_lambda_max_group_lasso.py similarity index 100% rename from test/test_lambda_max_group_lasso.py rename to pygridge/test/test_lambda_max_group_lasso.py diff --git a/test/test_lambda_max_lasso.py b/pygridge/test/test_lambda_max_lasso.py similarity index 100% rename from test/test_lambda_max_lasso.py rename to pygridge/test/test_lambda_max_lasso.py diff --git a/test/test_lambda_max_sparse_group_lasso.py b/pygridge/test/test_lambda_max_sparse_group_lasso.py similarity index 100% rename from test/test_lambda_max_sparse_group_lasso.py rename to pygridge/test/test_lambda_max_sparse_group_lasso.py diff --git a/test/test_lasso.py b/pygridge/test/test_lasso.py similarity index 100% rename from test/test_lasso.py rename to pygridge/test/test_lasso.py diff --git a/test/test_nnls.py b/pygridge/test/test_nnls.py similarity index 100% rename from test/test_nnls.py rename to pygridge/test/test_nnls.py diff --git a/pygridge/test/test_seagull.py b/pygridge/test/test_seagull.py new file mode 100644 index 0000000..a07f23f --- /dev/null +++ b/pygridge/test/test_seagull.py @@ -0,0 +1,185 @@ +"""Unit tests for seagull estimator.""" + +import pytest +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import warnings +from ..src.seagull import seagull + + +@pytest.fixture +def sample_data(): + """Create sample data for testing. + + Returns + ------- + tuple + Tuple containing (y, X, Z, weights_u, groups) where: + - y is the target variable + - X is the fixed effects design matrix + - Z is the random effects design matrix + - weights_u is the weights for random effects + - groups is the group labels + """ + rng = np.random.RandomState(42) + n_samples = 5 + + y = np.arange(1, n_samples + 1, dtype=np.float64) + X = rng.randn(n_samples, 2).astype(np.float64) + Z = np.column_stack([ + np.ones(n_samples), + np.arange(1, n_samples + 1), + np.arange(1, n_samples + 1) ** 2 + ]).astype(np.float64) + weights_u = np.ones(3, dtype=np.float64) + groups = np.array([1, 1, 2, 2, 3], dtype=np.float64) + + return y, X, Z, weights_u, groups + + +class TestSeagullInputValidation: + """Test input validation in seagull.""" + + def test_missing_y(self): + """Test if error is raised when y is missing.""" + with pytest.raises(ValueError, match="Vector y is missing"): + seagull(y=None, Z=np.array([[1, 2], [3, 4]])) + + def test_empty_y(self): + """Test if error is raised when y is empty.""" + with pytest.raises(ValueError, match="Vector y is empty"): + seagull(y=np.array([]), Z=np.array([[1, 2], [3, 4]])) + + def test_non_numeric_y(self): + """Test if error is raised when y contains non-numeric values.""" + with pytest.raises(ValueError, match="Non-numeric values detected"): + seagull(y=np.array(['a', 'b']), Z=np.array([[1, 2], [3, 4]])) + + def test_nan_inf_y(self): + """Test if error is raised when y contains NaN or Inf.""" + with pytest.raises(ValueError, match="NA, NaN, or Inf detected"): + seagull(y=np.array([1, np.nan, 3]), Z=np.array([[1, 2], [3, 4], [5, 6]])) + + def test_missing_Z(self): + """Test if error is raised when Z is missing.""" + with pytest.raises(ValueError, match="Matrix Z is missing"): + seagull(y=np.array([1, 2, 3]), Z=None) + + def test_non_numeric_Z(self): + """Test if error is raised when Z contains non-numeric values.""" + with pytest.raises(ValueError, match="Non-numeric values detected"): + seagull(y=np.array([1, 2]), Z=np.array([['a', 'b'], ['c', 'd']])) + + def test_nan_inf_Z(self): + """Test if error is raised when Z contains NaN or Inf.""" + with pytest.raises(ValueError, match="NA, NaN, or Inf detected"): + seagull(y=np.array([1, 2]), Z=np.array([[1, np.inf], [3, 4]])) + + def test_mismatching_dimensions(self): + """Test if error is raised when dimensions don't match.""" + with pytest.raises(ValueError, match="Mismatching dimensions"): + seagull(y=np.array([1, 2, 3]), Z=np.array([[1, 2], [3, 4]])) + + +class TestSeagullEstimator: + """Test seagull estimator functionality.""" + + def test_lasso_path(self, sample_data): + """Test lasso path computation.""" + y, X, Z, weights_u, _ = sample_data + result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, alpha=1.0) + + assert result["result"] == "lasso" + assert isinstance(result["lambda_values"], np.ndarray) + assert isinstance(result["beta"], np.ndarray) + assert result["beta"].shape[0] == X.shape[1] + Z.shape[1] + + def test_group_lasso_path(self, sample_data): + """Test group lasso path computation.""" + y, X, Z, weights_u, groups = sample_data + result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, + groups=groups, alpha=0.0) + + assert result["result"] == "group_lasso" + assert isinstance(result["lambda_values"], np.ndarray) + assert isinstance(result["beta"], np.ndarray) + + def test_sparse_group_lasso_path(self, sample_data): + """Test sparse group lasso path computation.""" + y, X, Z, weights_u, groups = sample_data + result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, + groups=groups, alpha=0.5) + + assert result["result"] == "sparse_group_lasso" + assert isinstance(result["lambda_values"], np.ndarray) + assert isinstance(result["beta"], np.ndarray) + + +class TestSeagullParameters: + """Test seagull parameter validation and defaults.""" + + def test_invalid_alpha(self, sample_data): + """Test handling of invalid alpha parameter.""" + y, _, Z, _, _ = sample_data + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = seagull(y=y, Z=Z, alpha=2.0) + assert len(w) == 1 + assert "alpha is out of range" in str(w[0].message) + assert result["result"] == "lasso" + + def test_missing_groups_for_group_lasso(self, sample_data): + """Test if error is raised when groups are missing for group lasso.""" + y, X, Z, weights_u, _ = sample_data + with pytest.raises(ValueError, match="Vector groups is missing"): + seagull(y=y, X=X, Z=Z, weights_u=weights_u, alpha=0.5) + + def test_parameter_validation(self, sample_data): + """Test validation of numerical parameters.""" + y, _, Z, _, _ = sample_data + + # Test rel_acc validation + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + seagull(y=y, Z=Z, rel_acc=-0.1) + assert any("rel_acc is non-positive" in str(warn.message) for warn in w) + + # Test max_iter validation + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + seagull(y=y, Z=Z, max_iter=0) + assert any("max_iter is non-positive" in str(warn.message) for warn in w) + + # Test gamma_bls validation + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + seagull(y=y, Z=Z, gamma_bls=1.5) + assert any("gamma_bls is out of range" in str(warn.message) for warn in w) + + +class TestSeagullEdgeCases: + """Test seagull edge cases and corner cases.""" + + def test_single_feature(self): + """Test with single feature.""" + y = np.array([1., 2., 3.]) + Z = np.array([[1.], [2.], [3.]]) + result = seagull(y=y, Z=Z) + assert result is not None + assert isinstance(result, dict) + + def test_perfect_fit(self): + """Test with perfectly correlated data.""" + y = np.array([1., 2., 3.]) + Z = np.array([[1., 0.], [2., 0.], [3., 0.]]) + result = seagull(y=y, Z=Z) + assert result is not None + assert isinstance(result, dict) + + def test_constant_feature(self): + """Test with constant feature.""" + y = np.array([1., 2., 3.]) + Z = np.array([[1., 1.], [2., 1.], [3., 1.]]) + result = seagull(y=y, Z=Z) + assert result is not None + assert isinstance(result, dict) diff --git a/test/test_sparse_group_lasso.py b/pygridge/test/test_sparse_group_lasso.py similarity index 100% rename from test/test_sparse_group_lasso.py rename to pygridge/test/test_sparse_group_lasso.py diff --git a/src/covariance_design.py b/src/covariance_design.py deleted file mode 100644 index db2bbbe..0000000 --- a/src/covariance_design.py +++ /dev/null @@ -1,914 +0,0 @@ -"""Create covariance matrix designs for various statistical models.""" - -import numpy as np -from abc import ABC, abstractmethod -from typing import List, Callable, Union - -from groupedfeatures import GroupedFeatures, fill - - -class DiscreteNonParametric: - """Represents a discrete non-parametric spectrum characterized by eigenvalues and their - associated probabilities. - - Parameters - ---------- - eigs : list of float - A list of eigenvalues. - probs : list of float - A list of probabilities corresponding to each eigenvalue. - - Attributes - ---------- - eigs : numpy.ndarray - Numpy array of eigenvalues. - probs : numpy.ndarray - Numpy array of probabilities for each eigenvalue. - - Examples - -------- - >>> spectrum = DiscreteNonParametric(eigs=[1.0, 2.0, 3.0], probs=[0.2, 0.3, 0.5]) - """ - - def __init__(self, eigs: List[float], probs: List[float]): - if not isinstance(eigs, list): - raise TypeError( - f"'eigs' must be a list of floats, got {type(eigs).__name__}" - ) - if not all(isinstance(e, (int, float)) for e in eigs): - raise ValueError("All elements in 'eigs' must be integers or floats.") - if not isinstance(probs, list): - raise TypeError( - f"'probs' must be a list of floats, got {type(probs).__name__}" - ) - if not all(isinstance(p, (int, float)) for p in probs): - raise ValueError("All elements in 'probs' must be integers or floats.") - if len(eigs) != len(probs): - raise ValueError("'eigs' and 'probs' must be of the same length.") - if not np.isclose(sum(probs), 1.0): - raise ValueError("The probabilities in 'probs' must sum to 1.") - if any(p < 0 for p in probs): - raise ValueError("Probabilities in 'probs' must be non-negative.") - - self.eigs = np.array(eigs) - self.probs = np.array(probs) - - -class CovarianceDesign(ABC): - r"""Abstract base class defining the interface for covariance matrix designs. - - This class outlines the essential methods that any covariance design must implement: - - `get_Sigma`: Retrieve the covariance matrix. - - `nfeatures`: Obtain the number of features (dimensions) in the covariance matrix. - - `spectrum`: Get the spectral representation of the covariance matrix. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Constructs and returns the covariance matrix :math:`\Sigma`. - - nfeatures() -> int - Retrieves the number of features (dimensions) in the covariance matrix :math:`\Sigma`. - - spectrum() -> DiscreteNonParametric - Computes the spectral decomposition of the covariance matrix :math:`\Sigma`. - """ - - def __init__(self): - pass - - @abstractmethod - def get_Sigma(self) -> np.ndarray: - r"""Constructs and returns the covariance matrix :math:`\Sigma`. - - Returns - ------- - numpy.ndarray - The covariance matrix. - """ - pass - - @abstractmethod - def nfeatures(self) -> int: - r"""Retrieves the number of features (dimensions) in the covariance matrix :math:`\Sigma`. - - Returns - ------- - int - Number of features. - """ - pass - - @abstractmethod - def spectrum(self) -> DiscreteNonParametric: - r"""Computes the spectral decomposition of the covariance matrix :math:`\Sigma`. - - Returns - ------- - DiscreteNonParametric - Spectrum containing eigenvalues and their probabilities. - """ - pass - - -class AR1Design(CovarianceDesign): - r"""Implements an AutoRegressive model of order 1 (AR(1)) for covariance matrix design. - - In an AR(1) model, each element :math:`\Sigma_{i,j}` of the covariance matrix :math:`\Sigma` is defined as: - - .. math:: - \Sigma_{i,j} = \rho^{|i - j|} - - where :math:`\rho` is the correlation coefficient between adjacent features. - - Parameters - ---------- - p : int, optional - Number of features (dimensions). Must be set before generating :math:`\Sigma`. - rho : float, default=0.7 - The AR(1) parameter representing the correlation coefficient. - - Attributes - ---------- - p : int - Number of features. - rho : float - AR(1) correlation coefficient. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Constructs the AR(1) covariance matrix :math:`\Sigma`. - - nfeatures() -> int - Returns the number of features :math:`p`. - - spectrum() -> DiscreteNonParametric - Computes the eigenvalues of :math:`\Sigma` and assigns equal probability to each. - """ - - def __init__(self, p: int = None, rho: float = 0.7): - super().__init__() - if p is not None: - if not isinstance(p, int): - raise TypeError(f"'p' must be an integer, got {type(p).__name__}") - if p <= 0: - raise ValueError("'p' must be a positive integer.") - if not isinstance(rho, (int, float)): - raise TypeError(f"'rho' must be a float, got {type(rho).__name__}") - if not (0 <= rho < 1): - raise ValueError("'rho' must be in the interval [0, 1).") - self.p = p - self.rho = rho - - def get_Sigma(self) -> np.ndarray: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if self.p <= 0: - raise ValueError("'p' must be a positive integer.") - - rho = self.rho - indices = np.arange(self.p) - - try: - Sigma = rho ** np.abs(np.subtract.outer(indices, indices)) - except Exception as e: - raise RuntimeError(f"Failed to compute covariance matrix: {e}") - - if not np.allclose(Sigma, Sigma.T): - raise ValueError("Covariance matrix :math:`\\Sigma` must be symmetric.") - - return Sigma - - def nfeatures(self) -> int: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if not isinstance(self.p, int) or self.p <= 0: - raise ValueError("'p' must be a positive integer.") - return self.p - - def spectrum(self) -> DiscreteNonParametric: - Sigma = self.get_Sigma() - try: - eigs = np.linalg.eigvals(Sigma) - except np.linalg.LinAlgError as e: - raise RuntimeError(f"Eigenvalue computation failed: {e}") - - if eigs.size == 0: - raise ValueError("Covariance matrix :math:`\\Sigma` has no eigenvalues.") - - probs = fill(1.0 / len(eigs), len(eigs)) - if not np.isclose(np.sum(probs), 1.0): - raise ValueError("Probabilities must sum to 1.") - - return DiscreteNonParametric(eigs.tolist(), probs) - - -class DiagonalCovarianceDesign(CovarianceDesign): - r"""Abstract base class for covariance designs that produce diagonal covariance matrices. - - Since diagonal covariance matrices have non-zero entries only on the diagonal, this class - provides a common structure for such designs, managing the number of features. - - Parameters - ---------- - p : int, optional - Number of features (dimensions). Must be set before generating :math:`\Sigma`. - - Attributes - ---------- - p : int - Number of features. - - Methods - ------- - nfeatures() -> int - Returns the number of features :math:`p`. - """ - - def __init__(self, p: int = None): - super().__init__() - if p is not None: - if not isinstance(p, int): - raise TypeError(f"'p' must be an integer, got {type(p).__name__}") - if p <= 0: - raise ValueError("'p' must be a positive integer.") - self.p = p - - def nfeatures(self) -> int: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if not isinstance(self.p, int) or self.p <= 0: - raise ValueError("'p' must be a positive integer.") - return self.p - - -class IdentityCovarianceDesign(DiagonalCovarianceDesign): - r"""Constructs an identity covariance matrix :math:`\Sigma`, where all diagonal entries are 1 and - off-diagonal entries are 0. - - The identity matrix represents uncorrelated features with unit variance. - - Parameters - ---------- - p : int, optional - Number of features (dimensions). Must be set before generating :math:`\Sigma`. - - Attributes - ---------- - p : int - Number of features. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Returns the identity matrix of size :math:`p \times p`. - - spectrum() -> DiscreteNonParametric - Returns a spectrum where all eigenvalues are 1 with equal probabilities. - """ - - def __init__(self, p: int = None): - super().__init__(p) - - def get_Sigma(self) -> np.ndarray: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if self.p <= 0: - raise ValueError("'p' must be a positive integer.") - - try: - Sigma = np.identity(self.p) - except Exception as e: - raise RuntimeError(f"Failed to create identity matrix: {e}") - - return Sigma - - def spectrum(self) -> DiscreteNonParametric: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - - eigs = [1.0] * self.p - probs = [1.0 / self.p] * self.p - - if not np.isclose(sum(probs), 1.0): - raise ValueError("Probabilities must sum to 1.") - - return DiscreteNonParametric(eigs, probs) - - -class UniformScalingCovarianceDesign(DiagonalCovarianceDesign): - r"""Constructs a diagonal covariance matrix :math:`\Sigma` with uniform scaling on the diagonal. - - Each diagonal entry :math:`\Sigma_{i,i}` is set to a constant scaling factor. - - Parameters - ---------- - scaling : float, default=1.0 - The scaling factor applied uniformly to all diagonal entries. - p : int, optional - Number of features (dimensions). Must be set before generating :math:`\Sigma`. - - Attributes - ---------- - scaling : float - Scaling factor for the diagonal entries. - p : int - Number of features. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Returns a diagonal matrix with each diagonal entry equal to `scaling`. - - spectrum() -> DiscreteNonParametric - Returns a spectrum where all eigenvalues are equal to `scaling` with equal probabilities. - """ - - def __init__(self, scaling: float = 1.0, p: int = None): - super().__init__(p) - if not isinstance(scaling, (int, float)): - raise TypeError(f"'scaling' must be a float, got {type(scaling).__name__}") - if scaling <= 0: - raise ValueError("'scaling' must be a positive number.") - self.scaling = scaling - - def get_Sigma(self) -> np.ndarray: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if self.p <= 0: - raise ValueError("'p' must be a positive integer.") - - try: - Sigma = self.scaling * np.identity(self.p) - except Exception as e: - raise RuntimeError(f"Failed to create scaled identity matrix: {e}") - - return Sigma - - def spectrum(self) -> DiscreteNonParametric: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - - eigs = [self.scaling] * self.p - probs = [1.0 / self.p] * self.p - - if not np.isclose(sum(probs), 1.0): - raise ValueError("Probabilities must sum to 1.") - - return DiscreteNonParametric(eigs, probs) - - -class ExponentialOrderStatsCovarianceDesign(DiagonalCovarianceDesign): - r"""Constructs a diagonal covariance matrix using exponential order statistics for the - eigenvalues. The eigenvalues are generated based on the order statistics of exponential - random variables with a specified rate. - - The i-th eigenvalue is computed as: - - .. math:: - \lambda_i = \frac{1}{\text{rate}} \log\left(\frac{1}{t_i}\right) - - where :math:`t_i` are uniformly spaced points in the interval \left(\frac{1}{2p}, 1 - \frac{1}{2p}\right). - - Parameters - ---------- - p : int, optional - Number of features (dimensions). Must be set before generating :math:`\Sigma`. - rate : float, default=1.0 - Rate parameter for the exponential distribution. - - Attributes - ---------- - rate : float - Rate parameter for generating eigenvalues. - p : int - Number of features. - - Methods - ------- - spectrum() -> DiscreteNonParametric - Generates the eigenvalues based on exponential order statistics and assigns equal - probabilities. - - get_Sigma() -> numpy.ndarray - Returns a diagonal matrix with the generated eigenvalues. - """ - - def __init__(self, p: int = None, rate: float = 1.0): - super().__init__(p) - if p is not None: - if not isinstance(p, int): - raise TypeError(f"'p' must be an integer, got {type(p).__name__}") - if p <= 0: - raise ValueError("'p' must be a positive integer.") - if not isinstance(rate, (int, float)): - raise TypeError(f"'rate' must be a float, got {type(rate).__name__}") - if rate <= 0: - raise ValueError("'rate' must be a positive number.") - self.rate = rate - - def spectrum(self) -> DiscreteNonParametric: - if self.p is None: - raise ValueError("Number of features 'p' is not set.") - if self.p <= 0: - raise ValueError("'p' must be a positive integer.") - - p = self.p - rate = self.rate - - try: - tmp = np.linspace(1 / (2 * p), 1 - 1 / (2 * p), p) - eigs = (1 / rate) * np.log(1 / tmp) - except Exception as e: - raise RuntimeError(f"Failed to compute eigenvalues: {e}") - - probs = fill(1.0 / p, p) - - if not np.isclose(np.sum(probs), 1.0): - raise ValueError("Probabilities must sum to 1.") - if len(eigs) != p: - raise ValueError("Number of eigenvalues must match the number of features.") - - return DiscreteNonParametric(eigs.tolist(), probs) - - def get_Sigma(self) -> np.ndarray: - spectrum = self.spectrum() - - try: - Sigma = np.diag(spectrum.eigs) - except Exception as e: - raise RuntimeError( - f"Failed to create diagonal matrix from eigenvalues: {e}" - ) - - if Sigma.shape[0] != Sigma.shape[1]: - raise ValueError("Covariance matrix :math:`\\Sigma` must be square.") - if not np.allclose(Sigma, Sigma.T): - raise ValueError("Covariance matrix :math:`\\Sigma` must be symmetric.") - - return Sigma - - -class BlockDiagonal: - r"""Represents a block diagonal matrix composed of smaller square matrices (blocks). - - The overall covariance matrix :math:`\Sigma` is constructed by placing each block along the - diagonal, with all off-diagonal blocks being zero matrices. - - For example, given blocks :math:`B_1, B_2, \ldots, B_k`, the block diagonal matrix :math:`\Sigma` is: - - .. math:: - \Sigma = \begin{bmatrix} - B_1 & 0 & \cdots & 0 \\ - 0 & B_2 & \cdots & 0 \\ - \vdots & \vdots & \ddots & \vdots \\ - 0 & 0 & \cdots & B_k - \end{bmatrix} - - Parameters - ---------- - blocks : list of numpy.ndarray - A list of square numpy arrays to be placed on the diagonal. - - Attributes - ---------- - blocks : list of numpy.ndarray - List of block matrices. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Constructs and returns the block diagonal matrix :math:`\Sigma`. - """ - - def __init__(self, blocks: List[np.ndarray]): - if not isinstance(blocks, list): - raise TypeError( - f"'blocks' must be a list of numpy arrays, got {type(blocks).__name__}" - ) - if not all(isinstance(block, np.ndarray) for block in blocks): - raise TypeError("All elements in 'blocks' must be numpy.ndarray instances.") - if not all(block.ndim == 2 for block in blocks): - raise ValueError("All blocks must be 2-dimensional numpy arrays.") - if not all(block.shape[0] == block.shape[1] for block in blocks): - raise ValueError("All blocks must be square matrices.") - - self.blocks = blocks - - def get_Sigma(self) -> np.ndarray: - try: - Sigma = block_diag(*self.blocks) - except Exception as e: - raise RuntimeError(f"Failed to construct block diagonal matrix: {e}") - - if Sigma.size == 0: - raise ValueError("Resulting covariance matrix :math:`\\Sigma` is empty.") - if not np.allclose(Sigma, Sigma.T): - raise ValueError("Covariance matrix :math:`\\Sigma` must be symmetric.") - - return Sigma - - -def block_diag(*arrs): - """Constructs a block diagonal matrix from the provided input arrays. - - If no arrays are provided, returns an empty 2D array. - - Parameters - ---------- - *arrs : numpy.ndarray - Variable number of square 2D numpy arrays to be placed on the diagonal. - - Returns - ------- - numpy.ndarray - The resulting block diagonal matrix. - - Examples - -------- - >>> A = np.array([[1, 0], [0, 1]]) - >>> B = np.array([[2, 0], [0, 2]]) - >>> block_diag(A, B) - array([[1, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 2, 0], - [0, 0, 0, 2]]) - """ - if not all(isinstance(a, np.ndarray) for a in arrs): - raise TypeError("All arguments must be numpy.ndarray instances.") - if not all(a.ndim == 2 for a in arrs): - raise ValueError("All input arrays must be 2-dimensional.") - if not all(a.shape[0] == a.shape[1] for a in arrs): - raise ValueError("All input arrays must be square matrices.") - - if len(arrs) == 0: - return np.array([[]]) - - try: - shapes = np.array([a.shape for a in arrs]) - out_shape = np.sum(shapes, axis=0) - out = np.zeros(out_shape, dtype=arrs[0].dtype) - r, c = 0, 0 - for a in arrs: - out[r : r + a.shape[0], c : c + a.shape[1]] = a - r += a.shape[0] - c += a.shape[1] - except Exception as e: - raise RuntimeError(f"Failed to construct block diagonal matrix: {e}") - - return out - - -class MixtureModel: - r"""Represents a mixture model consisting of multiple spectra, each weighted by a - mixing proportion. - - The mixture model :math:`\Sigma` is defined as: - - .. math:: - \Sigma = \sum_{i=1}^{k} \pi_i \Sigma_i - - where :math:`\Sigma_i` are individual covariance matrices (spectra) and :math:`\pi_i` are their - corresponding mixing proportions. - - Parameters - ---------- - spectra : list of DiscreteNonParametric - A list of spectra that comprise the mixture. - mixing_prop : list of float - A list of mixing proportions corresponding to each spectrum. - - Attributes - ---------- - spectra : list of DiscreteNonParametric - List of component spectra. - mixing_prop : list of float - Mixing proportions for each spectrum. - - Examples - -------- - >>> spectrum1 = DiscreteNonParametric(eigs=[1, 2], probs=[0.5, 0.5]) - >>> spectrum2 = DiscreteNonParametric(eigs=[3, 4], probs=[0.3, 0.7]) - >>> mixture = MixtureModel(spectra=[spectrum1, spectrum2], mixing_prop=[0.6, 0.4]) - """ - - def __init__(self, spectra: List[DiscreteNonParametric], mixing_prop: List[float]): - if not isinstance(spectra, list): - raise TypeError( - "'spectra' must be a list of DiscreteNonParametric instances, got" - f" {type(spectra).__name__}" - ) - if not all(isinstance(s, DiscreteNonParametric) for s in spectra): - raise TypeError( - "All elements in 'spectra' must be instances of DiscreteNonParametric." - ) - if not isinstance(mixing_prop, list): - raise TypeError( - "'mixing_prop' must be a list of floats, got" - f" {type(mixing_prop).__name__}" - ) - if not all(isinstance(p, (int, float)) for p in mixing_prop): - raise ValueError( - "All elements in 'mixing_prop' must be integers or floats." - ) - if len(spectra) != len(mixing_prop): - raise ValueError("'spectra' and 'mixing_prop' must be of the same length.") - if not np.isclose(sum(mixing_prop), 1.0): - raise ValueError("The mixing proportions in 'mixing_prop' must sum to 1.") - if any(p < 0 for p in mixing_prop): - raise ValueError( - "Mixing proportions in 'mixing_prop' must be non-negative." - ) - - self.spectra = spectra - self.mixing_prop = mixing_prop - - -class BlockCovarianceDesign(CovarianceDesign): - r"""Constructs a block covariance matrix by composing multiple covariance designs. - - Each block within the block diagonal structure can have its own covariance characteristics, - allowing for complex covariance structures composed of simpler sub-components. - - Parameters - ---------- - blocks : list of CovarianceDesign - A list of covariance design instances, each representing a block. - groups : GroupedFeatures, optional - An instance specifying feature groupings. If provided, - it adjusts the mixing proportions based on group sizes. - - Attributes - ---------- - blocks : list of CovarianceDesign - List of covariance designs for each block. - groups : GroupedFeatures, optional - Feature groupings affecting mixing proportions. - - Methods - ------- - get_Sigma() -> numpy.ndarray - Constructs and returns the block diagonal covariance matrix :math:`\Sigma`. - - nfeatures() -> int - Returns the total number of features across all blocks. - - spectrum() -> MixtureModel - Combines the spectra of all blocks into a single mixture model, adjusting mixing - proportions based on group sizes if `groups` is provided. - """ - - def __init__(self, blocks: List[CovarianceDesign], groups: GroupedFeatures = None): - super().__init__() - if not isinstance(blocks, list): - raise TypeError( - "'blocks' must be a list of CovarianceDesign instances, got" - f" {type(blocks).__name__}" - ) - if not all(isinstance(block, CovarianceDesign) for block in blocks): - raise TypeError( - "All elements in 'blocks' must be instances of CovarianceDesign." - ) - if groups is not None and not isinstance(groups, GroupedFeatures): - raise TypeError( - "'groups' must be an instance of GroupedFeatures or None, got" - f" {type(groups).__name__}" - ) - - self.blocks = blocks - self.groups = groups - - if self.groups is not None: - if len(self.groups.ps) != len(self.blocks): - raise ValueError( - "Number of groups must match number of blocks in" - " BlockCovarianceDesign." - ) - - def get_Sigma(self) -> np.ndarray: - try: - block_matrices = [block.get_Sigma() for block in self.blocks] - except Exception as e: - raise RuntimeError( - f"Failed to retrieve covariance matrices from blocks: {e}" - ) - - try: - Sigma = block_diag(*block_matrices) - except Exception as e: - raise RuntimeError( - f"Failed to construct block diagonal covariance matrix: {e}" - ) - - if Sigma.size == 0: - raise ValueError("Resulting covariance matrix :math:`\\Sigma` is empty.") - if not np.allclose(Sigma, Sigma.T): - raise ValueError("Covariance matrix :math:`\\Sigma` must be symmetric.") - - return Sigma - - def nfeatures(self) -> int: - try: - total_features = sum(block.nfeatures() for block in self.blocks) - except Exception as e: - raise RuntimeError(f"Failed to compute total number of features: {e}") - - if total_features <= 0: - raise ValueError("Total number of features must be positive.") - - return total_features - - def spectrum(self) -> MixtureModel: - try: - spectra = [block.spectrum() for block in self.blocks] - except Exception as e: - raise RuntimeError(f"Failed to retrieve spectra from blocks: {e}") - - if self.groups is not None: - if len(self.groups.ps) != len(spectra): - raise ValueError("Number of groups must match number of blocks.") - - mixing_prop = [] - total_p = self.groups.p - - if total_p == 0: - raise ValueError( - "Total number of features across all groups must be positive." - ) - - for ps in self.groups.ps: - if not isinstance(ps, int): - raise TypeError("Group sizes in 'groups.ps' must be integers.") - if ps < 0: - raise ValueError("Group sizes in 'groups.ps' must be non-negative.") - mixing_prop.append(ps / total_p) - else: - num_spectra = len(spectra) - if num_spectra == 0: - raise ValueError( - "There must be at least one spectrum to form a mixture model." - ) - mixing_prop = [1.0 / num_spectra] * num_spectra - - if not np.isclose(sum(mixing_prop), 1.0): - raise ValueError("Mixing proportions must sum to 1.") - - return MixtureModel(spectra, mixing_prop) - - -def simulate_rotated_design( - cov: CovarianceDesign, n: int, rotated_measure: Callable = None -) -> np.ndarray: - r"""Simulates a rotated design matrix based on the provided covariance design. - - The simulation generates :math:`n` samples from a multivariate distribution with covariance - matrix :math:`\Sigma` specified by the `CovarianceDesign` instance. The rotation is achieved - using the Cholesky decomposition of :math:`\Sigma`, i.e., :math:`\Sigma = LL^T`, where :math:`L` is a lower - triangular matrix. - - The simulated design matrix :math:`X` is computed as: - - .. math:: - X = ZL - - where :math:`Z` is an :math:`n \times p` matrix with independent entries generated by `rotated_measure`. - - Parameters - ---------- - cov : CovarianceDesign - An instance of CovarianceDesign defining :math:`\Sigma`. - n : int - Number of samples to generate. - rotated_measure : Callable, optional - A callable that generates random samples. Defaults to `numpy.random.normal`. - - Returns - ------- - numpy.ndarray - An :math:`n \times p` simulated design matrix adhering to the covariance structure :math:`\Sigma`. - - Raises - ------ - ValueError - If :math:`\Sigma` is not positive definite, making the Cholesky decomposition impossible. - TypeError - If input types are incorrect. - - Examples - -------- - >>> ar1 = AR1Design(p=5, rho=0.7) - >>> X = simulate_rotated_design(ar1, n=100) - """ - if not isinstance(cov, CovarianceDesign): - raise TypeError( - f"'cov' must be an instance of CovarianceDesign, got {type(cov).__name__}" - ) - if not isinstance(n, int): - raise TypeError(f"'n' must be an integer, got {type(n).__name__}") - if n <= 0: - raise ValueError("'n' must be a positive integer.") - if rotated_measure is not None and not callable(rotated_measure): - raise TypeError("'rotated_measure' must be a callable or None.") - - if rotated_measure is None: - rotated_measure = np.random.normal - - Sigma = cov.get_Sigma() - if not isinstance(Sigma, np.ndarray): - raise TypeError("Covariance matrix :math:`\\Sigma` must be a numpy.ndarray.") - if Sigma.ndim != 2: - raise ValueError("Covariance matrix :math:`\\Sigma` must be 2-dimensional.") - if Sigma.shape[0] != Sigma.shape[1]: - raise ValueError("Covariance matrix :math:`\\Sigma` must be square.") - if np.any(np.isnan(Sigma)) or np.any(np.isinf(Sigma)): - raise ValueError( - "Covariance matrix :math:`\\Sigma` contains NaN or infinite values." - ) - - try: - Sigma_chol = np.linalg.cholesky(Sigma) - except np.linalg.LinAlgError: - raise ValueError( - "Sigma is not positive definite; Cholesky decomposition failed." - ) - - p = cov.nfeatures() - if not isinstance(p, int): - raise TypeError("'nfeatures()' must return an integer.") - if p <= 0: - raise ValueError("Number of features 'p' must be positive.") - - try: - Z = rotated_measure(size=(n, p)) - except Exception as e: - raise RuntimeError( - f"Failed to generate random samples using 'rotated_measure': {e}" - ) - - if not isinstance(Z, np.ndarray): - raise TypeError("Output from 'rotated_measure' must be a numpy.ndarray.") - if Z.shape != (n, p): - raise ValueError(f"Shape of Z must be ({n}, {p}), got {Z.shape}.") - - try: - X = Z @ Sigma_chol - except Exception as e: - raise RuntimeError(f"Failed to compute the simulated design matrix X: {e}") - - if not isinstance(X, np.ndarray): - raise TypeError("Simulated design matrix X must be a numpy.ndarray.") - if X.shape != (n, p): - raise ValueError( - f"Shape of simulated design matrix X must be ({n}, {p}), got {X.shape}." - ) - - return X - - -def set_groups( - design: CovarianceDesign, groups_or_p: Union[GroupedFeatures, int] -) -> None: - """Set the number of features for a covariance design.""" - if not isinstance(design, CovarianceDesign): - raise TypeError( - f"'design' must be an instance of CovarianceDesign, got {type(design).__name__}" - ) - - # Check if groups_or_p is an integer - if isinstance(groups_or_p, int): - if groups_or_p <= 0: - raise ValueError("'groups_or_p' as an integer must be a positive value.") - - if isinstance(design, BlockCovarianceDesign): - n_blocks = len(design.blocks) - size_per_block = groups_or_p // n_blocks - remainder = groups_or_p % n_blocks - sizes = [ - size_per_block + (1 if i < remainder else 0) for i in range(n_blocks) - ] - groups = GroupedFeatures(ps=sizes) - - for block, size in zip(design.blocks, sizes): - block.p = size - design.groups = groups - else: - design.p = groups_or_p - return - - # Check if groups_or_p is a GroupedFeatures instance using __class__ comparison - if groups_or_p.__class__.__name__ == "GroupedFeatures": - if isinstance(design, BlockCovarianceDesign): - if len(groups_or_p.ps) != len(design.blocks): - raise ValueError( - "Number of groups must match number of blocks in BlockCovarianceDesign." - ) - for block, group_size in zip(design.blocks, groups_or_p.ps): - block.p = group_size - design.groups = groups_or_p - else: - design.p = groups_or_p.nfeatures() - return - - # If we get here, the input type was invalid - raise TypeError("groups_or_p must be an instance of GroupedFeatures or int.") diff --git a/src/groupedfeatures.py b/src/groupedfeatures.py deleted file mode 100644 index 23c657e..0000000 --- a/src/groupedfeatures.py +++ /dev/null @@ -1,297 +0,0 @@ -"""Create and manage grouped feature structures for statistical modeling.""" - -from typing import Callable, Union, TypeVar -import numpy as np - -T = TypeVar("T") - - -class GroupedFeatures: - r"""A class representing groups of features. - - The first :math:`ps[0]` features are one group, the next :math:`ps[1]` features are the - second group, and so forth. - - Parameters - ---------- - ps : list of int - List of positive integers representing the size of each group. - - Attributes - ---------- - ps : list of int - List of group sizes. - p : int - Total number of features, calculated as :math:`p = \sum ps`. - num_groups : int - Number of groups, denoted as :math:`G`. - - Raises - ------ - TypeError - If `ps` is not a list or contains non-integer elements. - ValueError - If any group size is not positive. - """ - - def __init__(self, ps: list[int]): - if not isinstance(ps, list): - raise TypeError( - f"ps must be a list of positive integers, got {type(ps).__name__}" - ) - if not all(isinstance(p, int) for p in ps): - raise TypeError("All group sizes in ps must be integers") - if not all(p > 0 for p in ps): - raise ValueError("All group sizes in ps must be positive integers") - - self.ps = ps - self.p = sum(ps) - self.num_groups = len(ps) - - @classmethod - def from_group_size(cls, group_size: int, num_groups: int): - r"""Create a `GroupedFeatures` instance with uniform group sizes. - - The group sizes are all set to :math:`group\_size`. - - Parameters - ---------- - group_size : int - Size of each group. - num_groups : int - Number of groups. - - Returns - ------- - GroupedFeatures - Instance with uniform group sizes. - - Raises - ------ - TypeError - If `group_size` or `num_groups` is not an integer. - ValueError - If `group_size` or `num_groups` is not positive. - """ - if not isinstance(group_size, int): - raise TypeError( - f"group_size must be an integer, got {type(group_size).__name__}" - ) - if not isinstance(num_groups, int): - raise TypeError( - f"num_groups must be an integer, got {type(num_groups).__name__}" - ) - if group_size <= 0: - raise ValueError("group_size must be a positive integer") - if num_groups <= 0: - raise ValueError("num_groups must be a positive integer") - - return cls([group_size] * num_groups) - - def ngroups(self) -> int: - r"""Get the number of groups. - - Returns - ------- - int - Number of groups, :math:`G`. - """ - return self.num_groups - - def nfeatures(self) -> int: - r""" - Get the total number of features. - - Returns - ------- - int - Total number of features, :math:`p`. - """ - return self.p - - def group_idx(self, i: int) -> range: - r"""Get the range of feature indices for the :math:`i`-th group. - - Parameters - ---------- - i : int - Index of the group (0-based). - - Returns - ------- - range - Range object representing the indices of the group. - - Raises - ------ - TypeError - If `i` is not an integer. - IndexError - If `i` is out of range [0, :math:`num\_groups - 1`]. - """ - if not isinstance(i, int): - raise TypeError(f"Group index i must be an integer, got {type(i).__name__}") - if not (0 <= i < self.num_groups): - raise IndexError( - f"Group index i={i} is out of range [0, {self.num_groups -1}]" - ) - - starts = np.cumsum([0] + self.ps[:-1]).astype(int) - ends = np.cumsum(self.ps).astype(int) - return range(starts[i], ends[i]) - - def group_summary( - self, - vec: Union[list[T], np.ndarray], - f: Callable[[Union[list[T], np.ndarray]], T], - ) -> list[T]: - r"""Apply a summary function to each group of features. - - For each group :math:`g`, the function :math:`f` is applied to the subset of - features in that group. - - Parameters - ---------- - vec : array-like of shape (n_features,) or (n_samples, n_features) - List or ndarray of features. - f : callable - Function that takes a list or ndarray of features and returns a - summary value. - - Returns - ------- - list - List of summary values, one per group. - - Raises - ------ - TypeError - If `vec` is neither a list nor an ndarray, or if `f` is not callable. - ValueError - If length of `vec` does not match total number of features. - """ - if not callable(f): - raise TypeError("f must be a callable function") - - if isinstance(vec, np.ndarray): - if vec.ndim == 2: - if vec.shape[1] != self.p: - raise ValueError( - f"Length of vec ({vec.shape[1]}) does not match number of" - f" features ({self.p})" - ) - elif vec.ndim == 1: - if vec.shape[0] != self.p: - raise ValueError( - f"Length of vec ({vec.shape[0]}) does not match number of" - f" features ({self.p})" - ) - vec = vec.reshape(1, -1) - else: - raise ValueError(f"vec must be 1D or 2D, got {vec.ndim}D") - elif not isinstance(vec, list): - raise TypeError( - f"vec must be either a list or ndarray, got {type(vec).__name__}" - ) - else: - if len(vec) != self.p: - raise ValueError( - f"Length of vec ({len(vec)}) does not match number of features" - f" ({self.p})" - ) - - summaries = [] - for i in range(self.num_groups): - idx = self.group_idx(i) - try: - group_features = ( - [vec[j] for j in idx] - if isinstance(vec, list) - else vec[:, idx.start : idx.stop] - ) - summaries.append(f(group_features)) - except Exception as e: - raise RuntimeError( - f"Error applying function to group {idx.start}-{idx.stop}: {e}" - ) - - return summaries - - def group_expand(self, vec_or_num: Union[list[T], T, np.ndarray]) -> list[T]: - r""" - Expand a vector or number to a list of features. - - If `vec_or_num` is a number, replicate it across all features. - If it is a list or ndarray, each element corresponds to a group and is - replicated within the group. - - Parameters - ---------- - vec_or_num : int, float, list or ndarray - Either a single number or a list/ndarray with length equal to number - of groups. - - Returns - ------- - list - Expanded list of features. - - Raises - ------ - TypeError - If `vec_or_num` is neither a number nor a list/ndarray. - ValueError - If `vec_or_num` is a list/ndarray but its length does not match number - of groups. - """ - if isinstance(vec_or_num, (int, float)): - return [vec_or_num] * self.p - - if isinstance(vec_or_num, (list, np.ndarray)): - if len(vec_or_num) != self.num_groups: - raise ValueError( - f"Length of vec_or_num ({len(vec_or_num)}) does not match number of" - f" groups ({self.num_groups})" - ) - # If it's an ndarray, convert to list - if isinstance(vec_or_num, np.ndarray): - vec_or_num = vec_or_num.tolist() - arr = [0.0] * self.p - for i in range(self.num_groups): - idx = self.group_idx(i) - arr[idx.start : idx.stop] = [vec_or_num[i]] * len(idx) - return arr - - raise TypeError( - "vec_or_num must be either a number (int or float) or a list/ndarray, got" - f" {type(vec_or_num).__name__}" - ) - - -def fill(value: T, length: int) -> list[T]: - """Fill a list with a given value repeated :math:`length` times. - - Parameters - ---------- - value : T - Value to fill the list with. - length : int - Number of times to repeat the value. - - Returns - ------- - list - List containing the value repeated :math:`length` times. - - Raises - ------ - TypeError - If `length` is not an integer. - ValueError - If `length` is negative. - """ - if not isinstance(length, int): - raise TypeError(f"length must be an integer, got {type(length).__name__}") - if length < 0: - raise ValueError("length must be a non-negative integer") - return [value] * length diff --git a/test/test_blockridge.py b/test/test_blockridge.py deleted file mode 100644 index 5ce2687..0000000 --- a/test/test_blockridge.py +++ /dev/null @@ -1,645 +0,0 @@ -# TODO: Remove this file soon - -import pytest -import numpy as np -import time -from scipy.linalg import cho_solve - -from ..src.blockridge import ( - CholeskyRidgePredictor, - WoodburyRidgePredictor, - ShermanMorrisonRidgePredictor, - GroupRidgeRegressor, - lambda_lolas_rule, - MomentTunerSetup, - sigma_squared_path, - get_lambdas, - get_alpha_s_squared, -) - -from ..src.groupedfeatures import GroupedFeatures -from ..src.nnls import NNLSError - - -@pytest.fixture -def X(): - """Generate a random matrix X for testing.""" - np.random.seed(0) - return np.random.randn(100, 10) - - -@pytest.fixture -def Y(X): - """Generate a response vector Y based on X for testing.""" - np.random.seed(1) - beta_true = np.random.randn(X.shape[1]) - noise = np.random.randn(X.shape[0]) - return X @ beta_true + noise - - -# Group configurations -@pytest.fixture( - params=[ - [10], # Single group - [5, 5], # Two groups - [2, 3, 5], # Three groups - [1] * 10, # Ten groups - [3, 2, 2, 3], # Four groups with varying sizes - [4, 6], # Two groups with different sizes - [2, 2, 2, 2, 2], # Five groups - ] -) -def groups(request): - """Generate different group configurations for testing.""" - return GroupedFeatures(request.param) - - -def test_cholesky_ridge_predictor_initialization(X): - """Test the initialization of CholeskyRidgePredictor.""" - predictor = CholeskyRidgePredictor(X) - expected_XtX = np.dot(X.T, X) / X.shape[0] - np.testing.assert_allclose(predictor.XtX, expected_XtX, atol=1e-6) - assert ( - predictor.XtXp_lambda_chol is not None - ) # Cholesky factor should be initialized - expected_XtXp_lambda = expected_XtX + np.eye(X.shape[1]) - np.testing.assert_allclose(predictor.XtXp_lambda, expected_XtXp_lambda, atol=1e-6) - - -def test_cholesky_ridge_predictor_set_params(X, groups): - """Test updating lambda values in CholeskyRidgePredictor.""" - predictor = CholeskyRidgePredictor(X) - # Generate lambdas based on the number of groups - lambdas = np.linspace(0.5, 1.5, groups.num_groups) - predictor.set_params(groups, lambdas) - expected_diag = groups.group_expand(lambdas) - expected_XtXp_lambda = predictor.XtX + np.diag(expected_diag) - np.testing.assert_allclose(predictor.XtXp_lambda, expected_XtXp_lambda, atol=1e-6) - # Verify Cholesky factor by reconstructing - reconstructed = predictor.XtXp_lambda_chol @ predictor.XtXp_lambda_chol.T - np.testing.assert_allclose(reconstructed, expected_XtXp_lambda, atol=1e-6) - - -def test_cholesky_ridge_predictor_solve_system(X): - """Test the _solve_system operation of CholeskyRidgePredictor.""" - predictor = CholeskyRidgePredictor(X) - B = np.random.randn(X.shape[1], 3) - result = predictor._solve_system(B) - expected = np.linalg.solve(predictor.XtXp_lambda, B) - np.testing.assert_allclose(result, expected, atol=1e-6) - - -def test_woodbury_ridge_predictor_initialization(X): - """Test the initialization of WoodburyRidgePredictor.""" - predictor = WoodburyRidgePredictor(X) - p = X.shape[1] - assert predictor.A_inv.shape == (p, p) - assert predictor.U.shape == X.T.shape - assert predictor.V.shape == X.shape - - -def test_woodbury_ridge_predictor_set_params(X, groups): - """Test updating lambda values in WoodburyRidgePredictor.""" - predictor = WoodburyRidgePredictor(X) - # Generate lambdas based on the number of groups - lambdas = np.linspace(0.5, 1.5, groups.num_groups) - predictor.set_params(groups, lambdas) - - # Create the expected matrix - expected_A = np.diag(groups.group_expand(lambdas)) + X.T @ X - expected_A_inv = np.linalg.inv(expected_A) - - np.testing.assert_allclose(predictor.A_inv, expected_A_inv, atol=1e-6) - - # Test _solve_system operation - B = np.random.randn(X.shape[1], 3) - result = predictor._solve_system(B) - expected = np.linalg.solve(expected_A, B) - np.testing.assert_allclose(result, expected, atol=1e-6) - - -def test_group_ridge_regressor_initialization(X, Y, groups): - """Test the initialization of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - assert regressor.n == X.shape[0] - assert regressor.p == X.shape[1] - assert regressor.groups == groups - assert regressor.XtY.shape == (X.shape[1],) - assert regressor.lambdas.shape == (groups.num_groups,) - assert regressor.beta_current.shape == (X.shape[1],) - assert regressor.Y_hat.shape == (X.shape[0],) - assert regressor.XtXp_lambda_div_Xt.shape == (X.shape[1], X.shape[0]) - - -def test_group_ridge_regressor_fit(X, Y, groups): - """Test the fit method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - # Generate lambdas based on the number of groups - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - regressor.fit(lambdas) - assert regressor.beta_current.shape == (groups.p,) - expected_Y_hat = X @ regressor.beta_current - np.testing.assert_allclose(regressor.Y_hat, expected_Y_hat, atol=1e-6) - - -def test_group_ridge_regressor_set_params(X, Y, groups): - """Test updating lambda values in GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - new_lambdas = np.random.rand(groups.num_groups) - regressor.set_params(new_lambdas) - np.testing.assert_allclose(regressor.lambdas, new_lambdas) - - -def test_group_ridge_regressor_get_n_groups(X, Y, groups): - """Test the get_n_groups method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - assert regressor.get_n_groups() == groups.num_groups - - -def test_group_ridge_regressor_get_coef(X, Y, groups): - """Test the get_coef method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - assert np.allclose(regressor.get_coef(), regressor.beta_current) - - -def test_group_ridge_regressor_is_linear(X, Y, groups): - """Test the is_linear method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - assert regressor.is_linear() == True - - -def test_group_ridge_regressor_get_leverage(X, Y, groups): - """Test the get_leverage method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - regressor.fit(np.ones(groups.num_groups)) - leverage = regressor.get_leverage() - assert leverage.shape == (X.shape[0],) - assert np.all(leverage >= 0) and np.all(leverage <= 1) - - -def test_group_ridge_regressor_get_model_matrix(X, Y, groups): - """Test the get_model_matrix method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - np.testing.assert_allclose(regressor.get_model_matrix(), X) - - -def test_group_ridge_regressor_get_response(X, Y, groups): - """Test the get_response method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - np.testing.assert_allclose(regressor.get_response(), Y) - - -def test_group_ridge_regressor_fit_with_dict(X, Y, groups): - """Test fitting GroupRidgeRegressor with a dictionary of lambda values.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambda_dict = { - f"group_{i}": val for i, val in enumerate(np.random.rand(groups.num_groups)) - } - loo_error = regressor.fit(lambda_dict) - assert isinstance(loo_error, float) - assert loo_error >= 0 - - -def test_group_ridge_regressor_predict_new_data(X, Y, groups): - """Test predicting with new data using GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - regressor.fit(np.ones(groups.num_groups)) - X_new = np.random.randn(10, X.shape[1]) - predictions = regressor.predict(X_new) - assert predictions.shape == (10,) - - -def test_lambda_lolas_rule(X, Y, groups): - """Test the lambda LOLAS rule calculation.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - regressor.fit(np.ones(groups.num_groups)) - mom = MomentTunerSetup(regressor) - rule_lambda = lambda_lolas_rule(regressor, multiplier=0.1) - expected_lambda = ( - 0.1 * regressor.p**2 / regressor.n / regressor.predictor._trace_gram_matrix() - ) - assert np.isclose(rule_lambda, expected_lambda, atol=1e-6) - - -def test_moment_tuner_setup(X, Y, groups): - """Test the initialization of MomentTunerSetup.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - mom = MomentTunerSetup(regressor) - assert mom.ps.tolist() == groups.ps - assert mom.n == X.shape[0] - assert len(mom.beta_norms_squared) == groups.num_groups - assert len(mom.N_norms_squared) == groups.num_groups - assert mom.M_squared.shape == (groups.num_groups, groups.num_groups) - - -def test_sigma_squared_path(X, Y, groups): - """Test the sigma squared path calculation.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.ones(groups.num_groups) - regressor.fit(lambdas) - mom = MomentTunerSetup(regressor) - sigma_s_squared = np.linspace(0.1, 2.0, 10) - path = sigma_squared_path(regressor, mom, sigma_s_squared) - assert "lambdas" in path - assert "loos" in path - assert "betas" in path - assert path["lambdas"].shape == (10, groups.num_groups) - assert path["loos"].shape == (10,) - assert path["betas"].shape == (10, groups.p) - - -def test_get_lambdas(X, Y, groups): - """Test the get_lambdas function.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - regressor.fit(lambdas) - mom = MomentTunerSetup(regressor) - sigma_sq = 1.0 - lambdas_out = get_lambdas(mom, sigma_sq) - - # Assert that lambdas_out does not contain LARGE_VALUE unless expected - LARGE_VALUE = 1e12 - assert not np.any( - lambdas_out > LARGE_VALUE - ), "Lambda contains values exceeding LARGE_VALUE." - - -def test_get_alpha_s_squared(X, Y, groups): - """Test the get_alpha_s_squared function.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - regressor.fit(lambdas) - mom = MomentTunerSetup(regressor) - sigma_sq = 1.0 - try: - alpha_sq = get_alpha_s_squared(mom, sigma_sq) - except NNLSError as e: - pytest.fail(f"NNLS failed with error: {e}") - - assert alpha_sq.shape == (groups.num_groups,) - # Check that alpha_sq are non-negative - assert np.all(alpha_sq >= 0), "alpha_sq contains negative values." - - -def test_predict(X, Y, groups): - """Test the predict method of GroupRidgeRegressor.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - regressor.fit(lambdas) - predictions = regressor.predict(X) - np.testing.assert_allclose(predictions, regressor.Y_hat, atol=1e-6) - - -def test_get_loo_error(X, Y, groups): - """Test the leave-one-out error calculation.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - loo_error = regressor.fit(lambdas) - assert isinstance(loo_error, float) - # Since it's a mean of squared errors, it should be non-negative - assert loo_error >= 0 - - -def test_get_mse(X, Y, groups): - """Test the mean squared error calculation for ridge regression.""" - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - lambdas = np.linspace(1.0, 1.0, groups.num_groups) - regressor.fit(lambdas) - X_test = np.random.randn(50, X.shape[1]) - Y_test = X_test @ regressor.beta_current + np.random.randn(50) - mse = regressor.get_mse(X_test, Y_test) - assert isinstance(mse, float) - assert mse >= 0 - - -@pytest.fixture -def high_dim_data(): - """Generate high-dimensional data for testing.""" - np.random.seed(42) - n = 100 # Number of observations - p = 1000 # Number of features (p >> n) - X = np.random.randn(n, p) - beta_true = np.random.randn(p) - Y = X @ beta_true + np.random.randn(n) * 0.1 - groups = GroupedFeatures([50] * 20) # 20 groups of 50 features each - return X, Y, groups - - -def test_high_dimensional_case(high_dim_data): - """Test various components of the ridge regression in high-dimensional settings.""" - X, Y, groups = high_dim_data - regressor = GroupRidgeRegressor(X=X, Y=Y, groups=groups) - - # Test initialization - assert regressor.n == X.shape[0] - assert regressor.p == X.shape[1] - assert regressor.groups.num_groups == 20 - - # Test fitting - lambdas = np.ones(groups.num_groups) - loo_error = regressor.fit(lambdas) - assert isinstance(loo_error, float) - assert loo_error >= 0 - - # Test prediction - predictions = regressor.predict(X) - assert predictions.shape == (X.shape[0],) - - # Test MSE - mse = regressor.get_mse(X, Y) - assert isinstance(mse, float) - assert mse >= 0 - - # Test Cholesky predictor - chol_predictor = CholeskyRidgePredictor(X) - chol_predictor.set_params(groups, lambdas) - assert chol_predictor.XtXp_lambda_chol.shape == (X.shape[1], X.shape[1]) - - # Test Woodbury predictor - wood_predictor = WoodburyRidgePredictor(X) - wood_predictor.set_params(groups, lambdas) - assert wood_predictor.A_inv.shape == (X.shape[1], X.shape[1]) - - # Test MomentTunerSetup - mom = MomentTunerSetup(regressor) - assert mom.ps.shape == (groups.num_groups,) - assert mom.M_squared.shape == (groups.num_groups, groups.num_groups) - - # Test sigma_squared_path - sigma_s_squared = np.linspace(0.1, 2.0, 5) - path = sigma_squared_path(regressor, mom, sigma_s_squared) - assert path["lambdas"].shape == (5, groups.num_groups) - assert path["loos"].shape == (5,) - assert path["betas"].shape == (5, X.shape[1]) - - -def test_high_dimensional_predictor_speed(high_dim_data): - """Compare the speed of Cholesky and Woodbury predictors in high-dimensional settings.""" - X, Y, groups = high_dim_data - lambdas = np.ones(groups.num_groups) - n, p = X.shape - - print(f"Data dimensions: n={n}, p={p}") - - # Test Cholesky predictor speed - chol_start = time.time() - chol_predictor = CholeskyRidgePredictor(X) - chol_predictor.set_params(groups, lambdas) - chol_end = time.time() - chol_time = chol_end - chol_start - - # Test Woodbury predictor speed - wood_start = time.time() - wood_predictor = WoodburyRidgePredictor(X) - wood_predictor.set_params(groups, lambdas) - wood_end = time.time() - wood_time = wood_end - wood_start - - print(f"Cholesky predictor time: {chol_time:.4f} seconds") - print(f"Woodbury predictor time: {wood_time:.4f} seconds") - - # Test matrix operations - B = np.random.randn(p, 5) - - chol_op_start = time.time() - chol_result = chol_predictor._solve_system(B) - chol_op_end = time.time() - chol_op_time = chol_op_end - chol_op_start - - wood_op_start = time.time() - wood_result = wood_predictor._solve_system(B) - wood_op_end = time.time() - wood_op_time = wood_op_end - wood_op_start - - print(f"Cholesky operation time: {chol_op_time:.4f} seconds") - print(f"Woodbury operation time: {wood_op_time:.4f} seconds") - - np.testing.assert_allclose(chol_result, wood_result, rtol=1e-1, atol=1e-1) - - -@pytest.fixture -def very_high_dim_data(): - """Generate very high-dimensional data for testing.""" - np.random.seed(42) - n = 100 # Number of observations - p = 10000 # Number of features (p >>> n) - X = np.random.randn(n, p) - beta_true = np.random.randn(p) - Y = X @ beta_true + np.random.randn(n) * 0.1 - groups = GroupedFeatures([500] * 20) # 20 groups of 500 features each - return X, Y, groups - - -def test_very_high_dimensional_predictor_speed(very_high_dim_data): - """Compare the speed of Cholesky, Woodbury, and Sherman-Morrison predictors in very high-dimensional settings.""" - X, Y, groups = very_high_dim_data - lambdas = np.ones(groups.num_groups) - n, p = X.shape - - print(f"Data dimensions: n={n}, p={p}") - - # Test Cholesky predictor speed - chol_start = time.time() - chol_predictor = CholeskyRidgePredictor(X) - chol_predictor.set_params(groups, lambdas) - chol_end = time.time() - chol_time = chol_end - chol_start - - # Test Woodbury predictor speed - wood_start = time.time() - wood_predictor = WoodburyRidgePredictor(X) - wood_predictor.set_params(groups, lambdas) - wood_end = time.time() - wood_time = wood_end - wood_start - - # Test Sherman-Morrison predictor speed - sm_start = time.time() - sm_predictor = ShermanMorrisonRidgePredictor(X) - sm_predictor.set_params(groups, lambdas) - sm_end = time.time() - sm_time = sm_end - sm_start - - print(f"Cholesky predictor time: {chol_time:.4f} seconds") - print(f"Woodbury predictor time: {wood_time:.4f} seconds") - print(f"Sherman-Morrison predictor time: {sm_time:.4f} seconds") - - # Test matrix operations - B = np.random.randn(p, 5) - - chol_op_start = time.time() - chol_result = chol_predictor._solve_system(B) - chol_op_end = time.time() - chol_op_time = chol_op_end - chol_op_start - - wood_op_start = time.time() - wood_result = wood_predictor._solve_system(B) - wood_op_end = time.time() - wood_op_time = wood_op_end - wood_op_start - - sm_op_start = time.time() - sm_result = sm_predictor._solve_system(B) - sm_op_end = time.time() - sm_op_time = sm_op_end - sm_op_start - - print(f"Cholesky operation time: {chol_op_time:.4f} seconds") - print(f"Woodbury operation time: {wood_op_time:.4f} seconds") - print(f"Sherman-Morrison operation time: {sm_op_time:.4f} seconds") - - np.testing.assert_allclose(chol_result, sm_result, rtol=1e-1, atol=1e-1) - np.testing.assert_allclose(wood_result, sm_result, rtol=1e-1, atol=1e-1) - - assert wood_time < chol_time, ( - "Woodbury predictor should be faster in very high-dimensional case. Woodbury:" - f" {wood_time:.4f}, Cholesky: {chol_time:.4f}" - ) - - -def test_sherman_morrison_ridge_predictor_initialization(X): - """Test the initialization of ShermanMorrisonRidgePredictor.""" - predictor = ShermanMorrisonRidgePredictor(X) - expected_A = np.eye(X.shape[1]) + (X.T @ X) / X.shape[0] - expected_A_inv = np.linalg.inv(expected_A) - - assert predictor.A_inv.shape == (X.shape[1], X.shape[1]) - np.testing.assert_allclose(predictor.A_inv, expected_A_inv, atol=1e-6) - - -def test_sherman_morrison_ridge_predictor_set_params(X, groups): - """Test the set_params method of ShermanMorrisonRidgePredictor.""" - predictor = ShermanMorrisonRidgePredictor(X) - lambdas = np.linspace(0.5, 1.5, groups.num_groups) - predictor.set_params(groups, lambdas) - - # Create the expected matrix - expected_A = np.diag(groups.group_expand(lambdas)) + X.T @ X / X.shape[0] - expected_A_inv = np.linalg.inv(expected_A) - - np.testing.assert_allclose(predictor.A_inv, expected_A_inv, atol=1e-6) - - -def test_sherman_morrison_single_update(X): - """Test a single Sherman-Morrison update.""" - predictor = ShermanMorrisonRidgePredictor(X) - u = np.random.randn(X.shape[1]) - v = np.random.randn(X.shape[1]) - - A_inv_initial = predictor.A_inv.copy() - - # Compute expected A_inv after Sherman-Morrison update manually - denominator = 1.0 + v @ (A_inv_initial @ u) - expected_A_inv = ( - A_inv_initial - np.outer(A_inv_initial @ u, v @ A_inv_initial) / denominator - ) - - # Apply Sherman-Morrison update - predictor._sherman_morrison_update(u, v) - - # Assert that predictor.A_inv matches the manually computed inverse - np.testing.assert_allclose(predictor.A_inv, expected_A_inv, atol=1e-6) - - -def test_sherman_morrison_ridge_predictor_solve_system(X): - """Test the _solve_system method of ShermanMorrisonRidgePredictor.""" - predictor = ShermanMorrisonRidgePredictor(X) - B = np.random.randn(X.shape[1], 3) - result = predictor._solve_system(B) - expected = predictor.A_inv @ B - np.testing.assert_allclose(result, expected, atol=1e-6) - - -def test_sherman_morrison_formula(X): - """Test the Sherman-Morrison formula implementation.""" - predictor = ShermanMorrisonRidgePredictor(X) - - # Initialize A and compute its inverse - A = np.random.randn(X.shape[1], X.shape[1]) - A = A @ A.T # Make A positive definite - A_inv = np.linalg.inv(A) - predictor.A_inv = A_inv.copy() - - u = np.random.randn(X.shape[1]) - v = np.random.randn(X.shape[1]) - - denominator = 1.0 + v @ (A_inv @ u) - expected_A_inv = A_inv - np.outer(A_inv @ u, v @ A_inv) / denominator - - predictor._sherman_morrison_update(u, v) - np.testing.assert_allclose(predictor.A_inv, expected_A_inv, atol=1e-6) - - -def test_sherman_morrison_predictor_in_high_dimensional_case(high_dim_data): - """Test ShermanMorrisonRidgePredictor in high-dimensional settings.""" - X, Y, groups = high_dim_data - predictor = ShermanMorrisonRidgePredictor(X) - - # Test initialization - assert predictor.A_inv.shape == (X.shape[1], X.shape[1]) - - # Test updating lambda_s - lambdas = np.ones(groups.num_groups) - predictor.set_params(groups, lambdas) - - # Test _solve_system operation - B = np.random.randn(X.shape[1], 5) - result = predictor._solve_system(B) - assert result.shape == B.shape - - -def test_very_high_dimensional_predictor_speed(very_high_dim_data): - """Compare the speed of Cholesky, Woodbury, and Sherman-Morrison predictors in very high-dimensional settings.""" - X, Y, groups = very_high_dim_data - lambdas = np.ones(groups.num_groups) - n, p = X.shape - - print(f"Data dimensions: n={n}, p={p}") - - # Test Cholesky predictor speed - chol_start = time.time() - chol_predictor = CholeskyRidgePredictor(X) - chol_predictor.set_params(groups, lambdas) - chol_end = time.time() - chol_time = chol_end - chol_start - - # Test Woodbury predictor speed - wood_start = time.time() - wood_predictor = WoodburyRidgePredictor(X) - wood_predictor.set_params(groups, lambdas) - wood_end = time.time() - wood_time = wood_end - wood_start - - print(f"Cholesky predictor time: {chol_time:.4f} seconds") - print(f"Woodbury predictor time: {wood_time:.4f} seconds") - - # Test Sherman-Morrison predictor speed - sm_start = time.time() - sm_predictor = ShermanMorrisonRidgePredictor(X) - sm_predictor.set_params(groups, lambdas) - sm_end = time.time() - sm_time = sm_end - sm_start - - print(f"Sherman-Morrison predictor time: {sm_time:.4f} seconds") - - # Test matrix operations - B = np.random.randn(p, 5) - - chol_op_start = time.time() - chol_result = chol_predictor._trace_gram_matrix() - chol_op_end = time.time() - chol_op_time = chol_op_end - chol_op_start - - wood_op_start = time.time() - wood_result = wood_predictor._trace_gram_matrix() - wood_op_end = time.time() - wood_op_time = wood_op_end - wood_op_start - - sm_op_start = time.time() - sm_result = sm_predictor._trace_gram_matrix() - sm_op_end = time.time() - sm_op_time = sm_op_end - sm_op_start - - print(f"Cholesky operation time: {chol_op_time:.4f} seconds") - print(f"Woodbury operation time: {wood_op_time:.4f} seconds") - print(f"Sherman-Morrison operation time: {sm_op_time:.4f} seconds") - - np.testing.assert_allclose(chol_result, sm_result, rtol=1e-1, atol=1e-1) - np.testing.assert_allclose(wood_result, sm_result, rtol=1e-1, atol=1e-1) diff --git a/test/test_covariance.py b/test/test_covariance.py deleted file mode 100644 index f2cebd0..0000000 --- a/test/test_covariance.py +++ /dev/null @@ -1,668 +0,0 @@ -from typing import Union -import pytest -import numpy as np -from unittest.mock import Mock, patch - -from ..src.covariance_design import ( - DiscreteNonParametric, - CovarianceDesign, - AR1Design, - DiagonalCovarianceDesign, - IdentityCovarianceDesign, - UniformScalingCovarianceDesign, - ExponentialOrderStatsCovarianceDesign, - BlockDiagonal, - MixtureModel, - BlockCovarianceDesign, - block_diag, - simulate_rotated_design, - set_groups, -) - -from ..src.groupedfeatures import GroupedFeatures - - -def test_discrete_non_parametric_valid(): - """Test valid initialization of DiscreteNonParametric.""" - eigs = [1.0, 2.0, 3.0] - probs = [0.2, 0.3, 0.5] - spectrum = DiscreteNonParametric(eigs, probs) - np.testing.assert_array_equal(spectrum.eigs, np.array(eigs)) - np.testing.assert_array_equal(spectrum.probs, np.array(probs)) - - -def test_discrete_non_parametric_invalid_eigs_type(): - """Test TypeError for invalid eigenvalues type.""" - with pytest.raises(TypeError): - DiscreteNonParametric(eigs="not_a_list", probs=[0.5, 0.5]) - - -def test_discrete_non_parametric_invalid_eigs_elements(): - """Test ValueError for invalid elements in eigenvalues.""" - with pytest.raises(ValueError): - DiscreteNonParametric(eigs=[1.0, "two", 3.0], probs=[0.3, 0.4, 0.3]) - - -def test_discrete_non_parametric_invalid_probs_type(): - """Test TypeError for invalid probabilities type.""" - with pytest.raises(TypeError): - DiscreteNonParametric(eigs=[1.0, 2.0], probs="not_a_list") - - -def test_discrete_non_parametric_probs_length_mismatch(): - """Test ValueError for mismatched lengths of eigenvalues and probabilities.""" - with pytest.raises(ValueError): - DiscreteNonParametric(eigs=[1.0, 2.0], probs=[0.5]) - - -def test_discrete_non_parametric_probs_sum_not_one(): - """Test ValueError for probabilities that do not sum to one.""" - with pytest.raises(ValueError): - DiscreteNonParametric(eigs=[1.0, 2.0], probs=[0.6, 0.5]) - - -def test_discrete_non_parametric_negative_probs(): - """Test ValueError for negative probabilities.""" - with pytest.raises(ValueError): - DiscreteNonParametric(eigs=[1.0, 2.0], probs=[0.7, -0.7]) - - -def test_ar1_design_valid(): - """Test valid initialization of AR1Design.""" - p = 5 - rho = 0.6 - ar1 = AR1Design(p=p, rho=rho) - assert ar1.p == p - assert ar1.rho == rho - Sigma = ar1.get_Sigma() - expected_Sigma = rho ** np.abs(np.subtract.outer(np.arange(p), np.arange(p))) - np.testing.assert_array_almost_equal(Sigma, expected_Sigma) - assert ar1.nfeatures() == p - spectrum = ar1.spectrum() - assert isinstance(spectrum, DiscreteNonParametric) - assert len(spectrum.eigs) == p - assert np.isclose(np.sum(spectrum.probs), 1.0) - - -def test_ar1_design_invalid_p_type(): - """Test TypeError for invalid p type in AR1Design.""" - with pytest.raises(TypeError): - AR1Design(p="five", rho=0.6) - - -def test_ar1_design_invalid_p_value(): - """Test ValueError for invalid p value in AR1Design.""" - with pytest.raises(ValueError): - AR1Design(p=0, rho=0.6) - - -def test_ar1_design_invalid_rho_type(): - """Test TypeError for invalid rho type in AR1Design.""" - with pytest.raises(TypeError): - AR1Design(p=5, rho="0.6") - - -def test_ar1_design_invalid_rho_value(): - """Test ValueError for invalid rho value in AR1Design.""" - with pytest.raises(ValueError): - AR1Design(p=5, rho=1.5) - - -def test_ar1_design_missing_p(): - """Test ValueError when p is None in AR1Design.""" - ar1 = AR1Design(p=None, rho=0.6) - with pytest.raises(ValueError): - ar1.get_Sigma() - - -def test_identity_covariance_design_valid(): - """Test valid initialization of IdentityCovarianceDesign.""" - p = 4 - identity = IdentityCovarianceDesign(p=p) - Sigma = identity.get_Sigma() - expected_Sigma = np.identity(p) - np.testing.assert_array_equal(Sigma, expected_Sigma) - assert identity.nfeatures() == p - spectrum = identity.spectrum() - assert isinstance(spectrum, DiscreteNonParametric) - assert all(e == 1.0 for e in spectrum.eigs) - assert np.allclose(spectrum.probs, np.full(p, 1.0 / p)) - - -def test_identity_covariance_design_invalid_p_type(): - """Test TypeError for invalid p type in IdentityCovarianceDesign.""" - with pytest.raises(TypeError): - IdentityCovarianceDesign(p="four") - - -def test_identity_covariance_design_invalid_p_value(): - """Test ValueError for invalid p value in IdentityCovarianceDesign.""" - with pytest.raises(ValueError): - IdentityCovarianceDesign(p=-2) - - -def test_identity_covariance_design_missing_p(): - """Test ValueError when p is None in IdentityCovarianceDesign.""" - identity = IdentityCovarianceDesign() - with pytest.raises(ValueError): - identity.get_Sigma() - with pytest.raises(ValueError): - identity.nfeatures() - with pytest.raises(ValueError): - identity.spectrum() - - -def test_uniform_scaling_covariance_design_valid(): - """Test valid initialization of UniformScalingCovarianceDesign.""" - p = 3 - scaling = 2.5 - uniform = UniformScalingCovarianceDesign(scaling=scaling, p=p) - Sigma = uniform.get_Sigma() - expected_Sigma = scaling * np.identity(p) - np.testing.assert_array_equal(Sigma, expected_Sigma) - assert uniform.scaling == scaling - assert uniform.nfeatures() == p - spectrum = uniform.spectrum() - assert isinstance(spectrum, DiscreteNonParametric) - assert all(e == scaling for e in spectrum.eigs) - assert np.allclose(spectrum.probs, np.full(p, 1.0 / p)) - - -def test_uniform_scaling_covariance_design_invalid_scaling_type(): - """Test TypeError for invalid scaling type in UniformScalingCovarianceDesign.""" - with pytest.raises(TypeError): - UniformScalingCovarianceDesign(scaling="2.5", p=3) - - -def test_uniform_scaling_covariance_design_invalid_scaling_value(): - """Test ValueError for invalid scaling value in UniformScalingCovarianceDesign.""" - with pytest.raises(ValueError): - UniformScalingCovarianceDesign(scaling=-1.0, p=3) - - -def test_uniform_scaling_covariance_design_invalid_p_type(): - """Test TypeError for invalid p type in UniformScalingCovarianceDesign.""" - with pytest.raises(TypeError): - UniformScalingCovarianceDesign(scaling=1.0, p="three") - - -def test_uniform_scaling_covariance_design_invalid_p_value(): - """Test ValueError for invalid p value in UniformScalingCovarianceDesign.""" - with pytest.raises(ValueError): - UniformScalingCovarianceDesign(scaling=1.0, p=0) - - -def test_exponential_order_stats_covariance_design_valid(): - """Test valid initialization of ExponentialOrderStatsCovarianceDesign.""" - p = 4 - rate = 1.5 - exp_design = ExponentialOrderStatsCovarianceDesign(p=p, rate=rate) - spectrum = exp_design.spectrum() - assert isinstance(spectrum, DiscreteNonParametric) - assert len(spectrum.eigs) == p - assert np.all(spectrum.eigs > 0) - assert np.isclose(np.sum(spectrum.probs), 1.0) - Sigma = exp_design.get_Sigma() - assert np.allclose(Sigma, np.diag(spectrum.eigs)) - - -def test_exponential_order_stats_covariance_design_invalid_rate_type(): - """Test TypeError for invalid rate type in ExponentialOrderStatsCovarianceDesign.""" - with pytest.raises(TypeError): - ExponentialOrderStatsCovarianceDesign(p=3, rate="1.0") - - -def test_exponential_order_stats_covariance_design_invalid_rate_value(): - """Test ValueError for invalid rate value in ExponentialOrderStatsCovarianceDesign.""" - with pytest.raises(ValueError): - ExponentialOrderStatsCovarianceDesign(p=3, rate=0) - - -def test_exponential_order_stats_covariance_design_invalid_p_type(): - """Test TypeError for invalid p type in ExponentialOrderStatsCovarianceDesign.""" - with pytest.raises(TypeError): - ExponentialOrderStatsCovarianceDesign(p="three", rate=1.0) - - -def test_exponential_order_stats_covariance_design_invalid_p_value(): - """Test ValueError for invalid p value in ExponentialOrderStatsCovarianceDesign.""" - with pytest.raises(ValueError): - ExponentialOrderStatsCovarianceDesign(p=-1, rate=1.0) - - -def test_exponential_order_stats_covariance_design_missing_p(): - """Test ValueError when p is None in ExponentialOrderStatsCovarianceDesign.""" - exp_design = ExponentialOrderStatsCovarianceDesign(p=None, rate=1.0) - with pytest.raises(ValueError): - exp_design.get_Sigma() - with pytest.raises(ValueError): - exp_design.spectrum() - with pytest.raises(ValueError): - exp_design.nfeatures() - - -def test_block_diagonal_valid(): - """Test valid initialization of BlockDiagonal.""" - block1 = np.array([[1, 0], [0, 1]]) - block2 = np.array([[2, 0], [0, 2]]) - block_diag_obj = BlockDiagonal(blocks=[block1, block2]) - Sigma = block_diag_obj.get_Sigma() - expected_Sigma = block_diag(block1, block2) - np.testing.assert_array_equal(Sigma, expected_Sigma) - - -def test_block_diagonal_empty_blocks(): - """Test ValueError for empty blocks in BlockDiagonal.""" - block_diag_obj = BlockDiagonal(blocks=[]) - with pytest.raises( - ValueError, match=r"Resulting covariance matrix :math:`\\Sigma` is empty\." - ): - Sigma = block_diag_obj.get_Sigma() - - -def test_block_diagonal_invalid_blocks_type(): - """Test TypeError for invalid blocks type in BlockDiagonal.""" - with pytest.raises(TypeError): - BlockDiagonal(blocks="not_a_list") - - -def test_block_diagonal_invalid_blocks_elements(): - """Test TypeError for invalid elements in blocks of BlockDiagonal.""" - with pytest.raises(TypeError): - BlockDiagonal(blocks=[np.array([[1, 0]]), "not_an_array"]) - - -def test_block_diagonal_non_square_blocks(): - """Test ValueError for non-square blocks in BlockDiagonal.""" - with pytest.raises(ValueError): - BlockDiagonal(blocks=[np.array([[1, 2, 3]]), np.array([[4, 5], [6, 7]])]) - - -def test_mixture_model_valid(): - """Test valid initialization of MixtureModel.""" - eigs1 = [1.0, 2.0] - probs1 = [0.5, 0.5] - spectrum1 = DiscreteNonParametric(eigs=eigs1, probs=probs1) - - eigs2 = [3.0, 4.0] - probs2 = [0.3, 0.7] - spectrum2 = DiscreteNonParametric(eigs=eigs2, probs=probs2) - - mixture = MixtureModel(spectra=[spectrum1, spectrum2], mixing_prop=[0.6, 0.4]) - assert mixture.spectra == [spectrum1, spectrum2] - assert mixture.mixing_prop == [0.6, 0.4] - - -def test_mixture_model_invalid_spectra_type(): - """Test TypeError for invalid spectra type in MixtureModel.""" - with pytest.raises(TypeError): - MixtureModel(spectra="not_a_list", mixing_prop=[0.5, 0.5]) - - -def test_mixture_model_invalid_spectra_elements(): - """Test TypeError for invalid elements in spectra of MixtureModel.""" - with pytest.raises(TypeError): - MixtureModel(spectra=[Mock(), Mock()], mixing_prop=[0.5, 0.5]) - - -def test_mixture_model_invalid_mixing_prop_type(): - """Test TypeError for invalid mixing_prop type in MixtureModel.""" - spectrum = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - with pytest.raises(TypeError): - MixtureModel(spectra=[spectrum], mixing_prop="not_a_list") - - -def test_mixture_model_invalid_mixing_prop_elements(): - """Test ValueError for invalid elements in mixing_prop of MixtureModel.""" - spectrum = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - with pytest.raises(ValueError): - MixtureModel(spectra=[spectrum], mixing_prop=[0.7, "0.3"]) - - -def test_mixture_model_length_mismatch(): - """Test ValueError for length mismatch between spectra and mixing_prop in MixtureModel.""" - spectrum = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - with pytest.raises(ValueError): - MixtureModel(spectra=[spectrum], mixing_prop=[0.5, 0.5]) - - -def test_mixture_model_mixing_prop_sum_not_one(): - """Test ValueError for mixing_prop that does not sum to one in MixtureModel.""" - spectrum = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - with pytest.raises(ValueError): - MixtureModel(spectra=[spectrum], mixing_prop=[0.6, 0.5]) - - -def test_mixture_model_negative_mixing_prop(): - """Test ValueError for negative mixing_prop in MixtureModel.""" - spectrum = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - with pytest.raises(ValueError): - MixtureModel(spectra=[spectrum], mixing_prop=[-0.5, 1.5]) - - -def test_block_covariance_design_valid(): - """Test valid initialization of BlockCovarianceDesign.""" - # Create covariance designs - block1 = IdentityCovarianceDesign(p=2) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=3) - - # Create BlockCovarianceDesign without groups first - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - - # Then set groups - groups = GroupedFeatures(ps=[2, 3]) - set_groups(block_design, groups) - - # Get Sigma and construct expected Sigma using block_diag - Sigma = block_design.get_Sigma() - expected_Sigma = block_diag(block1.get_Sigma(), block2.get_Sigma()) - np.testing.assert_array_equal(Sigma, expected_Sigma) - - assert block_design.nfeatures() == 5 - spectrum = block_design.spectrum() - - assert isinstance(spectrum, MixtureModel) - assert len(spectrum.spectra) == 2 - np.testing.assert_allclose(spectrum.mixing_prop, [2 / 5, 3 / 5]) - - -def test_block_covariance_design_with_groups(): - """Test BlockCovarianceDesign with specified groups.""" - # Create covariance designs - block1 = IdentityCovarianceDesign(p=2) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=3) - - # Create BlockCovarianceDesign without groups first - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - - # Then set groups - groups = GroupedFeatures(ps=[2, 3]) - set_groups(block_design, groups) - - Sigma = block_design.get_Sigma() - expected_Sigma = block_diag(block1.get_Sigma(), block2.get_Sigma()) - np.testing.assert_array_equal(Sigma, expected_Sigma) - - spectrum = block_design.spectrum() - assert isinstance(spectrum, MixtureModel) - assert len(spectrum.spectra) == 2 - np.testing.assert_allclose(spectrum.mixing_prop, [2 / 5, 3 / 5]) - - -def test_block_covariance_design_invalid_blocks_type(): - """Test TypeError for invalid blocks type in BlockCovarianceDesign.""" - with pytest.raises(TypeError): - BlockCovarianceDesign(blocks="not_a_list") - - -def test_block_covariance_design_invalid_blocks_elements(): - """Test TypeError for invalid elements in blocks of BlockCovarianceDesign.""" - with pytest.raises(TypeError): - BlockCovarianceDesign(blocks=[IdentityCovarianceDesign(p=2), "not_a_design"]) - - -def test_block_covariance_design_invalid_groups_type(): - """Test TypeError for invalid groups type in BlockCovarianceDesign.""" - block1 = IdentityCovarianceDesign(p=2) - with pytest.raises(TypeError): - BlockCovarianceDesign(blocks=[block1], groups="not_grouped") - - -def test_block_covariance_design_groups_blocks_mismatch(): - """Test ValueError for mismatch between groups and blocks in BlockCovarianceDesign.""" - block1 = IdentityCovarianceDesign(p=2) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=3) - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - - # Try to set mismatched groups - groups = GroupedFeatures(ps=[2, 3, 1]) # 3 groups, but only 2 blocks - with pytest.raises(ValueError): - set_groups(block_design, groups) - - -def test_block_covariance_design_spectrum_sum_not_one(): - """Test ValueError for incorrect mixing properties in BlockCovarianceDesign.""" - # Mock a BlockCovarianceDesign with incorrect mixing properties - block1 = Mock(spec=CovarianceDesign) - block1.spectrum.return_value = DiscreteNonParametric(eigs=[1.0], probs=[1.0]) - block2 = Mock(spec=CovarianceDesign) - block2.spectrum.return_value = DiscreteNonParametric(eigs=[2.0], probs=[1.0]) - - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - - with pytest.raises(ValueError): - # Manually set mixing_prop to not sum to 1 - block_design.spectrum = Mock( - return_value=MixtureModel( - spectra=[block1.spectrum(), block2.spectrum()], mixing_prop=[0.6, 0.5] - ) - ) - block_design.spectrum() - - -def test_block_diag_valid(): - """Test valid block diagonal construction.""" - A = np.array([[1, 0], [0, 1]]) - B = np.array([[2, 0], [0, 2]]) - result = block_diag(A, B) - expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 2]]) - np.testing.assert_array_equal(result, expected) - - -def test_block_diag_no_blocks(): - """Test block_diag with no blocks.""" - result = block_diag() - expected = np.array([[]]) - np.testing.assert_array_equal(result, expected) - - -def test_block_diag_non_numpy_input(): - """Test TypeError for non-numpy input in block_diag.""" - A = [[1, 0], [0, 1]] - B = np.array([[2, 0], [0, 2]]) - with pytest.raises(TypeError): - block_diag(A, B) - - -def test_block_diag_non_2d_input(): - """Test ValueError for non-2D input in block_diag.""" - A = np.array([1, 2, 3]) - B = np.array([[2, 0], [0, 2]]) - with pytest.raises(ValueError): - block_diag(A, B) - - -def test_block_diag_non_square_blocks(): - """Test ValueError for non-square blocks in block_diag.""" - A = np.array([[1, 0, 0], [0, 1, 0]]) - B = np.array([[2, 0], [0, 2]]) - with pytest.raises(ValueError): - block_diag(A, B) - - -def test_block_diag_exception_handling(): - """Test exception handling in block_diag.""" - A = np.array([[1, 0], [0, 1]]) - B = np.array([[2, 0], [0, 2]]) - block_diag_obj = BlockDiagonal(blocks=[A, B]) - - with patch( - "PyGRidge.src.covariance_design.block_diag", - side_effect=Exception("Mocked exception"), - ): - with pytest.raises( - RuntimeError, - match="Failed to construct block diagonal matrix: Mocked exception", - ): - Sigma = block_diag_obj.get_Sigma() - - -def test_simulate_rotated_design_valid(): - """Test valid simulation of rotated design.""" - p = 3 - n = 10 - rho = 0.5 - ar1 = AR1Design(p=p, rho=rho) - X = simulate_rotated_design(cov=ar1, n=n) - assert isinstance(X, np.ndarray) - assert X.shape == (n, p) - - -def test_simulate_rotated_design_invalid_cov_type(): - """Test TypeError for invalid covariance type in simulate_rotated_design.""" - with pytest.raises(TypeError): - simulate_rotated_design(cov="not_a_cov_design", n=10) - - -def test_simulate_rotated_design_invalid_n_type(): - """Test TypeError for invalid n type in simulate_rotated_design.""" - ar1 = AR1Design(p=3, rho=0.5) - with pytest.raises(TypeError): - simulate_rotated_design(cov=ar1, n="ten") - - -def test_simulate_rotated_design_invalid_n_value(): - """Test ValueError for invalid n value in simulate_rotated_design.""" - ar1 = AR1Design(p=3, rho=0.5) - with pytest.raises(ValueError): - simulate_rotated_design(cov=ar1, n=0) - - -def test_simulate_rotated_design_invalid_rotated_measure(): - """Test TypeError for invalid rotated measure in simulate_rotated_design.""" - ar1 = AR1Design(p=3, rho=0.5) - with pytest.raises(TypeError): - simulate_rotated_design(cov=ar1, n=10, rotated_measure="not_callable") - - -def test_simulate_rotated_design_sigma_not_positive_definite(): - """Test ValueError for non-positive definite Sigma in simulate_rotated_design.""" - # Create a covariance design that returns a non-positive definite matrix - mock_cov = Mock(spec=CovarianceDesign) - mock_cov.get_Sigma.return_value = np.array( - [[1, 2], [2, 1]] - ) # Not positive definite - mock_cov.nfeatures.return_value = 2 - with pytest.raises(ValueError): - simulate_rotated_design(cov=mock_cov, n=5) - - -def test_simulate_rotated_design_invalid_Sigma_type(): - """Test TypeError for invalid Sigma type in simulate_rotated_design.""" - mock_cov = Mock(spec=CovarianceDesign) - mock_cov.get_Sigma.return_value = "not_an_array" - mock_cov.nfeatures.return_value = 2 - with pytest.raises(TypeError): - simulate_rotated_design(cov=mock_cov, n=5) - - -def test_simulate_rotated_design_invalid_Sigma_dimensions(): - """Test ValueError for invalid Sigma dimensions in simulate_rotated_design.""" - mock_cov = Mock(spec=CovarianceDesign) - mock_cov.get_Sigma.return_value = np.array([1, 2, 3]) - mock_cov.nfeatures.return_value = 3 - with pytest.raises(ValueError): - simulate_rotated_design(cov=mock_cov, n=5) - - -def test_simulate_rotated_design_invalid_Sigma_shape(): - """Test ValueError for invalid Sigma shape in simulate_rotated_design.""" - mock_cov = Mock(spec=CovarianceDesign) - mock_cov.get_Sigma.return_value = np.array([[1, 2], [3, 4]]) - mock_cov.nfeatures.return_value = 3 - with pytest.raises(ValueError): - simulate_rotated_design(cov=mock_cov, n=5) - - -def test_simulate_rotated_design_rotated_measure_output_invalid(): - """Test TypeError for invalid output from rotated measure in simulate_rotated_design.""" - - def faulty_measure(size): - return "not_an_array" - - ar1 = AR1Design(p=2, rho=0.5) - with pytest.raises(TypeError): - simulate_rotated_design(cov=ar1, n=5, rotated_measure=faulty_measure) - - -def test_simulate_rotated_design_rotated_measure_shape_mismatch(): - """Test ValueError for shape mismatch in rotated measure output in simulate_rotated_design.""" - - def faulty_measure(size): - return np.random.normal(size=(size[0], size[1] + 1)) # Incorrect shape - - ar1 = AR1Design(p=2, rho=0.5) - with pytest.raises(ValueError): - simulate_rotated_design(cov=ar1, n=5, rotated_measure=faulty_measure) - - -def test_set_groups_with_GroupedFeatures_non_BlockCovarianceDesign(): - """Test setting groups with GroupedFeatures for non-BlockCovarianceDesign.""" - groups = GroupedFeatures(ps=[2, 3]) - ar1 = AR1Design(p=None, rho=0.5) - set_groups(ar1, groups) - assert ar1.p == sum(groups.ps) # Should be 5 - - -def test_set_groups_with_GroupedFeatures_BlockCovarianceDesign(): - """Test setting groups with GroupedFeatures for BlockCovarianceDesign.""" - groups = GroupedFeatures(ps=[2, 3]) - block1 = IdentityCovarianceDesign(p=None) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=None) - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - set_groups(block_design, groups) - assert block_design.groups == groups - assert block1.p == 2 - assert block2.p == 3 - - -def test_set_groups_with_GroupedFeatures_BlockCovarianceDesign_mismatch(): - """Test ValueError for mismatch between groups and blocks in BlockCovarianceDesign.""" - groups = GroupedFeatures(ps=[2, 3, 1]) # 3 groups - block1 = IdentityCovarianceDesign(p=None) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=None) - block_design = BlockCovarianceDesign(blocks=[block1, block2]) # 2 blocks - with pytest.raises(ValueError): - set_groups(block_design, groups) - - -def test_set_groups_with_int(): - """Test setting groups with an integer value.""" - ar1 = AR1Design(p=None, rho=0.5) - set_groups(ar1, 4) - assert ar1.p == 4 - - -def test_set_groups_with_invalid_groups_or_p_type(): - """Test TypeError for invalid groups or p type in set_groups.""" - ar1 = AR1Design(p=None, rho=0.5) - with pytest.raises(TypeError): - set_groups(ar1, "invalid_type") - - -def test_set_groups_with_GroupedFeatures_invalid_group_sizes(): - """Test ValueError for invalid group sizes in GroupedFeatures.""" - with pytest.raises( - ValueError, match="All group sizes in ps must be positive integers" - ): - GroupedFeatures(ps=[2, -3]) # Negative group size - - -def test_set_groups_with_int_invalid_value(): - """Test ValueError for invalid integer value in set_groups.""" - ar1 = AR1Design(p=None, rho=0.5) - with pytest.raises(ValueError): - set_groups(ar1, 0) - with pytest.raises(ValueError): - set_groups(ar1, -1) - - -def test_set_groups_with_BlockCovarianceDesign_non_GroupedFeatures(): - """Test setting groups with an integer for BlockCovarianceDesign.""" - block1 = IdentityCovarianceDesign(p=None) - block2 = UniformScalingCovarianceDesign(scaling=3.0, p=None) - block_design = BlockCovarianceDesign(blocks=[block1, block2]) - - set_groups(block_design, 5) - - assert block_design.groups diff --git a/test/test_groupfeatures.py b/test/test_groupfeatures.py deleted file mode 100644 index 51f5587..0000000 --- a/test/test_groupfeatures.py +++ /dev/null @@ -1,310 +0,0 @@ -import pytest -from typing import List, Callable - -from ..src.groupedfeatures import GroupedFeatures, fill - - -# Define a simple summary function for testing -def sum_func(group: List[int]) -> int: - return sum(group) - - -def invalid_func(group: List[int]) -> int: - raise ValueError("Invalid function for testing") - - -class TestGroupedFeatures: - # Tests for __init__ - - def test_init_valid(self): - """Test valid initialization of GroupedFeatures.""" - ps = [2, 3, 5] - gf = GroupedFeatures(ps) - assert gf.ps == ps - assert gf.p == 10 - assert gf.num_groups == 3 - - @pytest.mark.parametrize( - "ps, expected_exception, message", - [ - (None, TypeError, "ps must be a list"), - ("not a list", TypeError, "ps must be a list"), - ([1.5, 2, 3], TypeError, "All group sizes in ps must be integers"), - ([1, -2, 3], ValueError, "All group sizes in ps must be positive integers"), - ([0, 1, 2], ValueError, "All group sizes in ps must be positive integers"), - ], - ) - def test_init_invalid(self, ps, expected_exception, message): - """Test invalid initialization of GroupedFeatures.""" - with pytest.raises(expected_exception, match=message): - GroupedFeatures(ps) - - # Tests for from_group_size - - def test_from_group_size_valid(self): - """Test valid creation of GroupedFeatures from group size.""" - group_size = 4 - num_groups = 5 - gf = GroupedFeatures.from_group_size(group_size, num_groups) - assert gf.ps == [4] * 5 - assert gf.p == 20 - assert gf.num_groups == 5 - - @pytest.mark.parametrize( - "group_size, num_groups, expected_exception, message", - [ - ("4", 5, TypeError, "group_size must be an integer"), - (4, "5", TypeError, "num_groups must be an integer"), - (0, 5, ValueError, "group_size must be a positive integer"), - (-1, 5, ValueError, "group_size must be a positive integer"), - (4, 0, ValueError, "num_groups must be a positive integer"), - (4, -2, ValueError, "num_groups must be a positive integer"), - ], - ) - def test_from_group_size_invalid( - self, group_size, num_groups, expected_exception, message - ): - """Test invalid creation of GroupedFeatures from group size.""" - with pytest.raises(expected_exception, match=message): - GroupedFeatures.from_group_size(group_size, num_groups) - - # Tests for ngroups and nfeatures - - def test_ngroups_nfeatures(self): - """Test the number of groups and features in GroupedFeatures.""" - ps = [1, 2, 3] - gf = GroupedFeatures(ps) - assert gf.ngroups() == 3 - assert gf.nfeatures() == 6 - - # Tests for group_idx - - @pytest.mark.parametrize( - "ps, i, expected_range", - [ - ([2, 3, 5], 0, range(0, 2)), - ([2, 3, 5], 1, range(2, 5)), - ([2, 3, 5], 2, range(5, 10)), - ], - ) - def test_group_idx_valid(self, ps, i, expected_range): - """Test valid group index retrieval.""" - gf = GroupedFeatures(ps) - idx = gf.group_idx(i) - assert list(idx) == list(expected_range) - - @pytest.mark.parametrize( - "ps, i, expected_exception, message", - [ - ([2, 3, 5], "1", TypeError, "Group index i must be an integer"), - ([2, 3, 5], 3, IndexError, "Group index i=3 is out of range"), - ([2, 3, 5], -1, IndexError, "Group index i=-1 is out of range"), - ([2, 3, 5], 100, IndexError, "Group index i=100 is out of range"), - ], - ) - def test_group_idx_invalid(self, ps, i, expected_exception, message): - """Test invalid group index retrieval.""" - gf = GroupedFeatures(ps) - with pytest.raises(expected_exception, match=message): - gf.group_idx(i) - - # Tests for group_summary - - def test_group_summary_valid(self): - """Test valid group summary calculation.""" - ps = [2, 3] - gf = GroupedFeatures(ps) - vec = [1, 2, 3, 4, 5] - summary = gf.group_summary(vec, sum_func) - assert summary == [3, 12] # sum([1,2])=3 and sum([3,4,5])=12 - - @pytest.mark.parametrize( - "ps, vec, f, expected_exception, message", - [ - ( - [2, 3], - "not a list", - sum_func, - TypeError, - "vec must be either a list or ndarray, got str", - ), - ( - [2, 3], - [1, 2], - sum_func, - ValueError, - "Length of vec \\(2\\) does not match number of features \\(5\\)", - ), - ( - [2, 3], - [1, 2, 3, 4, 5], - "not callable", - TypeError, - "f must be a callable function", - ), - ], - ) - def test_group_summary_invalid(self, ps, vec, f, expected_exception, message): - """Test invalid group summary calculation.""" - gf = GroupedFeatures(ps) - with pytest.raises(expected_exception, match=message): - gf.group_summary(vec, f) - - def test_group_summary_function_exception(self): - """Test exception raised by invalid summary function.""" - ps = [2, 3] - gf = GroupedFeatures(ps) - vec = [1, 2, 3, 4, 5] - with pytest.raises( - RuntimeError, - match="Error applying function to group 0-2: Invalid function for testing", - ): - gf.group_summary(vec, invalid_func) - - # Tests for group_expand - - def test_group_expand_with_number(self): - """Test group expansion with a single number.""" - ps = [2, 3] - gf = GroupedFeatures(ps) - num = 7 - expanded = gf.group_expand(num) - assert expanded == [7, 7, 7, 7, 7] - - def test_group_expand_with_list(self): - """Test group expansion with a list of numbers.""" - ps = [2, 3] - gf = GroupedFeatures(ps) - vec_or_num = [10, 20] - expanded = gf.group_expand(vec_or_num) - assert expanded == [10, 10, 20, 20, 20] - - @pytest.mark.parametrize( - "ps, vec_or_num, expected_exception, message", - [ - ( - [2, 3], - [1], - ValueError, - r"Length of vec_or_num \(1\) does not match number of groups \(2\)", - ), - ( - [2, 3], - "not a number or list", - TypeError, - r"vec_or_num must be either a number", - ), - ([2, 3], None, TypeError, r"vec_or_num must be either a number"), - ], - ) - def test_group_expand_invalid(self, ps, vec_or_num, expected_exception, message): - """Test invalid group expansion input.""" - gf = GroupedFeatures(ps) - with pytest.raises(expected_exception, match=message): - gf.group_expand(vec_or_num) - - # Tests for fill - - @pytest.mark.parametrize( - "value, length, expected", - [ - (5, 3, [5, 5, 5]), - ("a", 2, ["a", "a"]), - (None, 0, []), - ], - ) - def test_fill_valid(self, value, length, expected): - """Test valid fill function behavior.""" - assert fill(value, length) == expected - - @pytest.mark.parametrize( - "value, length, expected_exception, message", - [ - (5, -1, ValueError, "length must be a non-negative integer"), - ("a", 2.5, TypeError, "length must be an integer"), - (None, "3", TypeError, "length must be an integer"), - ], - ) - def test_fill_invalid(self, value, length, expected_exception, message): - """Test invalid fill function input.""" - with pytest.raises(expected_exception, match=message): - fill(value, length) - - -class TestGroupedFeaturesEdgeCases: - def test_empty_groups(self): - """Test behavior with empty group sizes.""" - ps = [] - gf = GroupedFeatures(ps) - assert gf.ngroups() == 0 - assert gf.nfeatures() == 0 - # group_summary on empty groups - summary = gf.group_summary([], sum_func) - assert summary == [] - - def test_single_group(self): - """Test behavior with a single group.""" - ps = [5] - gf = GroupedFeatures(ps) - assert gf.ngroups() == 1 - assert gf.nfeatures() == 5 - assert list(gf.group_idx(0)) == list(range(0, 5)) - expanded = gf.group_expand([10]) - assert expanded == [10, 10, 10, 10, 10] - - def test_group_expand_with_floats(self): - """Test group expansion with float values.""" - ps = [1, 2] - gf = GroupedFeatures(ps) - vec_or_num = [3.5, 4.5] - expanded = gf.group_expand(vec_or_num) - assert expanded == [3.5, 4.5, 4.5] - - def test_group_summary_empty(self): - """Test group summary on empty input.""" - ps = [] - gf = GroupedFeatures(ps) - vec = [] - summary = gf.group_summary(vec, sum_func) - assert summary == [] - - def test_group_expand_empty(self): - """Test group expansion with empty input.""" - ps = [] - gf = GroupedFeatures(ps) - expanded_num = gf.group_expand(5) - expanded_list = gf.group_expand([]) - assert expanded_num == [] - assert expanded_list == [] - - def test_group_summary_non_numeric(self): - """Test group summary with non-numeric values.""" - ps = [2, 2] - gf = GroupedFeatures(ps) - vec = ["a", "b", "c", "d"] - - def concat(group: List[str]) -> str: - return "".join(group) - - summary = gf.group_summary(vec, concat) - assert summary == ["ab", "cd"] - - def test_group_expand_with_non_numeric_list(self): - """Test group expansion with a list of non-numeric values.""" - ps = [2, 3] - gf = GroupedFeatures(ps) - vec_or_num = ["x", "y"] - expanded = gf.group_expand(vec_or_num) - assert expanded == ["x", "x", "y", "y", "y"] - - def test_group_summary_with_different_types(self): - """Test group summary with mixed types.""" - ps = [1, 1] - gf = GroupedFeatures(ps) - vec = [True, False] - - def any_func(group: List[bool]) -> bool: - return any(group) - - summary = gf.group_summary(vec, any_func) - assert summary == [True, False] diff --git a/test/test_seagull.py b/test/test_seagull.py deleted file mode 100644 index 9e90b4f..0000000 --- a/test/test_seagull.py +++ /dev/null @@ -1,181 +0,0 @@ -import pytest -import numpy as np -from ..src.seagull import seagull -from numpy.testing import assert_allclose -import warnings - - -# TODO: Some of these tests fail. Need to fix this. -@pytest.fixture(autouse=True) -def mock_imports(monkeypatch): - import numpy as np - - def mock_lambda_max(*args, **kwargs): - return 1.0 - - def mock_lasso(*args, **kwargs): - # Return dictionary with proper structure - return { - "result": "lasso", - "lambda_values": np.array([1.0], dtype=np.float64), - "beta": np.zeros(2, dtype=np.float64), - } - - def mock_group_lasso(*args, **kwargs): - return { - "result": "group_lasso", - "lambda_values": np.array([1.0], dtype=np.float64), - "beta": np.zeros(2, dtype=np.float64), - } - - def mock_sparse_group_lasso(*args, **kwargs): - return { - "result": "sparse_group_lasso", - "lambda_values": np.array([1.0], dtype=np.float64), - "beta": np.zeros(2, dtype=np.float64), - } - - # Use full package paths instead of relative imports - monkeypatch.setattr( - "PyGRidge.src.lambda_max_lasso.lambda_max_lasso", mock_lambda_max - ) - monkeypatch.setattr( - "PyGRidge.src.lambda_max_group_lasso.lambda_max_group_lasso", mock_lambda_max - ) - monkeypatch.setattr( - "PyGRidge.src.lambda_max_sparse_group_lasso.lambda_max_sparse_group_lasso", - mock_lambda_max, - ) - monkeypatch.setattr("PyGRidge.src.lasso.lasso", mock_lasso) - monkeypatch.setattr("PyGRidge.src.group_lasso.group_lasso", mock_group_lasso) - monkeypatch.setattr( - "PyGRidge.src.sparse_group_lasso.sparse_group_lasso", mock_sparse_group_lasso - ) - - # Also patch the functions directly in the seagull module - monkeypatch.setattr("PyGRidge.src.seagull.lambda_max_lasso", mock_lambda_max) - monkeypatch.setattr("PyGRidge.src.seagull.lambda_max_group_lasso", mock_lambda_max) - monkeypatch.setattr( - "PyGRidge.src.seagull.lambda_max_sparse_group_lasso", mock_lambda_max - ) - monkeypatch.setattr("PyGRidge.src.seagull.lasso", mock_lasso) - monkeypatch.setattr("PyGRidge.src.seagull.group_lasso", mock_group_lasso) - monkeypatch.setattr( - "PyGRidge.src.seagull.sparse_group_lasso", mock_sparse_group_lasso - ) - - -# Fixture for common test data -@pytest.fixture -def test_data(): - # Ensure all arrays are float64 to avoid casting issues - y = np.array([1, 2, 3, 4, 5], dtype=np.float64) - X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], dtype=np.float64) - Z = np.array( - [[1, 1, 1], [1, 2, 3], [1, 3, 6], [1, 4, 10], [1, 5, 15]], dtype=np.float64 - ) - weights_u = np.array([1, 1, 1], dtype=np.float64) - groups = np.array([1, 1, 2, 2, 3], dtype=np.float64) - return y, X, Z, weights_u, groups - - -# Test input validation -def test_missing_y(): - with pytest.raises(ValueError, match="Vector y is missing"): - seagull(y=None, Z=np.array([[1, 2], [3, 4]])) - - -def test_empty_y(): - with pytest.raises(ValueError, match="Vector y is empty"): - seagull(y=np.array([]), Z=np.array([[1, 2], [3, 4]])) - - -def test_non_numeric_y(): - with pytest.raises(ValueError, match="Non-numeric values detected in vector y"): - seagull(y=np.array(["a", "b"]), Z=np.array([[1, 2], [3, 4]])) - - -def test_nan_inf_y(): - with pytest.raises(ValueError, match="NA, NaN, or Inf detected in vector y"): - seagull(y=np.array([1, np.nan, 3]), Z=np.array([[1, 2], [3, 4], [5, 6]])) - - -def test_missing_Z(): - with pytest.raises(ValueError, match="Matrix Z is missing"): - seagull(y=np.array([1, 2, 3]), Z=None) - - -def test_non_numeric_Z(): - with pytest.raises(ValueError, match="Non-numeric values detected in matrix Z"): - seagull(y=np.array([1, 2]), Z=np.array([["a", "b"], ["c", "d"]])) - - -def test_nan_inf_Z(): - with pytest.raises(ValueError, match="NA, NaN, or Inf detected in matrix Z"): - seagull(y=np.array([1, 2]), Z=np.array([[1, np.inf], [3, 4]])) - - -def test_mismatching_dimensions_y_Z(): - with pytest.raises( - ValueError, match="Mismatching dimensions of vector y and matrix Z" - ): - seagull(y=np.array([1, 2, 3]), Z=np.array([[1, 2], [3, 4]])) - - -# Test core functionality -def test_lasso(test_data): - y, X, Z, weights_u, _ = test_data - result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, alpha=1.0) - assert result["result"] == "lasso" - - -def test_group_lasso(test_data): - y, X, Z, weights_u, groups = test_data - result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, groups=groups, alpha=0.0) - assert result["result"] == "group_lasso" - - -def test_sparse_group_lasso(test_data): - y, X, Z, weights_u, groups = test_data - result = seagull(y=y, X=X, Z=Z, weights_u=weights_u, groups=groups, alpha=0.5) - assert result["result"] == "sparse_group_lasso" - - -# Test edge cases and parameter validation -def test_invalid_alpha(): - y = np.array([1, 2, 3], dtype=np.float64) - Z = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64) - with pytest.warns(UserWarning, match="The parameter alpha is out of range"): - result = seagull(y, Z=Z, alpha=2.0) - assert result["result"] == "lasso" - - -def test_missing_groups_for_group_lasso(): - y = np.array([1, 2, 3]) - Z = np.array([[1, 2], [3, 4], [5, 6]]) - with pytest.raises(ValueError, match="Vector groups is missing"): - seagull(y, Z=Z, alpha=0.5) - - -def test_invalid_rel_acc(): - y = np.array([1, 2, 3], dtype=np.float64) - Z = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64) - with pytest.warns(UserWarning, match="The parameter rel_acc is non-positive"): - result = seagull(y, Z=Z, rel_acc=-0.1, alpha=1.0) - assert result["result"] == "lasso" - - -def test_invalid_max_iter(): - y = np.array([1, 2, 3], dtype=np.float64) - Z = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64) - with pytest.warns(UserWarning, match="The parameter max_iter is non-positive"): - result = seagull(y, Z=Z, max_iter=0, alpha=1.0) - assert result["result"] == "lasso" - - -def test_invalid_gamma_bls(): - y = np.array([1, 2, 3], dtype=np.float64) - Z = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64) - with pytest.warns(UserWarning, match="The parameter gamma_bls is out of range"): - result = seagull(y, Z=Z, gamma_bls=1.5, alpha=1.0) - assert result["result"] == "lasso"