diff --git a/LICENSE.txt b/LICENSE.txt index 9eab1af7..66c93078 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -PyUol Copyright (c) 2019, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. +PyUoI Copyright (c) 2019, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index ed602a16..facb1219 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ PyUoI requires * numpy>=1.14 * h5py>=2.8 -* scikit-learn>=0.20 +* scikit-learn>=0.24 and optionally @@ -89,8 +89,8 @@ Please see our ReadTheDocs # Copyright -PyUol Copyright (c) 2019, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. +PyUoI Copyright (c) 2019, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. -If you have questions about your rights to use or distribute this software, please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov referring to " PyUol" (LBNL Ref 2019-157)." +If you have questions about your rights to use or distribute this software, please contact Berkeley Lab's Innovation & Partnerships Office at IPO@lbl.gov referring to " PyUoI" (LBNL Ref 2019-157)." NOTICE. This software was developed under funding from the U.S. Department of Energy. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, and perform publicly and display publicly. The U.S. Government is granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, distribute copies to the public, perform publicly and display publicly, and to permit others to do so. diff --git a/bin/generate_build.sh b/bin/generate_build.sh old mode 100644 new mode 100755 index 0d7854e0..34f9776d --- a/bin/generate_build.sh +++ b/bin/generate_build.sh @@ -1,6 +1,6 @@ eval "$(conda shell.bash hook)" mkdir dist -for py in 3.6 3.7; do +for py in 3.6 3.7 3.8; do git clone https://github.com/BouchardLab/pyuoi.git cd pyuoi conda create -y -n temp_build_env python=$py diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 6c0ef1d3..d05f75d0 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -28,7 +28,7 @@ PyUoI requires * numpy>=1.14 * h5py>=2.8 - * scikit-learn>=0.20 + * scikit-learn>=0.24 and optionally diff --git a/pyuoi/datasets/__init__.py b/pyuoi/datasets/__init__.py index 45f4ae3e..9d3ec6b4 100644 --- a/pyuoi/datasets/__init__.py +++ b/pyuoi/datasets/__init__.py @@ -126,7 +126,10 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, if isinstance(random_state, int): rng = np.random.RandomState(random_state) else: - rng = random_state + if random_state is None: + rng = np.random + else: + rng = random_state n_not_informative = n_features - n_informative X = rng.randn(n_samples, n_features) diff --git a/pyuoi/linear_model/logistic.py b/pyuoi/linear_model/logistic.py index 483b43b1..145e8d6d 100644 --- a/pyuoi/linear_model/logistic.py +++ b/pyuoi/linear_model/logistic.py @@ -541,9 +541,10 @@ def _logistic_regression_path(X, y, Cs=48, fit_intercept=True, _, n_features = X.shape classes = np.unique(y) + n_classes = len(classes) if multi_class == 'auto': - if len(classes) > 2: + if n_classes > 2: multi_class = 'multinomial' else: multi_class = 'ovr' @@ -616,9 +617,19 @@ def _logistic_regression_path(X, y, Cs=48, fit_intercept=True, target = Y_multi if penalty == 'l2': w0 = w0.ravel() + if coef_mask is not None: + x0 = np.zeros_like(w0) def func(x, *args): - return _multinomial_loss_grad(x, *args)[0:2] + mask = args[3] + if mask is not None: + x0[mask] = x + args = args[:3] + (None,) + (args[-1],) + f, df = _multinomial_loss_grad(x0, *args)[0:2] + df = df[mask] + else: + f, df = _multinomial_loss_grad(x, *args)[0:2] + return f, df else: w0 = w0.T.ravel().copy() @@ -634,7 +645,18 @@ def func(x, g, *args): else: target = y_bin if penalty == 'l2': - func = _logistic_loss_and_grad + x0 = np.zeros_like(w0) + + def func(x, *args): + mask = args[3] + if mask is not None: + x0[mask] = x + args = args[:3] + (None,) + (args[-1],) + f, df = _logistic_loss_and_grad(x0, *args) + df = df[mask] + else: + f, df = _logistic_loss_and_grad(x, *args) + return f, df else: def func(x, g, *args): loss, grad = _logistic_loss_and_grad(x, *args) @@ -644,63 +666,79 @@ def func(x, g, *args): coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) for i, C in enumerate(Cs): - iprint = [-1, 50, 1, 100, 101][ - np.searchsorted(np.array([0, 1, 2, 3]), verbose)] - if penalty == 'l2': - w0, loss, info = optimize.fmin_l_bfgs_b( - func, w0, fprime=None, - args=(X, target, 1. / C, coef_mask, sample_weight), - iprint=iprint, pgtol=tol, maxiter=max_iter) - else: - zeros_seen = [0] - - def zero_coef(x, *args): - if multi_class == 'multinomial': - x = x.reshape(-1, classes.size)[:-1] - else: - x = x[:-1] - now_zeros = np.array_equiv(x, 0.) - if now_zeros: - zeros_seen[0] += 1 + if coef_mask is None or coef_mask.sum(): + iprint = [-1, 50, 1, 100, 101][ + np.searchsorted(np.array([0, 1, 2, 3]), verbose)] + if penalty == 'l2': + if coef_mask is None: + w0, loss, info = optimize.fmin_l_bfgs_b( + func, w0, fprime=None, + args=(X, target, 1. / C, coef_mask, sample_weight), + iprint=iprint, pgtol=tol, maxiter=max_iter) else: - zeros_seen[0] = 0 - if zeros_seen[0] > 1: - return -2048 - try: - w0 = fmin_lbfgs(func, w0, orthantwise_c=1. / C, - args=(X, target, 0., coef_mask, sample_weight), - max_iterations=max_iter, - epsilon=tol, - orthantwise_end=coef_size, - progress=zero_coef) - except AllZeroLBFGSError: - w0 *= 0. - info = None - if info is not None and info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge. Increase the number " - "of iterations.", ConvergenceWarning) - # In scipy <= 1.0.0, nit may exceed maxiter. - # See https://github.com/scipy/scipy/issues/7854. - if info is None: - n_iter_i = -1 - else: - n_iter_i = min(info['nit'], max_iter) + if fit_intercept: + if multi_class == 'multinomial': + mask = [coef_mask, + np.ones(n_classes)[:, np.newaxis]] + mask = np.concatenate(mask, axis=1) + else: + mask = np.concatenate([coef_mask, np.ones(1)]) + else: + mask = coef_mask + mask = np.nonzero(mask.ravel())[0] + wp = w0[mask] + wp, loss, info = optimize.fmin_l_bfgs_b( + func, wp, fprime=None, + args=(X, target, 1. / C, mask, sample_weight), + iprint=iprint, pgtol=tol, maxiter=max_iter) + w0 = np.zeros_like(w0) + w0[mask] = wp + + else: + zeros_seen = [0] + + def zero_coef(x, *args): + if multi_class == 'multinomial': + x = x.reshape(-1, classes.size)[:-1] + else: + x = x[:-1] + now_zeros = np.array_equiv(x, 0.) + if now_zeros: + zeros_seen[0] += 1 + else: + zeros_seen[0] = 0 + if zeros_seen[0] > 1: + return -2048 + try: + args = (X, target, 0., coef_mask, sample_weight) + w0 = fmin_lbfgs(func, w0, orthantwise_c=1. / C, + args=args, + max_iterations=max_iter, + epsilon=tol, + orthantwise_end=coef_size, + progress=zero_coef) + except AllZeroLBFGSError: + w0 *= 0. + info = None + if info is not None and info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge. Increase the number " + "of iterations.", ConvergenceWarning) + # In scipy <= 1.0.0, nit may exceed maxiter. + # See https://github.com/scipy/scipy/issues/7854. + if info is None: + n_iter_i = -1 + else: + n_iter_i = min(info['nit'], max_iter) + + n_iter[i] = n_iter_i if multi_class == 'multinomial': n_classes = max(2, classes.size) if penalty == 'l2': - multi_w0 = np.reshape(w0, (n_classes, -1)) + w0 = np.reshape(w0, (n_classes, -1)) else: - multi_w0 = np.reshape(w0, (-1, n_classes)).T - if coef_mask is not None: - multi_w0[:, :n_features] *= coef_mask - coefs.append(multi_w0.copy()) - else: - if coef_mask is not None: - w0[:n_features] *= coef_mask - coefs.append(w0.copy()) - - n_iter[i] = n_iter_i + w0 = np.reshape(w0, (-1, n_classes)).T + coefs.append(w0.copy()) return np.array(coefs), np.array(Cs), n_iter diff --git a/requirements.txt b/requirements.txt index 851c57c7..369f4430 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.14 h5py>=2.8 -scikit-learn>=0.20 +scikit-learn>=0.24 diff --git a/setup.py b/setup.py index 4f1ed957..dcd72854 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def finalize_options(self): # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0.0', + version='1.1.0', description='The Union of Intersections framework in Python.', long_description=long_description, diff --git a/tests/test_uoi_l1logistic.py b/tests/test_uoi_l1logistic.py index 397b551f..84a7d8a7 100644 --- a/tests/test_uoi_l1logistic.py +++ b/tests/test_uoi_l1logistic.py @@ -1,18 +1,274 @@ -import pytest +import pytest, numbers, warnings import numpy as np from numpy.testing import assert_array_equal, assert_allclose, assert_equal from scipy.sparse import rand as sprand +from scipy import optimize from pyuoi import UoI_L1Logistic from pyuoi.linear_model.logistic import (fit_intercept_fixed_coef, MaskedCoefLogisticRegression, - LogisticInterceptFitterNoFeatures) + LogisticInterceptFitterNoFeatures, + _logistic_regression_path, + _multinomial_loss_grad, + _logistic_loss_and_grad) from sklearn.metrics import accuracy_score -from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.utils import (compute_class_weight, + check_consistent_length, check_array) +from sklearn.exceptions import ConvergenceWarning from pyuoi.datasets import make_classification +from pyuoi.lbfgs import fmin_lbfgs, AllZeroLBFGSError + + +def _logistic_regression_path_old(X, y, Cs=48, fit_intercept=True, + max_iter=100, tol=1e-4, verbose=0, coef=None, + class_weight=None, penalty='l2', + multi_class='auto', + check_input=True, + sample_weight=None, + l1_ratio=None, coef_mask=None): + """Compute a Logistic Regression model for a list of regularization + parameters. + + This is the original function used to check the new indexing-based + version rather than the masking version implemented here. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Input data. + y : array-like, shape (n_samples,) or (n_samples, n_targets) + Input data, target values. + Cs : int | array-like, shape (n_cs,) + List of values for the regularization parameter or integer specifying + the number of regularization parameters that should be used. In this + case, the parameters will be chosen in a logarithmic scale between + 1e-4 and 1e4. + fit_intercept : bool + Whether to fit an intercept for the model. In this case the shape of + the returned array is (n_cs, n_features + 1). + max_iter : int + Maximum number of iterations for the solver. + tol : float + Stopping criterion. For the newton-cg and lbfgs solvers, the iteration + will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient. + verbose : int + For the liblinear and lbfgs solvers set verbose to any positive + number for verbosity. + coef : array-like, shape (n_features,), default None + Initialization value for coefficients of logistic regression. + Useless for liblinear solver. + class_weight : dict or 'balanced', optional + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + multi_class : str, {'multinomial', 'auto'}, default: 'auto' + For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'auto' selects binary if the data is binary + and otherwise selects 'multinomial'. + check_input : bool, default True + If False, the input arrays X and y will not be checked. + sample_weight : array-like, shape(n_samples,) optional + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + coef_mask : array-like, shape (n_features), (n_classes, n_features) optional + Masking array for coef. + Returns + ------- + coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1) + List of coefficients for the Logistic Regression model. If + fit_intercept is set to True then the second dimension will be + n_features + 1, where the last item represents the intercept. For + ``multiclass='multinomial'``, the shape is (n_classes, n_cs, + n_features) or (n_classes, n_cs, n_features + 1). + Cs : ndarray + Grid of Cs used for cross-validation. + n_iter : array, shape (n_cs,) + Actual number of iteration for each Cs. + """ + if isinstance(Cs, numbers.Integral): + Cs = np.logspace(-4, 4, Cs) + + # Preprocessing. + if check_input: + X = check_array(X, accept_sparse='csr', dtype=np.float64, + accept_large_sparse=True) + y = check_array(y, ensure_2d=False, dtype=None) + check_consistent_length(X, y) + _, n_features = X.shape + + classes = np.unique(y) + + if multi_class == 'auto': + if len(classes) > 2: + multi_class = 'multinomial' + else: + multi_class = 'ovr' + + # If sample weights exist, convert them to array (support for lists) + # and check length + # Otherwise set them to 1 for all examples + if sample_weight is not None: + sample_weight = np.array(sample_weight, dtype=X.dtype, order='C') + check_consistent_length(y, sample_weight) + else: + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + # If class_weights is a dict (provided by the user), the weights + # are assigned to the original labels. If it is "balanced", then + # the class_weights are assigned after masking the labels with a OvR. + le = LabelEncoder() + if isinstance(class_weight, dict) or multi_class == 'multinomial': + class_weight_ = compute_class_weight(class_weight, classes=classes, y=y) + sample_weight *= class_weight_[le.fit_transform(y)] + + # For doing a ovr, we need to mask the labels first. for the + # multinomial case this is not necessary. + if multi_class == 'ovr': + coef_size = n_features + w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) + mask_classes = np.array([-1, 1]) + mask = (y == 1) + y_bin = np.ones(y.shape, dtype=X.dtype) + y_bin[~mask] = -1. + # for compute_class_weight + + if class_weight == "balanced": + class_weight_ = compute_class_weight(class_weight, + classes=mask_classes, + y=y_bin) + sample_weight *= class_weight_[le.fit_transform(y_bin)] + + else: + coef_size = classes.size * n_features + lbin = OneHotEncoder(categories=[range(classes.size)], sparse=False) + Y_multi = lbin.fit_transform(y[:, np.newaxis]) + if Y_multi.shape[1] == 1: + Y_multi = np.hstack([1 - Y_multi, Y_multi]) + w0 = np.zeros((classes.size, n_features + int(fit_intercept)), + dtype=X.dtype) + w0[:, -1] = LogisticInterceptFitterNoFeatures(y, + classes.size).intercept_ + + if coef is not None: + # it must work both giving the bias term and not + if multi_class == 'ovr': + if coef.size not in (n_features, w0.size): + raise ValueError( + 'Initialization coef is of shape %d, expected shape ' + '%d or %d' % (coef.size, n_features, w0.size)) + w0[:coef.size] = coef + else: + w0[:, :coef.shape[1]] = coef + + # Mask initial array + if coef_mask is not None: + if multi_class == 'ovr': + w0[:n_features] *= coef_mask + else: + w0[:, :n_features] *= coef_mask + + if multi_class == 'multinomial': + # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters. + target = Y_multi + if penalty == 'l2': + w0 = w0.ravel() + + def func(x, *args): + return _multinomial_loss_grad(x, *args)[0:2] + else: + w0 = w0.T.ravel().copy() + + def inner_func(x, *args): + return _multinomial_loss_grad(x, *args)[0:2] + + def func(x, g, *args): + x = x.reshape(-1, classes.size).T.ravel() + loss, grad = inner_func(x, *args) + grad = grad.reshape(classes.size, -1).T.ravel() + g[:] = grad + return loss + else: + target = y_bin + if penalty == 'l2': + func = _logistic_loss_and_grad + else: + def func(x, g, *args): + loss, grad = _logistic_loss_and_grad(x, *args) + g[:] = grad + return loss + + coefs = list() + n_iter = np.zeros(len(Cs), dtype=np.int32) + for i, C in enumerate(Cs): + iprint = [-1, 50, 1, 100, 101][ + np.searchsorted(np.array([0, 1, 2, 3]), verbose)] + if penalty == 'l2': + w0, loss, info = optimize.fmin_l_bfgs_b( + func, w0, fprime=None, + args=(X, target, 1. / C, coef_mask, sample_weight), + iprint=iprint, pgtol=tol, maxiter=max_iter) + else: + zeros_seen = [0] + + def zero_coef(x, *args): + if multi_class == 'multinomial': + x = x.reshape(-1, classes.size)[:-1] + else: + x = x[:-1] + now_zeros = np.array_equiv(x, 0.) + if now_zeros: + zeros_seen[0] += 1 + else: + zeros_seen[0] = 0 + if zeros_seen[0] > 1: + return -2048 + try: + w0 = fmin_lbfgs(func, w0, orthantwise_c=1. / C, + args=(X, target, 0., coef_mask, sample_weight), + max_iterations=max_iter, + epsilon=tol, + orthantwise_end=coef_size, + progress=zero_coef) + except AllZeroLBFGSError: + w0 *= 0. + info = None + if info is not None and info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge. Increase the number " + "of iterations.", ConvergenceWarning) + # In scipy <= 1.0.0, nit may exceed maxiter. + # See https://github.com/scipy/scipy/issues/7854. + if info is None: + n_iter_i = -1 + else: + n_iter_i = min(info['nit'], max_iter) + + if multi_class == 'multinomial': + n_classes = max(2, classes.size) + if penalty == 'l2': + multi_w0 = np.reshape(w0, (n_classes, -1)) + else: + multi_w0 = np.reshape(w0, (-1, n_classes)).T + if coef_mask is not None: + multi_w0[:, :n_features] *= coef_mask + coefs.append(multi_w0.copy()) + else: + if coef_mask is not None: + w0[:n_features] *= coef_mask + coefs.append(w0.copy()) + + n_iter[i] = n_iter_i + + return np.array(coefs), np.array(Cs), n_iter def test_fit_intercept_fixed_coef(): @@ -218,6 +474,72 @@ def test_masked_logistic_standardize(): lr.fit(X, y, coef_mask=mask) +@pytest.mark.parametrize("n_classes,penalty,fit_intercept", [(3, "l2", True), + (3, "l2", False), + (3, "l1", True), + (3, "l1", False), + (2, "l2", True), + (2, "l2", False), + (2, "l1", True), + (2, "l1", False)]) +def test_masking_with_indexing(n_classes, penalty, fit_intercept): + """Check that indexing the masks gives the same results as masking with + logistic regression. + """ + X, y, w, intercept = make_classification(n_samples=1000, + n_classes=n_classes, + n_features=20, + n_informative=10, + random_state=0) + mask = w != 0. + if n_classes == 2: + mask = mask.ravel() + coefs, _, _ = _logistic_regression_path(X, y, [10.], coef_mask=mask, + penalty=penalty, + fit_intercept=fit_intercept) + coefs_old, _, _ = _logistic_regression_path_old(X, y, [10.], coef_mask=mask, + penalty=penalty, + fit_intercept=fit_intercept) + assert_allclose(coefs, coefs_old) + coefs, _, _ = _logistic_regression_path(X, y, [10.], + penalty=penalty, + fit_intercept=fit_intercept) + coefs_old, _, _ = _logistic_regression_path_old(X, y, [10.], + penalty=penalty, + fit_intercept=fit_intercept) + assert_allclose(coefs, coefs_old) + + +@pytest.mark.parametrize("n_classes,penalty,fit_intercept", [(3, "l2", True), + (3, "l2", False), + (3, "l1", True), + (3, "l1", False), + (2, "l2", True), + (2, "l2", False), + (2, "l1", True), + (2, "l1", False)]) +def test_all_masked_with_indexing(n_classes, penalty, fit_intercept): + """Check masking all of the coef either works with intercept or raises an error. + """ + X, y, w, intercept = make_classification(n_samples=1000, + n_classes=n_classes, + n_features=20, + n_informative=10, + random_state=0) + mask = np.zeros_like(w) + if n_classes == 2: + mask = mask.ravel() + coefs, _, _ = _logistic_regression_path(X, y, [10.], coef_mask=mask, + fit_intercept=fit_intercept) + if fit_intercept: + if n_classes == 2: + assert_equal(coefs[0][:-1], 0.) + else: + assert_equal(coefs[0][:, :-1], 0.) + else: + assert_equal(coefs[0], 0.) + + def test_estimation_score_usage(): """Test the ability to change the estimation score in UoI L1Logistic""" methods = ('acc', 'log', 'BIC', 'AIC', 'AICc')