From b1488457cbb9a8f275a32bb735511c1e58e3d856 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Fri, 13 May 2022 15:57:17 +0100 Subject: [PATCH 1/9] Add option to make state.hessian a sparse COO matrix --- src/glum/_solvers.py | 68 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index 2f359dfb..bb693cee 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -145,15 +145,31 @@ def update_hessian(state, data, active_set): # 4) just like an update, we only update the active_set hessian_init = build_hessian_delta( data.X, - state.hessian_rows, + state.hessian_rows, # this is the d that goes into sandwich data.fit_intercept, data.P2, np.arange(data.X.shape[0], dtype=np.int32), active_set, ) - state.hessian[np.ix_(active_set, active_set)] = hessian_init + + # In the sparse Hessian case, stsate.hessian is a coo_matrix. + # hessian_init is a numpy array of size active_set x active_set; + # we can convert this to a coo_matrix and simply add it to the state.hessian + if state.use_sparse_hessian: + coo_data = hessian_init.flatten() + coo_rows = np.repeat(active_set, active_set.shape[0]) + coo_cols = np.tile(active_set, active_set.shape[0]) + + state.hessian = sparse.coo_matrix( + (coo_data, (coo_rows, coo_cols)), + shape=(state.coef.shape[0], state.coef.shape[0]), + ) + else: + state.hessian[np.ix_(active_set, active_set)] = hessian_init + state.hessian_initialized = True n_active_rows = data.X.shape[0] + else: # In an update iteration, we want to: @@ -176,13 +192,31 @@ def update_hessian(state, data, active_set): active_rows=active_rows, active_cols=active_set, ) - state.hessian[np.ix_(active_set, active_set)] += hessian_delta + + if state.use_sparse_hessian: + coo_data = hessian_delta.flatten() + coo_rows = np.repeat(active_set, active_set.shape[0]) + coo_cols = np.tile(active_set, active_set.shape[0]) + + state.hessian += sparse.coo_matrix( + (coo_data, (coo_rows, coo_cols)), + shape=(state.coef.shape[0], state.coef.shape[0]), + ) + else: + state.hessian[np.ix_(active_set, active_set)] += hessian_delta + n_active_rows = active_rows.shape[0] - return ( - state.hessian[np.ix_(active_set, active_set)], - n_active_rows, - ) + # If state.hessian is in COO form, we have to convert to CSC in order to index the + # active set elements, which we then convert to a numpy array for compatibility with + # the rest of the code + if state.use_sparse_hessian: + return ( + state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(), + n_active_rows, + ) + else: + return state.hessian[np.ix_(active_set, active_set)], n_active_rows def _is_subset(x, y): @@ -208,6 +242,8 @@ def build_hessian_delta( """ idx = 1 if intercept else 0 active_cols_non_intercept = active_cols[idx:] - idx + # Here is the safe sandwich dot that calls sandwich + # active_cols is small even for a large number of columns (2304 for 1M) delta = _safe_sandwich_dot( X, hessian_rows, active_rows, active_cols_non_intercept, intercept ) @@ -271,7 +307,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L ---------- inner_solver A least squares solver that can handle the appropriate penalties. With - an L1 penalty, this will _cd_solver. With only an L2 penalty, + an L1 penalty, this will be _cd_solver. With only an L2 penalty, _least_squares_solver will be more efficient. coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. @@ -528,9 +564,19 @@ def __init__(self, coef, data): self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype) self.gradient_rows = None self.hessian_rows = None - self.hessian = np.zeros( - (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype - ) + + # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we + # can instantiate the Hessian as a sparse COO matrix. + self.use_sparse_hessian = True + if self.use_sparse_hessian: + self.hessian = sparse.coo_matrix( + (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype + ) + else: + self.hessian = np.zeros( + (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype + ) + self.hessian_initialized = False self.coef_P2 = None self.norm_min_subgrad = None From 2bddab6f4f88167bde0052a7addfdbb9504a12fd Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Tue, 17 May 2022 21:37:36 +0100 Subject: [PATCH 2/9] fisher_diag seems to work --- src/glum/_cd_fast.pyx | 301 +++++++++++++++++++++++++++++++++++++++++- src/glum/_solvers.py | 122 +++++++++++++---- 2 files changed, 391 insertions(+), 32 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index b4bcf19b..0a2ed5b6 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -9,7 +9,7 @@ # # cython: boundscheck=False, wraparound=False, cdivision=True from libc.math cimport fabs -cimport numpy as np +cimport numpy as np # is it a problem that these two imports have same name??? import numpy as np import numpy.linalg as linalg from numpy.math cimport INFINITY @@ -107,6 +107,295 @@ def enet_coordinate_descent_gram(int[::1] active_set, floating[:] lower_bounds, bint has_upper_bounds, floating[:] upper_bounds): + """Cython version of the coordinate descent algorithm + for Elastic-Net regression + We minimize + (1/2) * w^T Q w - q^T w + P1 norm(w, 1) + which amounts to the L1 problem when: + Q = X^T X (Gram matrix) + q = X^T y + """ + + # get the data information into easy vars + cdef unsigned int n_active_features = active_set.shape[0] + cdef unsigned int n_features = Q.shape[0] # are you sure this is CORRECT? + # n_features is exactly now the same as n_active_features; should it be q.shape[0]? + # however, the variable is never used again, so I guess we are okay + + cdef floating w_ii + cdef floating P1_ii + cdef floating qii_temp + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating d_w_tol = tol + cdef floating norm_min_subgrad = 0 + cdef floating max_min_subgrad + cdef unsigned int active_set_ii, active_set_jj + cdef unsigned int ii, jj + cdef int n_iter = 0 + cdef unsigned int f_iter + cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef UINT32_t* rand_r_state = &rand_r_state_seed + + with nogil: + for n_iter in range(max_iter): + w_max = 0.0 + d_w_max = 0.0 + for f_iter in range(n_active_features): # Loop over coordinates + if random: + active_set_ii = rand_int(n_active_features, rand_r_state) + else: + active_set_ii = f_iter # _ii is an index + ii = active_set[active_set_ii] # odd syntax; gets the active set's row + + if ii < intercept: # but `intercept` is binary!? + P1_ii = 0.0 + else: + P1_ii = P1[ii - intercept] + + if Q[active_set_ii, active_set_ii] == 0.0: # no need to multiply zeros + continue + + w_ii = w[ii] # Store previous value, the iith element of w + + qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii] + w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii] + + if ii >= intercept: + if has_lower_bounds: + if w[ii] < lower_bounds[ii - intercept]: + w[ii] = lower_bounds[ii - intercept] + if has_upper_bounds: + if w[ii] > upper_bounds[ii - intercept]: + w[ii] = upper_bounds[ii - intercept] + + if w[ii] != 0.0 or w_ii != 0.0: + # q += (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y) + for active_set_jj in range(n_active_features): + jj = active_set[active_set_jj] + q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj] + + # update the maximum absolute coefficient update + d_w_ii = fabs(w[ii] - w_ii) + if d_w_ii > d_w_max: + d_w_max = d_w_ii + + if fabs(w[ii]) > w_max: + w_max = fabs(w[ii]) + + if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: + # the biggest coordinate update of this iteration was smaller than + # the tolerance: check the minimum norm subgradient as the + # ultimate stopping criterion + cython_norm_min_subgrad( + active_set, + w, q, P1, intercept, + has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds, + &norm_min_subgrad, &max_min_subgrad + ) + if norm_min_subgrad <= tol: + break + else: + # for/else, runs if for doesn't end with a `break` + with gil: + warnings.warn("Coordinate descent did not converge. You might want to " + "increase the number of iterations. Minimum norm " + "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), + ConvergenceWarning) + + return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 + +def enet_coordinate_descent_gram_diag_fisher( + #floating[:,:] X, + #floating[:] hessian_rows, + np.ndarray[np.float64_t, ndim=2] X, + np.ndarray[np.float64_t, ndim=1] hessian_rows, + int[::1] active_set, + floating[::1] w, + floating[::1] P1, + floating[:,:] Q, + floating[::1] q, + int max_iter, floating tol, object rng, + bint intercept, bint random, + bint has_lower_bounds, + floating[:] lower_bounds, + bint has_upper_bounds, + floating[:] upper_bounds): + """Cython version of the coordinate descent algorithm + for Elastic-Net regression + We minimize + (1/2) * w^T Q w - q^T w + P1 norm(w, 1) + which amounts to the L1 problem when: + Q = X^T X (Gram matrix) + q = X^T y + """ + # The CD that never calculates the entire Hessian matrix; only rows, when they are + # necessary. + + # get the data information into easy vars + cdef unsigned int n_active_features = active_set.shape[0] + # cdef unsigned int n_features = q.shape[0] # check that this is the same as Q.shape[0] + + cdef floating w_ii + cdef floating P1_ii + cdef floating qii_temp + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating d_w_tol = tol + cdef floating norm_min_subgrad = 0 + cdef floating max_min_subgrad + cdef unsigned int active_set_ii, active_set_jj + cdef unsigned int ii, jj + cdef int n_iter = 0 + cdef unsigned int f_iter + cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef UINT32_t* rand_r_state = &rand_r_state_seed + + #cdef np.ndarray[np.float64_t, dtype=q.dtype] Qj # will hold rows of the Q matrix + # cdef floating[:] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) + cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) + # is empty bad? + + # used to check correctness of code - compare with Q from non-diag-fisher + cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64) + + # THE inefficiency here is that we must re-calculate the rows of Q for ever iteration. + + # with nogil: + for n_iter in range(max_iter): + w_max = 0.0 + d_w_max = 0.0 + for f_iter in range(n_active_features): # Loop over coordinates + if random: + active_set_ii = rand_int(n_active_features, rand_r_state) + else: + active_set_ii = f_iter # _ii is an index + ii = active_set[active_set_ii] # odd syntax; gets an active row + + # remember, Q hasn't been created yet. + # and the only row we need for this iteration is Q[active_set_ii] + # also remember, sandwich product is between data.X, state.hessian_rows + # only use first column of X in the X^T part of the sandwich + + # if intercept == 1: # maybe this should go into the next if statement? + # Q_active_set_ii[0] = hessian_rows.sum() + # this is perfectly valid because hessian_rows has length 16087 + + if active_set[0] < intercept: + if ii == 0: + Q_active_set_ii[0] = hessian_rows.sum() + Q_active_set_ii[1:] = np.matmul(hessian_rows, X) + else: + Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1]) + Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X) + + # if ii < intercept: + # # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:]) + # # Q_active_set_ii[0] = hessian_rows.sum() + # Q_active_set_ii[1:] = np.matmul(hessian_rows, X) + + # the next condition in the original code checks for sparseness. + # we won't worry about that here. but we will have to go elem by elem + # actually no, we don't have to go elem by elem. Cython can handle. + # do i have to worry about intercept term? or does X already ignore it? + + # what about the active features? do we only need to multiply by those? + # correct. i believe we only need to multiply by those + # so X can actually be passed in as X[(active_set_ii) cartesian product (active_set_ii)], right? + # the only problem is that it would mess up the active_set_ii / active_set_jj logic... + + # NB! We care about the intercept column as well, and must treat differently. + # does feature_list in sklearnfork include the intercept column? + else: + Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X) + # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X) + # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:]) + # CHECK that we've covered all the cases!!!!! + + + # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X) + + Q_check[active_set_ii] = Q_active_set_ii + + # for col_idx in range(X.shape[1]): + # for row_idx in range(X.shape[0]): + # Q_active_set_ii[intercept + col_idx] = + + # the next if statement checks for P2, but we're only using this for L1 + # regularization, so we don't need to worry about it. and that's all! + + # If the above works, can merge the following into the above if-else!!!! + if ii < intercept: # equiv. to (ii=0 and intercept=1) + P1_ii = 0.0 + else: + P1_ii = P1[ii - intercept] + + if Q_active_set_ii[active_set_ii] == 0.0: # no need to multiply zeros + continue + + w_ii = w[ii] # Store previous value, the iith element of w + + qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii] ## + w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii] + + if ii >= intercept: # this is big-brain logic + if has_lower_bounds: + if w[ii] < lower_bounds[ii - intercept]: + w[ii] = lower_bounds[ii - intercept] + if has_upper_bounds: + if w[ii] > upper_bounds[ii - intercept]: + w[ii] = upper_bounds[ii - intercept] + + if w[ii] != 0.0 or w_ii != 0.0: + # q += (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y) + for active_set_jj in range(n_active_features): + jj = active_set[active_set_jj] + # this is equivalent to A += Bj * z + q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj] + + # update the maximum absolute coefficient update + d_w_ii = fabs(w[ii] - w_ii) + if d_w_ii > d_w_max: + d_w_max = d_w_ii + + if fabs(w[ii]) > w_max: + w_max = fabs(w[ii]) + + if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: + # the biggest coordinate update of this iteration was smaller than + # the tolerance: check the minimum norm subgradient as the + # ultimate stopping criterion + cython_norm_min_subgrad( + active_set, + w, q, P1, intercept, + has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds, + &norm_min_subgrad, &max_min_subgrad + ) + if norm_min_subgrad <= tol: + break + else: + # for/else, runs if for doesn't end with a `break` + # with gil: + warnings.warn("Coordinate descent did not converge. You might want to " + "increase the number of iterations. Minimum norm " + "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), + ConvergenceWarning) + + return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1, Q_check + +def enet_coordinate_descent_gram_diag_only(int[::1] active_set, + floating[::1] w, + floating[::1] P1, + floating[:,:] Q, + floating[::1] q, + int max_iter, floating tol, object rng, + bint intercept, bint random, + bint has_lower_bounds, + floating[:] lower_bounds, + bint has_upper_bounds, + floating[:] upper_bounds): """Cython version of the coordinate descent algorithm for Elastic-Net regression We minimize @@ -115,6 +404,7 @@ def enet_coordinate_descent_gram(int[::1] active_set, Q = X^T X (Gram matrix) q = X^T y """ + # In this version, we don't access the off-diagonal elements. And see what happens. # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] @@ -168,11 +458,10 @@ def enet_coordinate_descent_gram(int[::1] active_set, if w[ii] > upper_bounds[ii - intercept]: w[ii] = upper_bounds[ii - intercept] - if w[ii] != 0.0 or w_ii != 0.0: - # q += (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y) - for active_set_jj in range(n_active_features): - jj = active_set[active_set_jj] - q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj] + # if w[ii] != 0.0 or w_ii != 0.0: + # for active_set_jj in range(n_active_features): + # jj = active_set[active_set_jj] + # q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj] # update the maximum absolute coefficient update d_w_ii = fabs(w[ii] - w_ii) diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index bb693cee..950daef8 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -15,6 +15,7 @@ from ._cd_fast import ( _norm_min_subgrad, enet_coordinate_descent_gram, + enet_coordinate_descent_gram_diag_fisher, identify_active_rows, ) from ._distribution import ExponentialDispersionModel @@ -56,23 +57,62 @@ def _least_squares_solver(state, data, hessian): @timeit("inner_solver_runtime") -def _cd_solver(state, data, active_hessian): - new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram( - state.active_set, - state.coef.copy(), - data.P1, - active_hessian, - -state.score, - data.max_inner_iter, - state.inner_tol, - data.random_state, - data.fit_intercept, - data.selection == "random", - data.has_lower_bounds, - data._lower_bounds, - data.has_upper_bounds, - data._upper_bounds, - ) +def _cd_solver(state, data, active_hessian, diag_fisher=False): + if diag_fisher: + if data.fit_intercept: + if 0 in state.active_set: + # remove the zero, subtract one from everything else + X_active = data.X[:, state.active_set[1:] - 1].A + else: + X_active = data.X[:, state.active_set - 1].A + else: + X_active = data.X[:, state.active_set].A + # instead of passing in data.X, we should pass in + # but remember! X is sparse. But X is not necessarily square + # we may need active rows and columns + # X_active = data.X[:, state.active_set].A + ( + new_coef, + gap, + _, + _, + n_cycles, + Q_check, + ) = enet_coordinate_descent_gram_diag_fisher( + X_active, # new + state.hessian_rows, # new + state.active_set, + state.coef.copy(), + data.P1, + active_hessian, + -state.score, + data.max_inner_iter, + state.inner_tol, + data.random_state, + data.fit_intercept, + data.selection == "random", + data.has_lower_bounds, + data._lower_bounds, + data.has_upper_bounds, + data._upper_bounds, + ) + else: + new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram( + state.active_set, + state.coef.copy(), + data.P1, + active_hessian, + -state.score, + data.max_inner_iter, + state.inner_tol, + data.random_state, + data.fit_intercept, + data.selection == "random", + data.has_lower_bounds, + data._lower_bounds, + data.has_upper_bounds, + data._upper_bounds, + ) return new_coef - state.coef, n_cycles @@ -143,16 +183,18 @@ def update_hessian(state, data, active_set): # 2) use all the rows # 3) Include the P2 components # 4) just like an update, we only update the active_set + + # this is not needed for diag_fisher! hessian_init = build_hessian_delta( data.X, state.hessian_rows, # this is the d that goes into sandwich data.fit_intercept, data.P2, - np.arange(data.X.shape[0], dtype=np.int32), - active_set, + np.arange(data.X.shape[0], dtype=np.int32), # this uses all rows (16087) + active_set, # (17650) ) - # In the sparse Hessian case, stsate.hessian is a coo_matrix. + # In the sparse Hessian case, state.hessian is a coo_matrix. # hessian_init is a numpy array of size active_set x active_set; # we can convert this to a coo_matrix and simply add it to the state.hessian if state.use_sparse_hessian: @@ -169,6 +211,7 @@ def update_hessian(state, data, active_set): state.hessian_initialized = True n_active_rows = data.X.shape[0] + # fortunately, none of these above variables are required else: @@ -178,6 +221,9 @@ def update_hessian(state, data, active_set): # 3) Ignore the P2 components because those don't change and have # already been added # 4) only update the active set subset of the hessian. + + # in the diag_fisher case, we don't want to use the diff, we use the whole thing + hessian_rows_diff, active_rows = identify_active_rows( state.gradient_rows, state.hessian_rows, @@ -301,6 +347,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L Repeat steps 1-3 until convergence. Note: Use hessian matrix instead of Hessian for H. + This used to say "Use Fisher matrix instead of Hessian for H." Note: f' = -score, H = hessian matrix Parameters @@ -346,12 +393,35 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L state.old_active_set = state.active_set state.active_set = identify_active_set(state, data) - # 0) Build the hessian - hessian, state.n_active_rows = update_hessian(state, data, state.active_set) + diag_fisher = True + + if not diag_fisher: + # 0) Build the hessian + hessian, state.n_active_rows = update_hessian( + state, data, state.active_set + ) + # state.n_active_rows never gets used again in this function + # may be used in logging, but we'll ignore for diag_fisher case + + # 1) Solve the L1 and L2 penalized least squares problem + d, n_cycles_this_iter = inner_solver( + state, data, hessian, diag_fisher=diag_fisher + ) + state.n_cycles += n_cycles_this_iter - # 1) Solve the L1 and L2 penalized least squares problem - d, n_cycles_this_iter = inner_solver(state, data, hessian) - state.n_cycles += n_cycles_this_iter + else: + # we need a hessian update step here + # but then we'd store mxm, which is undesired + # we'll instead store hessian diag, but have to update it first + # update_hessian_diag_fisher() # will update state.hessian_rows + # not needed! because of update_quadratic() below + + # sandwich product is between data.X, state.hessian_rows + # only use first row of X in the X^T part of the sandwich + d, n_cycles_this_iter = inner_solver( + state, data, None, diag_fisher=diag_fisher + ) + state.n_cycles += n_cycles_this_iter # 2) Line search state.old_hessian_rows[:] = state.hessian_rows @@ -368,7 +438,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L # 3) Update the quadratic approximation state.gradient_rows, state.score, state.hessian_rows = update_quadratic( state, data, coef_P2 - ) + ) # is this where we update state.hessian_rows?? if so, no need for diff :) # 4) Check if we've converged ( From 01fab32fdbf4b66d3fc8162f6c4a15f84cdea591 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Wed, 18 May 2022 15:08:30 +0100 Subject: [PATCH 3/9] Pass sparse array into _cd_fast.pyx instead of dense array. --- src/glum/_cd_fast.pyx | 36 +++++++++++++++++++++++++++--------- src/glum/_solvers.py | 13 ++++++++----- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index 0a2ed5b6..2d4415e7 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -209,7 +209,8 @@ def enet_coordinate_descent_gram(int[::1] active_set, def enet_coordinate_descent_gram_diag_fisher( #floating[:,:] X, #floating[:] hessian_rows, - np.ndarray[np.float64_t, ndim=2] X, + #np.ndarray[np.float64_t, ndim=2] X, + X, np.ndarray[np.float64_t, ndim=1] hessian_rows, int[::1] active_set, floating[::1] w, @@ -258,10 +259,12 @@ def enet_coordinate_descent_gram_diag_fisher( cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) # is empty bad? + cdef cur_col # makes sure we can overwrite this and not take up too much memory? + # used to check correctness of code - compare with Q from non-diag-fisher - cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64) + # cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64) - # THE inefficiency here is that we must re-calculate the rows of Q for ever iteration. + # THE inefficiency here is that we must re-calculate the rows of Q for every iteration. # with nogil: for n_iter in range(max_iter): @@ -286,10 +289,20 @@ def enet_coordinate_descent_gram_diag_fisher( if active_set[0] < intercept: if ii == 0: Q_active_set_ii[0] = hessian_rows.sum() - Q_active_set_ii[1:] = np.matmul(hessian_rows, X) + Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows) + # Q_active_set_ii[1:] = np.matmul(hessian_rows, X) else: - Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1]) - Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X) + cur_col = X[:, active_set_ii - 1] + Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows) + Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) + + # Q_active_set_ii[0] = X[:, active_set_ii - 1].transpose_matvec(hessian_rows) + # Q_active_set_ii[1:] = X.transpose_matvec(X[:, active_set_ii - 1].multiply(hessian_rows).ravel()) + + # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X[:, active_set_ii - 1].A.ravel()) + # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii - 1]) + # Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1]) + # Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X) # use matvec # if ii < intercept: # # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:]) @@ -309,7 +322,12 @@ def enet_coordinate_descent_gram_diag_fisher( # NB! We care about the intercept column as well, and must treat differently. # does feature_list in sklearnfork include the intercept column? else: - Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X) + cur_col = X[:, active_set_ii] + Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) + + # Q_active_set_ii = X.transpose_matvec(hessian_rows * X[:, active_set_ii].A.ravel()) + # Q_active_set_ii = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii]) + # Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X) # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X) # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:]) # CHECK that we've covered all the cases!!!!! @@ -317,7 +335,7 @@ def enet_coordinate_descent_gram_diag_fisher( # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X) - Q_check[active_set_ii] = Q_active_set_ii + # Q_check[active_set_ii] = Q_active_set_ii # for checking correctness # for col_idx in range(X.shape[1]): # for row_idx in range(X.shape[0]): @@ -383,7 +401,7 @@ def enet_coordinate_descent_gram_diag_fisher( "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), ConvergenceWarning) - return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1, Q_check + return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check def enet_coordinate_descent_gram_diag_only(int[::1] active_set, floating[::1] w, diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index 950daef8..662cd13f 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -60,13 +60,16 @@ def _least_squares_solver(state, data, hessian): def _cd_solver(state, data, active_hessian, diag_fisher=False): if diag_fisher: if data.fit_intercept: - if 0 in state.active_set: + # the following line should be equivalent to the if/else below it + # X_active = data.X[:, state.active_set[(state.active_set[0] == 0):] - 1].A + if state.active_set[0] == 0: # remove the zero, subtract one from everything else - X_active = data.X[:, state.active_set[1:] - 1].A + X_active = data.X[:, state.active_set[1:] - 1] # .A else: - X_active = data.X[:, state.active_set - 1].A + X_active = data.X[:, state.active_set - 1] # .A else: - X_active = data.X[:, state.active_set].A + # for some reason, active_set here is massive (everything!) + X_active = data.X[:, state.active_set] # .A # instead of passing in data.X, we should pass in # but remember! X is sparse. But X is not necessarily square # we may need active rows and columns @@ -77,7 +80,7 @@ def _cd_solver(state, data, active_hessian, diag_fisher=False): _, _, n_cycles, - Q_check, + # Q_check, ) = enet_coordinate_descent_gram_diag_fisher( X_active, # new state.hessian_rows, # new From 149f890ff2e8ba44f427ce78c1d08fe5f1327d88 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Wed, 18 May 2022 17:57:15 +0100 Subject: [PATCH 4/9] Cleaned and ready for review. --- src/glum/_cd_fast.pyx | 231 ++++++------------------------------------ src/glum/_glm.py | 17 ++++ src/glum/_solvers.py | 80 +++++---------- 3 files changed, 69 insertions(+), 259 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index 2d4415e7..73aea8cb 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -9,7 +9,7 @@ # # cython: boundscheck=False, wraparound=False, cdivision=True from libc.math cimport fabs -cimport numpy as np # is it a problem that these two imports have same name??? +cimport numpy as np import numpy as np import numpy.linalg as linalg from numpy.math cimport INFINITY @@ -116,9 +116,8 @@ def enet_coordinate_descent_gram(int[::1] active_set, q = X^T y """ - # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] - cdef unsigned int n_features = Q.shape[0] # are you sure this is CORRECT? + cdef unsigned int n_features = Q.shape[0] # are you sure this is correct? # n_features is exactly now the same as n_active_features; should it be q.shape[0]? # however, the variable is never used again, so I guess we are okay @@ -142,22 +141,22 @@ def enet_coordinate_descent_gram(int[::1] active_set, for n_iter in range(max_iter): w_max = 0.0 d_w_max = 0.0 - for f_iter in range(n_active_features): # Loop over coordinates + for f_iter in range(n_active_features): if random: active_set_ii = rand_int(n_active_features, rand_r_state) else: - active_set_ii = f_iter # _ii is an index - ii = active_set[active_set_ii] # odd syntax; gets the active set's row + active_set_ii = f_iter + ii = active_set[active_set_ii] - if ii < intercept: # but `intercept` is binary!? + if ii < intercept: P1_ii = 0.0 else: P1_ii = P1[ii - intercept] - if Q[active_set_ii, active_set_ii] == 0.0: # no need to multiply zeros + if Q[active_set_ii, active_set_ii] == 0.0: continue - w_ii = w[ii] # Store previous value, the iith element of w + w_ii = w[ii] qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii] w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii] @@ -207,15 +206,11 @@ def enet_coordinate_descent_gram(int[::1] active_set, return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 def enet_coordinate_descent_gram_diag_fisher( - #floating[:,:] X, - #floating[:] hessian_rows, - #np.ndarray[np.float64_t, ndim=2] X, X, np.ndarray[np.float64_t, ndim=1] hessian_rows, int[::1] active_set, floating[::1] w, floating[::1] P1, - floating[:,:] Q, floating[::1] q, int max_iter, floating tol, object rng, bint intercept, bint random, @@ -230,13 +225,13 @@ def enet_coordinate_descent_gram_diag_fisher( which amounts to the L1 problem when: Q = X^T X (Gram matrix) q = X^T y + + This version, for diag_fisher=True, never calculates the entire Hessian matrix; + only rows, when necessary. The inefficiency here is that we must re-calculate + the rows of Q for every iteration up to max_iter. """ - # The CD that never calculates the entire Hessian matrix; only rows, when they are - # necessary. - # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] - # cdef unsigned int n_features = q.shape[0] # check that this is the same as Q.shape[0] cdef floating w_ii cdef floating P1_ii @@ -254,111 +249,51 @@ def enet_coordinate_descent_gram_diag_fisher( cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) cdef UINT32_t* rand_r_state = &rand_r_state_seed - #cdef np.ndarray[np.float64_t, dtype=q.dtype] Qj # will hold rows of the Q matrix - # cdef floating[:] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) + # Equivalent to Q[active_set_ii] in the diag_fisher=False version cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) - # is empty bad? - - cdef cur_col # makes sure we can overwrite this and not take up too much memory? - - # used to check correctness of code - compare with Q from non-diag-fisher - # cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64) - # THE inefficiency here is that we must re-calculate the rows of Q for every iteration. - - # with nogil: + cdef cur_col # Does the cdef here (rather than using it as a local variable) reduce memory? + + # nogil is incompatible with our slicing, numpy, and sparse matrix operations, but + # may be necessary to increase performance + # with nogil: for n_iter in range(max_iter): w_max = 0.0 d_w_max = 0.0 - for f_iter in range(n_active_features): # Loop over coordinates + for f_iter in range(n_active_features): if random: active_set_ii = rand_int(n_active_features, rand_r_state) else: - active_set_ii = f_iter # _ii is an index - ii = active_set[active_set_ii] # odd syntax; gets an active row - - # remember, Q hasn't been created yet. - # and the only row we need for this iteration is Q[active_set_ii] - # also remember, sandwich product is between data.X, state.hessian_rows - # only use first column of X in the X^T part of the sandwich + active_set_ii = f_iter + ii = active_set[active_set_ii] - # if intercept == 1: # maybe this should go into the next if statement? - # Q_active_set_ii[0] = hessian_rows.sum() - # this is perfectly valid because hessian_rows has length 16087 - - if active_set[0] < intercept: + # Check logic here + if active_set[0] < intercept: if ii == 0: Q_active_set_ii[0] = hessian_rows.sum() Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows) - # Q_active_set_ii[1:] = np.matmul(hessian_rows, X) else: - cur_col = X[:, active_set_ii - 1] + cur_col = X[:, active_set_ii - 1] # Does this reduce memory? Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows) Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) - - # Q_active_set_ii[0] = X[:, active_set_ii - 1].transpose_matvec(hessian_rows) - # Q_active_set_ii[1:] = X.transpose_matvec(X[:, active_set_ii - 1].multiply(hessian_rows).ravel()) - - # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X[:, active_set_ii - 1].A.ravel()) - # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii - 1]) - # Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1]) - # Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X) # use matvec - - # if ii < intercept: - # # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:]) - # # Q_active_set_ii[0] = hessian_rows.sum() - # Q_active_set_ii[1:] = np.matmul(hessian_rows, X) - - # the next condition in the original code checks for sparseness. - # we won't worry about that here. but we will have to go elem by elem - # actually no, we don't have to go elem by elem. Cython can handle. - # do i have to worry about intercept term? or does X already ignore it? - - # what about the active features? do we only need to multiply by those? - # correct. i believe we only need to multiply by those - # so X can actually be passed in as X[(active_set_ii) cartesian product (active_set_ii)], right? - # the only problem is that it would mess up the active_set_ii / active_set_jj logic... - - # NB! We care about the intercept column as well, and must treat differently. - # does feature_list in sklearnfork include the intercept column? else: cur_col = X[:, active_set_ii] Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) - # Q_active_set_ii = X.transpose_matvec(hessian_rows * X[:, active_set_ii].A.ravel()) - # Q_active_set_ii = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii]) - # Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X) - # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X) - # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:]) - # CHECK that we've covered all the cases!!!!! - - - # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X) - - # Q_check[active_set_ii] = Q_active_set_ii # for checking correctness - - # for col_idx in range(X.shape[1]): - # for row_idx in range(X.shape[0]): - # Q_active_set_ii[intercept + col_idx] = - - # the next if statement checks for P2, but we're only using this for L1 - # regularization, so we don't need to worry about it. and that's all! - - # If the above works, can merge the following into the above if-else!!!! - if ii < intercept: # equiv. to (ii=0 and intercept=1) + if ii < intercept: P1_ii = 0.0 else: P1_ii = P1[ii - intercept] - if Q_active_set_ii[active_set_ii] == 0.0: # no need to multiply zeros + if Q_active_set_ii[active_set_ii] == 0.0: continue - w_ii = w[ii] # Store previous value, the iith element of w + w_ii = w[ii] - qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii] ## + qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii] w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii] - if ii >= intercept: # this is big-brain logic + if ii >= intercept: if has_lower_bounds: if w[ii] < lower_bounds[ii - intercept]: w[ii] = lower_bounds[ii - intercept] @@ -370,7 +305,6 @@ def enet_coordinate_descent_gram_diag_fisher( # q += (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y) for active_set_jj in range(n_active_features): jj = active_set[active_set_jj] - # this is equivalent to A += Bj * z q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj] # update the maximum absolute coefficient update @@ -403,113 +337,6 @@ def enet_coordinate_descent_gram_diag_fisher( return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check -def enet_coordinate_descent_gram_diag_only(int[::1] active_set, - floating[::1] w, - floating[::1] P1, - floating[:,:] Q, - floating[::1] q, - int max_iter, floating tol, object rng, - bint intercept, bint random, - bint has_lower_bounds, - floating[:] lower_bounds, - bint has_upper_bounds, - floating[:] upper_bounds): - """Cython version of the coordinate descent algorithm - for Elastic-Net regression - We minimize - (1/2) * w^T Q w - q^T w + P1 norm(w, 1) - which amount to the Elastic-Net problem when: - Q = X^T X (Gram matrix) - q = X^T y - """ - # In this version, we don't access the off-diagonal elements. And see what happens. - - # get the data information into easy vars - cdef unsigned int n_active_features = active_set.shape[0] - cdef unsigned int n_features = Q.shape[0] - - cdef floating w_ii - cdef floating P1_ii - cdef floating qii_temp - cdef floating d_w_max - cdef floating w_max - cdef floating d_w_ii - cdef floating d_w_tol = tol - cdef floating norm_min_subgrad = 0 - cdef floating max_min_subgrad - cdef unsigned int active_set_ii, active_set_jj - cdef unsigned int ii, jj - cdef int n_iter = 0 - cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed - - with nogil: - for n_iter in range(max_iter): - w_max = 0.0 - d_w_max = 0.0 - for f_iter in range(n_active_features): # Loop over coordinates - if random: - active_set_ii = rand_int(n_active_features, rand_r_state) - else: - active_set_ii = f_iter - ii = active_set[active_set_ii] - - if ii < intercept: - P1_ii = 0.0 - else: - P1_ii = P1[ii - intercept] - - if Q[active_set_ii, active_set_ii] == 0.0: - continue - - w_ii = w[ii] # Store previous value - - qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii] - w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii] - - if ii >= intercept: - if has_lower_bounds: - if w[ii] < lower_bounds[ii - intercept]: - w[ii] = lower_bounds[ii - intercept] - if has_upper_bounds: - if w[ii] > upper_bounds[ii - intercept]: - w[ii] = upper_bounds[ii - intercept] - - # if w[ii] != 0.0 or w_ii != 0.0: - # for active_set_jj in range(n_active_features): - # jj = active_set[active_set_jj] - # q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj] - - # update the maximum absolute coefficient update - d_w_ii = fabs(w[ii] - w_ii) - if d_w_ii > d_w_max: - d_w_max = d_w_ii - - if fabs(w[ii]) > w_max: - w_max = fabs(w[ii]) - - if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: - # the biggest coordinate update of this iteration was smaller than - # the tolerance: check the minimum norm subgradient as the - # ultimate stopping criterion - cython_norm_min_subgrad( - active_set, - w, q, P1, intercept, - has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds, - &norm_min_subgrad, &max_min_subgrad - ) - if norm_min_subgrad <= tol: - break - else: - # for/else, runs if for doesn't end with a `break` - with gil: - warnings.warn("Coordinate descent did not converge. You might want to " - "increase the number of iterations. Minimum norm " - "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), - ConvergenceWarning) - - return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 cdef void cython_norm_min_subgrad( int[::1] active_set, diff --git a/src/glum/_glm.py b/src/glum/_glm.py index a4bd39f9..381f453a 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -672,6 +672,7 @@ def __init__( A_ineq: Optional[np.ndarray] = None, b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, + diag_fisher=False, ): self.l1_ratio = l1_ratio self.P1 = P1 @@ -701,6 +702,7 @@ def __init__( self.A_ineq = A_ineq self.b_ineq = b_ineq self.force_all_finite = force_all_finite + self.diag_fisher = diag_fisher @property def family_instance(self) -> ExponentialDispersionModel: @@ -969,6 +971,7 @@ def _solve( lower_bounds=lower_bounds, upper_bounds=upper_bounds, verbose=self.verbose > 0, + diag_fisher=self.diag_fisher, ) if self._solver == "irls-ls": coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver( @@ -976,6 +979,7 @@ def _solve( ) # 4.2 coordinate descent ############################################## elif self._solver == "irls-cd": + # [Alan] This is the case we're concerned with for wide problems. coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver( _cd_solver, coef, irls_data ) @@ -2088,6 +2092,17 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): ``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for details. + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + (Text taken directly from the orig_sklearn_fork.) + Attributes ---------- coef_ : numpy.array, shape (n_features,) @@ -2168,6 +2183,7 @@ def __init__( A_ineq: Optional[np.ndarray] = None, b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, + diag_fisher=False, ): self.alphas = alphas self.alpha = alpha @@ -2200,6 +2216,7 @@ def __init__( A_ineq=A_ineq, b_ineq=b_ineq, force_all_finite=force_all_finite, + diag_fisher=diag_fisher, ) def _validate_hyperparameters(self) -> None: diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index 662cd13f..1363f80c 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -57,37 +57,24 @@ def _least_squares_solver(state, data, hessian): @timeit("inner_solver_runtime") -def _cd_solver(state, data, active_hessian, diag_fisher=False): - if diag_fisher: +def _cd_solver(state, data, active_hessian): + if data.diag_fisher: if data.fit_intercept: - # the following line should be equivalent to the if/else below it - # X_active = data.X[:, state.active_set[(state.active_set[0] == 0):] - 1].A if state.active_set[0] == 0: - # remove the zero, subtract one from everything else - X_active = data.X[:, state.active_set[1:] - 1] # .A + # if the intercept column of X is in the active set, leave it out + # (we deal with intercepts separately in _cd_fast.pyx) + X_active = data.X[:, state.active_set[1:] - 1] else: - X_active = data.X[:, state.active_set - 1] # .A + X_active = data.X[:, state.active_set - 1] else: - # for some reason, active_set here is massive (everything!) - X_active = data.X[:, state.active_set] # .A - # instead of passing in data.X, we should pass in - # but remember! X is sparse. But X is not necessarily square - # we may need active rows and columns - # X_active = data.X[:, state.active_set].A - ( - new_coef, - gap, - _, - _, - n_cycles, - # Q_check, - ) = enet_coordinate_descent_gram_diag_fisher( - X_active, # new - state.hessian_rows, # new + # with no intercept, active_set here can be massive on first iteration + X_active = data.X[:, state.active_set] + (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher( + X_active, + state.hessian_rows, state.active_set, state.coef.copy(), data.P1, - active_hessian, -state.score, data.max_inner_iter, state.inner_tol, @@ -187,14 +174,13 @@ def update_hessian(state, data, active_set): # 3) Include the P2 components # 4) just like an update, we only update the active_set - # this is not needed for diag_fisher! hessian_init = build_hessian_delta( data.X, - state.hessian_rows, # this is the d that goes into sandwich + state.hessian_rows, data.fit_intercept, data.P2, - np.arange(data.X.shape[0], dtype=np.int32), # this uses all rows (16087) - active_set, # (17650) + np.arange(data.X.shape[0], dtype=np.int32), + active_set, ) # In the sparse Hessian case, state.hessian is a coo_matrix. @@ -214,7 +200,6 @@ def update_hessian(state, data, active_set): state.hessian_initialized = True n_active_rows = data.X.shape[0] - # fortunately, none of these above variables are required else: @@ -225,8 +210,6 @@ def update_hessian(state, data, active_set): # already been added # 4) only update the active set subset of the hessian. - # in the diag_fisher case, we don't want to use the diff, we use the whole thing - hessian_rows_diff, active_rows = identify_active_rows( state.gradient_rows, state.hessian_rows, @@ -260,6 +243,7 @@ def update_hessian(state, data, active_set): # active set elements, which we then convert to a numpy array for compatibility with # the rest of the code if state.use_sparse_hessian: + # this is a little awkward return ( state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(), n_active_rows, @@ -291,8 +275,6 @@ def build_hessian_delta( """ idx = 1 if intercept else 0 active_cols_non_intercept = active_cols[idx:] - idx - # Here is the safe sandwich dot that calls sandwich - # active_cols is small even for a large number of columns (2304 for 1M) delta = _safe_sandwich_dot( X, hessian_rows, active_rows, active_cols_non_intercept, intercept ) @@ -396,35 +378,17 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L state.old_active_set = state.active_set state.active_set = identify_active_set(state, data) - diag_fisher = True - - if not diag_fisher: + if not data.diag_fisher: # 0) Build the hessian hessian, state.n_active_rows = update_hessian( state, data, state.active_set ) - # state.n_active_rows never gets used again in this function - # may be used in logging, but we'll ignore for diag_fisher case - # 1) Solve the L1 and L2 penalized least squares problem - d, n_cycles_this_iter = inner_solver( - state, data, hessian, diag_fisher=diag_fisher - ) - state.n_cycles += n_cycles_this_iter - + d, n_cycles_this_iter = inner_solver(state, data, hessian) else: - # we need a hessian update step here - # but then we'd store mxm, which is undesired - # we'll instead store hessian diag, but have to update it first - # update_hessian_diag_fisher() # will update state.hessian_rows - # not needed! because of update_quadratic() below - - # sandwich product is between data.X, state.hessian_rows - # only use first row of X in the X^T part of the sandwich - d, n_cycles_this_iter = inner_solver( - state, data, None, diag_fisher=diag_fisher - ) - state.n_cycles += n_cycles_this_iter + # 0 & 1) Build the hessian row by row, solving as we go + d, n_cycles_this_iter = inner_solver(state, data, None) + state.n_cycles += n_cycles_this_iter # 2) Line search state.old_hessian_rows[:] = state.hessian_rows @@ -441,7 +405,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L # 3) Update the quadratic approximation state.gradient_rows, state.score, state.hessian_rows = update_quadratic( state, data, coef_P2 - ) # is this where we update state.hessian_rows?? if so, no need for diff :) + ) # 4) Check if we've converged ( @@ -544,6 +508,7 @@ def __init__( lower_bounds: Optional[np.ndarray] = None, upper_bounds: Optional[np.ndarray] = None, verbose: bool = False, + diag_fisher: bool = False, ): self.X = X self.y = y @@ -575,6 +540,7 @@ def __init__( self.intercept_offset = 1 if self.fit_intercept else 0 self.verbose = verbose + self.diag_fisher = diag_fisher self._check_data() From a58dfa1bad56d57943b8dd8c1dda8810a363de59 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Thu, 19 May 2022 20:37:26 +0100 Subject: [PATCH 5/9] Address comments; about to split into two PRs --- src/glum_benchmarks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glum_benchmarks/README.md b/src/glum_benchmarks/README.md index c3ac43e2..23321713 100644 --- a/src/glum_benchmarks/README.md +++ b/src/glum_benchmarks/README.md @@ -6,7 +6,7 @@ Python package to benchmark GLM implementations. ## Running the benchmarks -After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/problems.py` to see the list of problems that will be run through each library. +After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/glum_benchmarks/problems.py` to see the list of problems that will be run through each library. To run the full benchmarking suite, just run `glm_benchmarks_run` with no flags. This will probably take a very long time. From c5f566774bd53bf05f229c957a0745b409039dda Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Thu, 19 May 2022 20:39:58 +0100 Subject: [PATCH 6/9] Address comments; about to split into two PRs --- src/glum/_cd_fast.pyx | 14 +++++--------- src/glum/_glm.py | 17 ++++++++++++++--- src/glum/_solvers.py | 24 +++++++++++++----------- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index 73aea8cb..a75c33ea 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -117,9 +117,6 @@ def enet_coordinate_descent_gram(int[::1] active_set, """ cdef unsigned int n_active_features = active_set.shape[0] - cdef unsigned int n_features = Q.shape[0] # are you sure this is correct? - # n_features is exactly now the same as n_active_features; should it be q.shape[0]? - # however, the variable is never used again, so I guess we are okay cdef floating w_ii cdef floating P1_ii @@ -141,7 +138,7 @@ def enet_coordinate_descent_gram(int[::1] active_set, for n_iter in range(max_iter): w_max = 0.0 d_w_max = 0.0 - for f_iter in range(n_active_features): + for f_iter in range(n_active_features): # Loop over coordinates if random: active_set_ii = rand_int(n_active_features, rand_r_state) else: @@ -227,8 +224,8 @@ def enet_coordinate_descent_gram_diag_fisher( q = X^T y This version, for diag_fisher=True, never calculates the entire Hessian matrix; - only rows, when necessary. The inefficiency here is that we must re-calculate - the rows of Q for every iteration up to max_iter. + only rows, when necessary. This requires less memory. However, the inefficiency + is that we must re-calculate the rows of Q for every iteration up to max_iter. """ cdef unsigned int n_active_features = active_set.shape[0] @@ -260,14 +257,13 @@ def enet_coordinate_descent_gram_diag_fisher( for n_iter in range(max_iter): w_max = 0.0 d_w_max = 0.0 - for f_iter in range(n_active_features): + for f_iter in range(n_active_features): # Loop over coordinates if random: active_set_ii = rand_int(n_active_features, rand_r_state) else: active_set_ii = f_iter ii = active_set[active_set_ii] - # Check logic here if active_set[0] < intercept: if ii == 0: Q_active_set_ii[0] = hessian_rows.sum() @@ -335,7 +331,7 @@ def enet_coordinate_descent_gram_diag_fisher( "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), ConvergenceWarning) - return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check + return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 cdef void cython_norm_min_subgrad( diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 381f453a..e54140f0 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -672,7 +672,8 @@ def __init__( A_ineq: Optional[np.ndarray] = None, b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, - diag_fisher=False, + use_sparse_hessian: bool = True, + diag_fisher: bool = False, ): self.l1_ratio = l1_ratio self.P1 = P1 @@ -702,6 +703,7 @@ def __init__( self.A_ineq = A_ineq self.b_ineq = b_ineq self.force_all_finite = force_all_finite + self.use_sparse_hessian = use_sparse_hessian self.diag_fisher = diag_fisher @property @@ -971,6 +973,7 @@ def _solve( lower_bounds=lower_bounds, upper_bounds=upper_bounds, verbose=self.verbose > 0, + use_sparse_hessian=self.use_sparse_hessian, diag_fisher=self.diag_fisher, ) if self._solver == "irls-ls": @@ -979,7 +982,7 @@ def _solve( ) # 4.2 coordinate descent ############################################## elif self._solver == "irls-cd": - # [Alan] This is the case we're concerned with for wide problems. + # This is the case we're concerned with for wide problems. coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver( _cd_solver, coef, irls_data ) @@ -2092,6 +2095,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): ``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for details. + use_sparse_hessian : boolean, optional, (default=True) + If ``True``, stores the current state of the Hessian as a sparse COO matrix. + If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where + `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory + issues when working with extremely wide data. + diag_fisher : boolean, optional, (default=False) Only relevant for solver 'cd' (see also ``start_params='guess'``). If ``False``, the full Fisher matrix (expected Hessian) is computed in @@ -2183,7 +2192,8 @@ def __init__( A_ineq: Optional[np.ndarray] = None, b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, - diag_fisher=False, + use_sparse_hessian: bool = True, + diag_fisher: bool = False, ): self.alphas = alphas self.alpha = alpha @@ -2216,6 +2226,7 @@ def __init__( A_ineq=A_ineq, b_ineq=b_ineq, force_all_finite=force_all_finite, + use_sparse_hessian=use_sparse_hessian, diag_fisher=diag_fisher, ) diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index 1363f80c..b1350e39 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -69,6 +69,7 @@ def _cd_solver(state, data, active_hessian): else: # with no intercept, active_set here can be massive on first iteration X_active = data.X[:, state.active_set] + (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher( X_active, state.hessian_rows, @@ -103,6 +104,7 @@ def _cd_solver(state, data, active_hessian): data.has_upper_bounds, data._upper_bounds, ) + return new_coef - state.coef, n_cycles @@ -184,10 +186,10 @@ def update_hessian(state, data, active_set): ) # In the sparse Hessian case, state.hessian is a coo_matrix. - # hessian_init is a numpy array of size active_set x active_set; + # hessian_init is np array of size active_set.shape[0] x active_set.shape[0]; # we can convert this to a coo_matrix and simply add it to the state.hessian - if state.use_sparse_hessian: - coo_data = hessian_init.flatten() + if data.use_sparse_hessian: + coo_data = hessian_init.ravel(order="C") coo_rows = np.repeat(active_set, active_set.shape[0]) coo_cols = np.tile(active_set, active_set.shape[0]) @@ -225,8 +227,8 @@ def update_hessian(state, data, active_set): active_cols=active_set, ) - if state.use_sparse_hessian: - coo_data = hessian_delta.flatten() + if data.use_sparse_hessian: + coo_data = hessian_delta.ravel(order="C") coo_rows = np.repeat(active_set, active_set.shape[0]) coo_cols = np.tile(active_set, active_set.shape[0]) @@ -241,9 +243,9 @@ def update_hessian(state, data, active_set): # If state.hessian is in COO form, we have to convert to CSC in order to index the # active set elements, which we then convert to a numpy array for compatibility with - # the rest of the code - if state.use_sparse_hessian: - # this is a little awkward + # the rest of the code. This numpy array is dense, but we care about problems in + # which the active set is usually much smaller than the number of data columns. + if data.use_sparse_hessian: return ( state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(), n_active_rows, @@ -332,7 +334,6 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L Repeat steps 1-3 until convergence. Note: Use hessian matrix instead of Hessian for H. - This used to say "Use Fisher matrix instead of Hessian for H." Note: f' = -score, H = hessian matrix Parameters @@ -508,6 +509,7 @@ def __init__( lower_bounds: Optional[np.ndarray] = None, upper_bounds: Optional[np.ndarray] = None, verbose: bool = False, + use_sparse_hessian: bool = True, diag_fisher: bool = False, ): self.X = X @@ -540,6 +542,7 @@ def __init__( self.intercept_offset = 1 if self.fit_intercept else 0 self.verbose = verbose + self.use_sparse_hessian = use_sparse_hessian self.diag_fisher = diag_fisher self._check_data() @@ -606,8 +609,7 @@ def __init__(self, coef, data): # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we # can instantiate the Hessian as a sparse COO matrix. - self.use_sparse_hessian = True - if self.use_sparse_hessian: + if data.use_sparse_hessian: self.hessian = sparse.coo_matrix( (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype ) From 684de67a47c00568a20f8187c6508f292bfe8b84 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Thu, 19 May 2022 20:46:39 +0100 Subject: [PATCH 7/9] Put back some old comments that were mistakenly removed --- src/glum/_cd_fast.pyx | 8 +++++--- src/glum_benchmarks/README.md | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index a75c33ea..d40b41f4 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -111,11 +111,12 @@ def enet_coordinate_descent_gram(int[::1] active_set, for Elastic-Net regression We minimize (1/2) * w^T Q w - q^T w + P1 norm(w, 1) - which amounts to the L1 problem when: + which amounts to the Elastic-Net problem when: Q = X^T X (Gram matrix) q = X^T y """ + # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] cdef floating w_ii @@ -153,7 +154,7 @@ def enet_coordinate_descent_gram(int[::1] active_set, if Q[active_set_ii, active_set_ii] == 0.0: continue - w_ii = w[ii] + w_ii = w[ii] # Store previous value qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii] w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii] @@ -228,6 +229,7 @@ def enet_coordinate_descent_gram_diag_fisher( is that we must re-calculate the rows of Q for every iteration up to max_iter. """ + # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] cdef floating w_ii @@ -284,7 +286,7 @@ def enet_coordinate_descent_gram_diag_fisher( if Q_active_set_ii[active_set_ii] == 0.0: continue - w_ii = w[ii] + w_ii = w[ii] # Store previous value qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii] w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii] diff --git a/src/glum_benchmarks/README.md b/src/glum_benchmarks/README.md index 23321713..c3ac43e2 100644 --- a/src/glum_benchmarks/README.md +++ b/src/glum_benchmarks/README.md @@ -6,7 +6,7 @@ Python package to benchmark GLM implementations. ## Running the benchmarks -After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/glum_benchmarks/problems.py` to see the list of problems that will be run through each library. +After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/problems.py` to see the list of problems that will be run through each library. To run the full benchmarking suite, just run `glm_benchmarks_run` with no flags. This will probably take a very long time. From 4d628a4f0b7576cb90d8d7f11d1cceb2fae28f16 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Thu, 19 May 2022 20:48:09 +0100 Subject: [PATCH 8/9] One-letter fix --- src/glum/_cd_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index d40b41f4..5d5d702e 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -111,7 +111,7 @@ def enet_coordinate_descent_gram(int[::1] active_set, for Elastic-Net regression We minimize (1/2) * w^T Q w - q^T w + P1 norm(w, 1) - which amounts to the Elastic-Net problem when: + which amount to the Elastic-Net problem when: Q = X^T X (Gram matrix) q = X^T y """ From 39c2b10b886f53bf135daa48686c71bb98424850 Mon Sep 17 00:00:00 2001 From: Alan Tu Date: Thu, 19 May 2022 21:41:35 +0100 Subject: [PATCH 9/9] use_sparse_hessian code isolated in this branch --- src/glum/_cd_fast.pyx | 134 +----------------------------------------- src/glum/_glm.py | 16 ----- src/glum/_solvers.py | 81 +++++++------------------ 3 files changed, 22 insertions(+), 209 deletions(-) diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx index 5d5d702e..b4bcf19b 100644 --- a/src/glum/_cd_fast.pyx +++ b/src/glum/_cd_fast.pyx @@ -118,6 +118,7 @@ def enet_coordinate_descent_gram(int[::1] active_set, # get the data information into easy vars cdef unsigned int n_active_features = active_set.shape[0] + cdef unsigned int n_features = Q.shape[0] cdef floating w_ii cdef floating P1_ii @@ -203,139 +204,6 @@ def enet_coordinate_descent_gram(int[::1] active_set, return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 -def enet_coordinate_descent_gram_diag_fisher( - X, - np.ndarray[np.float64_t, ndim=1] hessian_rows, - int[::1] active_set, - floating[::1] w, - floating[::1] P1, - floating[::1] q, - int max_iter, floating tol, object rng, - bint intercept, bint random, - bint has_lower_bounds, - floating[:] lower_bounds, - bint has_upper_bounds, - floating[:] upper_bounds): - """Cython version of the coordinate descent algorithm - for Elastic-Net regression - We minimize - (1/2) * w^T Q w - q^T w + P1 norm(w, 1) - which amounts to the L1 problem when: - Q = X^T X (Gram matrix) - q = X^T y - - This version, for diag_fisher=True, never calculates the entire Hessian matrix; - only rows, when necessary. This requires less memory. However, the inefficiency - is that we must re-calculate the rows of Q for every iteration up to max_iter. - """ - - # get the data information into easy vars - cdef unsigned int n_active_features = active_set.shape[0] - - cdef floating w_ii - cdef floating P1_ii - cdef floating qii_temp - cdef floating d_w_max - cdef floating w_max - cdef floating d_w_ii - cdef floating d_w_tol = tol - cdef floating norm_min_subgrad = 0 - cdef floating max_min_subgrad - cdef unsigned int active_set_ii, active_set_jj - cdef unsigned int ii, jj - cdef int n_iter = 0 - cdef unsigned int f_iter - cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) - cdef UINT32_t* rand_r_state = &rand_r_state_seed - - # Equivalent to Q[active_set_ii] in the diag_fisher=False version - cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64) - - cdef cur_col # Does the cdef here (rather than using it as a local variable) reduce memory? - - # nogil is incompatible with our slicing, numpy, and sparse matrix operations, but - # may be necessary to increase performance - # with nogil: - for n_iter in range(max_iter): - w_max = 0.0 - d_w_max = 0.0 - for f_iter in range(n_active_features): # Loop over coordinates - if random: - active_set_ii = rand_int(n_active_features, rand_r_state) - else: - active_set_ii = f_iter - ii = active_set[active_set_ii] - - if active_set[0] < intercept: - if ii == 0: - Q_active_set_ii[0] = hessian_rows.sum() - Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows) - else: - cur_col = X[:, active_set_ii - 1] # Does this reduce memory? - Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows) - Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) - else: - cur_col = X[:, active_set_ii] - Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel()) - - if ii < intercept: - P1_ii = 0.0 - else: - P1_ii = P1[ii - intercept] - - if Q_active_set_ii[active_set_ii] == 0.0: - continue - - w_ii = w[ii] # Store previous value - - qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii] - w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii] - - if ii >= intercept: - if has_lower_bounds: - if w[ii] < lower_bounds[ii - intercept]: - w[ii] = lower_bounds[ii - intercept] - if has_upper_bounds: - if w[ii] > upper_bounds[ii - intercept]: - w[ii] = upper_bounds[ii - intercept] - - if w[ii] != 0.0 or w_ii != 0.0: - # q += (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y) - for active_set_jj in range(n_active_features): - jj = active_set[active_set_jj] - q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj] - - # update the maximum absolute coefficient update - d_w_ii = fabs(w[ii] - w_ii) - if d_w_ii > d_w_max: - d_w_max = d_w_ii - - if fabs(w[ii]) > w_max: - w_max = fabs(w[ii]) - - if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: - # the biggest coordinate update of this iteration was smaller than - # the tolerance: check the minimum norm subgradient as the - # ultimate stopping criterion - cython_norm_min_subgrad( - active_set, - w, q, P1, intercept, - has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds, - &norm_min_subgrad, &max_min_subgrad - ) - if norm_min_subgrad <= tol: - break - else: - # for/else, runs if for doesn't end with a `break` - # with gil: - warnings.warn("Coordinate descent did not converge. You might want to " - "increase the number of iterations. Minimum norm " - "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol), - ConvergenceWarning) - - return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 - - cdef void cython_norm_min_subgrad( int[::1] active_set, floating[::1] coef, diff --git a/src/glum/_glm.py b/src/glum/_glm.py index e54140f0..6e12cc25 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -673,7 +673,6 @@ def __init__( b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, use_sparse_hessian: bool = True, - diag_fisher: bool = False, ): self.l1_ratio = l1_ratio self.P1 = P1 @@ -704,7 +703,6 @@ def __init__( self.b_ineq = b_ineq self.force_all_finite = force_all_finite self.use_sparse_hessian = use_sparse_hessian - self.diag_fisher = diag_fisher @property def family_instance(self) -> ExponentialDispersionModel: @@ -974,7 +972,6 @@ def _solve( upper_bounds=upper_bounds, verbose=self.verbose > 0, use_sparse_hessian=self.use_sparse_hessian, - diag_fisher=self.diag_fisher, ) if self._solver == "irls-ls": coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver( @@ -2101,17 +2098,6 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase): `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory issues when working with extremely wide data. - diag_fisher : boolean, optional, (default=False) - Only relevant for solver 'cd' (see also ``start_params='guess'``). - If ``False``, the full Fisher matrix (expected Hessian) is computed in - each outer iteration (Newton iteration). If ``True``, only a diagonal - matrix (stored as 1d array) is computed, such that - fisher = X.T @ diag @ X. This saves memory and matrix-matrix - multiplications, but needs more matrix-vector multiplications. If you - use large sparse X or if you have many features, - i.e. n_features >> n_samples, you might set this option to ``True``. - (Text taken directly from the orig_sklearn_fork.) - Attributes ---------- coef_ : numpy.array, shape (n_features,) @@ -2193,7 +2179,6 @@ def __init__( b_ineq: Optional[np.ndarray] = None, force_all_finite: bool = True, use_sparse_hessian: bool = True, - diag_fisher: bool = False, ): self.alphas = alphas self.alpha = alpha @@ -2227,7 +2212,6 @@ def __init__( b_ineq=b_ineq, force_all_finite=force_all_finite, use_sparse_hessian=use_sparse_hessian, - diag_fisher=diag_fisher, ) def _validate_hyperparameters(self) -> None: diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py index b1350e39..e05e2113 100644 --- a/src/glum/_solvers.py +++ b/src/glum/_solvers.py @@ -15,7 +15,6 @@ from ._cd_fast import ( _norm_min_subgrad, enet_coordinate_descent_gram, - enet_coordinate_descent_gram_diag_fisher, identify_active_rows, ) from ._distribution import ExponentialDispersionModel @@ -58,53 +57,22 @@ def _least_squares_solver(state, data, hessian): @timeit("inner_solver_runtime") def _cd_solver(state, data, active_hessian): - if data.diag_fisher: - if data.fit_intercept: - if state.active_set[0] == 0: - # if the intercept column of X is in the active set, leave it out - # (we deal with intercepts separately in _cd_fast.pyx) - X_active = data.X[:, state.active_set[1:] - 1] - else: - X_active = data.X[:, state.active_set - 1] - else: - # with no intercept, active_set here can be massive on first iteration - X_active = data.X[:, state.active_set] - - (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher( - X_active, - state.hessian_rows, - state.active_set, - state.coef.copy(), - data.P1, - -state.score, - data.max_inner_iter, - state.inner_tol, - data.random_state, - data.fit_intercept, - data.selection == "random", - data.has_lower_bounds, - data._lower_bounds, - data.has_upper_bounds, - data._upper_bounds, - ) - else: - new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram( - state.active_set, - state.coef.copy(), - data.P1, - active_hessian, - -state.score, - data.max_inner_iter, - state.inner_tol, - data.random_state, - data.fit_intercept, - data.selection == "random", - data.has_lower_bounds, - data._lower_bounds, - data.has_upper_bounds, - data._upper_bounds, - ) - + new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram( + state.active_set, + state.coef.copy(), + data.P1, + active_hessian, + -state.score, + data.max_inner_iter, + state.inner_tol, + data.random_state, + data.fit_intercept, + data.selection == "random", + data.has_lower_bounds, + data._lower_bounds, + data.has_upper_bounds, + data._upper_bounds, + ) return new_coef - state.coef, n_cycles @@ -379,16 +347,11 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L state.old_active_set = state.active_set state.active_set = identify_active_set(state, data) - if not data.diag_fisher: - # 0) Build the hessian - hessian, state.n_active_rows = update_hessian( - state, data, state.active_set - ) - # 1) Solve the L1 and L2 penalized least squares problem - d, n_cycles_this_iter = inner_solver(state, data, hessian) - else: - # 0 & 1) Build the hessian row by row, solving as we go - d, n_cycles_this_iter = inner_solver(state, data, None) + # 0) Build the hessian + hessian, state.n_active_rows = update_hessian(state, data, state.active_set) + + # 1) Solve the L1 and L2 penalized least squares problem + d, n_cycles_this_iter = inner_solver(state, data, hessian) state.n_cycles += n_cycles_this_iter # 2) Line search @@ -510,7 +473,6 @@ def __init__( upper_bounds: Optional[np.ndarray] = None, verbose: bool = False, use_sparse_hessian: bool = True, - diag_fisher: bool = False, ): self.X = X self.y = y @@ -543,7 +505,6 @@ def __init__( self.intercept_offset = 1 if self.fit_intercept else 0 self.verbose = verbose self.use_sparse_hessian = use_sparse_hessian - self.diag_fisher = diag_fisher self._check_data()