From b1488457cbb9a8f275a32bb735511c1e58e3d856 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Fri, 13 May 2022 15:57:17 +0100
Subject: [PATCH 1/9] Add option to make state.hessian a sparse COO matrix

---
 src/glum/_solvers.py | 68 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 11 deletions(-)

diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index 2f359dfb..bb693cee 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -145,15 +145,31 @@ def update_hessian(state, data, active_set):
         # 4) just like an update, we only update the active_set
         hessian_init = build_hessian_delta(
             data.X,
-            state.hessian_rows,
+            state.hessian_rows,  # this is the d that goes into sandwich
             data.fit_intercept,
             data.P2,
             np.arange(data.X.shape[0], dtype=np.int32),
             active_set,
         )
-        state.hessian[np.ix_(active_set, active_set)] = hessian_init
+
+        # In the sparse Hessian case, stsate.hessian is a coo_matrix.
+        # hessian_init is a numpy array of size active_set x active_set;
+        # we can convert this to a coo_matrix and simply add it to the state.hessian
+        if state.use_sparse_hessian:
+            coo_data = hessian_init.flatten()
+            coo_rows = np.repeat(active_set, active_set.shape[0])
+            coo_cols = np.tile(active_set, active_set.shape[0])
+
+            state.hessian = sparse.coo_matrix(
+                (coo_data, (coo_rows, coo_cols)),
+                shape=(state.coef.shape[0], state.coef.shape[0]),
+            )
+        else:
+            state.hessian[np.ix_(active_set, active_set)] = hessian_init
+
         state.hessian_initialized = True
         n_active_rows = data.X.shape[0]
+
     else:
 
         # In an update iteration, we want to:
@@ -176,13 +192,31 @@ def update_hessian(state, data, active_set):
             active_rows=active_rows,
             active_cols=active_set,
         )
-        state.hessian[np.ix_(active_set, active_set)] += hessian_delta
+
+        if state.use_sparse_hessian:
+            coo_data = hessian_delta.flatten()
+            coo_rows = np.repeat(active_set, active_set.shape[0])
+            coo_cols = np.tile(active_set, active_set.shape[0])
+
+            state.hessian += sparse.coo_matrix(
+                (coo_data, (coo_rows, coo_cols)),
+                shape=(state.coef.shape[0], state.coef.shape[0]),
+            )
+        else:
+            state.hessian[np.ix_(active_set, active_set)] += hessian_delta
+
         n_active_rows = active_rows.shape[0]
 
-    return (
-        state.hessian[np.ix_(active_set, active_set)],
-        n_active_rows,
-    )
+    # If state.hessian is in COO form, we have to convert to CSC in order to index the
+    # active set elements, which we then convert to a numpy array for compatibility with
+    # the rest of the code
+    if state.use_sparse_hessian:
+        return (
+            state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
+            n_active_rows,
+        )
+    else:
+        return state.hessian[np.ix_(active_set, active_set)], n_active_rows
 
 
 def _is_subset(x, y):
@@ -208,6 +242,8 @@ def build_hessian_delta(
     """
     idx = 1 if intercept else 0
     active_cols_non_intercept = active_cols[idx:] - idx
+    # Here is the safe sandwich dot that calls sandwich
+    # active_cols is small even for a large number of columns (2304 for 1M)
     delta = _safe_sandwich_dot(
         X, hessian_rows, active_rows, active_cols_non_intercept, intercept
     )
@@ -271,7 +307,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
     ----------
     inner_solver
         A least squares solver that can handle the appropriate penalties. With
-        an L1 penalty, this will _cd_solver. With only an L2 penalty,
+        an L1 penalty, this will be _cd_solver. With only an L2 penalty,
         _least_squares_solver will be more efficient.
     coef : ndarray, shape (c,)
         If fit_intercept=False, shape c=X.shape[1].
@@ -528,9 +564,19 @@ def __init__(self, coef, data):
         self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype)
         self.gradient_rows = None
         self.hessian_rows = None
-        self.hessian = np.zeros(
-            (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
-        )
+
+        # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
+        # can instantiate the Hessian as a sparse COO matrix.
+        self.use_sparse_hessian = True
+        if self.use_sparse_hessian:
+            self.hessian = sparse.coo_matrix(
+                (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
+            )
+        else:
+            self.hessian = np.zeros(
+                (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
+            )
+
         self.hessian_initialized = False
         self.coef_P2 = None
         self.norm_min_subgrad = None

From 2bddab6f4f88167bde0052a7addfdbb9504a12fd Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Tue, 17 May 2022 21:37:36 +0100
Subject: [PATCH 2/9] fisher_diag seems to work

---
 src/glum/_cd_fast.pyx | 301 +++++++++++++++++++++++++++++++++++++++++-
 src/glum/_solvers.py  | 122 +++++++++++++----
 2 files changed, 391 insertions(+), 32 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index b4bcf19b..0a2ed5b6 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -9,7 +9,7 @@
 #
 # cython: boundscheck=False, wraparound=False, cdivision=True
 from libc.math cimport fabs
-cimport numpy as np
+cimport numpy as np  # is it a problem that these two imports have same name???
 import numpy as np
 import numpy.linalg as linalg
 from numpy.math cimport INFINITY
@@ -107,6 +107,295 @@ def enet_coordinate_descent_gram(int[::1] active_set,
                                  floating[:] lower_bounds,
                                  bint has_upper_bounds,
                                  floating[:] upper_bounds):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+        We minimize
+        (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
+        which amounts to the L1 problem when:
+        Q = X^T X (Gram matrix)
+        q = X^T y
+    """
+
+    # get the data information into easy vars
+    cdef unsigned int n_active_features = active_set.shape[0]
+    cdef unsigned int n_features = Q.shape[0]  # are you sure this is CORRECT?
+    # n_features is exactly now the same as n_active_features; should it be q.shape[0]?
+    # however, the variable is never used again, so I guess we are okay
+
+    cdef floating w_ii
+    cdef floating P1_ii
+    cdef floating qii_temp
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating d_w_tol = tol
+    cdef floating norm_min_subgrad = 0
+    cdef floating max_min_subgrad
+    cdef unsigned int active_set_ii, active_set_jj
+    cdef unsigned int ii, jj
+    cdef int n_iter = 0
+    cdef unsigned int f_iter
+    cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef UINT32_t* rand_r_state = &rand_r_state_seed
+
+    with nogil:
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_active_features):  # Loop over coordinates
+                if random:
+                    active_set_ii = rand_int(n_active_features, rand_r_state)
+                else:
+                    active_set_ii = f_iter  # _ii is an index
+                ii = active_set[active_set_ii]  # odd syntax; gets the active set's row
+
+                if ii < <unsigned int>intercept:  # but `intercept` is binary!?
+                    P1_ii = 0.0
+                else:
+                    P1_ii = P1[ii - intercept]
+
+                if Q[active_set_ii, active_set_ii] == 0.0:  # no need to multiply zeros
+                    continue
+
+                w_ii = w[ii]  # Store previous value, the iith element of w
+
+                qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii]
+                w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii]
+
+                if ii >= <unsigned int>intercept:
+                    if has_lower_bounds:
+                        if w[ii] < lower_bounds[ii - intercept]:
+                            w[ii] = lower_bounds[ii - intercept]
+                    if has_upper_bounds:
+                        if w[ii] > upper_bounds[ii - intercept]:
+                            w[ii] = upper_bounds[ii - intercept]
+
+                if w[ii] != 0.0 or w_ii != 0.0:
+                    # q +=  (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y)
+                    for active_set_jj in range(n_active_features):
+                        jj = active_set[active_set_jj]
+                        q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj]
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                if fabs(w[ii]) > w_max:
+                    w_max = fabs(w[ii])
+
+            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+                # the biggest coordinate update of this iteration was smaller than
+                # the tolerance: check the minimum norm subgradient as the
+                # ultimate stopping criterion
+                cython_norm_min_subgrad(
+                    active_set,
+                    w, q, P1, intercept,
+                    has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds,
+                    &norm_min_subgrad, &max_min_subgrad
+                )
+                if norm_min_subgrad <= tol:
+                    break
+        else:
+            # for/else, runs if for doesn't end with a `break`
+            with gil:
+                warnings.warn("Coordinate descent did not converge. You might want to "
+                              "increase the number of iterations. Minimum norm "
+                              "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
+                              ConvergenceWarning)
+
+    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
+
+def enet_coordinate_descent_gram_diag_fisher(
+                                 #floating[:,:] X, 
+                                 #floating[:] hessian_rows,
+                                 np.ndarray[np.float64_t, ndim=2] X,
+                                 np.ndarray[np.float64_t, ndim=1] hessian_rows,
+                                 int[::1] active_set,
+                                 floating[::1] w,
+                                 floating[::1] P1,
+                                 floating[:,:] Q,
+                                 floating[::1] q,
+                                 int max_iter, floating tol, object rng,
+                                 bint intercept, bint random,
+                                 bint has_lower_bounds,
+                                 floating[:] lower_bounds,
+                                 bint has_upper_bounds,
+                                 floating[:] upper_bounds):
+    """Cython version of the coordinate descent algorithm
+        for Elastic-Net regression
+        We minimize
+        (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
+        which amounts to the L1 problem when:
+        Q = X^T X (Gram matrix)
+        q = X^T y
+    """
+    # The CD that never calculates the entire Hessian matrix; only rows, when they are
+    # necessary.
+
+    # get the data information into easy vars
+    cdef unsigned int n_active_features = active_set.shape[0]
+    # cdef unsigned int n_features = q.shape[0]  # check that this is the same as Q.shape[0]
+
+    cdef floating w_ii
+    cdef floating P1_ii
+    cdef floating qii_temp
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating d_w_tol = tol
+    cdef floating norm_min_subgrad = 0
+    cdef floating max_min_subgrad
+    cdef unsigned int active_set_ii, active_set_jj
+    cdef unsigned int ii, jj
+    cdef int n_iter = 0
+    cdef unsigned int f_iter
+    cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
+    cdef UINT32_t* rand_r_state = &rand_r_state_seed
+
+    #cdef np.ndarray[np.float64_t, dtype=q.dtype] Qj  # will hold rows of the Q matrix
+    # cdef floating[:] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
+    cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
+    # is empty bad?
+
+    # used to check correctness of code - compare with Q from non-diag-fisher
+    cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64)
+
+    # THE inefficiency here is that we must re-calculate the rows of Q for ever iteration.
+
+    # with nogil:
+    for n_iter in range(max_iter):
+        w_max = 0.0
+        d_w_max = 0.0
+        for f_iter in range(n_active_features):  # Loop over coordinates
+            if random:
+                active_set_ii = rand_int(n_active_features, rand_r_state)
+            else:
+                active_set_ii = f_iter  # _ii is an index
+            ii = active_set[active_set_ii]  # odd syntax; gets an active row
+
+            # remember, Q hasn't been created yet.
+            # and the only row we need for this iteration is Q[active_set_ii]
+            # also remember, sandwich product is between data.X, state.hessian_rows
+            # only use first column of X in the X^T part of the sandwich
+
+            # if intercept == 1:  # maybe this should go into the next if statement?
+                # Q_active_set_ii[0] = hessian_rows.sum()
+                # this is perfectly valid because hessian_rows has length 16087
+
+            if active_set[0] < <unsigned int>intercept:
+                if ii == 0:
+                    Q_active_set_ii[0] = hessian_rows.sum()
+                    Q_active_set_ii[1:] = np.matmul(hessian_rows, X)
+                else:
+                    Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1])
+                    Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X)
+
+            # if ii < <unsigned int>intercept:
+            #     # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:])
+            #     # Q_active_set_ii[0] = hessian_rows.sum()
+            #     Q_active_set_ii[1:] = np.matmul(hessian_rows, X)
+
+            # the next condition in the original code checks for sparseness.
+            # we won't worry about that here. but we will have to go elem by elem
+                # actually no, we don't have to go elem by elem. Cython can handle.
+            # do i have to worry about intercept term? or does X already ignore it?
+
+            # what about the active features? do we only need to multiply by those?
+            # correct. i believe we only need to multiply by those
+            # so X can actually be passed in as X[(active_set_ii) cartesian product (active_set_ii)], right?
+            # the only problem is that it would mess up the active_set_ii / active_set_jj logic...
+
+            # NB! We care about the intercept column as well, and must treat differently.
+            # does feature_list in sklearnfork include the intercept column?
+            else:
+                Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X)
+                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X)
+                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:])
+                # CHECK that we've covered all the cases!!!!!
+
+
+                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X)
+
+            Q_check[active_set_ii] = Q_active_set_ii
+
+            # for col_idx in range(X.shape[1]):
+            #     for row_idx in range(X.shape[0]):
+            #         Q_active_set_ii[intercept + col_idx] = <array multiplication>
+
+            # the next if statement checks for P2, but we're only using this for L1
+            # regularization, so we don't need to worry about it. and that's all!
+
+            # If the above works, can merge the following into the above if-else!!!!
+            if ii < <unsigned int>intercept:  # equiv. to (ii=0 and intercept=1)
+                P1_ii = 0.0
+            else:
+                P1_ii = P1[ii - intercept]
+
+            if Q_active_set_ii[active_set_ii] == 0.0:  # no need to multiply zeros
+                continue
+
+            w_ii = w[ii]  # Store previous value, the iith element of w
+
+            qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii]  ##
+            w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii]
+
+            if ii >= <unsigned int>intercept: # this is big-brain logic
+                if has_lower_bounds:
+                    if w[ii] < lower_bounds[ii - intercept]:
+                        w[ii] = lower_bounds[ii - intercept]
+                if has_upper_bounds:
+                    if w[ii] > upper_bounds[ii - intercept]:
+                        w[ii] = upper_bounds[ii - intercept]
+
+            if w[ii] != 0.0 or w_ii != 0.0:
+                # q +=  (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y)
+                for active_set_jj in range(n_active_features):
+                    jj = active_set[active_set_jj]
+                    # this is equivalent to A += Bj * z
+                    q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj]
+
+            # update the maximum absolute coefficient update
+            d_w_ii = fabs(w[ii] - w_ii)
+            if d_w_ii > d_w_max:
+                d_w_max = d_w_ii
+
+            if fabs(w[ii]) > w_max:
+                w_max = fabs(w[ii])
+
+        if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
+            # the biggest coordinate update of this iteration was smaller than
+            # the tolerance: check the minimum norm subgradient as the
+            # ultimate stopping criterion
+            cython_norm_min_subgrad(
+                active_set,
+                w, q, P1, intercept,
+                has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds,
+                &norm_min_subgrad, &max_min_subgrad
+            )
+            if norm_min_subgrad <= tol:
+                break
+    else:
+        # for/else, runs if for doesn't end with a `break`
+        # with gil:
+        warnings.warn("Coordinate descent did not converge. You might want to "
+                            "increase the number of iterations. Minimum norm "
+                            "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
+                            ConvergenceWarning)
+
+    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1, Q_check
+
+def enet_coordinate_descent_gram_diag_only(int[::1] active_set,
+                                 floating[::1] w,
+                                 floating[::1] P1,
+                                 floating[:,:] Q,
+                                 floating[::1] q,
+                                 int max_iter, floating tol, object rng,
+                                 bint intercept, bint random,
+                                 bint has_lower_bounds,
+                                 floating[:] lower_bounds,
+                                 bint has_upper_bounds,
+                                 floating[:] upper_bounds):
     """Cython version of the coordinate descent algorithm
         for Elastic-Net regression
         We minimize
@@ -115,6 +404,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         Q = X^T X (Gram matrix)
         q = X^T y
     """
+    # In this version, we don't access the off-diagonal elements. And see what happens.
 
     # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
@@ -168,11 +458,10 @@ def enet_coordinate_descent_gram(int[::1] active_set,
                         if w[ii] > upper_bounds[ii - intercept]:
                             w[ii] = upper_bounds[ii - intercept]
 
-                if w[ii] != 0.0 or w_ii != 0.0:
-                    # q +=  (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y)
-                    for active_set_jj in range(n_active_features):
-                        jj = active_set[active_set_jj]
-                        q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj]
+                # if w[ii] != 0.0 or w_ii != 0.0:
+                #     for active_set_jj in range(n_active_features):
+                #         jj = active_set[active_set_jj]
+                #         q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj]
 
                 # update the maximum absolute coefficient update
                 d_w_ii = fabs(w[ii] - w_ii)
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index bb693cee..950daef8 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -15,6 +15,7 @@
 from ._cd_fast import (
     _norm_min_subgrad,
     enet_coordinate_descent_gram,
+    enet_coordinate_descent_gram_diag_fisher,
     identify_active_rows,
 )
 from ._distribution import ExponentialDispersionModel
@@ -56,23 +57,62 @@ def _least_squares_solver(state, data, hessian):
 
 
 @timeit("inner_solver_runtime")
-def _cd_solver(state, data, active_hessian):
-    new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram(
-        state.active_set,
-        state.coef.copy(),
-        data.P1,
-        active_hessian,
-        -state.score,
-        data.max_inner_iter,
-        state.inner_tol,
-        data.random_state,
-        data.fit_intercept,
-        data.selection == "random",
-        data.has_lower_bounds,
-        data._lower_bounds,
-        data.has_upper_bounds,
-        data._upper_bounds,
-    )
+def _cd_solver(state, data, active_hessian, diag_fisher=False):
+    if diag_fisher:
+        if data.fit_intercept:
+            if 0 in state.active_set:
+                # remove the zero, subtract one from everything else
+                X_active = data.X[:, state.active_set[1:] - 1].A
+            else:
+                X_active = data.X[:, state.active_set - 1].A
+        else:
+            X_active = data.X[:, state.active_set].A
+        # instead of passing in data.X, we should pass in
+        # but remember! X is sparse. But X is not necessarily square
+        # we may need active rows and columns
+        # X_active = data.X[:, state.active_set].A
+        (
+            new_coef,
+            gap,
+            _,
+            _,
+            n_cycles,
+            Q_check,
+        ) = enet_coordinate_descent_gram_diag_fisher(
+            X_active,  # new
+            state.hessian_rows,  # new
+            state.active_set,
+            state.coef.copy(),
+            data.P1,
+            active_hessian,
+            -state.score,
+            data.max_inner_iter,
+            state.inner_tol,
+            data.random_state,
+            data.fit_intercept,
+            data.selection == "random",
+            data.has_lower_bounds,
+            data._lower_bounds,
+            data.has_upper_bounds,
+            data._upper_bounds,
+        )
+    else:
+        new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram(
+            state.active_set,
+            state.coef.copy(),
+            data.P1,
+            active_hessian,
+            -state.score,
+            data.max_inner_iter,
+            state.inner_tol,
+            data.random_state,
+            data.fit_intercept,
+            data.selection == "random",
+            data.has_lower_bounds,
+            data._lower_bounds,
+            data.has_upper_bounds,
+            data._upper_bounds,
+        )
     return new_coef - state.coef, n_cycles
 
 
@@ -143,16 +183,18 @@ def update_hessian(state, data, active_set):
         # 2) use all the rows
         # 3) Include the P2 components
         # 4) just like an update, we only update the active_set
+
+        # this is not needed for diag_fisher!
         hessian_init = build_hessian_delta(
             data.X,
             state.hessian_rows,  # this is the d that goes into sandwich
             data.fit_intercept,
             data.P2,
-            np.arange(data.X.shape[0], dtype=np.int32),
-            active_set,
+            np.arange(data.X.shape[0], dtype=np.int32),  # this uses all rows (16087)
+            active_set,  # (17650)
         )
 
-        # In the sparse Hessian case, stsate.hessian is a coo_matrix.
+        # In the sparse Hessian case, state.hessian is a coo_matrix.
         # hessian_init is a numpy array of size active_set x active_set;
         # we can convert this to a coo_matrix and simply add it to the state.hessian
         if state.use_sparse_hessian:
@@ -169,6 +211,7 @@ def update_hessian(state, data, active_set):
 
         state.hessian_initialized = True
         n_active_rows = data.X.shape[0]
+        # fortunately, none of these above variables are required
 
     else:
 
@@ -178,6 +221,9 @@ def update_hessian(state, data, active_set):
         # 3) Ignore the P2 components because those don't change and have
         #    already been added
         # 4) only update the active set subset of the hessian.
+
+        # in the diag_fisher case, we don't want to use the diff, we use the whole thing
+
         hessian_rows_diff, active_rows = identify_active_rows(
             state.gradient_rows,
             state.hessian_rows,
@@ -301,6 +347,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
 
     Repeat steps 1-3 until convergence.
     Note: Use hessian matrix instead of Hessian for H.
+        This used to say "Use Fisher matrix instead of Hessian for H."
     Note: f' = -score, H = hessian matrix
 
     Parameters
@@ -346,12 +393,35 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
             state.old_active_set = state.active_set
             state.active_set = identify_active_set(state, data)
 
-            # 0) Build the hessian
-            hessian, state.n_active_rows = update_hessian(state, data, state.active_set)
+            diag_fisher = True
+
+            if not diag_fisher:
+                # 0) Build the hessian
+                hessian, state.n_active_rows = update_hessian(
+                    state, data, state.active_set
+                )
+                # state.n_active_rows never gets used again in this function
+                # may be used in logging, but we'll ignore for diag_fisher case
+
+                # 1) Solve the L1 and L2 penalized least squares problem
+                d, n_cycles_this_iter = inner_solver(
+                    state, data, hessian, diag_fisher=diag_fisher
+                )
+                state.n_cycles += n_cycles_this_iter
 
-            # 1) Solve the L1 and L2 penalized least squares problem
-            d, n_cycles_this_iter = inner_solver(state, data, hessian)
-            state.n_cycles += n_cycles_this_iter
+            else:
+                # we need a hessian update step here
+                # but then we'd store mxm, which is undesired
+                # we'll instead store hessian diag, but have to update it first
+                # update_hessian_diag_fisher()  # will update state.hessian_rows
+                # not needed! because of update_quadratic() below
+
+                # sandwich product is between data.X, state.hessian_rows
+                # only use first row of X in the X^T part of the sandwich
+                d, n_cycles_this_iter = inner_solver(
+                    state, data, None, diag_fisher=diag_fisher
+                )
+                state.n_cycles += n_cycles_this_iter
 
             # 2) Line search
             state.old_hessian_rows[:] = state.hessian_rows
@@ -368,7 +438,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
             # 3) Update the quadratic approximation
             state.gradient_rows, state.score, state.hessian_rows = update_quadratic(
                 state, data, coef_P2
-            )
+            )  # is this where we update state.hessian_rows?? if so, no need for diff :)
 
             # 4) Check if we've converged
             (

From 01fab32fdbf4b66d3fc8162f6c4a15f84cdea591 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Wed, 18 May 2022 15:08:30 +0100
Subject: [PATCH 3/9] Pass sparse array into _cd_fast.pyx instead of dense
 array.

---
 src/glum/_cd_fast.pyx | 36 +++++++++++++++++++++++++++---------
 src/glum/_solvers.py  | 13 ++++++++-----
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index 0a2ed5b6..2d4415e7 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -209,7 +209,8 @@ def enet_coordinate_descent_gram(int[::1] active_set,
 def enet_coordinate_descent_gram_diag_fisher(
                                  #floating[:,:] X, 
                                  #floating[:] hessian_rows,
-                                 np.ndarray[np.float64_t, ndim=2] X,
+                                 #np.ndarray[np.float64_t, ndim=2] X,
+                                 X,
                                  np.ndarray[np.float64_t, ndim=1] hessian_rows,
                                  int[::1] active_set,
                                  floating[::1] w,
@@ -258,10 +259,12 @@ def enet_coordinate_descent_gram_diag_fisher(
     cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
     # is empty bad?
 
+    cdef cur_col  # makes sure we can overwrite this and not take up too much memory?
+
     # used to check correctness of code - compare with Q from non-diag-fisher
-    cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64)
+    # cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64)
 
-    # THE inefficiency here is that we must re-calculate the rows of Q for ever iteration.
+    # THE inefficiency here is that we must re-calculate the rows of Q for every iteration.
 
     # with nogil:
     for n_iter in range(max_iter):
@@ -286,10 +289,20 @@ def enet_coordinate_descent_gram_diag_fisher(
             if active_set[0] < <unsigned int>intercept:
                 if ii == 0:
                     Q_active_set_ii[0] = hessian_rows.sum()
-                    Q_active_set_ii[1:] = np.matmul(hessian_rows, X)
+                    Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows)
+                    # Q_active_set_ii[1:] = np.matmul(hessian_rows, X)  
                 else:
-                    Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1])
-                    Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X)
+                    cur_col = X[:, active_set_ii - 1]
+                    Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows)
+                    Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
+
+                    # Q_active_set_ii[0] = X[:, active_set_ii - 1].transpose_matvec(hessian_rows)
+                    # Q_active_set_ii[1:] = X.transpose_matvec(X[:, active_set_ii - 1].multiply(hessian_rows).ravel())
+
+                    # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X[:, active_set_ii - 1].A.ravel())
+                    # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii - 1]) 
+                    # Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1])
+                    # Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X)  # use matvec
 
             # if ii < <unsigned int>intercept:
             #     # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:])
@@ -309,7 +322,12 @@ def enet_coordinate_descent_gram_diag_fisher(
             # NB! We care about the intercept column as well, and must treat differently.
             # does feature_list in sklearnfork include the intercept column?
             else:
-                Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X)
+                cur_col = X[:, active_set_ii]
+                Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
+
+                # Q_active_set_ii = X.transpose_matvec(hessian_rows * X[:, active_set_ii].A.ravel())
+                # Q_active_set_ii = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii]) 
+                # Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X)
                 # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X)
                 # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:])
                 # CHECK that we've covered all the cases!!!!!
@@ -317,7 +335,7 @@ def enet_coordinate_descent_gram_diag_fisher(
 
                 # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X)
 
-            Q_check[active_set_ii] = Q_active_set_ii
+            # Q_check[active_set_ii] = Q_active_set_ii  # for checking correctness
 
             # for col_idx in range(X.shape[1]):
             #     for row_idx in range(X.shape[0]):
@@ -383,7 +401,7 @@ def enet_coordinate_descent_gram_diag_fisher(
                             "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
                             ConvergenceWarning)
 
-    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1, Q_check
+    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check
 
 def enet_coordinate_descent_gram_diag_only(int[::1] active_set,
                                  floating[::1] w,
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index 950daef8..662cd13f 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -60,13 +60,16 @@ def _least_squares_solver(state, data, hessian):
 def _cd_solver(state, data, active_hessian, diag_fisher=False):
     if diag_fisher:
         if data.fit_intercept:
-            if 0 in state.active_set:
+            # the following line should be equivalent to the if/else below it
+            # X_active = data.X[:, state.active_set[(state.active_set[0] == 0):] - 1].A
+            if state.active_set[0] == 0:
                 # remove the zero, subtract one from everything else
-                X_active = data.X[:, state.active_set[1:] - 1].A
+                X_active = data.X[:, state.active_set[1:] - 1]  # .A
             else:
-                X_active = data.X[:, state.active_set - 1].A
+                X_active = data.X[:, state.active_set - 1]  # .A
         else:
-            X_active = data.X[:, state.active_set].A
+            # for some reason, active_set here is massive (everything!)
+            X_active = data.X[:, state.active_set]  # .A
         # instead of passing in data.X, we should pass in
         # but remember! X is sparse. But X is not necessarily square
         # we may need active rows and columns
@@ -77,7 +80,7 @@ def _cd_solver(state, data, active_hessian, diag_fisher=False):
             _,
             _,
             n_cycles,
-            Q_check,
+            # Q_check,
         ) = enet_coordinate_descent_gram_diag_fisher(
             X_active,  # new
             state.hessian_rows,  # new

From 149f890ff2e8ba44f427ce78c1d08fe5f1327d88 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Wed, 18 May 2022 17:57:15 +0100
Subject: [PATCH 4/9] Cleaned and ready for review.

---
 src/glum/_cd_fast.pyx | 231 ++++++------------------------------------
 src/glum/_glm.py      |  17 ++++
 src/glum/_solvers.py  |  80 +++++----------
 3 files changed, 69 insertions(+), 259 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index 2d4415e7..73aea8cb 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -9,7 +9,7 @@
 #
 # cython: boundscheck=False, wraparound=False, cdivision=True
 from libc.math cimport fabs
-cimport numpy as np  # is it a problem that these two imports have same name???
+cimport numpy as np
 import numpy as np
 import numpy.linalg as linalg
 from numpy.math cimport INFINITY
@@ -116,9 +116,8 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         q = X^T y
     """
 
-    # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
-    cdef unsigned int n_features = Q.shape[0]  # are you sure this is CORRECT?
+    cdef unsigned int n_features = Q.shape[0]  # are you sure this is correct?
     # n_features is exactly now the same as n_active_features; should it be q.shape[0]?
     # however, the variable is never used again, so I guess we are okay
 
@@ -142,22 +141,22 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         for n_iter in range(max_iter):
             w_max = 0.0
             d_w_max = 0.0
-            for f_iter in range(n_active_features):  # Loop over coordinates
+            for f_iter in range(n_active_features):
                 if random:
                     active_set_ii = rand_int(n_active_features, rand_r_state)
                 else:
-                    active_set_ii = f_iter  # _ii is an index
-                ii = active_set[active_set_ii]  # odd syntax; gets the active set's row
+                    active_set_ii = f_iter
+                ii = active_set[active_set_ii]
 
-                if ii < <unsigned int>intercept:  # but `intercept` is binary!?
+                if ii < <unsigned int>intercept:
                     P1_ii = 0.0
                 else:
                     P1_ii = P1[ii - intercept]
 
-                if Q[active_set_ii, active_set_ii] == 0.0:  # no need to multiply zeros
+                if Q[active_set_ii, active_set_ii] == 0.0:
                     continue
 
-                w_ii = w[ii]  # Store previous value, the iith element of w
+                w_ii = w[ii]
 
                 qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii]
                 w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii]
@@ -207,15 +206,11 @@ def enet_coordinate_descent_gram(int[::1] active_set,
     return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
 
 def enet_coordinate_descent_gram_diag_fisher(
-                                 #floating[:,:] X, 
-                                 #floating[:] hessian_rows,
-                                 #np.ndarray[np.float64_t, ndim=2] X,
                                  X,
                                  np.ndarray[np.float64_t, ndim=1] hessian_rows,
                                  int[::1] active_set,
                                  floating[::1] w,
                                  floating[::1] P1,
-                                 floating[:,:] Q,
                                  floating[::1] q,
                                  int max_iter, floating tol, object rng,
                                  bint intercept, bint random,
@@ -230,13 +225,13 @@ def enet_coordinate_descent_gram_diag_fisher(
         which amounts to the L1 problem when:
         Q = X^T X (Gram matrix)
         q = X^T y
+
+        This version, for diag_fisher=True, never calculates the entire Hessian matrix; 
+        only rows, when necessary. The inefficiency here is that we must re-calculate 
+        the rows of Q for every iteration up to max_iter.
     """
-    # The CD that never calculates the entire Hessian matrix; only rows, when they are
-    # necessary.
 
-    # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
-    # cdef unsigned int n_features = q.shape[0]  # check that this is the same as Q.shape[0]
 
     cdef floating w_ii
     cdef floating P1_ii
@@ -254,111 +249,51 @@ def enet_coordinate_descent_gram_diag_fisher(
     cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
     cdef UINT32_t* rand_r_state = &rand_r_state_seed
 
-    #cdef np.ndarray[np.float64_t, dtype=q.dtype] Qj  # will hold rows of the Q matrix
-    # cdef floating[:] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
+    # Equivalent to Q[active_set_ii] in the diag_fisher=False version
     cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
-    # is empty bad?
-
-    cdef cur_col  # makes sure we can overwrite this and not take up too much memory?
-
-    # used to check correctness of code - compare with Q from non-diag-fisher
-    # cdef np.ndarray[np.float64_t, ndim=2] Q_check = np.empty((n_active_features, n_active_features), dtype=np.float64)
 
-    # THE inefficiency here is that we must re-calculate the rows of Q for every iteration.
-
-    # with nogil:
+    cdef cur_col  # Does the cdef here (rather than using it as a local variable) reduce memory?
+    
+    # nogil is incompatible with our slicing, numpy, and sparse matrix operations, but 
+    # may be necessary to increase performance
+    # with nogil:  
     for n_iter in range(max_iter):
         w_max = 0.0
         d_w_max = 0.0
-        for f_iter in range(n_active_features):  # Loop over coordinates
+        for f_iter in range(n_active_features):
             if random:
                 active_set_ii = rand_int(n_active_features, rand_r_state)
             else:
-                active_set_ii = f_iter  # _ii is an index
-            ii = active_set[active_set_ii]  # odd syntax; gets an active row
-
-            # remember, Q hasn't been created yet.
-            # and the only row we need for this iteration is Q[active_set_ii]
-            # also remember, sandwich product is between data.X, state.hessian_rows
-            # only use first column of X in the X^T part of the sandwich
+                active_set_ii = f_iter
+            ii = active_set[active_set_ii]
 
-            # if intercept == 1:  # maybe this should go into the next if statement?
-                # Q_active_set_ii[0] = hessian_rows.sum()
-                # this is perfectly valid because hessian_rows has length 16087
-
-            if active_set[0] < <unsigned int>intercept:
+            # Check logic here
+            if active_set[0] < <unsigned int>intercept:  
                 if ii == 0:
                     Q_active_set_ii[0] = hessian_rows.sum()
                     Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows)
-                    # Q_active_set_ii[1:] = np.matmul(hessian_rows, X)  
                 else:
-                    cur_col = X[:, active_set_ii - 1]
+                    cur_col = X[:, active_set_ii - 1]  # Does this reduce memory?
                     Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows)
                     Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
-
-                    # Q_active_set_ii[0] = X[:, active_set_ii - 1].transpose_matvec(hessian_rows)
-                    # Q_active_set_ii[1:] = X.transpose_matvec(X[:, active_set_ii - 1].multiply(hessian_rows).ravel())
-
-                    # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X[:, active_set_ii - 1].A.ravel())
-                    # Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii - 1]) 
-                    # Q_active_set_ii[0] = np.dot(hessian_rows, X[:, active_set_ii - 1])
-                    # Q_active_set_ii[1:] = np.matmul((hessian_rows * X[:, active_set_ii - 1]), X)  # use matvec
-
-            # if ii < <unsigned int>intercept:
-            #     # Q_active_set_ii[1:] = np.matmul(hessian_rows, X[:, 1:])
-            #     # Q_active_set_ii[0] = hessian_rows.sum()
-            #     Q_active_set_ii[1:] = np.matmul(hessian_rows, X)
-
-            # the next condition in the original code checks for sparseness.
-            # we won't worry about that here. but we will have to go elem by elem
-                # actually no, we don't have to go elem by elem. Cython can handle.
-            # do i have to worry about intercept term? or does X already ignore it?
-
-            # what about the active features? do we only need to multiply by those?
-            # correct. i believe we only need to multiply by those
-            # so X can actually be passed in as X[(active_set_ii) cartesian product (active_set_ii)], right?
-            # the only problem is that it would mess up the active_set_ii / active_set_jj logic...
-
-            # NB! We care about the intercept column as well, and must treat differently.
-            # does feature_list in sklearnfork include the intercept column?
             else:
                 cur_col = X[:, active_set_ii]
                 Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
 
-                # Q_active_set_ii = X.transpose_matvec(hessian_rows * X[:, active_set_ii].A.ravel())
-                # Q_active_set_ii = X.transpose_matvec(hessian_rows * X.A[:, active_set_ii]) 
-                # Q_active_set_ii = np.matmul((hessian_rows * X[:, active_set_ii]), X)
-                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii - intercept]), X)
-                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, active_set_ii]), X[:, intercept:])
-                # CHECK that we've covered all the cases!!!!!
-
-
-                # Q_active_set_ii[intercept:] = np.matmul((hessian_rows * X[:, intercept + active_set_ii]), X)
-
-            # Q_check[active_set_ii] = Q_active_set_ii  # for checking correctness
-
-            # for col_idx in range(X.shape[1]):
-            #     for row_idx in range(X.shape[0]):
-            #         Q_active_set_ii[intercept + col_idx] = <array multiplication>
-
-            # the next if statement checks for P2, but we're only using this for L1
-            # regularization, so we don't need to worry about it. and that's all!
-
-            # If the above works, can merge the following into the above if-else!!!!
-            if ii < <unsigned int>intercept:  # equiv. to (ii=0 and intercept=1)
+            if ii < <unsigned int>intercept:
                 P1_ii = 0.0
             else:
                 P1_ii = P1[ii - intercept]
 
-            if Q_active_set_ii[active_set_ii] == 0.0:  # no need to multiply zeros
+            if Q_active_set_ii[active_set_ii] == 0.0:
                 continue
 
-            w_ii = w[ii]  # Store previous value, the iith element of w
+            w_ii = w[ii]
 
-            qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii]  ##
+            qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii]
             w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii]
 
-            if ii >= <unsigned int>intercept: # this is big-brain logic
+            if ii >= <unsigned int>intercept:
                 if has_lower_bounds:
                     if w[ii] < lower_bounds[ii - intercept]:
                         w[ii] = lower_bounds[ii - intercept]
@@ -370,7 +305,6 @@ def enet_coordinate_descent_gram_diag_fisher(
                 # q +=  (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y)
                 for active_set_jj in range(n_active_features):
                     jj = active_set[active_set_jj]
-                    # this is equivalent to A += Bj * z
                     q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj]
 
             # update the maximum absolute coefficient update
@@ -403,113 +337,6 @@ def enet_coordinate_descent_gram_diag_fisher(
 
     return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check
 
-def enet_coordinate_descent_gram_diag_only(int[::1] active_set,
-                                 floating[::1] w,
-                                 floating[::1] P1,
-                                 floating[:,:] Q,
-                                 floating[::1] q,
-                                 int max_iter, floating tol, object rng,
-                                 bint intercept, bint random,
-                                 bint has_lower_bounds,
-                                 floating[:] lower_bounds,
-                                 bint has_upper_bounds,
-                                 floating[:] upper_bounds):
-    """Cython version of the coordinate descent algorithm
-        for Elastic-Net regression
-        We minimize
-        (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
-        which amount to the Elastic-Net problem when:
-        Q = X^T X (Gram matrix)
-        q = X^T y
-    """
-    # In this version, we don't access the off-diagonal elements. And see what happens.
-
-    # get the data information into easy vars
-    cdef unsigned int n_active_features = active_set.shape[0]
-    cdef unsigned int n_features = Q.shape[0]
-
-    cdef floating w_ii
-    cdef floating P1_ii
-    cdef floating qii_temp
-    cdef floating d_w_max
-    cdef floating w_max
-    cdef floating d_w_ii
-    cdef floating d_w_tol = tol
-    cdef floating norm_min_subgrad = 0
-    cdef floating max_min_subgrad
-    cdef unsigned int active_set_ii, active_set_jj
-    cdef unsigned int ii, jj
-    cdef int n_iter = 0
-    cdef unsigned int f_iter
-    cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
-    cdef UINT32_t* rand_r_state = &rand_r_state_seed
-
-    with nogil:
-        for n_iter in range(max_iter):
-            w_max = 0.0
-            d_w_max = 0.0
-            for f_iter in range(n_active_features):  # Loop over coordinates
-                if random:
-                    active_set_ii = rand_int(n_active_features, rand_r_state)
-                else:
-                    active_set_ii = f_iter
-                ii = active_set[active_set_ii]
-
-                if ii < <unsigned int>intercept:
-                    P1_ii = 0.0
-                else:
-                    P1_ii = P1[ii - intercept]
-
-                if Q[active_set_ii, active_set_ii] == 0.0:
-                    continue
-
-                w_ii = w[ii]  # Store previous value
-
-                qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii]
-                w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii]
-
-                if ii >= <unsigned int>intercept:
-                    if has_lower_bounds:
-                        if w[ii] < lower_bounds[ii - intercept]:
-                            w[ii] = lower_bounds[ii - intercept]
-                    if has_upper_bounds:
-                        if w[ii] > upper_bounds[ii - intercept]:
-                            w[ii] = upper_bounds[ii - intercept]
-
-                # if w[ii] != 0.0 or w_ii != 0.0:
-                #     for active_set_jj in range(n_active_features):
-                #         jj = active_set[active_set_jj]
-                #         q[jj] += (w[ii] - w_ii) * Q[active_set_ii, active_set_jj]
-
-                # update the maximum absolute coefficient update
-                d_w_ii = fabs(w[ii] - w_ii)
-                if d_w_ii > d_w_max:
-                    d_w_max = d_w_ii
-
-                if fabs(w[ii]) > w_max:
-                    w_max = fabs(w[ii])
-
-            if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
-                # the biggest coordinate update of this iteration was smaller than
-                # the tolerance: check the minimum norm subgradient as the
-                # ultimate stopping criterion
-                cython_norm_min_subgrad(
-                    active_set,
-                    w, q, P1, intercept,
-                    has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds,
-                    &norm_min_subgrad, &max_min_subgrad
-                )
-                if norm_min_subgrad <= tol:
-                    break
-        else:
-            # for/else, runs if for doesn't end with a `break`
-            with gil:
-                warnings.warn("Coordinate descent did not converge. You might want to "
-                              "increase the number of iterations. Minimum norm "
-                              "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
-                              ConvergenceWarning)
-
-    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
 
 cdef void cython_norm_min_subgrad(
     int[::1] active_set,
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index a4bd39f9..381f453a 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -672,6 +672,7 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
+        diag_fisher=False,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -701,6 +702,7 @@ def __init__(
         self.A_ineq = A_ineq
         self.b_ineq = b_ineq
         self.force_all_finite = force_all_finite
+        self.diag_fisher = diag_fisher
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -969,6 +971,7 @@ def _solve(
                 lower_bounds=lower_bounds,
                 upper_bounds=upper_bounds,
                 verbose=self.verbose > 0,
+                diag_fisher=self.diag_fisher,
             )
             if self._solver == "irls-ls":
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
@@ -976,6 +979,7 @@ def _solve(
                 )
             # 4.2 coordinate descent ##############################################
             elif self._solver == "irls-cd":
+                # [Alan] This is the case we're concerned with for wide problems.
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
                     _cd_solver, coef, irls_data
                 )
@@ -2088,6 +2092,17 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         ``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for
         details.
 
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+        (Text taken directly from the orig_sklearn_fork.)
+
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2168,6 +2183,7 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
+        diag_fisher=False,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2200,6 +2216,7 @@ def __init__(
             A_ineq=A_ineq,
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
+            diag_fisher=diag_fisher,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index 662cd13f..1363f80c 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -57,37 +57,24 @@ def _least_squares_solver(state, data, hessian):
 
 
 @timeit("inner_solver_runtime")
-def _cd_solver(state, data, active_hessian, diag_fisher=False):
-    if diag_fisher:
+def _cd_solver(state, data, active_hessian):
+    if data.diag_fisher:
         if data.fit_intercept:
-            # the following line should be equivalent to the if/else below it
-            # X_active = data.X[:, state.active_set[(state.active_set[0] == 0):] - 1].A
             if state.active_set[0] == 0:
-                # remove the zero, subtract one from everything else
-                X_active = data.X[:, state.active_set[1:] - 1]  # .A
+                # if the intercept column of X is in the active set, leave it out
+                # (we deal with intercepts separately in _cd_fast.pyx)
+                X_active = data.X[:, state.active_set[1:] - 1]
             else:
-                X_active = data.X[:, state.active_set - 1]  # .A
+                X_active = data.X[:, state.active_set - 1]
         else:
-            # for some reason, active_set here is massive (everything!)
-            X_active = data.X[:, state.active_set]  # .A
-        # instead of passing in data.X, we should pass in
-        # but remember! X is sparse. But X is not necessarily square
-        # we may need active rows and columns
-        # X_active = data.X[:, state.active_set].A
-        (
-            new_coef,
-            gap,
-            _,
-            _,
-            n_cycles,
-            # Q_check,
-        ) = enet_coordinate_descent_gram_diag_fisher(
-            X_active,  # new
-            state.hessian_rows,  # new
+            # with no intercept, active_set here can be massive on first iteration
+            X_active = data.X[:, state.active_set]
+        (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher(
+            X_active,
+            state.hessian_rows,
             state.active_set,
             state.coef.copy(),
             data.P1,
-            active_hessian,
             -state.score,
             data.max_inner_iter,
             state.inner_tol,
@@ -187,14 +174,13 @@ def update_hessian(state, data, active_set):
         # 3) Include the P2 components
         # 4) just like an update, we only update the active_set
 
-        # this is not needed for diag_fisher!
         hessian_init = build_hessian_delta(
             data.X,
-            state.hessian_rows,  # this is the d that goes into sandwich
+            state.hessian_rows,
             data.fit_intercept,
             data.P2,
-            np.arange(data.X.shape[0], dtype=np.int32),  # this uses all rows (16087)
-            active_set,  # (17650)
+            np.arange(data.X.shape[0], dtype=np.int32),
+            active_set,
         )
 
         # In the sparse Hessian case, state.hessian is a coo_matrix.
@@ -214,7 +200,6 @@ def update_hessian(state, data, active_set):
 
         state.hessian_initialized = True
         n_active_rows = data.X.shape[0]
-        # fortunately, none of these above variables are required
 
     else:
 
@@ -225,8 +210,6 @@ def update_hessian(state, data, active_set):
         #    already been added
         # 4) only update the active set subset of the hessian.
 
-        # in the diag_fisher case, we don't want to use the diff, we use the whole thing
-
         hessian_rows_diff, active_rows = identify_active_rows(
             state.gradient_rows,
             state.hessian_rows,
@@ -260,6 +243,7 @@ def update_hessian(state, data, active_set):
     # active set elements, which we then convert to a numpy array for compatibility with
     # the rest of the code
     if state.use_sparse_hessian:
+        # this is a little awkward
         return (
             state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
             n_active_rows,
@@ -291,8 +275,6 @@ def build_hessian_delta(
     """
     idx = 1 if intercept else 0
     active_cols_non_intercept = active_cols[idx:] - idx
-    # Here is the safe sandwich dot that calls sandwich
-    # active_cols is small even for a large number of columns (2304 for 1M)
     delta = _safe_sandwich_dot(
         X, hessian_rows, active_rows, active_cols_non_intercept, intercept
     )
@@ -396,35 +378,17 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
             state.old_active_set = state.active_set
             state.active_set = identify_active_set(state, data)
 
-            diag_fisher = True
-
-            if not diag_fisher:
+            if not data.diag_fisher:
                 # 0) Build the hessian
                 hessian, state.n_active_rows = update_hessian(
                     state, data, state.active_set
                 )
-                # state.n_active_rows never gets used again in this function
-                # may be used in logging, but we'll ignore for diag_fisher case
-
                 # 1) Solve the L1 and L2 penalized least squares problem
-                d, n_cycles_this_iter = inner_solver(
-                    state, data, hessian, diag_fisher=diag_fisher
-                )
-                state.n_cycles += n_cycles_this_iter
-
+                d, n_cycles_this_iter = inner_solver(state, data, hessian)
             else:
-                # we need a hessian update step here
-                # but then we'd store mxm, which is undesired
-                # we'll instead store hessian diag, but have to update it first
-                # update_hessian_diag_fisher()  # will update state.hessian_rows
-                # not needed! because of update_quadratic() below
-
-                # sandwich product is between data.X, state.hessian_rows
-                # only use first row of X in the X^T part of the sandwich
-                d, n_cycles_this_iter = inner_solver(
-                    state, data, None, diag_fisher=diag_fisher
-                )
-                state.n_cycles += n_cycles_this_iter
+                # 0 & 1) Build the hessian row by row, solving as we go
+                d, n_cycles_this_iter = inner_solver(state, data, None)
+            state.n_cycles += n_cycles_this_iter
 
             # 2) Line search
             state.old_hessian_rows[:] = state.hessian_rows
@@ -441,7 +405,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
             # 3) Update the quadratic approximation
             state.gradient_rows, state.score, state.hessian_rows = update_quadratic(
                 state, data, coef_P2
-            )  # is this where we update state.hessian_rows?? if so, no need for diff :)
+            )
 
             # 4) Check if we've converged
             (
@@ -544,6 +508,7 @@ def __init__(
         lower_bounds: Optional[np.ndarray] = None,
         upper_bounds: Optional[np.ndarray] = None,
         verbose: bool = False,
+        diag_fisher: bool = False,
     ):
         self.X = X
         self.y = y
@@ -575,6 +540,7 @@ def __init__(
 
         self.intercept_offset = 1 if self.fit_intercept else 0
         self.verbose = verbose
+        self.diag_fisher = diag_fisher
 
         self._check_data()
 

From a58dfa1bad56d57943b8dd8c1dda8810a363de59 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Thu, 19 May 2022 20:37:26 +0100
Subject: [PATCH 5/9] Address comments; about to split into two PRs

---
 src/glum_benchmarks/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum_benchmarks/README.md b/src/glum_benchmarks/README.md
index c3ac43e2..23321713 100644
--- a/src/glum_benchmarks/README.md
+++ b/src/glum_benchmarks/README.md
@@ -6,7 +6,7 @@ Python package to benchmark GLM implementations.
 
 ## Running the benchmarks
 
-After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/problems.py` to see the list of problems that will be run through each library.
+After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/glum_benchmarks/problems.py` to see the list of problems that will be run through each library.
 
 To run the full benchmarking suite, just run `glm_benchmarks_run` with no flags. This will probably take a very long time.
 

From c5f566774bd53bf05f229c957a0745b409039dda Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Thu, 19 May 2022 20:39:58 +0100
Subject: [PATCH 6/9] Address comments; about to split into two PRs

---
 src/glum/_cd_fast.pyx | 14 +++++---------
 src/glum/_glm.py      | 17 ++++++++++++++---
 src/glum/_solvers.py  | 24 +++++++++++++-----------
 3 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index 73aea8cb..a75c33ea 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -117,9 +117,6 @@ def enet_coordinate_descent_gram(int[::1] active_set,
     """
 
     cdef unsigned int n_active_features = active_set.shape[0]
-    cdef unsigned int n_features = Q.shape[0]  # are you sure this is correct?
-    # n_features is exactly now the same as n_active_features; should it be q.shape[0]?
-    # however, the variable is never used again, so I guess we are okay
 
     cdef floating w_ii
     cdef floating P1_ii
@@ -141,7 +138,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         for n_iter in range(max_iter):
             w_max = 0.0
             d_w_max = 0.0
-            for f_iter in range(n_active_features):
+            for f_iter in range(n_active_features):  # Loop over coordinates
                 if random:
                     active_set_ii = rand_int(n_active_features, rand_r_state)
                 else:
@@ -227,8 +224,8 @@ def enet_coordinate_descent_gram_diag_fisher(
         q = X^T y
 
         This version, for diag_fisher=True, never calculates the entire Hessian matrix; 
-        only rows, when necessary. The inefficiency here is that we must re-calculate 
-        the rows of Q for every iteration up to max_iter.
+        only rows, when necessary. This requires less memory. However, the inefficiency 
+        is that we must re-calculate the rows of Q for every iteration up to max_iter.
     """
 
     cdef unsigned int n_active_features = active_set.shape[0]
@@ -260,14 +257,13 @@ def enet_coordinate_descent_gram_diag_fisher(
     for n_iter in range(max_iter):
         w_max = 0.0
         d_w_max = 0.0
-        for f_iter in range(n_active_features):
+        for f_iter in range(n_active_features):  # Loop over coordinates
             if random:
                 active_set_ii = rand_int(n_active_features, rand_r_state)
             else:
                 active_set_ii = f_iter
             ii = active_set[active_set_ii]
 
-            # Check logic here
             if active_set[0] < <unsigned int>intercept:  
                 if ii == 0:
                     Q_active_set_ii[0] = hessian_rows.sum()
@@ -335,7 +331,7 @@ def enet_coordinate_descent_gram_diag_fisher(
                             "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
                             ConvergenceWarning)
 
-    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1 # , Q_check
+    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
 
 
 cdef void cython_norm_min_subgrad(
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 381f453a..e54140f0 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -672,7 +672,8 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
-        diag_fisher=False,
+        use_sparse_hessian: bool = True,
+        diag_fisher: bool = False,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -702,6 +703,7 @@ def __init__(
         self.A_ineq = A_ineq
         self.b_ineq = b_ineq
         self.force_all_finite = force_all_finite
+        self.use_sparse_hessian = use_sparse_hessian
         self.diag_fisher = diag_fisher
 
     @property
@@ -971,6 +973,7 @@ def _solve(
                 lower_bounds=lower_bounds,
                 upper_bounds=upper_bounds,
                 verbose=self.verbose > 0,
+                use_sparse_hessian=self.use_sparse_hessian,
                 diag_fisher=self.diag_fisher,
             )
             if self._solver == "irls-ls":
@@ -979,7 +982,7 @@ def _solve(
                 )
             # 4.2 coordinate descent ##############################################
             elif self._solver == "irls-cd":
-                # [Alan] This is the case we're concerned with for wide problems.
+                # This is the case we're concerned with for wide problems.
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
                     _cd_solver, coef, irls_data
                 )
@@ -2092,6 +2095,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         ``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for
         details.
 
+    use_sparse_hessian : boolean, optional, (default=True)
+        If ``True``, stores the current state of the Hessian as a sparse COO matrix.
+        If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
+        `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
+        issues when working with extremely wide data.
+
     diag_fisher : boolean, optional, (default=False)
         Only relevant for solver 'cd' (see also ``start_params='guess'``).
         If ``False``, the full Fisher matrix (expected Hessian) is computed in
@@ -2183,7 +2192,8 @@ def __init__(
         A_ineq: Optional[np.ndarray] = None,
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
-        diag_fisher=False,
+        use_sparse_hessian: bool = True,
+        diag_fisher: bool = False,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2216,6 +2226,7 @@ def __init__(
             A_ineq=A_ineq,
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
+            use_sparse_hessian=use_sparse_hessian,
             diag_fisher=diag_fisher,
         )
 
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index 1363f80c..b1350e39 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -69,6 +69,7 @@ def _cd_solver(state, data, active_hessian):
         else:
             # with no intercept, active_set here can be massive on first iteration
             X_active = data.X[:, state.active_set]
+
         (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher(
             X_active,
             state.hessian_rows,
@@ -103,6 +104,7 @@ def _cd_solver(state, data, active_hessian):
             data.has_upper_bounds,
             data._upper_bounds,
         )
+
     return new_coef - state.coef, n_cycles
 
 
@@ -184,10 +186,10 @@ def update_hessian(state, data, active_set):
         )
 
         # In the sparse Hessian case, state.hessian is a coo_matrix.
-        # hessian_init is a numpy array of size active_set x active_set;
+        # hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
         # we can convert this to a coo_matrix and simply add it to the state.hessian
-        if state.use_sparse_hessian:
-            coo_data = hessian_init.flatten()
+        if data.use_sparse_hessian:
+            coo_data = hessian_init.ravel(order="C")
             coo_rows = np.repeat(active_set, active_set.shape[0])
             coo_cols = np.tile(active_set, active_set.shape[0])
 
@@ -225,8 +227,8 @@ def update_hessian(state, data, active_set):
             active_cols=active_set,
         )
 
-        if state.use_sparse_hessian:
-            coo_data = hessian_delta.flatten()
+        if data.use_sparse_hessian:
+            coo_data = hessian_delta.ravel(order="C")
             coo_rows = np.repeat(active_set, active_set.shape[0])
             coo_cols = np.tile(active_set, active_set.shape[0])
 
@@ -241,9 +243,9 @@ def update_hessian(state, data, active_set):
 
     # If state.hessian is in COO form, we have to convert to CSC in order to index the
     # active set elements, which we then convert to a numpy array for compatibility with
-    # the rest of the code
-    if state.use_sparse_hessian:
-        # this is a little awkward
+    # the rest of the code. This numpy array is dense, but we care about problems in
+    # which the active set is usually much smaller than the number of data columns.
+    if data.use_sparse_hessian:
         return (
             state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
             n_active_rows,
@@ -332,7 +334,6 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
 
     Repeat steps 1-3 until convergence.
     Note: Use hessian matrix instead of Hessian for H.
-        This used to say "Use Fisher matrix instead of Hessian for H."
     Note: f' = -score, H = hessian matrix
 
     Parameters
@@ -508,6 +509,7 @@ def __init__(
         lower_bounds: Optional[np.ndarray] = None,
         upper_bounds: Optional[np.ndarray] = None,
         verbose: bool = False,
+        use_sparse_hessian: bool = True,
         diag_fisher: bool = False,
     ):
         self.X = X
@@ -540,6 +542,7 @@ def __init__(
 
         self.intercept_offset = 1 if self.fit_intercept else 0
         self.verbose = verbose
+        self.use_sparse_hessian = use_sparse_hessian
         self.diag_fisher = diag_fisher
 
         self._check_data()
@@ -606,8 +609,7 @@ def __init__(self, coef, data):
 
         # In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
         # can instantiate the Hessian as a sparse COO matrix.
-        self.use_sparse_hessian = True
-        if self.use_sparse_hessian:
+        if data.use_sparse_hessian:
             self.hessian = sparse.coo_matrix(
                 (self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
             )

From 684de67a47c00568a20f8187c6508f292bfe8b84 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Thu, 19 May 2022 20:46:39 +0100
Subject: [PATCH 7/9] Put back some old comments that were mistakenly removed

---
 src/glum/_cd_fast.pyx         | 8 +++++---
 src/glum_benchmarks/README.md | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index a75c33ea..d40b41f4 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -111,11 +111,12 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         for Elastic-Net regression
         We minimize
         (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
-        which amounts to the L1 problem when:
+        which amounts to the Elastic-Net problem when:
         Q = X^T X (Gram matrix)
         q = X^T y
     """
 
+    # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
 
     cdef floating w_ii
@@ -153,7 +154,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
                 if Q[active_set_ii, active_set_ii] == 0.0:
                     continue
 
-                w_ii = w[ii]
+                w_ii = w[ii]  # Store previous value
 
                 qii_temp = q[ii] - w[ii] * Q[active_set_ii, active_set_ii]
                 w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q[active_set_ii, active_set_ii]
@@ -228,6 +229,7 @@ def enet_coordinate_descent_gram_diag_fisher(
         is that we must re-calculate the rows of Q for every iteration up to max_iter.
     """
 
+    # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
 
     cdef floating w_ii
@@ -284,7 +286,7 @@ def enet_coordinate_descent_gram_diag_fisher(
             if Q_active_set_ii[active_set_ii] == 0.0:
                 continue
 
-            w_ii = w[ii]
+            w_ii = w[ii]  # Store previous value
 
             qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii]
             w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii]
diff --git a/src/glum_benchmarks/README.md b/src/glum_benchmarks/README.md
index 23321713..c3ac43e2 100644
--- a/src/glum_benchmarks/README.md
+++ b/src/glum_benchmarks/README.md
@@ -6,7 +6,7 @@ Python package to benchmark GLM implementations.
 
 ## Running the benchmarks
 
-After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/glum_benchmarks/problems.py` to see the list of problems that will be run through each library.
+After installing the package, you should have two CLI tools: `glm_benchmarks_run` and `glm_benchmarks_analyze`. Use the `--help` flag for full details. Look in `src/glum/problems.py` to see the list of problems that will be run through each library.
 
 To run the full benchmarking suite, just run `glm_benchmarks_run` with no flags. This will probably take a very long time.
 

From 4d628a4f0b7576cb90d8d7f11d1cceb2fae28f16 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Thu, 19 May 2022 20:48:09 +0100
Subject: [PATCH 8/9] One-letter fix

---
 src/glum/_cd_fast.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index d40b41f4..5d5d702e 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -111,7 +111,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
         for Elastic-Net regression
         We minimize
         (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
-        which amounts to the Elastic-Net problem when:
+        which amount to the Elastic-Net problem when:
         Q = X^T X (Gram matrix)
         q = X^T y
     """

From 39c2b10b886f53bf135daa48686c71bb98424850 Mon Sep 17 00:00:00 2001
From: Alan Tu <alan.tu@quantco.com>
Date: Thu, 19 May 2022 21:41:35 +0100
Subject: [PATCH 9/9] use_sparse_hessian code isolated in this branch

---
 src/glum/_cd_fast.pyx | 134 +-----------------------------------------
 src/glum/_glm.py      |  16 -----
 src/glum/_solvers.py  |  81 +++++++------------------
 3 files changed, 22 insertions(+), 209 deletions(-)

diff --git a/src/glum/_cd_fast.pyx b/src/glum/_cd_fast.pyx
index 5d5d702e..b4bcf19b 100644
--- a/src/glum/_cd_fast.pyx
+++ b/src/glum/_cd_fast.pyx
@@ -118,6 +118,7 @@ def enet_coordinate_descent_gram(int[::1] active_set,
 
     # get the data information into easy vars
     cdef unsigned int n_active_features = active_set.shape[0]
+    cdef unsigned int n_features = Q.shape[0]
 
     cdef floating w_ii
     cdef floating P1_ii
@@ -203,139 +204,6 @@ def enet_coordinate_descent_gram(int[::1] active_set,
 
     return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
 
-def enet_coordinate_descent_gram_diag_fisher(
-                                 X,
-                                 np.ndarray[np.float64_t, ndim=1] hessian_rows,
-                                 int[::1] active_set,
-                                 floating[::1] w,
-                                 floating[::1] P1,
-                                 floating[::1] q,
-                                 int max_iter, floating tol, object rng,
-                                 bint intercept, bint random,
-                                 bint has_lower_bounds,
-                                 floating[:] lower_bounds,
-                                 bint has_upper_bounds,
-                                 floating[:] upper_bounds):
-    """Cython version of the coordinate descent algorithm
-        for Elastic-Net regression
-        We minimize
-        (1/2) * w^T Q w - q^T w + P1 norm(w, 1)
-        which amounts to the L1 problem when:
-        Q = X^T X (Gram matrix)
-        q = X^T y
-
-        This version, for diag_fisher=True, never calculates the entire Hessian matrix; 
-        only rows, when necessary. This requires less memory. However, the inefficiency 
-        is that we must re-calculate the rows of Q for every iteration up to max_iter.
-    """
-
-    # get the data information into easy vars
-    cdef unsigned int n_active_features = active_set.shape[0]
-
-    cdef floating w_ii
-    cdef floating P1_ii
-    cdef floating qii_temp
-    cdef floating d_w_max
-    cdef floating w_max
-    cdef floating d_w_ii
-    cdef floating d_w_tol = tol
-    cdef floating norm_min_subgrad = 0
-    cdef floating max_min_subgrad
-    cdef unsigned int active_set_ii, active_set_jj
-    cdef unsigned int ii, jj
-    cdef int n_iter = 0
-    cdef unsigned int f_iter
-    cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
-    cdef UINT32_t* rand_r_state = &rand_r_state_seed
-
-    # Equivalent to Q[active_set_ii] in the diag_fisher=False version
-    cdef np.ndarray[np.float64_t, ndim=1] Q_active_set_ii = np.empty(n_active_features, dtype=np.float64)
-
-    cdef cur_col  # Does the cdef here (rather than using it as a local variable) reduce memory?
-    
-    # nogil is incompatible with our slicing, numpy, and sparse matrix operations, but 
-    # may be necessary to increase performance
-    # with nogil:  
-    for n_iter in range(max_iter):
-        w_max = 0.0
-        d_w_max = 0.0
-        for f_iter in range(n_active_features):  # Loop over coordinates
-            if random:
-                active_set_ii = rand_int(n_active_features, rand_r_state)
-            else:
-                active_set_ii = f_iter
-            ii = active_set[active_set_ii]
-
-            if active_set[0] < <unsigned int>intercept:  
-                if ii == 0:
-                    Q_active_set_ii[0] = hessian_rows.sum()
-                    Q_active_set_ii[1:] = X.transpose_matvec(hessian_rows)
-                else:
-                    cur_col = X[:, active_set_ii - 1]  # Does this reduce memory?
-                    Q_active_set_ii[0] = cur_col.transpose_matvec(hessian_rows)
-                    Q_active_set_ii[1:] = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
-            else:
-                cur_col = X[:, active_set_ii]
-                Q_active_set_ii = X.transpose_matvec(cur_col.multiply(hessian_rows).ravel())
-
-            if ii < <unsigned int>intercept:
-                P1_ii = 0.0
-            else:
-                P1_ii = P1[ii - intercept]
-
-            if Q_active_set_ii[active_set_ii] == 0.0:
-                continue
-
-            w_ii = w[ii]  # Store previous value
-
-            qii_temp = q[ii] - w[ii] * Q_active_set_ii[active_set_ii]
-            w[ii] = fsign(-qii_temp) * fmax(fabs(qii_temp) - P1_ii, 0) / Q_active_set_ii[active_set_ii]
-
-            if ii >= <unsigned int>intercept:
-                if has_lower_bounds:
-                    if w[ii] < lower_bounds[ii - intercept]:
-                        w[ii] = lower_bounds[ii - intercept]
-                if has_upper_bounds:
-                    if w[ii] > upper_bounds[ii - intercept]:
-                        w[ii] = upper_bounds[ii - intercept]
-
-            if w[ii] != 0.0 or w_ii != 0.0:
-                # q +=  (w[ii] - w_ii) * Q[ii] # Update q = X.T (X w - y)
-                for active_set_jj in range(n_active_features):
-                    jj = active_set[active_set_jj]
-                    q[jj] += (w[ii] - w_ii) * Q_active_set_ii[active_set_jj]
-
-            # update the maximum absolute coefficient update
-            d_w_ii = fabs(w[ii] - w_ii)
-            if d_w_ii > d_w_max:
-                d_w_max = d_w_ii
-
-            if fabs(w[ii]) > w_max:
-                w_max = fabs(w[ii])
-
-        if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
-            # the biggest coordinate update of this iteration was smaller than
-            # the tolerance: check the minimum norm subgradient as the
-            # ultimate stopping criterion
-            cython_norm_min_subgrad(
-                active_set,
-                w, q, P1, intercept,
-                has_lower_bounds, lower_bounds, has_upper_bounds, upper_bounds,
-                &norm_min_subgrad, &max_min_subgrad
-            )
-            if norm_min_subgrad <= tol:
-                break
-    else:
-        # for/else, runs if for doesn't end with a `break`
-        # with gil:
-        warnings.warn("Coordinate descent did not converge. You might want to "
-                            "increase the number of iterations. Minimum norm "
-                            "subgradient: {}, tolerance: {}".format(norm_min_subgrad, tol),
-                            ConvergenceWarning)
-
-    return np.asarray(w), norm_min_subgrad, max_min_subgrad, tol, n_iter + 1
-
-
 cdef void cython_norm_min_subgrad(
     int[::1] active_set,
     floating[::1] coef,
diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index e54140f0..6e12cc25 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -673,7 +673,6 @@ def __init__(
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
         use_sparse_hessian: bool = True,
-        diag_fisher: bool = False,
     ):
         self.l1_ratio = l1_ratio
         self.P1 = P1
@@ -704,7 +703,6 @@ def __init__(
         self.b_ineq = b_ineq
         self.force_all_finite = force_all_finite
         self.use_sparse_hessian = use_sparse_hessian
-        self.diag_fisher = diag_fisher
 
     @property
     def family_instance(self) -> ExponentialDispersionModel:
@@ -974,7 +972,6 @@ def _solve(
                 upper_bounds=upper_bounds,
                 verbose=self.verbose > 0,
                 use_sparse_hessian=self.use_sparse_hessian,
-                diag_fisher=self.diag_fisher,
             )
             if self._solver == "irls-ls":
                 coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
@@ -2101,17 +2098,6 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
         `num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
         issues when working with extremely wide data.
 
-    diag_fisher : boolean, optional, (default=False)
-        Only relevant for solver 'cd' (see also ``start_params='guess'``).
-        If ``False``, the full Fisher matrix (expected Hessian) is computed in
-        each outer iteration (Newton iteration). If ``True``, only a diagonal
-        matrix (stored as 1d array) is computed, such that
-        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
-        multiplications, but needs more matrix-vector multiplications. If you
-        use large sparse X or if you have many features,
-        i.e. n_features >> n_samples, you might set this option to ``True``.
-        (Text taken directly from the orig_sklearn_fork.)
-
     Attributes
     ----------
     coef_ : numpy.array, shape (n_features,)
@@ -2193,7 +2179,6 @@ def __init__(
         b_ineq: Optional[np.ndarray] = None,
         force_all_finite: bool = True,
         use_sparse_hessian: bool = True,
-        diag_fisher: bool = False,
     ):
         self.alphas = alphas
         self.alpha = alpha
@@ -2227,7 +2212,6 @@ def __init__(
             b_ineq=b_ineq,
             force_all_finite=force_all_finite,
             use_sparse_hessian=use_sparse_hessian,
-            diag_fisher=diag_fisher,
         )
 
     def _validate_hyperparameters(self) -> None:
diff --git a/src/glum/_solvers.py b/src/glum/_solvers.py
index b1350e39..e05e2113 100644
--- a/src/glum/_solvers.py
+++ b/src/glum/_solvers.py
@@ -15,7 +15,6 @@
 from ._cd_fast import (
     _norm_min_subgrad,
     enet_coordinate_descent_gram,
-    enet_coordinate_descent_gram_diag_fisher,
     identify_active_rows,
 )
 from ._distribution import ExponentialDispersionModel
@@ -58,53 +57,22 @@ def _least_squares_solver(state, data, hessian):
 
 @timeit("inner_solver_runtime")
 def _cd_solver(state, data, active_hessian):
-    if data.diag_fisher:
-        if data.fit_intercept:
-            if state.active_set[0] == 0:
-                # if the intercept column of X is in the active set, leave it out
-                # (we deal with intercepts separately in _cd_fast.pyx)
-                X_active = data.X[:, state.active_set[1:] - 1]
-            else:
-                X_active = data.X[:, state.active_set - 1]
-        else:
-            # with no intercept, active_set here can be massive on first iteration
-            X_active = data.X[:, state.active_set]
-
-        (new_coef, gap, _, _, n_cycles,) = enet_coordinate_descent_gram_diag_fisher(
-            X_active,
-            state.hessian_rows,
-            state.active_set,
-            state.coef.copy(),
-            data.P1,
-            -state.score,
-            data.max_inner_iter,
-            state.inner_tol,
-            data.random_state,
-            data.fit_intercept,
-            data.selection == "random",
-            data.has_lower_bounds,
-            data._lower_bounds,
-            data.has_upper_bounds,
-            data._upper_bounds,
-        )
-    else:
-        new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram(
-            state.active_set,
-            state.coef.copy(),
-            data.P1,
-            active_hessian,
-            -state.score,
-            data.max_inner_iter,
-            state.inner_tol,
-            data.random_state,
-            data.fit_intercept,
-            data.selection == "random",
-            data.has_lower_bounds,
-            data._lower_bounds,
-            data.has_upper_bounds,
-            data._upper_bounds,
-        )
-
+    new_coef, gap, _, _, n_cycles = enet_coordinate_descent_gram(
+        state.active_set,
+        state.coef.copy(),
+        data.P1,
+        active_hessian,
+        -state.score,
+        data.max_inner_iter,
+        state.inner_tol,
+        data.random_state,
+        data.fit_intercept,
+        data.selection == "random",
+        data.has_lower_bounds,
+        data._lower_bounds,
+        data.has_upper_bounds,
+        data._upper_bounds,
+    )
     return new_coef - state.coef, n_cycles
 
 
@@ -379,16 +347,11 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
             state.old_active_set = state.active_set
             state.active_set = identify_active_set(state, data)
 
-            if not data.diag_fisher:
-                # 0) Build the hessian
-                hessian, state.n_active_rows = update_hessian(
-                    state, data, state.active_set
-                )
-                # 1) Solve the L1 and L2 penalized least squares problem
-                d, n_cycles_this_iter = inner_solver(state, data, hessian)
-            else:
-                # 0 & 1) Build the hessian row by row, solving as we go
-                d, n_cycles_this_iter = inner_solver(state, data, None)
+            # 0) Build the hessian
+            hessian, state.n_active_rows = update_hessian(state, data, state.active_set)
+
+            # 1) Solve the L1 and L2 penalized least squares problem
+            d, n_cycles_this_iter = inner_solver(state, data, hessian)
             state.n_cycles += n_cycles_this_iter
 
             # 2) Line search
@@ -510,7 +473,6 @@ def __init__(
         upper_bounds: Optional[np.ndarray] = None,
         verbose: bool = False,
         use_sparse_hessian: bool = True,
-        diag_fisher: bool = False,
     ):
         self.X = X
         self.y = y
@@ -543,7 +505,6 @@ def __init__(
         self.intercept_offset = 1 if self.fit_intercept else 0
         self.verbose = verbose
         self.use_sparse_hessian = use_sparse_hessian
-        self.diag_fisher = diag_fisher
 
         self._check_data()