Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add use_sparse_hessian functionality to prevent a dense Hessian from being instantiated #543

Closed
wants to merge 9 commits into from
12 changes: 12 additions & 0 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,7 @@ def __init__(
A_ineq: Optional[np.ndarray] = None,
b_ineq: Optional[np.ndarray] = None,
force_all_finite: bool = True,
use_sparse_hessian: bool = True,
):
self.l1_ratio = l1_ratio
self.P1 = P1
Expand Down Expand Up @@ -701,6 +702,7 @@ def __init__(
self.A_ineq = A_ineq
self.b_ineq = b_ineq
self.force_all_finite = force_all_finite
self.use_sparse_hessian = use_sparse_hessian

@property
def family_instance(self) -> ExponentialDispersionModel:
Expand Down Expand Up @@ -969,13 +971,15 @@ def _solve(
lower_bounds=lower_bounds,
upper_bounds=upper_bounds,
verbose=self.verbose > 0,
use_sparse_hessian=self.use_sparse_hessian,
)
if self._solver == "irls-ls":
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_least_squares_solver, coef, irls_data
)
# 4.2 coordinate descent ##############################################
elif self._solver == "irls-cd":
# This is the case we're concerned with for wide problems.
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_cd_solver, coef, irls_data
)
Expand Down Expand Up @@ -2088,6 +2092,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
``A_ineq w <= b_ineq``. Refer to the documentation of ``A_ineq`` for
details.

use_sparse_hessian : boolean, optional, (default=True)
If ``True``, stores the current state of the Hessian as a sparse COO matrix.
If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
`num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
issues when working with extremely wide data.

Attributes
----------
coef_ : numpy.array, shape (n_features,)
Expand Down Expand Up @@ -2168,6 +2178,7 @@ def __init__(
A_ineq: Optional[np.ndarray] = None,
b_ineq: Optional[np.ndarray] = None,
force_all_finite: bool = True,
use_sparse_hessian: bool = True,
):
self.alphas = alphas
self.alpha = alpha
Expand Down Expand Up @@ -2200,6 +2211,7 @@ def __init__(
A_ineq=A_ineq,
b_ineq=b_ineq,
force_all_finite=force_all_finite,
use_sparse_hessian=use_sparse_hessian,
)

def _validate_hyperparameters(self) -> None:
Expand Down
68 changes: 58 additions & 10 deletions src/glum/_solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def update_hessian(state, data, active_set):
# 2) use all the rows
# 3) Include the P2 components
# 4) just like an update, we only update the active_set

hessian_init = build_hessian_delta(
data.X,
state.hessian_rows,
Expand All @@ -151,9 +152,25 @@ def update_hessian(state, data, active_set):
np.arange(data.X.shape[0], dtype=np.int32),
active_set,
)
state.hessian[np.ix_(active_set, active_set)] = hessian_init

# In the sparse Hessian case, state.hessian is a coo_matrix.
# hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
# we can convert this to a coo_matrix and simply add it to the state.hessian
if data.use_sparse_hessian:
coo_data = hessian_init.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian = sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] = hessian_init

state.hessian_initialized = True
n_active_rows = data.X.shape[0]

else:

# In an update iteration, we want to:
Expand All @@ -162,6 +179,7 @@ def update_hessian(state, data, active_set):
# 3) Ignore the P2 components because those don't change and have
# already been added
# 4) only update the active set subset of the hessian.

hessian_rows_diff, active_rows = identify_active_rows(
state.gradient_rows,
state.hessian_rows,
Expand All @@ -176,13 +194,32 @@ def update_hessian(state, data, active_set):
active_rows=active_rows,
active_cols=active_set,
)
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

if data.use_sparse_hessian:
coo_data = hessian_delta.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian += sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

n_active_rows = active_rows.shape[0]

return (
state.hessian[np.ix_(active_set, active_set)],
n_active_rows,
)
# If state.hessian is in COO form, we have to convert to CSC in order to index the
# active set elements, which we then convert to a numpy array for compatibility with
# the rest of the code. This numpy array is dense, but we care about problems in
# which the active set is usually much smaller than the number of data columns.
if data.use_sparse_hessian:
return (
state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
n_active_rows,
)
else:
return state.hessian[np.ix_(active_set, active_set)], n_active_rows


def _is_subset(x, y):
Expand Down Expand Up @@ -271,7 +308,7 @@ def _irls_solver(inner_solver, coef, data) -> Tuple[np.ndarray, int, int, List[L
----------
inner_solver
A least squares solver that can handle the appropriate penalties. With
an L1 penalty, this will _cd_solver. With only an L2 penalty,
an L1 penalty, this will be _cd_solver. With only an L2 penalty,
_least_squares_solver will be more efficient.
coef : ndarray, shape (c,)
If fit_intercept=False, shape c=X.shape[1].
Expand Down Expand Up @@ -435,6 +472,7 @@ def __init__(
lower_bounds: Optional[np.ndarray] = None,
upper_bounds: Optional[np.ndarray] = None,
verbose: bool = False,
use_sparse_hessian: bool = True,
):
self.X = X
self.y = y
Expand Down Expand Up @@ -466,6 +504,7 @@ def __init__(

self.intercept_offset = 1 if self.fit_intercept else 0
self.verbose = verbose
self.use_sparse_hessian = use_sparse_hessian

self._check_data()

Expand Down Expand Up @@ -528,9 +567,18 @@ def __init__(self, coef, data):
self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype)
self.gradient_rows = None
self.hessian_rows = None
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

# In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
# can instantiate the Hessian as a sparse COO matrix.
if data.use_sparse_hessian:
self.hessian = sparse.coo_matrix(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)
else:
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

self.hessian_initialized = False
self.coef_P2 = None
self.norm_min_subgrad = None
Expand Down