Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of backtraking #1

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
342 changes: 295 additions & 47 deletions experiments/Riemannian_vs_baseline_car.ipynb

Large diffs are not rendered by default.

2,289 changes: 2,289 additions & 0 deletions experiments/Riemannian_vs_baseline_hiv.ipynb

Large diffs are not rendered by default.

Binary file modified experiments/data/riemannian_vs_baseline_car.pickle
Binary file not shown.
Binary file not shown.
Binary file not shown.
17 changes: 8 additions & 9 deletions src/TTRegression.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class TTRegression(BaseEstimator, LinearClassifierMixin):
Contains all the logged details (e.g. loss on each iteration).
"""

def __init__(self, tt_model, loss_name, rank,
def __init__(self, tt_model, loss_name, rank, learning_rate,
solver='riemannian-sgd', batch_size=-1, fit_intercept=True,
reg=0., exp_reg=1.0, dropout=None, max_iter=100, verbose=0,
persuit_init=False, coef0=None, intercept0=None):
Expand All @@ -55,6 +55,7 @@ def __init__(self, tt_model, loss_name, rank,
self.tt_model = tt_model
self.loss_name = loss_name
self.rank = rank
self.learning_rate = learning_rate
self.solver = solver
self.batch_size = batch_size
self.fit_intercept = fit_intercept
Expand Down Expand Up @@ -226,31 +227,29 @@ def fit_log_val(self, X_, y_, val_X_=None, val_y_=None):
if self.solver == 'riemannian-sgd':
from optimizers.riemannian_sgd import riemannian_sgd
w, b = riemannian_sgd(X, y, self.tt_dot, self.loss, self.loss_grad,
self.project, w0=self.coef_,
intercept0=self.intercept_,
self.project, self.learning_rate,
w0=self.coef_, intercept0=self.intercept_,
fit_intercept=self.fit_intercept,
val_x=val_X, val_y=val_y,
reg=self.reg, exp_reg=self.exp_reg,
dropout=self.dropout,
batch_size=self.batch_size,
num_passes=self.max_iter,
logger=self.logger, verbose_period=1,
beta=0.5, rho=0.1)
logger=self.logger, verbose_period=1)
self.coef_, self.intercept_ = w, b
elif self.solver == 'sgd':
if self.dropout is not None:
print('WARNING: dropout for "sgd" solver is not supported.')

from optimizers.core_sgd import core_sgd
w, b = core_sgd(X, y, self.tt_dot, self.loss, self.loss_grad,
self.gradient_wrt_cores, w0=self.coef_,
intercept0=self.intercept_,
self.gradient_wrt_cores, self.learning_rate,
w0=self.coef_, intercept0=self.intercept_,
fit_intercept=self.fit_intercept,
val_x=val_X, val_y=val_y, reg=self.reg,
batch_size=self.batch_size,
num_passes=self.max_iter,
logger=self.logger, verbose_period=1,
beta=0.5, rho=0.1)
logger=self.logger, verbose_period=1)
self.coef_, self.intercept_ = w, b
else:
raise ValueError("Only 'riemannian-sgd' and 'sgd' solvers are supported.")
Expand Down
30 changes: 5 additions & 25 deletions src/optimizers/core_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@


# TODO: add Adam scheme support.
# TODO: support fit_intercept
# TODO: debug?
def core_sgd(train_x, train_y, vectorized_tt_dot_h, loss_h,
loss_grad_h, grad_wrt_cores_h, w0, intercept0=0,
loss_grad_h, grad_wrt_cores_h, learning_rate, w0, intercept0=0,
fit_intercept=True, val_x=None, val_y=None, reg=0,
batch_size=-1, num_passes=30, seed=None,
logger=None, verbose_period=1,
debug=False,
beta=0.5, rho=0.1):
debug=False):
"""SGD w.r.t. TT-cores optimization for a linear model with weights in TT.

The objective function is
Expand Down Expand Up @@ -54,34 +55,13 @@ def core_sgd(train_x, train_y, vectorized_tt_dot_h, loss_h,
batch_y = train_y[curr_idx]
batch_w_x = vectorized_tt_dot_h(w, train_x[curr_idx, :])
batch_linear_o = batch_w_x + b
batch_loss_arr = loss_h(batch_linear_o, batch_y)
wcore_wcore = w.core.dot(w.core)
batch_loss = np.sum(batch_loss_arr) + reg * wcore_wcore / 2.0
batch_grad_coef = loss_grad_h(batch_linear_o, batch_y)
w_cores = tt.tensor.to_list(w)
gradient = grad_wrt_cores_h(w_cores, train_x[curr_idx, :], batch_grad_coef)
gradient += reg * w.core

# Armiho step choosing.
step_prev_w = step_w
gradient_norm = np.linalg.norm(gradient)
while step_w > 1e-10:
new_w = w.copy()
new_w.core += -step_w * gradient
new_w_x = vectorized_tt_dot_h(new_w, train_x[curr_idx, :])
w.core += -learning_rate * gradient

if fit_intercept:
b_objective = lambda b: np.sum(loss_h(new_w_x + b, batch_y))
m = minimize_scalar(b_objective)
b = m.x
new_loss = m.fun
else:
new_loss = np.sum(loss_h(new_w_x + b, batch_y))
new_loss += reg * new_w.core.dot(new_w.core) / 2.0
if new_loss <= batch_loss - rho * step_w * gradient_norm**2:
break
step_w *= beta
w = new_w

if (logger is not None) and e % verbose_period == 0:
logger.after_each_iter(e, train_x, train_y, w, lambda w, x: vectorized_tt_dot_h(w, x) + b, stage='train')
Expand Down
52 changes: 3 additions & 49 deletions src/optimizers/riemannian_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def build_reg_tens(n, exp_reg):


def riemannian_sgd(train_x, train_y, vectorized_tt_dot_h, loss_h,
loss_grad_h, project_h, w0, intercept0=0,
loss_grad_h, project_h, learning_rate, w0, intercept0=0,
fit_intercept=True, val_x=None, val_y=None,
reg=0., exp_reg=1., dropout=None, batch_size=-1,
num_passes=30, seed=None, logger=None, verbose_period=1,
Expand Down Expand Up @@ -123,59 +123,13 @@ def riemannian_sgd(train_x, train_y, vectorized_tt_dot_h, loss_h,
batch_y = train_y[curr_idx]
batch_w_x = vectorized_tt_dot_h(w, curr_batch)
batch_linear_o = batch_w_x + b
batch_loss_arr = loss_h(batch_linear_o, batch_y)
wreg = w * reg_tens
wregreg = w * reg_tens * reg_tens
wreg_wreg = wreg.norm()**2
batch_loss = np.sum(batch_loss_arr) + reg * wreg_wreg / 2.0
batch_grad_coef = loss_grad_h(batch_linear_o, batch_y)
batch_gradient_b = np.sum(batch_grad_coef)
direction = project_h(w, curr_batch, batch_grad_coef, reg=0)
direction = riemannian.project(w, [direction, reg * wregreg])
batch_dir_x = vectorized_tt_dot_h(direction, curr_batch)

dir_dir = direction.norm()**2
wreg_dir = tt.dot(wreg, direction)
if fit_intercept:
# TODO: Use classical Newton-Raphson (with hessian).
step_objective = lambda s: _regularized_loss_step(s, loss_h, batch_y, batch_w_x, batch_dir_x, b, batch_gradient_b, reg, wreg_wreg, wreg_dir, dir_dir)
step_gradient = lambda s: _regularized_loss_step_grad(s, loss_grad_h, batch_y, batch_w_x, batch_dir_x, b, batch_gradient_b, reg, wreg_dir, dir_dir)
step0_w, step0_b = fmin_bfgs(step_objective, np.ones(2), fprime=step_gradient, gtol=1e-10, disp=logger.disp())
else:
def w_step_objective(w_step):
steps = np.array([w_step, 0])
obj = _regularized_loss_step(steps, loss_h, batch_y,
batch_w_x, batch_dir_x, b,
batch_gradient_b, reg, wreg_wreg,
wreg_dir, dir_dir)
return obj
step0_w = minimize_scalar(w_step_objective).x


# TODO: consider using Probabilistic Line Searches for Stochastic Optimization.
# Armiho step choosing.
step_w = step0_w
# <gradient, direction> =
# = <(\sum_i coef[i] * x_i + reg * w), direction> =
# = \sum_i coef[i] <x_i, direction> + reg * <w, direction>
grad_times_direction = batch_dir_x.dot(batch_grad_coef) + reg * wreg_dir
while step_w > 1e-10:
new_w = (w - step_w * direction).round(eps=0, rmax=max(w.r))
new_w_x = vectorized_tt_dot_h(new_w, curr_batch)

if fit_intercept:
b_objective = lambda b: np.sum(loss_h(new_w_x + b, batch_y))
m = minimize_scalar(b_objective)
b = m.x
new_loss = m.fun
else:
new_loss = np.sum(loss_h(new_w_x + b, batch_y))
new_wreg = new_w * reg_tens
new_loss += reg * new_wreg.norm()**2 / 2.0
if new_loss <= batch_loss - rho * step_w * grad_times_direction:
break
step_w *= beta
w = new_w

w = (w - learning_rate * direction).round(eps=0, rmax=max(w.r))

if (logger is not None) and e % verbose_period == 0:
logger.after_each_iter(e, train_x, train_y, w, lambda w, x: vectorized_tt_dot_h(w, x) + b, stage='train')
Expand Down