diff --git a/doc/modules/api.rst b/doc/modules/api.rst index 0d939ffee..85926f4d4 100644 --- a/doc/modules/api.rst +++ b/doc/modules/api.rst @@ -235,6 +235,7 @@ Batch solvers optim.solver.AGD optim.solver.BFGS optim.solver.GFB + optim.solver.SCPG Stochastic solvers ------------------ @@ -245,9 +246,10 @@ Stochastic solvers :template: class.rst optim.solver.SGD + optim.solver.AdaGrad optim.solver.SVRG + optim.solver.SAGA optim.solver.SDCA - optim.solver.AdaGrad History ------- diff --git a/doc/modules/optim.rst b/doc/modules/optim.rst index b36ab12f8..b09290708 100644 --- a/doc/modules/optim.rst +++ b/doc/modules/optim.rst @@ -487,7 +487,7 @@ and :math:`+\infty` otherwise). Note that depending on the problem, :math:`g` might actually be used only a subset of entries of :math:`w`. For instance, for generalized linear models, :math:`w` contains the model weights and -an intercept, which is not penalized, see :ref:`generalized linear models `. +an intercept, which is not penalized, see :ref:`generalized linear models `. Indeed, in all ``prox`` classes, an optional ``range`` parameter is available, to apply the regularization only to a subset of entries of :math:`w`. @@ -703,9 +703,11 @@ Proximal gradient descent :class:`GD ` Broyden, Fletcher, Goldfarb, and Shannon (quasi-newton) :class:`BFGS ` Self-Concordant Proximal Gradient Descent :class:`SCPG ` +Generalized Forward-Backward :class:`GFB ` Stochastic Gradient Descent :class:`SGD ` Adaptive Gradient Descent solver :class:`AdaGrad ` Stochastic Variance Reduced Descent :class:`SVRG ` +Stochastic Averaged Gradient Descent :class:`SAGA ` Stochastic Dual Coordinate Ascent :class:`SDCA ` ======================================================= ======================================== diff --git a/tick/inference/hawkes_em.py b/tick/inference/hawkes_em.py index 1dedfafac..46b9e4ada 100644 --- a/tick/inference/hawkes_em.py +++ b/tick/inference/hawkes_em.py @@ -276,7 +276,7 @@ def score(self, events=None, end_times=None, baseline=None, kernel=None): Baseline vector for which the score is measured If `None` baseline obtained during fitting is used - kernel : `None` or `np.ndarray', shape=(n_nodes, n_nodes, kernel_size), default=None + kernel : `None` or `np.ndarray`, shape=(n_nodes, n_nodes, kernel_size), default=None Used to force start values for kernel parameter If `None` kernel obtained during fitting is used diff --git a/tick/optim/solver/adagrad.py b/tick/optim/solver/adagrad.py index d92f7b758..7f125fe2b 100644 --- a/tick/optim/solver/adagrad.py +++ b/tick/optim/solver/adagrad.py @@ -7,66 +7,125 @@ class AdaGrad(SolverFirstOrderSto): - """ - Adaptive Gradient Descent solver - - Based on the works by: - - Duchi, J., Hazan, E., & Singer, Y. (2011). - Adaptive Subgradient Methods for Online Learning and - Stochastic Optimization. Journal of Machine Learning Research. + """Adaptive stochastic gradient descent solver + + For the minimization of objectives of the form + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(w) + g(w), + + where the functions :math:`f_i` have smooth gradients and :math:`g` is + prox-capable and separable, namely + + .. math:: + g(w) = \\sum_{j=1}^d g_j(w_j) + + where :math:`g_j` are prox-capable scalar functions of a single coordinate + :math:`w_j` of the vector of weights :math:`w \\in \\mathbb R^d`. Function + :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds + to the ``model.loss`` method of the model (passed with ``set_model`` to the + solver) and :math:`g` corresponds to the ``prox.value`` method of the + prox (passed with the ``set_prox`` method). The given prox must be, as + explained above, separable. + One iteration of :class:`AdaGrad ` corresponds to + the following iteration applied ``epoch_size`` times: + + .. math:: + \\begin{align*} + &\\text{for } j=1, \\ldots, d \\; \\text{ do the following:} \\\\ + & \\quad g_j \\gets ( \\nabla f_i(w) )_j \\\\ + & \\quad d_j \gets d_j + g_j^2 \\\\ + & \\quad w_j \\gets w_j - \\frac{\eta}{\\sqrt{d_j + 10^{-6}}} \\; g_j \\\\ + & \\quad w_j \\gets \\mathrm{prox}_{\\eta_j g_j}(w_j) + \\end{align*} + + where :math:`i` is sampled at random (strategy depends on ``rand_type``) at + each iteration, where :math:`\\eta` that can be tuned with ``step``. + The seed of the random number generator for generation of samples :math:`i` + can be seeded with ``seed``. + The iterations stop whenever tolerance ``tol`` is achieved, or after + ``max_iter`` epochs (namely ``max_iter``:math:`\\times` ``epoch_size``). + The obtained solution :math:`w` is returned by the ``solve`` method, and is + also stored in the ``solution`` attribute of the solver. Parameters ---------- - step : `float` default=0.01 - Step-size of the algorithm + step : `float`, default=1e-2 + Step-size parameter, the most important parameter of the solver. + A try-an-improve approach should be used. + + tol : `float`, default=1e-10 + The tolerance of the solver (iterations stop when the stopping + criterion is below it) - epoch_size : `int` - Epoch size + max_iter : `int`, default=100 + Maximum number of iterations of the solver, namely maximum number of + epochs (by default full pass over the data, unless ``epoch_size`` has + been modified from default) - rand_type : `str` - Type of random sampling + rand_type : {'unif', 'perm'}, default='unif' + How samples are randomly selected from the data - * if ``"unif"`` samples are uniformly drawn among all possibilities - * if ``"perm"`` a random permutation of all possibilities is + * if ``'unif'`` samples are uniformly drawn among all possibilities + * if ``'perm'`` a random permutation of all possibilities is generated and samples are sequentially taken from it. Once all of them have been taken, a new random permutation is generated - tol : `float`, default=0 - The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations - - max_iter : `int` - Maximum number of iterations of the solver - verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway - print_every : `int`, default = 10 + print_every : `int`, default=10 Print history information every time the iteration number is a - multiple of ``print_every`` + multiple of ``print_every``. Used only is ``verbose`` is True - record_every : `int`, default = 1 - Information along iteration is recorded in history each time the - iteration number of a multiple of ``record_every`` + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` - seed : `int` + seed : `int`, default=-1 The seed of the random sampling. If it is negative then a random seed (different at each run) will be chosen. + epoch_size : `int`, default given by model + Epoch size, namely how many iterations are made before updating the + variance reducing term. By default, this is automatically tuned using + information from the model object passed through ``set_model``. + Attributes ---------- - model : `Solver` - The model to solve + model : `Model` + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` + + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` + + References + ---------- + * J. Duchi, E. Hazan, Y. Singer, Adaptive Subgradient Methods for Online + Learning and Stochastic Optimization, *Journal of Machine Learning + Research* (2011) """ - def __init__(self, step: float = 0.01, epoch_size: int = None, - rand_type: str = "unif", tol: float = 0., + def __init__(self, step: float = 1e-2, epoch_size: int = None, + rand_type: str = 'unif', tol: float = 1e-10, max_iter: int = 100, verbose: bool = True, print_every: int = 10, record_every: int = 1, seed: int = -1): diff --git a/tick/optim/solver/agd.py b/tick/optim/solver/agd.py index 1a7f977b5..3e9c108e9 100644 --- a/tick/optim/solver/agd.py +++ b/tick/optim/solver/agd.py @@ -8,53 +8,85 @@ class AGD(SolverFirstOrder): - """ - AGD (accelerated proximal gradient descent) algorithm. + """Accelerated proximal gradient descent + + For the minimization of objectives of the form + + .. math:: + f(w) + g(w), + + where :math:`f` has a smooth gradient and :math:`g` is prox-capable. + Function :math:`f` corresponds to the ``model.loss`` method of the model + (passed with ``set_model`` to the solver) and :math:`g` corresponds to + the ``prox.value`` method of the prox (passed with the ``set_prox`` method). + One iteration of :class:`AGD ` is as follows: + + .. math:: + w^{k} &\\gets \\mathrm{prox}_{\\eta g} \\big(z^k - \\eta \\nabla f(z^k) + \\big) \\\\ + t_{k+1} &\\gets \\frac{1 + \sqrt{1 + 4 t_k^2}}{2} \\\\ + z^{k+1} &\\gets w^k + \\frac{t_k - 1}{t_{k+1}} (w^k - w^{k-1}) + + where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the + ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g}` is given by the + ``prox.call`` method. The step-size :math:`\\eta` can be tuned with + ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or + after ``max_iter`` iterations. The obtained solution :math:`w` is returned + by the ``solve`` method, and is also stored in the ``solution`` attribute + of the solver. Parameters ---------- - step : `float` default=None - Step-size of the algorithm. If ``linesearch=True``, this is the - first step-size to be used in the linesearch - (typically taken too large). Otherwise, it's the constant step - to be used along iterations. - - tol : `float`, default=0. + step : `float`, default=None + Step-size parameter, the most important parameter of the solver. + Whenever possible, this can be automatically tuned as + ``step = 1 / model.get_lip_best()``. If ``linesearch=True``, this is + the first step-size to be used in the linesearch (that should be taken + as too large). + + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) max_iter : `int`, default=100 - Maximum number of iterations of the solver + Maximum number of iterations of the solver. linesearch : `bool`, default=True - Use backtracking linesearch - - linesearch_step_increase : `float`, default=2. - Factor of step increase when using linesearch - - linesearch_step_decrease : `float`, default=0.5 - Factor of step decrease when using linesearch + If `True`, use backtracking linesearch to tune the step automatically. verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway print_every : `int`, default=10 - Print history information when ``n_iter`` (iteration number) is - a multiple of ``print_every`` + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True record_every : `int`, default=1 - Record history information when ``n_iter`` (iteration number) is - a multiple of ``record_every`` + Save history information every time the iteration number is a + multiple of ``record_every`` + + linesearch_step_increase : `float`, default=2. + Factor of step increase when using linesearch + + linesearch_step_decrease : `float`, default=0.5 + Factor of step decrease when using linesearch Attributes ---------- model : `Model` - The model to solve + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method time_start : `str` Start date of the call to ``solve()`` @@ -64,9 +96,15 @@ class AGD(SolverFirstOrder): time_end : `str` End date of the call to ``solve()`` + + References + ---------- + * A. Beck and M. Teboulle, A fast iterative shrinkage-thresholding + algorithm for linear inverse problems, + *SIAM journal on imaging sciences*, 2009 """ - def __init__(self, step: float = None, tol: float = 0., + def __init__(self, step: float = None, tol: float = 1e-10, max_iter: int = 100, linesearch: bool = True, linesearch_step_increase: float = 2., linesearch_step_decrease: float = 0.5, diff --git a/tick/optim/solver/base/first_order.py b/tick/optim/solver/base/first_order.py index 6e164a91c..c6faccad1 100644 --- a/tick/optim/solver/base/first_order.py +++ b/tick/optim/solver/base/first_order.py @@ -166,7 +166,7 @@ def _initialize_values(self, x0: np.ndarray = None, step: float = None, return tuple(result) def set_prox(self, prox: Prox): - """Set proximal operator in the solver. + """Set proximal operator in the solver Parameters ---------- @@ -181,7 +181,7 @@ def set_prox(self, prox: Prox): Notes ----- In some solvers, ``set_model`` must be called before - ``set_prox``, otherwise and error might be raised. + ``set_prox``, otherwise and error might be raised """ if not isinstance(prox, Prox): raise ValueError('Passed object of class %s is not a ' @@ -198,12 +198,12 @@ def _as_dict(self): return dd def objective(self, coeffs, loss: float=None): - """Compute the objective minimized by the solver at ``coeffs`` + """Compute the objective function Parameters ---------- - coeffs : `numpy.ndarray`, shape=(n_coeffs,) - The objective is computed at this point + coeffs : `np.array`, shape=(n_coeffs,) + Point where the objective is computed loss : `float`, default=`None` Gives the value of the loss if already known (allows to @@ -234,12 +234,13 @@ def solve(self, x0=None, step=None): Starting point of the solver step : `float`, default=`None` - Step-size or learning rate for the solver + Step-size or learning rate for the solver. This can be tuned also + using the ``step`` attribute Returns ------- output : `np.array`, shape=(n_coeffs,) - Obtained minimizer for the problem + Obtained minimizer for the problem, same as ``solution`` attribute """ if self.model is None: raise ValueError('You must first set the model using ' diff --git a/tick/optim/solver/base/first_order_sto.py b/tick/optim/solver/base/first_order_sto.py index 9841343fe..263f6bab5 100644 --- a/tick/optim/solver/base/first_order_sto.py +++ b/tick/optim/solver/base/first_order_sto.py @@ -116,19 +116,23 @@ def set_model(self, model: Model): return self def set_prox(self, prox: Prox): - """Set prox in the solver + """Set proximal operator in the solver Parameters ---------- prox : `Prox` - Sets the prox in the solver + The proximal operator of the penalization function Returns ------- output : `Solver` - The `Solver` with given prox - """ + The solver with given prox + Notes + ----- + In some solvers, ``set_model`` must be called before + ``set_prox``, otherwise and error might be raised + """ SolverFirstOrder.set_prox(self, prox) SolverSto.set_prox(self, prox) return self diff --git a/tick/optim/solver/bfgs.py b/tick/optim/solver/bfgs.py index bd5ec2042..5aa776fae 100644 --- a/tick/optim/solver/bfgs.py +++ b/tick/optim/solver/bfgs.py @@ -10,40 +10,63 @@ class BFGS(SolverFirstOrder): - """ - BFGS (Broyden, Fletcher, Goldfarb, and Shanno ) algorithm. - - This is a simple wrapping of `scipy.optimize.fmin_bfgs` + """Broyden, Fletcher, Goldfarb, and Shanno algorithm + + This solver is actually a simple wrapping of `scipy.optimize.fmin_bfgs` + BFGS (Broyden, Fletcher, Goldfarb, and Shanno) algorithm. This is a + quasi-newton algotithm that builds iteratively approximations of the inverse + Hessian. This solver can be used to minimize objectives of the form + + .. math:: + f(w) + g(w), + + for :math:`f` with a smooth gradient and only :math:`g` corresponding to + the zero penalization (namely :class:`ProxZero `) + or ridge penalization (namely :class:`ProxL2sq `). + Function :math:`f` corresponds to the ``model.loss`` method of the model + (passed with ``set_model`` to the solver) and :math:`g` corresponds to + the ``prox.value`` method of the prox (passed with the ``set_prox`` method). + The iterations stop whenever tolerance ``tol`` is achieved, or + after ``max_iter`` iterations. The obtained solution :math:`w` is returned + by the ``solve`` method, and is also stored in the ``solution`` attribute + of the solver. Parameters ---------- - tol : `float`, default=0. + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) - max_iter : `int`, default=100 + max_iter : `int`, default=10 Maximum number of iterations of the solver verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway print_every : `int`, default=10 - Print history information when ``n_iter`` (iteration number) is - a multiple of ``print_every`` + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True record_every : `int`, default=1 - Record history information when ``n_iter`` (iteration number) is - a multiple of ``record_every`` + Save history information every time the iteration number is a + multiple of ``record_every`` Attributes ---------- model : `Model` - The model to solve + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method time_start : `str` Start date of the call to ``solve()`` @@ -56,9 +79,8 @@ class BFGS(SolverFirstOrder): References ---------- - Quasi-Newton method of Broyden, Fletcher, Goldfarb, - and Shanno (BFGS), see - Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198. + * Quasi-Newton method of Broyden, Fletcher, Goldfarb and Shanno (BFGS), + see Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198. """ _attrinfos = { @@ -67,9 +89,9 @@ class BFGS(SolverFirstOrder): } } - def __init__(self, tol: float = 0., - max_iter: int = 100, verbose: bool = True, - print_every: int = 10, record_every: int = 1): + def __init__(self, tol: float = 1e-10, + max_iter: int = 10, verbose: bool = True, + print_every: int = 1, record_every: int = 1): SolverFirstOrder.__init__(self, step=None, tol=tol, max_iter=max_iter, verbose=verbose, print_every=print_every, diff --git a/tick/optim/solver/gd.py b/tick/optim/solver/gd.py index 0dc36b88c..ec41753a4 100644 --- a/tick/optim/solver/gd.py +++ b/tick/optim/solver/gd.py @@ -8,53 +8,82 @@ class GD(SolverFirstOrder): - """ - GD (proximal gradient descent) algorithm. + """Proximal gradient descent + + For the minimization of objectives of the form + + .. math:: + f(w) + g(w), + + where :math:`f` has a smooth gradient and :math:`g` is prox-capable. + Function :math:`f` corresponds to the ``model.loss`` method of the model + (passed with ``set_model`` to the solver) and :math:`g` corresponds to + the ``prox.value`` method of the prox (passed with the ``set_prox`` method). + One iteration of :class:`GD ` is as follows: + + .. math:: + w \\gets \\mathrm{prox}_{\\eta g} \\big(w - \\eta \\nabla f(w) \\big), + + where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the + ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g}` is given by the + ``prox.call`` method. The step-size :math:`\\eta` can be tuned with + ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or + after ``max_iter`` iterations. The obtained solution :math:`w` is returned + by the ``solve`` method, and is also stored in the ``solution`` attribute + of the solver. Parameters ---------- - step : `float` default=None - Step-size of the algorithm. If ``linesearch=True``, this is the - first step-size to be used in the linesearch - (typically taken too large). Otherwise, it's the constant step - to be used along iterations. - - tol : `float`, default=0. + step : `float`, default=None + Step-size parameter, the most important parameter of the solver. + Whenever possible, this can be automatically tuned as + ``step = 1 / model.get_lip_best()``. If ``linesearch=True``, this is + the first step-size to be used in the linesearch (that should be taken + as too large). + + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) max_iter : `int`, default=100 - Maximum number of iterations of the solver + Maximum number of iterations of the solver. linesearch : `bool`, default=True - Use backtracking linesearch - - linesearch_step_increase : `float`, default=2. - Factor of step increase when using linesearch - - linesearch_step_decrease : `float`, default=0.5 - Factor of step decrease when using linesearch + If `True`, use backtracking linesearch to tune the step automatically. verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway print_every : `int`, default=10 - Print history information when ``n_iter`` (iteration number) is - a multiple of ``print_every`` + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True record_every : `int`, default=1 - Record history information when ``n_iter`` (iteration number) is - a multiple of ``record_every`` + Save history information every time the iteration number is a + multiple of ``record_every`` + + linesearch_step_increase : `float`, default=2. + Factor of step increase when using linesearch + + linesearch_step_decrease : `float`, default=0.5 + Factor of step decrease when using linesearch Attributes ---------- model : `Model` - The model to solve + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method time_start : `str` Start date of the call to ``solve()`` @@ -64,6 +93,12 @@ class GD(SolverFirstOrder): time_end : `str` End date of the call to ``solve()`` + + References + ---------- + * A. Beck and M. Teboulle, A fast iterative shrinkage-thresholding + algorithm for linear inverse problems, + *SIAM journal on imaging sciences*, 2009 """ def __init__(self, step: float = None, tol: float = 0., diff --git a/tick/optim/solver/gfb.py b/tick/optim/solver/gfb.py index cc6fcdd03..a874a228e 100644 --- a/tick/optim/solver/gfb.py +++ b/tick/optim/solver/gfb.py @@ -12,7 +12,7 @@ class CompositeProx(Prox): Parameters ---------- - prox_list : List[Prox] + prox_list : `list` of `Prox` List of prox that are wrapped Attributes @@ -61,46 +61,99 @@ def value(self, coeffs: np.ndarray): class GFB(SolverFirstOrder): """Generalized Forward-Backward algorithm - - Minimize the objective - .. math:: f(x) + sum_i g_i(x) - - using generalized forward-backward. This algorithm assumes that f and all - g_i are convex, that f has a Lipschitz gradient and that all g_i are - prox-capable. + For the minimization of objectives of the form + + .. math:: + f(x) + \\sum_{p=1}^P g_p(x) + + where :math:`f` has a smooth gradient and :math:`g_1, \ldots, g_P` are + prox-capable. Function :math:`f` corresponds to the ``model.loss`` method + of the model (passed with ``set_model`` to the solver) and + :math:`g_1, \ldots, g_P` correspond to the list of prox passed with the + ``set_prox`` method. + One iteration of :class:`GFB ` is as follows: + + .. math:: + \\begin{align*} + &\\text{for } p=1, \\ldots, P \\; \\text{ do the following:} \\\\ + & \\quad z_p \\gets \\mathrm{prox}_{P \\eta g_p} \\Big(2 w - z_p^{\\text{old}} + - \\eta \\nabla f(w) \\Big) \\\\ + & \\quad z_p \\gets z_p^{\\text{old}} + \\beta (z_p - w) \\\\ + &w \\gets \\frac 1P \\sum_{p=1}^P z_p + \\end{align*} + + where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the + ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g_p}` is given by the + ``prox[p].call`` method. The step-size :math:`\\eta` can be tuned with + ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or + after ``max_iter`` iterations. The obtained solution :math:`w` is returned + by the ``solve`` method, and is also stored in the ``solution`` attribute + of the solver. The level of sur-relaxation :math:`\\beta` can be tuned + using the ``surrelax`` attribute. Parameters ---------- - step : `float` default=None - Step-size of the algorithm. + step : `float`, default=None + Step-size parameter, the most important parameter of the solver. + Whenever possible, this can be tuned as + ``step = 1 / model.get_lip_best()`` - tol : `float`, default=0. + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) + + max_iter : `int`, default=500 + Maximum number of iterations of the solver. - max_iter : `int`, default=1000 - Maximum number of iterations of the solver + surrelax : `float`, default=1 + Level of sur-relaxation to use in the algorithm. verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway print_every : `int`, default=10 - Print history information when ``n_iter`` (iteration number) is - a multiple of ``print_every`` + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True record_every : `int`, default=1 - Record history information when ``n_iter`` (iteration number) is - a multiple of ``record_every`` + Save history information every time the iteration number is a + multiple of ``record_every`` - surrelax : `float`, default=1 - Relaxation parameter + Attributes + ---------- + model : `Model` + The model used by the solver, passed with the ``set_model`` method + + prox : `list` of `Prox` + List of proximal operators used by the solver, passed with the + ``set_prox`` method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` + + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` + + References + ---------- + * H. Raguet, J. Fadili, G. Peyré, A generalized forward-backward splitting, + *SIAM Journal on Imaging Sciences* (2013) """ - def __init__(self, step: float = None, tol: float = 0., - max_iter: int = 1000, surrelax=1., verbose: bool = True, + def __init__(self, step: float = None, tol: float = 1e-10, + max_iter: int = 500, surrelax=1., verbose: bool = True, print_every: int = 10, record_every: int = 1): SolverFirstOrder.__init__(self, step=step, tol=tol, max_iter=max_iter, verbose=verbose, @@ -112,7 +165,7 @@ def set_prox(self, prox: list): """ Parameters ---------- - prox : list[Prox] + prox : `list` of `Prox` List of all proximal operators of the model """ prox = CompositeProx(prox) @@ -143,7 +196,7 @@ def _solve(self, x0: np.ndarray, step: float): # Relaxation step z[:] = z_old + self.surrelax * (z - x_old) - x[:] = 1./n_prox * sum(z_list) + x[:] = 1. / n_prox * sum(z_list) rel_delta = relative_distance(x, x_old) obj = self.objective(x) rel_obj = abs(obj - obj_old) / abs(obj_old) diff --git a/tick/optim/solver/saga.py b/tick/optim/solver/saga.py index b7f8a3f77..de8924c3c 100644 --- a/tick/optim/solver/saga.py +++ b/tick/optim/solver/saga.py @@ -14,61 +14,137 @@ class SAGA(SolverFirstOrderSto): - """Stochastic average gradient algorithm + """Stochastic Average Gradient solver + + For the minimization of objectives of the form + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(w) + g(w), + + where the functions :math:`f_i` have smooth gradients and :math:`g` is + prox-capable. Note that :class:`SAGA ` works only + with linear models, see :ref:`linear-models` where all linear models are + listed. + Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds + to the ``model.loss`` method of the model (passed with ``set_model`` to the + solver) and :math:`g` corresponds to the ``prox.value`` method of the + prox (passed with the ``set_prox`` method). + One iteration of :class:`SAGA ` corresponds to the + following iteration applied ``epoch_size`` times: + + .. math:: + \\begin{align*} + w &\\gets \\mathrm{prox}_{\\eta g} \\Big(w - \\eta \\Big(\\nabla f_i(w) + - \\delta_i + \\frac 1n \\sum_{i'=1}^n \\delta_{i'} \\Big) \\Big) \\\\ + \\delta_i &\\gets \\nabla f_i(w) + \\end{align*} + + where :math:`i` is sampled at random (strategy depends on ``rand_type``) at + each iteration, and where :math:`\\bar w` and :math:`\\nabla f(\\bar w)` + are updated at the beginning of each epoch, with a strategy that depend on + the ``variance_reduction`` parameter. The step-size :math:`\\eta` can be + tuned with ``step``, the seed of the random number generator for generation + of samples :math:`i` can be seeded with ``seed``. The iterations stop + whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs + (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates). + The obtained solution :math:`w` is returned by the ``solve`` method, and is + also stored in the ``solution`` attribute of the solver. + + Internally, :class:`SAGA ` has dedicated code when + the model is a generalized linear model with sparse features, and a + separable proximal operator: in this case, each iteration works only in the + set of non-zero features, leading to much faster iterates. Parameters ---------- - epoch_size : `int` - Epoch size + step : `float` + Step-size parameter, the most important parameter of the solver. + Whenever possible, this can be automatically tuned as + ``step = 1 / model.get_lip_max()``. Otherwise, use a try-an-improve + approach - rand_type : `str` - Type of random sampling - - * if ``"unif"`` samples are uniformly drawn among all possibilities - * if ``"perm"`` a random permutation of all possibilities is - generated and samples are sequentially taken from it. Once all of - them have been taken, a new random permutation is generated - - tol : `float`, default=0 + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) - max_iter : `int` - Maximum number of iterations of the solver + max_iter : `int`, default=10 + Maximum number of iterations of the solver, namely maximum number of + epochs (by default full pass over the data, unless ``epoch_size`` has + been modified from default) verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway + + seed : `int`, default=-1 + The seed of the random sampling. If it is negative then a random seed + (different at each run) will be chosen. + + epoch_size : `int`, default given by model + Epoch size, namely how many iterations are made before updating the + variance reducing term. By default, this is automatically tuned using + information from the model object passed through ``set_model``. + + variance_reduction : {'last', 'avg', 'rand'}, default='last' + Strategy used for the computation of the iterate used in + variance reduction (also called phase iterate). A warning will be + raised if the ``'avg'`` strategy is used when the model is a + generalized linear model with sparse features, since it is strongly + sub-optimal in this case + + * ``'last'`` : the phase iterate is the last iterate of the previous + epoch + * ``'avg``' : the phase iterate is the average over the iterates in the + past epoch + * ``'rand'``: the phase iterate is a random iterate of the previous + epoch - print_every : `int`, default = 10 - Print history information every time the iteration number is a - multiple of ``print_every`` + rand_type : {'unif', 'perm'}, default='unif' + How samples are randomly selected from the data - record_every : `int`, default = 1 - Information along iteration is recorded in history each time the - iteration number of a multiple of ``record_every`` + * if ``'unif'`` samples are uniformly drawn among all possibilities + * if ``'perm'`` a random permutation of all possibilities is + generated and samples are sequentially taken from it. Once all of + them have been taken, a new random permutation is generated - variance_reduction : {'last', 'avg', 'rand'}, default = last - Determine what is used as phase iterate for variance reduction. + print_every : `int`, default=1 + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True - * 'last' : the phase iterate is the last iterate of the previous epoch - * 'avg' : the phase iterate is the average over the iterates in the past - epoch - * 'rand': the phase iterate is a random iterate of the previous epoch + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` Attributes ---------- - model : `Solver` - The model to solve + model : `Model` + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` + + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` References ---------- - A. Defazio, F. Bach, S. Lacoste-Julien, SAGA: A fast incremental gradient - method with support for non-strongly convex composite objectives, NIPS 2014 + * A. Defazio, F. Bach, S. Lacoste-Julien, SAGA: A fast incremental gradient + method with support for non-strongly convex composite objectives, + NIPS 2014 """ def __init__(self, step: float = None, epoch_size: int = None, @@ -130,4 +206,4 @@ def set_model(self, model: ModelGeneralizedLinear): return SolverFirstOrderSto.set_model(self, model) else: raise ValueError("SAGA accepts only childs of " - "`ModelGeneralizedLinear`") \ No newline at end of file + "`ModelGeneralizedLinear`") diff --git a/tick/optim/solver/scpg.py b/tick/optim/solver/scpg.py index 063d045c4..2f75da383 100644 --- a/tick/optim/solver/scpg.py +++ b/tick/optim/solver/scpg.py @@ -10,38 +10,64 @@ from tick.optim.solver.base import SolverFirstOrder from tick.optim.solver.base.utils import relative_distance -__author__ = 'MartinBompaire' - class SCPG(SolverFirstOrder): - """ - Minimize the objective - - .. math:: f(x) + g(x) - - using the Proximal Gradient algorithm - This algorithm assumes that f and g are both convex, that - f is standard self-concordant and that g is prox-capable. + """Self-Concordant Proximal Gradient descent + + For the minimization of objectives of the form + + .. math:: + f(w) + g(w), + + where :math:`f` is self-concordant and :math:`g` is prox-capable. + Function :math:`f` corresponds to the ``model.loss`` method of the model + (passed with ``set_model`` to the solver) and :math:`g` corresponds to + the ``prox.value`` method of the prox (passed with the ``set_prox`` method). + One iteration of :class:`SCPG ` is as follows: + + .. math:: + y^k &\\gets \\mathrm{prox}_{g / L_k} \\big( w^k - \\tfrac{1}{L_k} + \\nabla f(w^k) \\big) \\\\ + d^k &\\gets y^k - w^k \\\\ + \\beta_k &\\gets \\sqrt{L_k} \| d^k \|_2 \\\\ + \\lambda_k &\\gets \\sqrt{{d^k}^\\top \\nabla^2 f(w^k) d^k} \\\\ + \\alpha_k &\\gets \\frac{\\beta_k}{\\lambda_k (\\lambda_k + + \\beta_k^2)} \\\\ + w^{k + 1} &\\gets w^{k} + \\alpha_k d^k \\\\ + + where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the + ``model.grad`` method and :math:`\\mathrm{prox}_{g / L_k}` is given by the + ``prox.call`` method. The first step size :math:`1 / L_k` can be tuned with + ``step`` it will then be updated with Barzilai-Borwein steps and linesearch. + The iterations stop whenever tolerance ``tol`` is achieved, or after + ``max_iter`` iterations. The obtained solution :math:`w` is returned + by the ``solve`` method, and is also stored in the ``solution`` attribute + of the solver. Parameters ---------- + step : `float`, default=None + Step-size parameter that will be used at the first iteration. Then + the linesearch algorithm will compute the next steps. - modified : `bool` (default False) - Weather or not using the modified version fo the algorithm. - - step : `float` default=None - Step-size of the algorithm. If ``linesearch=True``, this is the - first step-size to be used in the linesearch - (typically taken too large). Otherwise, it's the constant step - to be used along iterations. - - tol : `float`, default=0. + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) max_iter : `int`, default=100 - Maximum number of iterations of the solver + Maximum number of iterations of the solver. + + verbose : `bool`, default=True + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway + + print_every : `int`, default=10 + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True + + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` linesearch_step_increase : `float`, default=2. Factor of step increase when using linesearch @@ -49,25 +75,26 @@ class SCPG(SolverFirstOrder): linesearch_step_decrease : `float`, default=0.5 Factor of step decrease when using linesearch - verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) - - print_every : `int`, default=10 - Print history information when ``n_iter`` (iteration number) is - a multiple of ``print_every`` - - record_every : `int`, default=1 - Record history information when ``n_iter`` (iteration number) is - a multiple of ``record_every`` + modified : `bool`, default=False + Enables modified verison. The modified version of this algorithm + consists in evaluating :math:`f` at :math:`y_k` and :math:`w^{k+1}` + and keeping the iterate that minimizes the best :math:`f`. Attributes ---------- model : `Model` - The model to solve + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method time_start : `str` Start date of the call to ``solve()`` @@ -77,6 +104,18 @@ class SCPG(SolverFirstOrder): time_end : `str` End date of the call to ``solve()`` + + Notes + ----- + This algorithm is designed to work properly with self-concordant losses + such as Poisson regression with linear link + `ModelPoisReg ` or Hawkes processes likelihood + `ModelHawkesFixedSumExpKernLogLik ` + + References + ---------- + Tran-Dinh Q, Kyrillidis A, Cevher V. Composite self-concordant + minimization. *The Journal of Machine Learning Research* (2015) """ _attrinfos = { @@ -155,9 +194,10 @@ def value(self, coeffs: np.ndarray): return self.original_prox.value(coeffs) * self.sc_corr def __init__(self, step: float = None, tol: float = 0., - max_iter: int = 100, linesearch_step_increase: float = 2., - linesearch_step_decrease: float = 0.5, verbose: bool = True, + max_iter: int = 100, verbose: bool = True, print_every: int = 10, record_every: int = 1, + linesearch_step_increase: float = 2., + linesearch_step_decrease: float = 0.5, modified=False): SolverFirstOrder.__init__(self, step=step, tol=tol, max_iter=max_iter, verbose=verbose, print_every=print_every, @@ -296,7 +336,7 @@ def _gradient_step(self, x, prev_x, grad_x_ssc, prev_grad_x_ssc, # during function's run return x_new, y_k, alpha_k, beta_k, lambda_k, l_k - def _solve(self, x0: np.ndarray = None, step: float = 1e5): + def _solve(self, x0: np.ndarray = None, step: float = None): step, obj, x, prev_x, prev_grad_x_ssc, grad_x_ssc = \ self._initialize_values(x0, step=step) diff --git a/tick/optim/solver/sdca.py b/tick/optim/solver/sdca.py index f6901c90c..6a950640b 100644 --- a/tick/optim/solver/sdca.py +++ b/tick/optim/solver/sdca.py @@ -7,54 +7,144 @@ class SDCA(SolverFirstOrderSto): - """Stochastic Dual Coordinate Ascent solver + """Stochastic Dual Coordinate Ascent + + For the minimization of objectives of the form + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(w^\\top x_i) + g(w), + + where the functions :math:`f_i` have smooth gradients and :math:`g` is + prox-capable. This solver actually requires more than that, since it is + working in a Fenchel dual formulation of the primal problem given above. + First, it requires that some ridge penalization is used, hence the mandatory + parameter ``l_l2sq`` below: SDCA will actually minimize the objective + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(x_i^\\top w) + g(w) + \\frac{\\lambda}{2} + \\| w \\|_2^2, + + where :math:`\lambda` is tuned with the ``l_l2sq`` (see below). Now, putting + :math:`h(w) = g(w) + \lambda \|w\|_2^2 / 2`, SDCA maximize + the Fenchel dual problem + + .. math:: + D(\\alpha) = \\frac 1n \\sum_{i=1}^n \\Bigg[ - f_i^*(-\\alpha_i) - + \lambda h^*\\Big( \\frac{1}{\\lambda n} \\sum_{i=1}^n \\alpha_i x_i) + \\Big) \\Bigg], + + where :math:`f_i^*` and :math:`h^*` and the Fenchel duals of :math:`f_i` + and :math:`h` respectively. + Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds + to the ``model.loss`` method of the model (passed with ``set_model`` to the + solver) and :math:`g` corresponds to the ``prox.value`` method of the + prox (passed with the ``set_prox`` method). One iteration of + :class:`SDCA ` corresponds to the + following iteration applied ``epoch_size`` times: + + .. math:: + \\begin{align*} + \\delta_i &\\gets \\arg\\min_{\\delta} \\Big[ \\; f_i^*(-\\alpha_i - + \\delta) + w^\\top x_i \\delta + \\frac{1}{2 \\lambda n} \\| x_i\\|_2^2 + \\delta^2 \\Big] \\\\ + \\alpha_i &\\gets \\alpha_i + \\delta_i \\\\ + v &\\gets v + \\frac{1}{\\lambda n} \\delta_i x_i \\\\ + w &\\gets \\nabla g^*(v) + \\end{align*} + + where :math:`i` is sampled at random (strategy depends on ``rand_type``) at + each iteration. The ridge regularization :math:`\\lambda` can be tuned with + ``l_l2sq``, the seed of the random number generator for generation + of samples :math:`i` can be seeded with ``seed``. The iterations stop + whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs + (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates). + The obtained solution :math:`w` is returned by the ``solve`` method, and is + also stored in the ``solution`` attribute of the solver. The dual solution + :math:`\\alpha` is stored in the ``dual_solution`` attribute. + + Internally, :class:`SDCA ` has dedicated code when + the model is a generalized linear model with sparse features, and a + separable proximal operator: in this case, each iteration works only in the + set of non-zero features, leading to much faster iterates. Parameters ---------- l_l2sq : `float` Level of L2 penalization. L2 penalization is mandatory for SDCA. + Convergence properties of this solver are deeply connected to this + parameter, which should be understood as the "step" used by the + algorithm. - epoch_size : `int` - Epoch size + tol : `float`, default=1e-10 + The tolerance of the solver (iterations stop when the stopping + criterion is below it) - rand_type : `str` - Type of random sampling + max_iter : `int`, default=10 + Maximum number of iterations of the solver, namely maximum number of + epochs (by default full pass over the data, unless ``epoch_size`` has + been modified from default) - * if ``"unif"`` samples are uniformly drawn among all possibilities - * if ``"perm"`` a random permutation of all possibilities is - generated and samples are sequentially taken from it. Once all of - them have been taken, a new random permutation is generated + verbose : `bool`, default=True + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway - tol : `float`, default=0 - The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + seed : `int`, default=-1 + The seed of the random sampling. If it is negative then a random seed + (different at each run) will be chosen. - max_iter : `int` - Maximum number of iterations of the solver + epoch_size : `int`, default given by model + Epoch size, namely how many iterations are made before updating the + variance reducing term. By default, this is automatically tuned using + information from the model object passed through ``set_model``. - verbose : `bool`, default=`True` - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + rand_type : {'unif', 'perm'}, default='unif' + How samples are randomly selected from the data - print_every : `int`, default = 10 + * if ``'unif'`` samples are uniformly drawn among all possibilities + * if ``'perm'`` a random permutation of all possibilities is + generated and samples are sequentially taken from it. Once all of + them have been taken, a new random permutation is generated + + print_every : `int`, default=1 Print history information every time the iteration number is a - multiple of ``print_every`` + multiple of ``print_every``. Used only is ``verbose`` is True - record_every : `int`, default = 1 - Information along iteration is recorded in history each time the - iteration number of a multiple of ``record_every`` + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` Attributes ---------- - model : `Solver` - The model to solve + model : `Model` + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + dual_solution : `numpy.array` + Dual vector corresponding to the primal solution obtained by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` - dual_solution : `np.ndarray` - Dual vector to which the solver has converged + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` + + References + ---------- + * S. Shalev-Shwartz and T. Zhang, Accelerated proximal stochastic dual + coordinate ascent for regularized loss minimization, *ICML 2014* """ _attrinfos = { @@ -62,9 +152,9 @@ class SDCA(SolverFirstOrderSto): } def __init__(self, l_l2sq: float, epoch_size: int = None, - rand_type: str = "unif", tol: float = 0., - max_iter: int = 100, verbose: bool = True, - print_every: int = 10, record_every: int = 1, + rand_type: str = 'unif', tol: float = 1e-10, + max_iter: int = 10, verbose: bool = True, + print_every: int = 1, record_every: int = 1, seed: int = -1): SolverFirstOrderSto.__init__(self, step=0, epoch_size=epoch_size, @@ -101,6 +191,18 @@ def objective(self, coeffs, loss: float = None): return SolverFirstOrderSto.objective(self, coeffs, loss) + prox_l2_value def dual_objective(self, dual_coeffs): + """Compute the dual objective at ``dual_coeffs`` + + Parameters + ---------- + dual_coeffs : `numpy.ndarray`, shape=(n_samples,) + The dual objective objective is computed at this point + + Returns + ------- + output : `float` + Value of the dual objective at given ``dual_coeffs`` + """ primal = self.model._sdca_primal_dual_relation(self.l_l2sq, dual_coeffs) prox_l2_value = 0.5 * self.l_l2sq * np.linalg.norm(primal) ** 2 return self.model.dual_loss(dual_coeffs) - prox_l2_value diff --git a/tick/optim/solver/sgd.py b/tick/optim/solver/sgd.py index df60a5b9e..7298d4882 100644 --- a/tick/optim/solver/sgd.py +++ b/tick/optim/solver/sgd.py @@ -10,57 +10,111 @@ class SGD(SolverFirstOrderSto): - """ - Stochastic Gradient Descent solver + """Stochastic gradient descent solver + + For the minimization of objectives of the form + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(w) + g(w), + + where the functions :math:`f_i` have smooth gradients and :math:`g` is + prox-capable. Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds + to the ``model.loss`` method of the model (passed with ``set_model`` to the + solver) and :math:`g` corresponds to the ``prox.value`` method of the + prox (passed with the ``set_prox`` method). + One iteration of :class:`SGD ` corresponds to the + following iteration applied ``epoch_size`` times: + + .. math:: + w^{t+1} \\gets \\mathrm{prox}_{\\eta_t g} \\big(w^t - \\eta_t + \\nabla f_i(w^t) \\big), + + where :math:`i` is sampled at random (strategy depends on ``rand_type``) at + each iteration, where :math:`\\eta_t = \eta / (t + 1)`, with + :math:`\\eta > 0` that can be tuned with ``step``. The seed of the random + number generator for generation of samples :math:`i` can be seeded with + ``seed``. The iterations stop whenever tolerance ``tol`` is achieved, or + after ``max_iter`` epochs (namely ``max_iter``:math:`\\times` ``epoch_size`` + iterations). + The obtained solution :math:`w` is returned by the ``solve`` method, and is + also stored in the ``solution`` attribute of the solver. Parameters ---------- - epoch_size : `int` - Epoch size - - rand_type : `str` - Type of random sampling - - * if ``"unif"`` samples are uniformly drawn among all possibilities - * if ``"perm"`` a random permutation of all possibilities is - generated and samples are sequentially taken from it. Once all of - them have been taken, a new random permutation is generated + step : `float` + Step-size parameter, the most important parameter of the solver. + A try-an-improve approach should be used. - tol : `float`, default=0 + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) - max_iter : `int` - Maximum number of iterations of the solver + max_iter : `int`, default=100 + Maximum number of iterations of the solver, namely maximum number of + epochs (by default full pass over the data, unless ``epoch_size`` has + been modified from default) verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) - - print_every : `int`, default = 10 - Print history information every time the iteration number is a - multiple of ``print_every`` - - record_every : `int`, default = 1 - Information along iteration is recorded in history each time the - iteration number of a multiple of ``record_every`` + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway - seed : `int` + seed : `int`, default=-1 The seed of the random sampling. If it is negative then a random seed (different at each run) will be chosen. + epoch_size : `int`, default given by model + Epoch size, namely how many iterations are made before updating the + variance reducing term. By default, this is automatically tuned using + information from the model object passed through ``set_model``. + + rand_type : {'unif', 'perm'}, default='unif' + How samples are randomly selected from the data + + * if ``'unif'`` samples are uniformly drawn among all possibilities + * if ``'perm'`` a random permutation of all possibilities is + generated and samples are sequentially taken from it. Once all of + them have been taken, a new random permutation is generated + + print_every : `int`, default=10 + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True + + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` + Attributes ---------- - model : `Solver` - The model to solve + model : `Model` + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` + + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` + + References + ---------- + * https://en.wikipedia.org/wiki/Stochastic_gradient_descent """ def __init__(self, step: float = None, epoch_size: int = None, - rand_type: str = "unif", tol: float = 0., + rand_type: str = "unif", tol: float = 1e-10, max_iter: int = 100, verbose: bool = True, print_every: int = 10, record_every: int = 1, seed: int = -1): diff --git a/tick/optim/solver/svrg.py b/tick/optim/solver/svrg.py index b35d62609..dad3c12b3 100644 --- a/tick/optim/solver/svrg.py +++ b/tick/optim/solver/svrg.py @@ -7,8 +7,6 @@ __author__ = "Stephane Gaiffas" -# TODO: preparer methodes pour set et get attributes - variance_reduction_methods_mapper = { 'last': _SVRG.VarianceReductionMethod_Last, 'avg': _SVRG.VarianceReductionMethod_Average, @@ -17,74 +15,151 @@ class SVRG(SolverFirstOrderSto): - """ - Stochastic variance reduced gradient + """Stochastic Variance Reduced Gradient solver + + For the minimization of objectives of the form + + .. math:: + \\frac 1n \\sum_{i=1}^n f_i(w) + g(w), + + where the functions :math:`f_i` have smooth gradients and :math:`g` is + prox-capable. Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds + to the ``model.loss`` method of the model (passed with ``set_model`` to the + solver) and :math:`g` corresponds to the ``prox.value`` method of the + prox (passed with the ``set_prox`` method). + One iteration of :class:`SVRG ` corresponds to the + following iteration applied ``epoch_size`` times: + + .. math:: + w \\gets \\mathrm{prox}_{\\eta g} \\big(w - \\eta (\\nabla f_i(w) - + \\nabla f_i(\\bar{w}) + \\nabla f(\\bar{w}) \\big), + + where :math:`i` is sampled at random (strategy depends on ``rand_type``) at + each iteration, and where :math:`\\bar w` and :math:`\\nabla f(\\bar w)` + are updated at the beginning of each epoch, with a strategy that depend on + the ``variance_reduction`` parameter. The step-size :math:`\\eta` can be + tuned with ``step``, the seed of the random number generator for generation + of samples :math:`i` can be seeded with ``seed``. The iterations stop + whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs + (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates). + The obtained solution :math:`w` is returned by the ``solve`` method, and is + also stored in the ``solution`` attribute of the solver. + + Internally, :class:`SVRG ` has dedicated code when + the model is a generalized linear model with sparse features, and a + separable proximal operator: in this case, each iteration works only in the + set of non-zero features, leading to much faster iterates. + + Moreover, when ``n_threads`` > 1, this class actually implements parallel + and asynchronous updates of :math:`w`, which is likely to accelerate + optimization, depending on the sparsity of the dataset, and the number of + available cores. Parameters ---------- - epoch_size : `int` - Epoch size - - rand_type : `str` - Type of random sampling - - * if ``'unif'`` samples are uniformly drawn among all possibilities - * if ``'perm'`` a random permutation of all possibilities is - generated and samples are sequentially taken from it. Once all of - them have been taken, a new random permutation is generated + step : `float` + Step-size parameter, the most important parameter of the solver. + Whenever possible, this can be automatically tuned as + ``step = 1 / model.get_lip_max()``. Otherwise, use a try-an-improve + approach - tol : `float`, default=0 + tol : `float`, default=1e-10 The tolerance of the solver (iterations stop when the stopping - criterion is below it). By default the solver does ``max_iter`` - iterations + criterion is below it) - max_iter : `int` - Maximum number of iterations of the solver + max_iter : `int`, default=10 + Maximum number of iterations of the solver, namely maximum number of + epochs (by default full pass over the data, unless ``epoch_size`` has + been modified from default) verbose : `bool`, default=True - If `True`, we verbose things, otherwise the solver does not - print anything (but records information in history anyway) + If `True`, solver verboses history, otherwise nothing is displayed, + but history is recorded anyway - print_every : `int`, default = 10 - Print history information every time the iteration number is a - multiple of ``print_every`` + seed : `int`, default=-1 + The seed of the random sampling. If it is negative then a random seed + (different at each run) will be chosen. - record_every : `int`, default = 1 - Information along iteration is recorded in history each time the - iteration number of a multiple of ``record_every`` + n_threads : `int`, default=1 + Number of threads to use for parallel optimization. The strategy used + for this is asynchronous updates of the iterates. + + epoch_size : `int`, default given by model + Epoch size, namely how many iterations are made before updating the + variance reducing term. By default, this is automatically tuned using + information from the model object passed through ``set_model``. - Other Parameters - ---------------- variance_reduction : {'last', 'avg', 'rand'}, default='last' - Determine what is used as phase iterate for variance reduction. + Strategy used for the computation of the iterate used in + variance reduction (also called phase iterate). A warning will be + raised if the ``'avg'`` strategy is used when the model is a + generalized linear model with sparse features, since it is strongly + sub-optimal in this case + + * ``'last'`` : the phase iterate is the last iterate of the previous + epoch + * ``'avg``' : the phase iterate is the average over the iterates in the + past epoch + * ``'rand'``: the phase iterate is a random iterate of the previous + epoch + + rand_type : {'unif', 'perm'}, default='unif' + How samples are randomly selected from the data + + * if ``'unif'`` samples are uniformly drawn among all possibilities + * if ``'perm'`` a random permutation of all possibilities is + generated and samples are sequentially taken from it. Once all of + them have been taken, a new random permutation is generated + + print_every : `int`, default=1 + Print history information every time the iteration number is a + multiple of ``print_every``. Used only is ``verbose`` is True - * 'last' : the phase iterate is the last iterate of the previous epoch - * 'avg' : the phase iterate is the average over the iterates in the past - epoch. This is really a bad idea when using sparse datasets, a - warning will be raised in this case - * 'rand': the phase iterate is a random iterate of the previous epoch + record_every : `int`, default=1 + Save history information every time the iteration number is a + multiple of ``record_every`` Attributes ---------- - model : `Solver` - The model to solve + model : `Model` + The model used by the solver, passed with the ``set_model`` method prox : `Prox` - Proximal operator to solve + Proximal operator used by the solver, passed with the ``set_prox`` + method + + solution : `numpy.array`, shape=(n_coeffs,) + Minimizer found by the solver + + history : `dict`-like + A dict-type of object that contains history of the solver along + iterations. It should be accessed using the ``get_history`` method + + time_start : `str` + Start date of the call to ``solve()`` + + time_elapsed : `float` + Duration of the call to ``solve()``, in seconds + + time_end : `str` + End date of the call to ``solve()`` + + References + ---------- + * L. Xiao and T. Zhang, A proximal stochastic gradient method with + progressive variance reduction, *SIAM Journal on Optimization* (2014) """ def __init__(self, step: float = None, epoch_size: int = None, - rand_type: str = 'unif', tol: float = 0., - max_iter: int = 100, verbose: bool = True, - print_every: int = 10, record_every: int = 1, + rand_type: str = 'unif', tol: float = 1e-10, + max_iter: int = 10, verbose: bool = True, + print_every: int = 1, record_every: int = 1, seed: int = -1, variance_reduction: str = 'last', n_threads: int = 1): - - self.n_threads = n_threads - SolverFirstOrderSto.__init__(self, step, epoch_size, rand_type, tol, max_iter, verbose, print_every, record_every, seed=seed) + self.n_threads = n_threads step = self.step if step is None: step = 0.