diff --git a/doc/modules/api.rst b/doc/modules/api.rst
index 0d939ffee..85926f4d4 100644
--- a/doc/modules/api.rst
+++ b/doc/modules/api.rst
@@ -235,6 +235,7 @@ Batch solvers
    optim.solver.AGD
    optim.solver.BFGS
    optim.solver.GFB
+   optim.solver.SCPG
 
 Stochastic solvers
 ------------------
@@ -245,9 +246,10 @@ Stochastic solvers
    :template: class.rst
 
    optim.solver.SGD
+   optim.solver.AdaGrad
    optim.solver.SVRG
+   optim.solver.SAGA
    optim.solver.SDCA
-   optim.solver.AdaGrad
 
 History
 -------
diff --git a/doc/modules/optim.rst b/doc/modules/optim.rst
index b36ab12f8..b09290708 100644
--- a/doc/modules/optim.rst
+++ b/doc/modules/optim.rst
@@ -487,7 +487,7 @@ and :math:`+\infty` otherwise).
 Note that depending on the problem, :math:`g` might actually be used only a subset of
 entries of :math:`w`.
 For instance, for generalized linear models, :math:`w` contains the model weights and
-an intercept, which is not penalized, see :ref:`generalized linear models <optim-model-glm>`.
+an intercept, which is not penalized, see :ref:`generalized linear models <linear-models>`.
 Indeed, in all ``prox`` classes, an optional ``range`` parameter is available, to apply
 the regularization only to a subset of entries of :math:`w`.
 
@@ -703,9 +703,11 @@ Proximal gradient descent                                :class:`GD <tick.optim.
 Accelerated proximal gradient descent                    :class:`AGD <tick.optim.solver.AGD>`
 Broyden, Fletcher, Goldfarb, and Shannon (quasi-newton)  :class:`BFGS <tick.optim.solver.BFGS>`
 Self-Concordant Proximal Gradient Descent                :class:`SCPG <tick.optim.solver.SCPG>`
+Generalized Forward-Backward                             :class:`GFB <tick.optim.solver.GFB>`
 Stochastic Gradient Descent                              :class:`SGD <tick.optim.solver.SGD>`
 Adaptive Gradient Descent solver                         :class:`AdaGrad <tick.optim.solver.AdaGrad>`
 Stochastic Variance Reduced Descent                      :class:`SVRG <tick.optim.solver.SVRG>`
+Stochastic Averaged Gradient Descent                     :class:`SAGA <tick.optim.solver.SAGA>`
 Stochastic Dual Coordinate Ascent                        :class:`SDCA <tick.optim.solver.SDCA>`
 =======================================================  ========================================
 
diff --git a/tick/inference/hawkes_em.py b/tick/inference/hawkes_em.py
index 1dedfafac..46b9e4ada 100644
--- a/tick/inference/hawkes_em.py
+++ b/tick/inference/hawkes_em.py
@@ -276,7 +276,7 @@ def score(self, events=None, end_times=None, baseline=None, kernel=None):
             Baseline vector for which the score is measured
             If `None` baseline obtained during fitting is used
 
-        kernel : `None` or `np.ndarray', shape=(n_nodes, n_nodes, kernel_size), default=None
+        kernel : `None` or `np.ndarray`, shape=(n_nodes, n_nodes, kernel_size), default=None
             Used to force start values for kernel parameter
             If `None` kernel obtained during fitting is used
 
diff --git a/tick/optim/solver/adagrad.py b/tick/optim/solver/adagrad.py
index d92f7b758..7f125fe2b 100644
--- a/tick/optim/solver/adagrad.py
+++ b/tick/optim/solver/adagrad.py
@@ -7,66 +7,125 @@
 
 
 class AdaGrad(SolverFirstOrderSto):
-    """
-    Adaptive Gradient Descent solver
-
-    Based on the works by:
-
-    Duchi, J., Hazan, E., & Singer, Y. (2011).
-    Adaptive Subgradient Methods for Online Learning and
-    Stochastic Optimization. Journal of Machine Learning Research.
+    """Adaptive stochastic gradient descent solver
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable and separable, namely
+
+    .. math::
+        g(w) = \\sum_{j=1}^d g_j(w_j)
+
+    where :math:`g_j` are prox-capable scalar functions of a single coordinate
+    :math:`w_j` of the vector of weights :math:`w \\in \\mathbb R^d`. Function
+    :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method). The given prox must be, as
+    explained above, separable.
+    One iteration of :class:`AdaGrad <tick.optim.solver.AdaGrad>` corresponds to
+    the following iteration applied ``epoch_size`` times:
+
+    .. math::
+        \\begin{align*}
+        &\\text{for } j=1, \\ldots, d \\; \\text{ do the following:} \\\\
+        & \\quad g_j \\gets ( \\nabla f_i(w) )_j \\\\
+        & \\quad d_j \gets d_j + g_j^2 \\\\
+        & \\quad w_j \\gets w_j - \\frac{\eta}{\\sqrt{d_j + 10^{-6}}} \\; g_j \\\\
+        & \\quad w_j \\gets \\mathrm{prox}_{\\eta_j g_j}(w_j)
+        \\end{align*}
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration, where :math:`\\eta` that can be tuned with ``step``.
+    The seed of the random number generator for generation of samples :math:`i`
+    can be seeded with ``seed``.
+    The iterations stop whenever tolerance ``tol`` is achieved, or after
+    ``max_iter`` epochs (namely ``max_iter``:math:`\\times` ``epoch_size``).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver.
 
     Parameters
     ----------
-    step : `float` default=0.01
-        Step-size of the algorithm
+    step : `float`, default=1e-2
+        Step-size parameter, the most important parameter of the solver.
+        A try-an-improve approach should be used.
+
+    tol : `float`, default=1e-10
+        The tolerance of the solver (iterations stop when the stopping
+        criterion is below it)
 
-    epoch_size : `int`
-        Epoch size
+    max_iter : `int`, default=100
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
-    rand_type : `str`
-        Type of random sampling
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
 
-        * if ``"unif"`` samples are uniformly drawn among all possibilities
-        * if ``"perm"`` a random permutation of all possibilities is
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
           generated and samples are sequentially taken from it. Once all of
           them have been taken, a new random permutation is generated
 
-    tol : `float`, default=0
-        The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
-
-    max_iter : `int`
-        Maximum number of iterations of the solver
-
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
-    print_every : `int`, default = 10
+    print_every : `int`, default=10
         Print history information every time the iteration number is a
-        multiple of ``print_every``
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
-    seed : `int`
+    seed : `int`, default=-1
         The seed of the random sampling. If it is negative then a random seed
         (different at each run) will be chosen.
 
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
+
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * J. Duchi, E. Hazan, Y. Singer, Adaptive Subgradient Methods for Online
+      Learning and Stochastic Optimization, *Journal of Machine Learning
+      Research* (2011)
     """
 
-    def __init__(self, step: float = 0.01, epoch_size: int = None,
-                 rand_type: str = "unif", tol: float = 0.,
+    def __init__(self, step: float = 1e-2, epoch_size: int = None,
+                 rand_type: str = 'unif', tol: float = 1e-10,
                  max_iter: int = 100, verbose: bool = True,
                  print_every: int = 10, record_every: int = 1,
                  seed: int = -1):
diff --git a/tick/optim/solver/agd.py b/tick/optim/solver/agd.py
index 1a7f977b5..3e9c108e9 100644
--- a/tick/optim/solver/agd.py
+++ b/tick/optim/solver/agd.py
@@ -8,53 +8,85 @@
 
 
 class AGD(SolverFirstOrder):
-    """
-    AGD (accelerated proximal gradient descent) algorithm.
+    """Accelerated proximal gradient descent
+
+    For the minimization of objectives of the form
+
+    .. math::
+        f(w) + g(w),
+
+    where :math:`f` has a smooth gradient and :math:`g` is prox-capable.
+    Function :math:`f` corresponds to the ``model.loss`` method of the model
+    (passed with ``set_model`` to the solver) and :math:`g` corresponds to
+    the ``prox.value`` method of the prox (passed with the ``set_prox`` method).
+    One iteration of :class:`AGD <tick.optim.solver.AGD>` is as follows:
+
+    .. math::
+        w^{k} &\\gets \\mathrm{prox}_{\\eta g} \\big(z^k - \\eta \\nabla f(z^k)
+        \\big) \\\\
+        t_{k+1} &\\gets \\frac{1 + \sqrt{1 + 4 t_k^2}}{2} \\\\
+        z^{k+1} &\\gets w^k + \\frac{t_k - 1}{t_{k+1}} (w^k - w^{k-1})
+
+    where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the
+    ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g}` is given by the
+    ``prox.call`` method. The step-size :math:`\\eta` can be tuned with
+    ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver.
 
     Parameters
     ----------
-    step : `float` default=None
-        Step-size of the algorithm. If ``linesearch=True``, this is the
-        first step-size to be used in the linesearch
-        (typically taken too large). Otherwise, it's the constant step
-        to be used along iterations.
-
-    tol : `float`, default=0.
+    step : `float`, default=None
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be automatically tuned as
+        ``step = 1 / model.get_lip_best()``. If ``linesearch=True``, this is
+        the first step-size to be used in the linesearch (that should be taken
+        as too large).
+
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
     max_iter : `int`, default=100
-        Maximum number of iterations of the solver
+        Maximum number of iterations of the solver.
 
     linesearch : `bool`, default=True
-        Use backtracking linesearch
-
-    linesearch_step_increase : `float`, default=2.
-        Factor of step increase when using linesearch
-
-    linesearch_step_decrease : `float`, default=0.5
-        Factor of step decrease when using linesearch
+        If `True`, use backtracking linesearch to tune the step automatically.
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
     print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
     record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
+
+    linesearch_step_increase : `float`, default=2.
+        Factor of step increase when using linesearch
+
+    linesearch_step_decrease : `float`, default=0.5
+        Factor of step decrease when using linesearch
 
     Attributes
     ----------
     model : `Model`
-        The model to solve
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
 
     time_start : `str`
         Start date of the call to ``solve()``
@@ -64,9 +96,15 @@ class AGD(SolverFirstOrder):
 
     time_end : `str`
         End date of the call to ``solve()``
+
+    References
+    ----------
+    * A. Beck and M. Teboulle, A fast iterative shrinkage-thresholding
+      algorithm for linear inverse problems,
+      *SIAM journal on imaging sciences*, 2009
     """
 
-    def __init__(self, step: float = None, tol: float = 0.,
+    def __init__(self, step: float = None, tol: float = 1e-10,
                  max_iter: int = 100, linesearch: bool = True,
                  linesearch_step_increase: float = 2.,
                  linesearch_step_decrease: float = 0.5,
diff --git a/tick/optim/solver/base/first_order.py b/tick/optim/solver/base/first_order.py
index 6e164a91c..c6faccad1 100644
--- a/tick/optim/solver/base/first_order.py
+++ b/tick/optim/solver/base/first_order.py
@@ -166,7 +166,7 @@ def _initialize_values(self, x0: np.ndarray = None, step: float = None,
         return tuple(result)
 
     def set_prox(self, prox: Prox):
-        """Set proximal operator in the solver.
+        """Set proximal operator in the solver
 
         Parameters
         ----------
@@ -181,7 +181,7 @@ def set_prox(self, prox: Prox):
         Notes
         -----
         In some solvers, ``set_model`` must be called before
-        ``set_prox``, otherwise and error might be raised.
+        ``set_prox``, otherwise and error might be raised
         """
         if not isinstance(prox, Prox):
             raise ValueError('Passed object of class %s is not a '
@@ -198,12 +198,12 @@ def _as_dict(self):
         return dd
 
     def objective(self, coeffs, loss: float=None):
-        """Compute the objective minimized by the solver at ``coeffs``
+        """Compute the objective function
 
         Parameters
         ----------
-        coeffs : `numpy.ndarray`, shape=(n_coeffs,)
-            The objective is computed at this point
+        coeffs : `np.array`, shape=(n_coeffs,)
+            Point where the objective is computed
 
         loss : `float`, default=`None`
             Gives the value of the loss if already known (allows to
@@ -234,12 +234,13 @@ def solve(self, x0=None, step=None):
             Starting point of the solver
 
         step : `float`, default=`None`
-            Step-size or learning rate for the solver
+            Step-size or learning rate for the solver. This can be tuned also
+            using the ``step`` attribute
 
         Returns
         -------
         output : `np.array`, shape=(n_coeffs,)
-            Obtained minimizer for the problem
+            Obtained minimizer for the problem, same as ``solution`` attribute
         """
         if self.model is None:
             raise ValueError('You must first set the model using '
diff --git a/tick/optim/solver/base/first_order_sto.py b/tick/optim/solver/base/first_order_sto.py
index 9841343fe..263f6bab5 100644
--- a/tick/optim/solver/base/first_order_sto.py
+++ b/tick/optim/solver/base/first_order_sto.py
@@ -116,19 +116,23 @@ def set_model(self, model: Model):
         return self
 
     def set_prox(self, prox: Prox):
-        """Set prox in the solver
+        """Set proximal operator in the solver
 
         Parameters
         ----------
         prox : `Prox`
-            Sets the prox in the solver
+            The proximal operator of the penalization function
 
         Returns
         -------
         output : `Solver`
-            The `Solver` with given prox
-        """
+            The solver with given prox
 
+        Notes
+        -----
+        In some solvers, ``set_model`` must be called before
+        ``set_prox``, otherwise and error might be raised
+        """
         SolverFirstOrder.set_prox(self, prox)
         SolverSto.set_prox(self, prox)
         return self
diff --git a/tick/optim/solver/bfgs.py b/tick/optim/solver/bfgs.py
index bd5ec2042..5aa776fae 100644
--- a/tick/optim/solver/bfgs.py
+++ b/tick/optim/solver/bfgs.py
@@ -10,40 +10,63 @@
 
 
 class BFGS(SolverFirstOrder):
-    """
-    BFGS (Broyden, Fletcher, Goldfarb, and Shanno ) algorithm.
-
-    This is a simple wrapping of `scipy.optimize.fmin_bfgs`
+    """Broyden, Fletcher, Goldfarb, and Shanno algorithm
+
+    This solver is actually a simple wrapping of `scipy.optimize.fmin_bfgs`
+    BFGS (Broyden, Fletcher, Goldfarb, and Shanno) algorithm. This is a
+    quasi-newton algotithm that builds iteratively approximations of the inverse
+    Hessian. This solver can be used to minimize objectives of the form
+
+    .. math::
+        f(w) + g(w),
+
+    for :math:`f` with a smooth gradient and only :math:`g` corresponding to
+    the zero penalization (namely :class:`ProxZero <tick.optim.prox.ProxZero>`)
+    or ridge penalization (namely :class:`ProxL2sq <tick.optim.prox.ProxL2sq>`).
+    Function :math:`f` corresponds to the ``model.loss`` method of the model
+    (passed with ``set_model`` to the solver) and :math:`g` corresponds to
+    the ``prox.value`` method of the prox (passed with the ``set_prox`` method).
+    The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver.
 
     Parameters
     ----------
-    tol : `float`, default=0.
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
-    max_iter : `int`, default=100
+    max_iter : `int`, default=10
         Maximum number of iterations of the solver
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
     print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
     record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
     Attributes
     ----------
     model : `Model`
-        The model to solve
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
 
     time_start : `str`
         Start date of the call to ``solve()``
@@ -56,9 +79,8 @@ class BFGS(SolverFirstOrder):
 
     References
     ----------
-    Quasi-Newton method of Broyden, Fletcher, Goldfarb,
-    and Shanno (BFGS), see
-    Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198.
+    * Quasi-Newton method of Broyden, Fletcher, Goldfarb and Shanno (BFGS),
+      see Wright, and Nocedal 'Numerical Optimization', 1999, pg. 198.
     """
 
     _attrinfos = {
@@ -67,9 +89,9 @@ class BFGS(SolverFirstOrder):
         }
     }
 
-    def __init__(self, tol: float = 0.,
-                 max_iter: int = 100, verbose: bool = True,
-                 print_every: int = 10, record_every: int = 1):
+    def __init__(self, tol: float = 1e-10,
+                 max_iter: int = 10, verbose: bool = True,
+                 print_every: int = 1, record_every: int = 1):
         SolverFirstOrder.__init__(self, step=None, tol=tol,
                                   max_iter=max_iter, verbose=verbose,
                                   print_every=print_every,
diff --git a/tick/optim/solver/gd.py b/tick/optim/solver/gd.py
index 0dc36b88c..ec41753a4 100644
--- a/tick/optim/solver/gd.py
+++ b/tick/optim/solver/gd.py
@@ -8,53 +8,82 @@
 
 
 class GD(SolverFirstOrder):
-    """
-    GD (proximal gradient descent) algorithm.
+    """Proximal gradient descent
+
+    For the minimization of objectives of the form
+
+    .. math::
+        f(w) + g(w),
+
+    where :math:`f` has a smooth gradient and :math:`g` is prox-capable.
+    Function :math:`f` corresponds to the ``model.loss`` method of the model
+    (passed with ``set_model`` to the solver) and :math:`g` corresponds to
+    the ``prox.value`` method of the prox (passed with the ``set_prox`` method).
+    One iteration of :class:`GD <tick.optim.solver.GD>` is as follows:
+
+    .. math::
+        w \\gets \\mathrm{prox}_{\\eta g} \\big(w - \\eta \\nabla f(w) \\big),
+
+    where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the
+    ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g}` is given by the
+    ``prox.call`` method. The step-size :math:`\\eta` can be tuned with
+    ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver.
 
     Parameters
     ----------
-    step : `float` default=None
-        Step-size of the algorithm. If ``linesearch=True``, this is the
-        first step-size to be used in the linesearch
-        (typically taken too large). Otherwise, it's the constant step
-        to be used along iterations.
-
-    tol : `float`, default=0.
+    step : `float`, default=None
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be automatically tuned as
+        ``step = 1 / model.get_lip_best()``. If ``linesearch=True``, this is
+        the first step-size to be used in the linesearch (that should be taken
+        as too large).
+
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
     max_iter : `int`, default=100
-        Maximum number of iterations of the solver
+        Maximum number of iterations of the solver.
 
     linesearch : `bool`, default=True
-        Use backtracking linesearch
-
-    linesearch_step_increase : `float`, default=2.
-        Factor of step increase when using linesearch
-
-    linesearch_step_decrease : `float`, default=0.5
-        Factor of step decrease when using linesearch
+        If `True`, use backtracking linesearch to tune the step automatically.
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
     print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
     record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
+
+    linesearch_step_increase : `float`, default=2.
+        Factor of step increase when using linesearch
+
+    linesearch_step_decrease : `float`, default=0.5
+        Factor of step decrease when using linesearch
 
     Attributes
     ----------
     model : `Model`
-        The model to solve
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
 
     time_start : `str`
         Start date of the call to ``solve()``
@@ -64,6 +93,12 @@ class GD(SolverFirstOrder):
 
     time_end : `str`
         End date of the call to ``solve()``
+
+    References
+    ----------
+    * A. Beck and M. Teboulle, A fast iterative shrinkage-thresholding
+      algorithm for linear inverse problems,
+      *SIAM journal on imaging sciences*, 2009
     """
 
     def __init__(self, step: float = None, tol: float = 0.,
diff --git a/tick/optim/solver/gfb.py b/tick/optim/solver/gfb.py
index cc6fcdd03..a874a228e 100644
--- a/tick/optim/solver/gfb.py
+++ b/tick/optim/solver/gfb.py
@@ -12,7 +12,7 @@ class CompositeProx(Prox):
 
     Parameters
     ----------
-    prox_list : List[Prox]
+    prox_list : `list` of `Prox`
         List of prox that are wrapped
 
     Attributes
@@ -61,46 +61,99 @@ def value(self, coeffs: np.ndarray):
 
 class GFB(SolverFirstOrder):
     """Generalized Forward-Backward algorithm
-    
-    Minimize the objective
 
-    .. math:: f(x) + sum_i g_i(x)
-
-    using generalized forward-backward. This algorithm assumes that f and all
-    g_i are convex, that f has a Lipschitz gradient and that all g_i are
-    prox-capable.
+    For the minimization of objectives of the form
+
+    .. math::
+        f(x) + \\sum_{p=1}^P g_p(x)
+
+    where :math:`f` has a smooth gradient and :math:`g_1, \ldots, g_P` are
+    prox-capable. Function :math:`f` corresponds to the ``model.loss`` method
+    of the model (passed with ``set_model`` to the solver) and
+    :math:`g_1, \ldots, g_P` correspond to the list of prox passed with the
+    ``set_prox`` method.
+    One iteration of :class:`GFB <tick.optim.solver.GFB>` is as follows:
+
+    .. math::
+        \\begin{align*}
+        &\\text{for } p=1, \\ldots, P \\; \\text{ do the following:} \\\\
+        & \\quad z_p \\gets \\mathrm{prox}_{P \\eta g_p} \\Big(2 w - z_p^{\\text{old}}
+        - \\eta \\nabla f(w) \\Big) \\\\
+        & \\quad z_p \\gets z_p^{\\text{old}} + \\beta (z_p - w) \\\\
+        &w \\gets \\frac 1P \\sum_{p=1}^P z_p
+        \\end{align*}
+
+    where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the
+    ``model.grad`` method and :math:`\\mathrm{prox}_{\\eta g_p}` is given by the
+    ``prox[p].call`` method. The step-size :math:`\\eta` can be tuned with
+    ``step``. The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver. The level of sur-relaxation :math:`\\beta` can be tuned
+    using the ``surrelax`` attribute.
 
     Parameters
     ----------
-    step : `float` default=None
-        Step-size of the algorithm.
+    step : `float`, default=None
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be tuned as
+        ``step = 1 / model.get_lip_best()``
 
-    tol : `float`, default=0.
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
+
+    max_iter : `int`, default=500
+        Maximum number of iterations of the solver.
 
-    max_iter : `int`, default=1000
-        Maximum number of iterations of the solver
+    surrelax : `float`, default=1
+        Level of sur-relaxation to use in the algorithm.
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
     print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
     record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
-    surrelax : `float`, default=1
-        Relaxation parameter
+    Attributes
+    ----------
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
+
+    prox : `list` of `Prox`
+        List of proximal operators used by the solver, passed with the
+        ``set_prox`` method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * H. Raguet, J. Fadili, G. Peyré, A generalized forward-backward splitting,
+      *SIAM Journal on Imaging Sciences* (2013)
     """
 
-    def __init__(self, step: float = None, tol: float = 0.,
-                 max_iter: int = 1000, surrelax=1., verbose: bool = True,
+    def __init__(self, step: float = None, tol: float = 1e-10,
+                 max_iter: int = 500, surrelax=1., verbose: bool = True,
                  print_every: int = 10, record_every: int = 1):
         SolverFirstOrder.__init__(self, step=step, tol=tol,
                                   max_iter=max_iter, verbose=verbose,
@@ -112,7 +165,7 @@ def set_prox(self, prox: list):
         """
         Parameters
         ----------
-        prox : list[Prox]
+        prox : `list` of `Prox`
             List of all proximal operators of the model
         """
         prox = CompositeProx(prox)
@@ -143,7 +196,7 @@ def _solve(self, x0: np.ndarray, step: float):
                 # Relaxation step
                 z[:] = z_old + self.surrelax * (z - x_old)
 
-            x[:] = 1./n_prox * sum(z_list)
+            x[:] = 1. / n_prox * sum(z_list)
             rel_delta = relative_distance(x, x_old)
             obj = self.objective(x)
             rel_obj = abs(obj - obj_old) / abs(obj_old)
diff --git a/tick/optim/solver/saga.py b/tick/optim/solver/saga.py
index b7f8a3f77..de8924c3c 100644
--- a/tick/optim/solver/saga.py
+++ b/tick/optim/solver/saga.py
@@ -14,61 +14,137 @@
 
 
 class SAGA(SolverFirstOrderSto):
-    """Stochastic average gradient algorithm
+    """Stochastic Average Gradient solver
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable. Note that :class:`SAGA <tick.optim.solver.SAGA>` works only
+    with linear models, see :ref:`linear-models` where all linear models are
+    listed.
+    Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method).
+    One iteration of :class:`SAGA <tick.optim.solver.SAGA>` corresponds to the
+    following iteration applied ``epoch_size`` times:
+
+    .. math::
+        \\begin{align*}
+        w &\\gets \\mathrm{prox}_{\\eta g} \\Big(w - \\eta \\Big(\\nabla f_i(w)
+         - \\delta_i + \\frac 1n \\sum_{i'=1}^n \\delta_{i'} \\Big) \\Big) \\\\
+        \\delta_i &\\gets \\nabla f_i(w)
+        \\end{align*}
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration, and where :math:`\\bar w` and :math:`\\nabla f(\\bar w)`
+    are updated at the beginning of each epoch, with a strategy that depend on
+    the ``variance_reduction`` parameter. The step-size :math:`\\eta` can be
+    tuned with ``step``, the seed of the random number generator for generation
+    of samples :math:`i` can be seeded with ``seed``. The iterations stop
+    whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs
+    (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver.
+
+    Internally, :class:`SAGA <tick.optim.solver.SAGA>` has dedicated code when
+    the model is a generalized linear model with sparse features, and a
+    separable proximal operator: in this case, each iteration works only in the
+    set of non-zero features, leading to much faster iterates.
 
     Parameters
     ----------
-    epoch_size : `int`
-        Epoch size
+    step : `float`
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be automatically tuned as
+        ``step = 1 / model.get_lip_max()``. Otherwise, use a try-an-improve
+        approach
 
-    rand_type : `str`
-        Type of random sampling
-
-        * if ``"unif"`` samples are uniformly drawn among all possibilities
-        * if ``"perm"`` a random permutation of all possibilities is
-          generated and samples are sequentially taken from it. Once all of
-          them have been taken, a new random permutation is generated
-
-    tol : `float`, default=0
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
-    max_iter : `int`
-        Maximum number of iterations of the solver
+    max_iter : `int`, default=10
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
+
+    seed : `int`, default=-1
+        The seed of the random sampling. If it is negative then a random seed
+        (different at each run) will be chosen.
+
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
+
+    variance_reduction : {'last', 'avg', 'rand'}, default='last'
+        Strategy used for the computation of the iterate used in
+        variance reduction (also called phase iterate). A warning will be
+        raised if the ``'avg'`` strategy is used when the model is a
+        generalized linear model with sparse features, since it is strongly
+        sub-optimal in this case
+
+        * ``'last'`` : the phase iterate is the last iterate of the previous
+          epoch
+        * ``'avg``' : the phase iterate is the average over the iterates in the
+          past epoch
+        * ``'rand'``: the phase iterate is a random iterate of the previous
+          epoch
 
-    print_every : `int`, default = 10
-        Print history information every time the iteration number is a
-        multiple of ``print_every``
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
 
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
+          generated and samples are sequentially taken from it. Once all of
+          them have been taken, a new random permutation is generated
 
-    variance_reduction : {'last', 'avg', 'rand'}, default = last
-        Determine what is used as phase iterate for variance reduction.
+    print_every : `int`, default=1
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
-        * 'last' : the phase iterate is the last iterate of the previous epoch
-        * 'avg' : the phase iterate is the average over the iterates in the past
-          epoch
-        * 'rand': the phase iterate is a random iterate of the previous epoch
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
 
     References
     ----------
-    A. Defazio, F. Bach, S. Lacoste-Julien, SAGA: A fast incremental gradient
-    method with support for non-strongly convex composite objectives, NIPS 2014
+    * A. Defazio, F. Bach, S. Lacoste-Julien, SAGA: A fast incremental gradient
+      method with support for non-strongly convex composite objectives,
+      NIPS 2014
     """
 
     def __init__(self, step: float = None, epoch_size: int = None,
@@ -130,4 +206,4 @@ def set_model(self, model: ModelGeneralizedLinear):
             return SolverFirstOrderSto.set_model(self, model)
         else:
             raise ValueError("SAGA accepts only childs of "
-                             "`ModelGeneralizedLinear`")
\ No newline at end of file
+                             "`ModelGeneralizedLinear`")
diff --git a/tick/optim/solver/scpg.py b/tick/optim/solver/scpg.py
index 063d045c4..2f75da383 100644
--- a/tick/optim/solver/scpg.py
+++ b/tick/optim/solver/scpg.py
@@ -10,38 +10,64 @@
 from tick.optim.solver.base import SolverFirstOrder
 from tick.optim.solver.base.utils import relative_distance
 
-__author__ = 'MartinBompaire'
-
 
 class SCPG(SolverFirstOrder):
-    """
-    Minimize the objective
-
-    .. math:: f(x) + g(x)
-
-    using the Proximal Gradient algorithm
-    This algorithm assumes that f and g are both convex, that
-    f is standard self-concordant and that g is prox-capable.
+    """Self-Concordant Proximal Gradient descent
+
+    For the minimization of objectives of the form
+
+    .. math::
+        f(w) + g(w),
+
+    where :math:`f` is self-concordant and :math:`g` is prox-capable.
+    Function :math:`f` corresponds to the ``model.loss`` method of the model
+    (passed with ``set_model`` to the solver) and :math:`g` corresponds to
+    the ``prox.value`` method of the prox (passed with the ``set_prox`` method).
+    One iteration of :class:`SCPG <tick.optim.solver.SCPG>` is as follows:
+
+    .. math::
+        y^k &\\gets \\mathrm{prox}_{g / L_k} \\big( w^k - \\tfrac{1}{L_k}
+        \\nabla f(w^k) \\big) \\\\
+        d^k &\\gets y^k - w^k \\\\
+        \\beta_k &\\gets \\sqrt{L_k} \| d^k \|_2 \\\\
+        \\lambda_k &\\gets \\sqrt{{d^k}^\\top \\nabla^2 f(w^k) d^k} \\\\
+        \\alpha_k &\\gets \\frac{\\beta_k}{\\lambda_k (\\lambda_k +
+        \\beta_k^2)} \\\\
+        w^{k + 1} &\\gets w^{k} + \\alpha_k d^k \\\\
+
+    where :math:`\\nabla f(w)` is the gradient of :math:`f` given by the
+    ``model.grad`` method and :math:`\\mathrm{prox}_{g / L_k}` is given by the
+    ``prox.call`` method. The first step size :math:`1 / L_k` can be tuned with
+    ``step`` it will then be updated with Barzilai-Borwein steps and linesearch.
+    The iterations stop whenever tolerance ``tol`` is achieved, or after
+    ``max_iter`` iterations. The obtained solution :math:`w` is returned
+    by the ``solve`` method, and is also stored in the ``solution`` attribute
+    of the solver.
 
     Parameters
     ----------
+    step : `float`, default=None
+        Step-size parameter that will be used at the first iteration. Then
+        the linesearch algorithm will compute the next steps.
 
-    modified : `bool` (default False)
-        Weather or not using the modified version fo the algorithm.
-
-    step : `float` default=None
-        Step-size of the algorithm. If ``linesearch=True``, this is the
-        first step-size to be used in the linesearch
-        (typically taken too large). Otherwise, it's the constant step
-        to be used along iterations.
-
-    tol : `float`, default=0.
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
     max_iter : `int`, default=100
-        Maximum number of iterations of the solver
+        Maximum number of iterations of the solver.
+
+    verbose : `bool`, default=True
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
+
+    print_every : `int`, default=10
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
+
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
     linesearch_step_increase : `float`, default=2.
         Factor of step increase when using linesearch
@@ -49,25 +75,26 @@ class SCPG(SolverFirstOrder):
     linesearch_step_decrease : `float`, default=0.5
         Factor of step decrease when using linesearch
 
-    verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
-
-    print_every : `int`, default=10
-        Print history information when ``n_iter`` (iteration number) is
-        a multiple of ``print_every``
-
-    record_every : `int`, default=1
-        Record history information when ``n_iter`` (iteration number) is
-        a multiple of ``record_every``
+    modified : `bool`, default=False
+        Enables modified verison. The modified version of this algorithm
+        consists in evaluating :math:`f` at :math:`y_k` and :math:`w^{k+1}`
+        and keeping the iterate that minimizes the best :math:`f`.
 
     Attributes
     ----------
     model : `Model`
-        The model to solve
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
 
     time_start : `str`
         Start date of the call to ``solve()``
@@ -77,6 +104,18 @@ class SCPG(SolverFirstOrder):
 
     time_end : `str`
         End date of the call to ``solve()``
+
+    Notes
+    -----
+    This algorithm is designed to work properly with self-concordant losses
+    such as Poisson regression with linear link
+    `ModelPoisReg <tick.optim.model.ModelPoisReg>` or Hawkes processes likelihood
+    `ModelHawkesFixedSumExpKernLogLik <tick.optim.model.ModelHawkesFixedSumExpKernLogLik>`
+
+    References
+    ----------
+    Tran-Dinh Q, Kyrillidis A, Cevher V. Composite self-concordant
+    minimization. *The Journal of Machine Learning Research* (2015)
     """
 
     _attrinfos = {
@@ -155,9 +194,10 @@ def value(self, coeffs: np.ndarray):
             return self.original_prox.value(coeffs) * self.sc_corr
 
     def __init__(self, step: float = None, tol: float = 0.,
-                 max_iter: int = 100, linesearch_step_increase: float = 2.,
-                 linesearch_step_decrease: float = 0.5, verbose: bool = True,
+                 max_iter: int = 100, verbose: bool = True,
                  print_every: int = 10, record_every: int = 1,
+                 linesearch_step_increase: float = 2.,
+                 linesearch_step_decrease: float = 0.5,
                  modified=False):
         SolverFirstOrder.__init__(self, step=step, tol=tol, max_iter=max_iter,
                                   verbose=verbose, print_every=print_every,
@@ -296,7 +336,7 @@ def _gradient_step(self, x, prev_x, grad_x_ssc, prev_grad_x_ssc,
         # during function's run
         return x_new, y_k, alpha_k, beta_k, lambda_k, l_k
 
-    def _solve(self, x0: np.ndarray = None, step: float = 1e5):
+    def _solve(self, x0: np.ndarray = None, step: float = None):
 
         step, obj, x, prev_x, prev_grad_x_ssc, grad_x_ssc = \
             self._initialize_values(x0, step=step)
diff --git a/tick/optim/solver/sdca.py b/tick/optim/solver/sdca.py
index f6901c90c..6a950640b 100644
--- a/tick/optim/solver/sdca.py
+++ b/tick/optim/solver/sdca.py
@@ -7,54 +7,144 @@
 
 
 class SDCA(SolverFirstOrderSto):
-    """Stochastic Dual Coordinate Ascent solver
+    """Stochastic Dual Coordinate Ascent
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w^\\top x_i) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable. This solver actually requires more than that, since it is
+    working in a Fenchel dual formulation of the primal problem given above.
+    First, it requires that some ridge penalization is used, hence the mandatory
+    parameter ``l_l2sq`` below: SDCA will actually minimize the objective
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(x_i^\\top w) + g(w) + \\frac{\\lambda}{2}
+        \\| w \\|_2^2,
+
+    where :math:`\lambda` is tuned with the ``l_l2sq`` (see below). Now, putting
+    :math:`h(w) = g(w) + \lambda \|w\|_2^2 / 2`, SDCA maximize
+    the Fenchel dual problem
+
+    .. math::
+        D(\\alpha) = \\frac 1n \\sum_{i=1}^n \\Bigg[ - f_i^*(-\\alpha_i) -
+        \lambda h^*\\Big( \\frac{1}{\\lambda n} \\sum_{i=1}^n \\alpha_i x_i)
+        \\Big) \\Bigg],
+
+    where :math:`f_i^*` and :math:`h^*` and the Fenchel duals of :math:`f_i`
+    and :math:`h` respectively.
+    Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method). One iteration of
+    :class:`SDCA <tick.optim.solver.SDCA>` corresponds to the
+    following iteration applied ``epoch_size`` times:
+
+    .. math::
+        \\begin{align*}
+        \\delta_i &\\gets \\arg\\min_{\\delta} \\Big[ \\; f_i^*(-\\alpha_i -
+        \\delta) + w^\\top x_i \\delta + \\frac{1}{2 \\lambda n} \\| x_i\\|_2^2
+        \\delta^2 \\Big] \\\\
+        \\alpha_i &\\gets \\alpha_i + \\delta_i \\\\
+        v &\\gets v + \\frac{1}{\\lambda n} \\delta_i x_i \\\\
+        w &\\gets \\nabla g^*(v)
+        \\end{align*}
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration. The ridge regularization :math:`\\lambda` can be tuned with
+    ``l_l2sq``, the seed of the random number generator for generation
+    of samples :math:`i` can be seeded with ``seed``. The iterations stop
+    whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs
+    (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver. The dual solution
+    :math:`\\alpha` is stored in the ``dual_solution`` attribute.
+
+    Internally, :class:`SDCA <tick.optim.solver.SDCA>` has dedicated code when
+    the model is a generalized linear model with sparse features, and a
+    separable proximal operator: in this case, each iteration works only in the
+    set of non-zero features, leading to much faster iterates.
 
     Parameters
     ----------
     l_l2sq : `float`
         Level of L2 penalization. L2 penalization is mandatory for SDCA.
+        Convergence properties of this solver are deeply connected to this
+        parameter, which should be understood as the "step" used by the
+        algorithm.
 
-    epoch_size : `int`
-        Epoch size
+    tol : `float`, default=1e-10
+        The tolerance of the solver (iterations stop when the stopping
+        criterion is below it)
 
-    rand_type : `str`
-        Type of random sampling
+    max_iter : `int`, default=10
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
-        * if ``"unif"`` samples are uniformly drawn among all possibilities
-        * if ``"perm"`` a random permutation of all possibilities is
-          generated and samples are sequentially taken from it. Once all of
-          them have been taken, a new random permutation is generated
+    verbose : `bool`, default=True
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
-    tol : `float`, default=0
-        The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+    seed : `int`, default=-1
+        The seed of the random sampling. If it is negative then a random seed
+        (different at each run) will be chosen.
 
-    max_iter : `int`
-        Maximum number of iterations of the solver
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
 
-    verbose : `bool`, default=`True`
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
 
-    print_every : `int`, default = 10
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
+          generated and samples are sequentially taken from it. Once all of
+          them have been taken, a new random permutation is generated
+
+    print_every : `int`, default=1
         Print history information every time the iteration number is a
-        multiple of ``print_every``
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    dual_solution : `numpy.array`
+        Dual vector corresponding to the primal solution obtained by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
 
-    dual_solution : `np.ndarray`
-        Dual vector to which the solver has converged
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * S. Shalev-Shwartz and T. Zhang, Accelerated proximal stochastic dual
+      coordinate ascent for regularized loss minimization, *ICML 2014*
     """
 
     _attrinfos = {
@@ -62,9 +152,9 @@ class SDCA(SolverFirstOrderSto):
     }
 
     def __init__(self, l_l2sq: float, epoch_size: int = None,
-                 rand_type: str = "unif", tol: float = 0.,
-                 max_iter: int = 100, verbose: bool = True,
-                 print_every: int = 10, record_every: int = 1,
+                 rand_type: str = 'unif', tol: float = 1e-10,
+                 max_iter: int = 10, verbose: bool = True,
+                 print_every: int = 1, record_every: int = 1,
                  seed: int = -1):
 
         SolverFirstOrderSto.__init__(self, step=0, epoch_size=epoch_size,
@@ -101,6 +191,18 @@ def objective(self, coeffs, loss: float = None):
         return SolverFirstOrderSto.objective(self, coeffs, loss) + prox_l2_value
 
     def dual_objective(self, dual_coeffs):
+        """Compute the dual objective at ``dual_coeffs``
+
+        Parameters
+        ----------
+        dual_coeffs : `numpy.ndarray`, shape=(n_samples,)
+            The dual objective objective is computed at this point
+
+        Returns
+        -------
+        output : `float`
+            Value of the dual objective at given ``dual_coeffs``
+        """
         primal = self.model._sdca_primal_dual_relation(self.l_l2sq, dual_coeffs)
         prox_l2_value = 0.5 * self.l_l2sq * np.linalg.norm(primal) ** 2
         return self.model.dual_loss(dual_coeffs) - prox_l2_value
diff --git a/tick/optim/solver/sgd.py b/tick/optim/solver/sgd.py
index df60a5b9e..7298d4882 100644
--- a/tick/optim/solver/sgd.py
+++ b/tick/optim/solver/sgd.py
@@ -10,57 +10,111 @@
 
 
 class SGD(SolverFirstOrderSto):
-    """
-    Stochastic Gradient Descent solver
+    """Stochastic gradient descent solver
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable. Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method).
+    One iteration of :class:`SGD <tick.optim.solver.SGD>` corresponds to the
+    following iteration applied ``epoch_size`` times:
+
+    .. math::
+        w^{t+1} \\gets \\mathrm{prox}_{\\eta_t g} \\big(w^t - \\eta_t
+        \\nabla f_i(w^t) \\big),
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration, where :math:`\\eta_t = \eta / (t + 1)`, with
+    :math:`\\eta > 0` that can be tuned with ``step``. The seed of the random
+    number generator for generation of samples :math:`i` can be seeded with
+    ``seed``. The iterations stop whenever tolerance ``tol`` is achieved, or
+    after ``max_iter`` epochs (namely ``max_iter``:math:`\\times` ``epoch_size``
+    iterations).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver.
 
     Parameters
     ----------
-    epoch_size : `int`
-        Epoch size
-
-    rand_type : `str`
-        Type of random sampling
-
-        * if ``"unif"`` samples are uniformly drawn among all possibilities
-        * if ``"perm"`` a random permutation of all possibilities is
-          generated and samples are sequentially taken from it. Once all of
-          them have been taken, a new random permutation is generated
+    step : `float`
+        Step-size parameter, the most important parameter of the solver.
+        A try-an-improve approach should be used.
 
-    tol : `float`, default=0
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
-    max_iter : `int`
-        Maximum number of iterations of the solver
+    max_iter : `int`, default=100
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
-
-    print_every : `int`, default = 10
-        Print history information every time the iteration number is a
-        multiple of ``print_every``
-
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
-    seed : `int`
+    seed : `int`, default=-1
         The seed of the random sampling. If it is negative then a random seed
         (different at each run) will be chosen.
 
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
+
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
+
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
+          generated and samples are sequentially taken from it. Once all of
+          them have been taken, a new random permutation is generated
+
+    print_every : `int`, default=10
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
+
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
+
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * https://en.wikipedia.org/wiki/Stochastic_gradient_descent
     """
 
     def __init__(self, step: float = None, epoch_size: int = None,
-                 rand_type: str = "unif", tol: float = 0.,
+                 rand_type: str = "unif", tol: float = 1e-10,
                  max_iter: int = 100, verbose: bool = True,
                  print_every: int = 10, record_every: int = 1,
                  seed: int = -1):
diff --git a/tick/optim/solver/svrg.py b/tick/optim/solver/svrg.py
index b35d62609..dad3c12b3 100644
--- a/tick/optim/solver/svrg.py
+++ b/tick/optim/solver/svrg.py
@@ -7,8 +7,6 @@
 
 __author__ = "Stephane Gaiffas"
 
-# TODO: preparer methodes pour set et get attributes
-
 variance_reduction_methods_mapper = {
     'last': _SVRG.VarianceReductionMethod_Last,
     'avg': _SVRG.VarianceReductionMethod_Average,
@@ -17,74 +15,151 @@
 
 
 class SVRG(SolverFirstOrderSto):
-    """
-    Stochastic variance reduced gradient
+    """Stochastic Variance Reduced Gradient solver
+
+    For the minimization of objectives of the form
+
+    .. math::
+        \\frac 1n \\sum_{i=1}^n f_i(w) + g(w),
+
+    where the functions :math:`f_i` have smooth gradients and :math:`g` is
+    prox-capable. Function :math:`f = \\frac 1n \\sum_{i=1}^n f_i` corresponds
+    to the ``model.loss`` method of the model (passed with ``set_model`` to the
+    solver) and :math:`g` corresponds to the ``prox.value`` method of the
+    prox (passed with the ``set_prox`` method).
+    One iteration of :class:`SVRG <tick.optim.solver.SVRG>` corresponds to the
+    following iteration applied ``epoch_size`` times:
+
+    .. math::
+        w \\gets \\mathrm{prox}_{\\eta g} \\big(w - \\eta (\\nabla f_i(w) -
+        \\nabla f_i(\\bar{w}) + \\nabla f(\\bar{w}) \\big),
+
+    where :math:`i` is sampled at random (strategy depends on ``rand_type``) at
+    each iteration, and where :math:`\\bar w` and :math:`\\nabla f(\\bar w)`
+    are updated at the beginning of each epoch, with a strategy that depend on
+    the ``variance_reduction`` parameter. The step-size :math:`\\eta` can be
+    tuned with ``step``, the seed of the random number generator for generation
+    of samples :math:`i` can be seeded with ``seed``. The iterations stop
+    whenever tolerance ``tol`` is achieved, or after ``max_iter`` epochs
+    (namely ``max_iter`` :math:`\\times` ``epoch_size`` iterates).
+    The obtained solution :math:`w` is returned by the ``solve`` method, and is
+    also stored in the ``solution`` attribute of the solver.
+
+    Internally, :class:`SVRG <tick.optim.solver.SVRG>` has dedicated code when
+    the model is a generalized linear model with sparse features, and a
+    separable proximal operator: in this case, each iteration works only in the
+    set of non-zero features, leading to much faster iterates.
+
+    Moreover, when ``n_threads`` > 1, this class actually implements parallel
+    and asynchronous updates of :math:`w`, which is likely to accelerate
+    optimization, depending on the sparsity of the dataset, and the number of
+    available cores.
 
     Parameters
     ----------
-    epoch_size : `int`
-        Epoch size
-
-    rand_type : `str`
-        Type of random sampling
-
-        * if ``'unif'`` samples are uniformly drawn among all possibilities
-        * if ``'perm'`` a random permutation of all possibilities is
-          generated and samples are sequentially taken from it. Once all of
-          them have been taken, a new random permutation is generated
+    step : `float`
+        Step-size parameter, the most important parameter of the solver.
+        Whenever possible, this can be automatically tuned as
+        ``step = 1 / model.get_lip_max()``. Otherwise, use a try-an-improve
+        approach
 
-    tol : `float`, default=0
+    tol : `float`, default=1e-10
         The tolerance of the solver (iterations stop when the stopping
-        criterion is below it). By default the solver does ``max_iter``
-        iterations
+        criterion is below it)
 
-    max_iter : `int`
-        Maximum number of iterations of the solver
+    max_iter : `int`, default=10
+        Maximum number of iterations of the solver, namely maximum number of
+        epochs (by default full pass over the data, unless ``epoch_size`` has
+        been modified from default)
 
     verbose : `bool`, default=True
-        If `True`, we verbose things, otherwise the solver does not
-        print anything (but records information in history anyway)
+        If `True`, solver verboses history, otherwise nothing is displayed,
+        but history is recorded anyway
 
-    print_every : `int`, default = 10
-        Print history information every time the iteration number is a
-        multiple of ``print_every``
+    seed : `int`, default=-1
+        The seed of the random sampling. If it is negative then a random seed
+        (different at each run) will be chosen.
 
-    record_every : `int`, default = 1
-        Information along iteration is recorded in history each time the
-        iteration number of a multiple of ``record_every``
+    n_threads : `int`, default=1
+        Number of threads to use for parallel optimization. The strategy used
+        for this is asynchronous updates of the iterates.
+
+    epoch_size : `int`, default given by model
+        Epoch size, namely how many iterations are made before updating the
+        variance reducing term. By default, this is automatically tuned using
+        information from the model object passed through ``set_model``.
 
-    Other Parameters
-    ----------------
     variance_reduction : {'last', 'avg', 'rand'}, default='last'
-        Determine what is used as phase iterate for variance reduction.
+        Strategy used for the computation of the iterate used in
+        variance reduction (also called phase iterate). A warning will be
+        raised if the ``'avg'`` strategy is used when the model is a
+        generalized linear model with sparse features, since it is strongly
+        sub-optimal in this case
+
+        * ``'last'`` : the phase iterate is the last iterate of the previous
+          epoch
+        * ``'avg``' : the phase iterate is the average over the iterates in the
+          past epoch
+        * ``'rand'``: the phase iterate is a random iterate of the previous
+          epoch
+
+    rand_type : {'unif', 'perm'}, default='unif'
+        How samples are randomly selected from the data
+
+        * if ``'unif'`` samples are uniformly drawn among all possibilities
+        * if ``'perm'`` a random permutation of all possibilities is
+          generated and samples are sequentially taken from it. Once all of
+          them have been taken, a new random permutation is generated
+
+    print_every : `int`, default=1
+        Print history information every time the iteration number is a
+        multiple of ``print_every``. Used only is ``verbose`` is True
 
-        * 'last' : the phase iterate is the last iterate of the previous epoch
-        * 'avg' : the phase iterate is the average over the iterates in the past
-          epoch. This is really a bad idea when using sparse datasets, a
-          warning will be raised in this case
-        * 'rand': the phase iterate is a random iterate of the previous epoch
+    record_every : `int`, default=1
+        Save history information every time the iteration number is a
+        multiple of ``record_every``
 
     Attributes
     ----------
-    model : `Solver`
-        The model to solve
+    model : `Model`
+        The model used by the solver, passed with the ``set_model`` method
 
     prox : `Prox`
-        Proximal operator to solve
+        Proximal operator used by the solver, passed with the ``set_prox``
+        method
+
+    solution : `numpy.array`, shape=(n_coeffs,)
+        Minimizer found by the solver
+
+    history : `dict`-like
+        A dict-type of object that contains history of the solver along
+        iterations. It should be accessed using the ``get_history`` method
+
+    time_start : `str`
+        Start date of the call to ``solve()``
+
+    time_elapsed : `float`
+        Duration of the call to ``solve()``, in seconds
+
+    time_end : `str`
+        End date of the call to ``solve()``
+
+    References
+    ----------
+    * L. Xiao and T. Zhang, A proximal stochastic gradient method with
+      progressive variance reduction, *SIAM Journal on Optimization* (2014)
     """
 
     def __init__(self, step: float = None, epoch_size: int = None,
-                 rand_type: str = 'unif', tol: float = 0.,
-                 max_iter: int = 100, verbose: bool = True,
-                 print_every: int = 10, record_every: int = 1,
+                 rand_type: str = 'unif', tol: float = 1e-10,
+                 max_iter: int = 10, verbose: bool = True,
+                 print_every: int = 1, record_every: int = 1,
                  seed: int = -1, variance_reduction: str = 'last',
                  n_threads: int = 1):
-
-        self.n_threads = n_threads
-
         SolverFirstOrderSto.__init__(self, step, epoch_size, rand_type,
                                      tol, max_iter, verbose,
                                      print_every, record_every, seed=seed)
+        self.n_threads = n_threads
         step = self.step
         if step is None:
             step = 0.