From a4593a3eb9051683d22b1d1aea2727c048d78a11 Mon Sep 17 00:00:00 2001 From: mrava87 Date: Fri, 23 Aug 2024 10:30:05 -0500 Subject: [PATCH] feat: added AndersonProximalGradient --- docs/source/api/index.rst | 1 + pyproximal/optimization/__init__.py | 2 + pyproximal/optimization/primal.py | 202 +++++++++++++++++++++++++++- pyproximal/proximal/Simplex.py | 2 +- tutorials/twist.py | 34 +++-- 5 files changed, 229 insertions(+), 12 deletions(-) diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 4000054..492124c 100755 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -157,6 +157,7 @@ Primal AcceleratedProximalGradient ADMM ADMML2 + AndersonProximalGradient GeneralizedProximalGradient HQS LinearizedADMM diff --git a/pyproximal/optimization/__init__.py b/pyproximal/optimization/__init__.py index 6dd04b7..f3beba8 100644 --- a/pyproximal/optimization/__init__.py +++ b/pyproximal/optimization/__init__.py @@ -11,6 +11,8 @@ ProximalPoint Proximal point algorithm (or proximal min.) ProximalGradient Proximal gradient algorithm AcceleratedProximalGradient Accelerated Proximal gradient algorithm + AndersonProximalGradient Proximal gradient algorithm with Anderson acceleration + GeneralizedProximalGradient Generalized Proximal gradient algorithm HQS Half Quadrating Splitting ADMM Alternating Direction Method of Multipliers ADMML2 ADMM with L2 misfit term diff --git a/pyproximal/optimization/primal.py b/pyproximal/optimization/primal.py index 1bee7e0..2a163b8 100644 --- a/pyproximal/optimization/primal.py +++ b/pyproximal/optimization/primal.py @@ -156,7 +156,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1., Proximal operator of g function x0 : :obj:`numpy.ndarray` Initial vector - epsg : :obj:`float` or :obj:`np.ndarray`, optional + epsg : :obj:`float` or :obj:`numpy.ndarray`, optional Scaling factor of g function tau : :obj:`float` or :obj:`numpy.ndarray`, optional Positive scalar weight, which should satisfy the following condition @@ -195,7 +195,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1., Notes ----- - The Proximal point algorithm can be expressed by the following recursion: + The Proximal gradient algorithm can be expressed by the following recursion: .. math:: @@ -359,6 +359,202 @@ def AcceleratedProximalGradient(proxf, proxg, x0, tau=None, beta=0.5, callback=callback, show=show) +def AndersonProximalGradient(proxf, proxg, x0, epsg=1., + tau=None, niter=10, + nhistory=10, epsr=1e-10, + safeguard=False, tol=None, + callback=None, show=False): + r"""Proximal gradient with Anderson acceleration + + Solves the following minimization problem using the Proximal + gradient algorithm with Anderson acceleration: + + .. math:: + + \mathbf{x} = \argmin_\mathbf{x} f(\mathbf{x}) + \epsilon g(\mathbf{x}) + + where :math:`f(\mathbf{x})` is a smooth convex function with a uniquely + defined gradient and :math:`g(\mathbf{x})` is any convex function that + has a known proximal operator. + + Parameters + ---------- + proxf : :obj:`pyproximal.ProxOperator` + Proximal operator of f function (must have ``grad`` implemented) + proxg : :obj:`pyproximal.ProxOperator` + Proximal operator of g function + x0 : :obj:`numpy.ndarray` + Initial vector + epsg : :obj:`float` or :obj:`numpy.ndarray`, optional + Scaling factor of g function + tau : :obj:`float` or :obj:`numpy.ndarray`, optional + Positive scalar weight, which should satisfy the following condition + to guarantees convergence: :math:`\tau \in (0, 1/L]` where ``L`` is + the Lipschitz constant of :math:`\nabla f`. When ``tau=None``, + backtracking is used to adaptively estimate the best tau at each + iteration. Finally, note that :math:`\tau` can be chosen to be a vector + when dealing with problems with multiple right-hand-sides + niter : :obj:`int`, optional + Number of iterations of iterative scheme + nhistory : :obj:`int`, optional + Number of previous iterates to be kept in memory (to compute the scaling factors + epsr : :obj:`float`, optional + Scaling factor for regularization added to the inverse of :math:\mathbf{R}^T \mathbf{R}` + safeguard : :obj:`bool`, optional + Apply safeguarding strategy to the update (``True``) or not (``False``) + tol : :obj:`float`, optional + Tolerance on change of objective function (used as stopping criterion). If + ``tol=None``, run until ``niter`` is reached + callback : :obj:`callable`, optional + Function with signature (``callback(x)``) to call after each iteration + where ``x`` is the current model vector + show : :obj:`bool`, optional + Display iterations log + + Returns + ------- + x : :obj:`numpy.ndarray` + Inverted model + + Notes + ----- + The Proximal gradient algorithm with Anderson acceleration can be expressed by the + following recursion [1]_: + + .. math:: + m_k = min(m, k)\\ + \mathbf{g}^{k} = \mathbf{x}^{k} - \tau^k \nabla f(\mathbf{x}^k)\\ + \mathbf{r}^{k} = \mathbf{g}^{k} - \mathbf{g}^{k}\\ + \mathbf{G}^{k} = [\mathbf{g}^{k},..., \mathbf{g}^{k-m_k}]\\ + \mathbf{R}^{k} = [\mathbf{r}^{k},..., \mathbf{r}^{k-m_k}]\\ + \alpha_k = (\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1} / \mathbf{1}^T + (\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1}\\ + \mathbf{y}^{k+1} = \mathbf{G}^{k} \alpha_k\\ + \mathbf{x}^{k+1} = \prox_{\tau^{k+1} g}(\mathbf{y}^{k+1}) + + where :math:`m` equals ``nhistory``, :math:`k=1,2,...,n_{iter}`, :math:`\mathbf{y}^{0}=\mathbf{x}^{0}`, + :math:`\mathbf{y}^{1}=\mathbf{x}^{0} - \tau^0 \nabla f(\mathbf{x}^0)`, + :math:`\mathbf{x}^{1}=\prox_{\tau^k g}(\mathbf{y}^{1})`, and + :math:`\mathbf{g}^{0}=\mathbf{y}^{1}`. + + Refer to [1]_ for the guarded version of the algorithm (when ``safeguard=True``). + + .. [1] Mai, V., and Johansson, M. "Anderson Acceleration of Proximal Gradient + Methods", 2020. + + """ + # check if epgs is a vector + if np.asarray(epsg).size == 1.: + epsg = epsg * np.ones(niter) + epsg_print = str(epsg[0]) + else: + epsg_print = 'Multi' + + if show: + tstart = time.time() + print('Proximal Gradient with Anderson Acceleration \n' + '---------------------------------------------------------\n' + 'Proximal operator (f): %s\n' + 'Proximal operator (g): %s\n' + 'tau = %s\t\tepsg = %s\tniter = %d\n' + 'nhist = %d\tepsr = %s\n' + 'guard = %s\ttol = %s\n' % (type(proxf), type(proxg), + str(tau), epsg_print, niter, + nhistory, str(epsr), + str(safeguard), str(tol))) + head = ' Itn x[0] f g J=f+eps*g tau' + print(head) + + # initialize model + y = x0 - tau * proxf.grad(x0) + x = proxg.prox(y, epsg[0] * tau) + g = y.copy() + r = g - x0 + R, G = [g, ], [r, ] + pf = proxf(x) + pfg = np.inf + tolbreak = False + + # iterate + for iiter in range(niter): + + # update fix point + g = x - tau * proxf.grad(x) + r = g - y + + # update history vectors + R.insert(0, r) + G.insert(0, g) + if iiter >= nhistory - 1: + R.pop(-1) + G.pop(-1) + + # solve for alpha coefficients + Rstack = np.vstack(R) + Rinv = np.linalg.pinv(Rstack @ Rstack.T + epsr * np.linalg.norm(Rstack) ** 2) + ones = np.ones(min(nhistory, iiter + 2)) + Rinvones = Rinv @ ones + alpha = Rinvones / (ones[None] @ Rinvones) + + if not safeguard: + # update auxiliary variable + y = np.vstack(G).T @ alpha + + # update main variable + x = proxg.prox(y, epsg[iiter] * tau) + + else: + # update auxiliary variable + ytest = np.vstack(G).T @ alpha + + # update main variable + xtest = proxg.prox(ytest, epsg[iiter] * tau) + + # check if function is decreased, otherwise do basic PG step + pfold, pf = pf, proxf(xtest) + if pf <= pfold - tau * np.linalg.norm(proxf.grad(x)) ** 2 / 2: + y = ytest + x = xtest + else: + x = proxg.prox(g, epsg[iiter] * tau) + y = g + + # run callback + if callback is not None: + callback(x) + + # tolerance check: break iterations if overall + # objective does not decrease below tolerance + if tol is not None: + pfgold = pfg + pf, pg = proxf(x), proxg(x) + pfg = pf + np.sum(epsg[iiter] * pg) + if np.abs(1.0 - pfg / pfgold) < tol: + tolbreak = True + + # show iteration logger + if show: + if iiter < 10 or niter - iiter < 10 or iiter % (niter // 10) == 0: + if tol is None: + pf, pg = proxf(x), proxg(x) + pfg = pf + np.sum(epsg[iiter] * pg) + msg = '%6g %12.5e %10.3e %10.3e %10.3e %10.3e' % \ + (iiter + 1, np.real(to_numpy(x[0])) if x.ndim == 1 else np.real(to_numpy(x[0, 0])), + pf, pg, + pfg, + tau) + print(msg) + + # break if tolerance condition is met + if tolbreak: + break + + if show: + print('\nTotal time (s) = %.2f' % (time.time() - tstart)) + print('---------------------------------------------------------\n') + return x + + def GeneralizedProximalGradient(proxfs, proxgs, x0, tau, epsg=1., weights=None, eta=1., niter=10, @@ -390,7 +586,7 @@ def GeneralizedProximalGradient(proxfs, proxgs, x0, tau, Positive scalar weight, which should satisfy the following condition to guarantees convergence: :math:`\tau \in (0, 1/L]` where ``L`` is the Lipschitz constant of :math:`\sum_{i=1}^n \nabla f_i`. - epsg : :obj:`float` or :obj:`np.ndarray`, optional + epsg : :obj:`float` or :obj:`numpy.ndarray`, optional Scaling factor(s) of ``g`` function(s) weights : :obj:`float`, optional Weighting factors of ``g`` functions. Must sum to 1. diff --git a/pyproximal/proximal/Simplex.py b/pyproximal/proximal/Simplex.py index c3d10a7..8850264 100644 --- a/pyproximal/proximal/Simplex.py +++ b/pyproximal/proximal/Simplex.py @@ -204,7 +204,7 @@ def Simplex(n, radius, dims=None, axis=-1, maxiter=100, Notes ----- As the Simplex is an indicator function, the proximal operator corresponds - to its orthogonal projection (see :class:`pyprox.projection.SimplexProj` + to its orthogonal projection (see :class:`pyproximal.projection.SimplexProj` for details. Note that ``tau`` does not have effect for this proximal operator, any diff --git a/tutorials/twist.py b/tutorials/twist.py index 0d75c59..a6ce8a0 100644 --- a/tutorials/twist.py +++ b/tutorials/twist.py @@ -1,12 +1,13 @@ r""" -IHT, ISTA, FISTA, and TWIST for Compressive sensing -=================================================== +IHT, ISTA, FISTA, AA-ISTA, and TWIST for Compressive sensing +============================================================ -In this example we want to compare four popular solvers in compressive -sensing problem, namely IHT, ISTA, FISTA, and TwIST. The first three can +In this example we want to compare five popular solvers in compressive +sensing problem, namely IHT, ISTA, FISTA, AA-ISTA, and TwIST. The first three can be implemented using the same solver, namely the :py:class:`pyproximal.optimization.primal.ProximalGradient`, whilst the latter -is implemented in :py:class:`pyproximal.optimization.primal.TwIST`. +two are implemented using the :py:class:`pyproximal.optimization.primal.AndersonProximalGradient` and +:py:class:`pyproximal.optimization.primal.TwIST` solvers, respectively The IHT solver tries to solve the following unconstrained problem with a L0Ball regularization term: @@ -48,7 +49,6 @@ def callback(x, pf, pg, eps, cost): A = np.random.randn(N, M) A = A / np.linalg.norm(A, axis=0) Aop = pylops.MatrixMult(A) -Aop.explicit = False # temporary solution whilst PyLops gets updated x = np.random.rand(M) x[x < 0.9] = 0 @@ -88,6 +88,17 @@ def callback(x, pf, pg, eps, cost): callback=lambda x: callback(x, l2, l1, eps, costf)) niterf = len(costf) +# Anderson accelerated ISTA +l1 = pyproximal.proximal.L1() +l2 = pyproximal.proximal.L2(Op=Aop, b=y) +costa = [] +x_ander = \ + pyproximal.optimization.primal.AndersonProximalGradient(l2, l1, tau=tau, x0=np.zeros(M), + epsg=eps, niter=maxit, + nhistory=5, show=False, + callback=lambda x: callback(x, l2, l1, eps, costa)) +nitera = len(costa) + # TWIST (Note that since the smallest eigenvalue is zero, we arbitrarily # choose a small value for the solver to converge stably) l1 = pyproximal.proximal.L1(sigma=eps) @@ -111,6 +122,9 @@ def callback(x, pf, pg, eps, cost): m, s, b = ax.stem(x_fista, linefmt='--g', basefmt='--g', markerfmt='go', label='FISTA') plt.setp(m, markersize=7) +m, s, b = ax.stem(x_ander, linefmt='--m', basefmt='--m', + markerfmt='mo', label='AA-ISTA') +plt.setp(m, markersize=7) m, s, b = ax.stem(x_twist, linefmt='--b', basefmt='--b', markerfmt='bo', label='TWIST') plt.setp(m, markersize=7) @@ -118,9 +132,13 @@ def callback(x, pf, pg, eps, cost): ax.legend() plt.tight_layout() +############################################################################### +# Finally, let's compare the converge behaviour of the different algorithms + fig, ax = plt.subplots(1, 1, figsize=(8, 3)) ax.semilogy(costi, 'r', lw=2, label=r'$x_{ISTA} (niter=%d)$' % niteri) ax.semilogy(costf, 'g', lw=2, label=r'$x_{FISTA} (niter=%d)$' % niterf) +ax.semilogy(costa, 'm', lw=2, label=r'$x_{AA-ISTA} (niter=%d)$' % nitera) ax.semilogy(costt, 'b', lw=2, label=r'$x_{TWIST} (niter=%d)$' % maxit) ax.set_title('Cost function', size=15, fontweight='bold') ax.set_xlabel('Iteration') @@ -134,6 +152,6 @@ def callback(x, pf, pg, eps, cost): # cost function since this is a constrained problem. This is however greatly influenced # by the fact that we assume exact knowledge of the number of non-zero coefficients. # When this information is not available, IHT may become suboptimal. In this case the -# FISTA solver should always be preferred (over ISTA) and TwIST represents an -# alternative worth checking. +# FISTA or AA-ISTA solvers should always be preferred (over ISTA) and TwIST represents +# an alternative worth checking.