From a4593a3eb9051683d22b1d1aea2727c048d78a11 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Fri, 23 Aug 2024 10:30:05 -0500
Subject: [PATCH 1/7] feat: added AndersonProximalGradient

---
 docs/source/api/index.rst           |   1 +
 pyproximal/optimization/__init__.py |   2 +
 pyproximal/optimization/primal.py   | 202 +++++++++++++++++++++++++++-
 pyproximal/proximal/Simplex.py      |   2 +-
 tutorials/twist.py                  |  34 +++--
 5 files changed, 229 insertions(+), 12 deletions(-)

diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 4000054..492124c 100755
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -157,6 +157,7 @@ Primal
     AcceleratedProximalGradient
     ADMM
     ADMML2
+    AndersonProximalGradient
     GeneralizedProximalGradient
     HQS
     LinearizedADMM
diff --git a/pyproximal/optimization/__init__.py b/pyproximal/optimization/__init__.py
index 6dd04b7..f3beba8 100644
--- a/pyproximal/optimization/__init__.py
+++ b/pyproximal/optimization/__init__.py
@@ -11,6 +11,8 @@
     ProximalPoint                   Proximal point algorithm (or proximal min.)
     ProximalGradient                Proximal gradient algorithm
     AcceleratedProximalGradient     Accelerated Proximal gradient algorithm
+    AndersonProximalGradient        Proximal gradient algorithm with Anderson acceleration
+    GeneralizedProximalGradient     Generalized Proximal gradient algorithm
     HQS                             Half Quadrating Splitting
     ADMM                            Alternating Direction Method of Multipliers
     ADMML2                          ADMM with L2 misfit term
diff --git a/pyproximal/optimization/primal.py b/pyproximal/optimization/primal.py
index 1bee7e0..2a163b8 100644
--- a/pyproximal/optimization/primal.py
+++ b/pyproximal/optimization/primal.py
@@ -156,7 +156,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1.,
         Proximal operator of g function
     x0 : :obj:`numpy.ndarray`
         Initial vector
-    epsg : :obj:`float` or :obj:`np.ndarray`, optional
+    epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
         Scaling factor of g function
     tau : :obj:`float` or :obj:`numpy.ndarray`, optional
         Positive scalar weight, which should satisfy the following condition
@@ -195,7 +195,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1.,
 
     Notes
     -----
-    The Proximal point algorithm can be expressed by the following recursion:
+    The Proximal gradient algorithm can be expressed by the following recursion:
 
     .. math::
 
@@ -359,6 +359,202 @@ def AcceleratedProximalGradient(proxf, proxg, x0, tau=None, beta=0.5,
                             callback=callback, show=show)
 
 
+def AndersonProximalGradient(proxf, proxg, x0, epsg=1.,
+                             tau=None, niter=10, 
+                             nhistory=10, epsr=1e-10, 
+                             safeguard=False, tol=None, 
+                             callback=None, show=False):
+    r"""Proximal gradient with Anderson acceleration
+
+    Solves the following minimization problem using the Proximal
+    gradient algorithm with Anderson acceleration:
+
+    .. math::
+
+        \mathbf{x} = \argmin_\mathbf{x} f(\mathbf{x}) + \epsilon g(\mathbf{x})
+
+    where :math:`f(\mathbf{x})` is a smooth convex function with a uniquely
+    defined gradient and :math:`g(\mathbf{x})` is any convex function that
+    has a known proximal operator.
+
+    Parameters
+    ----------
+    proxf : :obj:`pyproximal.ProxOperator`
+        Proximal operator of f function (must have ``grad`` implemented)
+    proxg : :obj:`pyproximal.ProxOperator`
+        Proximal operator of g function
+    x0 : :obj:`numpy.ndarray`
+        Initial vector
+    epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
+        Scaling factor of g function
+    tau : :obj:`float` or :obj:`numpy.ndarray`, optional
+        Positive scalar weight, which should satisfy the following condition
+        to guarantees convergence: :math:`\tau  \in (0, 1/L]` where ``L`` is
+        the Lipschitz constant of :math:`\nabla f`. When ``tau=None``,
+        backtracking is used to adaptively estimate the best tau at each
+        iteration. Finally, note that :math:`\tau` can be chosen to be a vector
+        when dealing with problems with multiple right-hand-sides
+    niter : :obj:`int`, optional
+        Number of iterations of iterative scheme
+    nhistory : :obj:`int`, optional
+        Number of previous iterates to be kept in memory (to compute the scaling factors
+    epsr : :obj:`float`, optional
+        Scaling factor for regularization added to the inverse of :math:\mathbf{R}^T \mathbf{R}`
+    safeguard : :obj:`bool`, optional
+        Apply safeguarding strategy to the update (``True``) or not (``False``)
+    tol : :obj:`float`, optional
+        Tolerance on change of objective function (used as stopping criterion). If
+        ``tol=None``, run until ``niter`` is reached
+    callback : :obj:`callable`, optional
+        Function with signature (``callback(x)``) to call after each iteration
+        where ``x`` is the current model vector
+    show : :obj:`bool`, optional
+        Display iterations log
+
+    Returns
+    -------
+    x : :obj:`numpy.ndarray`
+        Inverted model
+
+    Notes
+    -----
+    The Proximal gradient algorithm with Anderson acceleration can be expressed by the 
+    following recursion [1]_:
+
+    .. math::
+        m_k = min(m, k)\\
+        \mathbf{g}^{k} = \mathbf{x}^{k} - \tau^k \nabla f(\mathbf{x}^k)\\
+        \mathbf{r}^{k} = \mathbf{g}^{k} - \mathbf{g}^{k}\\
+        \mathbf{G}^{k} = [\mathbf{g}^{k},..., \mathbf{g}^{k-m_k}]\\
+        \mathbf{R}^{k} = [\mathbf{r}^{k},..., \mathbf{r}^{k-m_k}]\\
+        \alpha_k = (\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1} / \mathbf{1}^T
+        (\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1}\\
+        \mathbf{y}^{k+1} = \mathbf{G}^{k} \alpha_k\\
+        \mathbf{x}^{k+1} = \prox_{\tau^{k+1} g}(\mathbf{y}^{k+1})
+
+    where :math:`m` equals ``nhistory``, :math:`k=1,2,...,n_{iter}`, :math:`\mathbf{y}^{0}=\mathbf{x}^{0}`,
+    :math:`\mathbf{y}^{1}=\mathbf{x}^{0} - \tau^0 \nabla f(\mathbf{x}^0)`,
+    :math:`\mathbf{x}^{1}=\prox_{\tau^k g}(\mathbf{y}^{1})`, and 
+    :math:`\mathbf{g}^{0}=\mathbf{y}^{1}`.
+    
+    Refer to [1]_ for the guarded version of the algorithm (when ``safeguard=True``).
+
+    .. [1] Mai, V., and Johansson, M. "Anderson Acceleration of Proximal Gradient 
+       Methods", 2020.
+    
+    """
+    # check if epgs is a vector
+    if np.asarray(epsg).size == 1.:
+        epsg = epsg * np.ones(niter)
+        epsg_print = str(epsg[0])
+    else:
+        epsg_print = 'Multi'
+
+    if show:
+        tstart = time.time()
+        print('Proximal Gradient with Anderson Acceleration \n'
+              '---------------------------------------------------------\n'
+              'Proximal operator (f): %s\n'
+              'Proximal operator (g): %s\n'
+              'tau = %s\t\tepsg = %s\tniter = %d\n'
+              'nhist = %d\tepsr = %s\n'
+              'guard = %s\ttol = %s\n' % (type(proxf), type(proxg),
+                                          str(tau), epsg_print, niter,
+                                          nhistory, str(epsr), 
+                                          str(safeguard), str(tol)))
+        head = '   Itn       x[0]          f           g       J=f+eps*g       tau'
+        print(head)
+
+    # initialize model
+    y = x0 - tau * proxf.grad(x0)
+    x = proxg.prox(y, epsg[0] * tau)
+    g = y.copy()
+    r = g - x0
+    R, G = [g, ], [r, ]
+    pf = proxf(x)
+    pfg = np.inf
+    tolbreak = False
+
+    # iterate
+    for iiter in range(niter):
+        
+        # update fix point
+        g = x - tau * proxf.grad(x)
+        r = g - y
+
+        # update history vectors
+        R.insert(0, r)
+        G.insert(0, g)
+        if iiter >= nhistory - 1:
+           R.pop(-1)
+           G.pop(-1)
+        
+        # solve for alpha coefficients
+        Rstack = np.vstack(R)
+        Rinv = np.linalg.pinv(Rstack @ Rstack.T + epsr * np.linalg.norm(Rstack) ** 2)
+        ones = np.ones(min(nhistory, iiter + 2)) 
+        Rinvones = Rinv @ ones
+        alpha = Rinvones / (ones[None] @ Rinvones)
+        
+        if not safeguard:
+            # update auxiliary variable
+            y = np.vstack(G).T @ alpha
+
+            # update main variable
+            x = proxg.prox(y, epsg[iiter] * tau)
+        
+        else:
+            # update auxiliary variable
+            ytest = np.vstack(G).T @ alpha
+
+            # update main variable
+            xtest = proxg.prox(ytest, epsg[iiter] * tau)
+
+            # check if function is decreased, otherwise do basic PG step
+            pfold, pf = pf, proxf(xtest)
+            if pf <= pfold - tau * np.linalg.norm(proxf.grad(x)) ** 2 / 2:
+                y = ytest
+                x = xtest
+            else:
+                x = proxg.prox(g, epsg[iiter] * tau)
+                y = g
+
+        # run callback
+        if callback is not None:
+            callback(x)
+
+        # tolerance check: break iterations if overall
+        # objective does not decrease below tolerance
+        if tol is not None:
+            pfgold = pfg
+            pf, pg = proxf(x), proxg(x)
+            pfg = pf + np.sum(epsg[iiter] * pg)
+            if np.abs(1.0 - pfg / pfgold) < tol:
+                tolbreak = True
+
+        # show iteration logger
+        if show:
+            if iiter < 10 or niter - iiter < 10 or iiter % (niter // 10) == 0:
+                if tol is None:
+                    pf, pg = proxf(x), proxg(x)
+                    pfg = pf + np.sum(epsg[iiter] * pg)
+                msg = '%6g  %12.5e  %10.3e  %10.3e  %10.3e  %10.3e' % \
+                      (iiter + 1, np.real(to_numpy(x[0])) if x.ndim == 1 else np.real(to_numpy(x[0, 0])),
+                       pf, pg,
+                       pfg,
+                       tau)
+                print(msg)
+
+        # break if tolerance condition is met
+        if tolbreak:
+            break
+
+    if show:
+        print('\nTotal time (s) = %.2f' % (time.time() - tstart))
+        print('---------------------------------------------------------\n')
+    return x
+
+
 def GeneralizedProximalGradient(proxfs, proxgs, x0, tau,
                                 epsg=1., weights=None,
                                 eta=1., niter=10,
@@ -390,7 +586,7 @@ def GeneralizedProximalGradient(proxfs, proxgs, x0, tau,
         Positive scalar weight, which should satisfy the following condition
         to guarantees convergence: :math:`\tau  \in (0, 1/L]` where ``L`` is
         the Lipschitz constant of :math:`\sum_{i=1}^n \nabla f_i`.
-    epsg : :obj:`float` or :obj:`np.ndarray`, optional
+    epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
         Scaling factor(s) of ``g`` function(s)
     weights : :obj:`float`, optional
         Weighting factors of ``g`` functions. Must sum to 1.
diff --git a/pyproximal/proximal/Simplex.py b/pyproximal/proximal/Simplex.py
index c3d10a7..8850264 100644
--- a/pyproximal/proximal/Simplex.py
+++ b/pyproximal/proximal/Simplex.py
@@ -204,7 +204,7 @@ def Simplex(n, radius, dims=None, axis=-1, maxiter=100,
     Notes
     -----
     As the Simplex is an indicator function, the proximal operator corresponds
-    to its orthogonal projection (see :class:`pyprox.projection.SimplexProj`
+    to its orthogonal projection (see :class:`pyproximal.projection.SimplexProj`
     for details.
 
     Note that ``tau`` does not have effect for this proximal operator, any
diff --git a/tutorials/twist.py b/tutorials/twist.py
index 0d75c59..a6ce8a0 100644
--- a/tutorials/twist.py
+++ b/tutorials/twist.py
@@ -1,12 +1,13 @@
 r"""
-IHT, ISTA, FISTA, and TWIST for Compressive sensing
-===================================================
+IHT, ISTA, FISTA, AA-ISTA, and TWIST for Compressive sensing
+============================================================
 
-In this example we want to compare four popular solvers in compressive
-sensing problem, namely IHT, ISTA, FISTA, and TwIST. The first three can
+In this example we want to compare five popular solvers in compressive
+sensing problem, namely IHT, ISTA, FISTA, AA-ISTA, and TwIST. The first three can
 be implemented using the same solver, namely the
 :py:class:`pyproximal.optimization.primal.ProximalGradient`, whilst the latter
-is implemented in :py:class:`pyproximal.optimization.primal.TwIST`.
+two are implemented using the :py:class:`pyproximal.optimization.primal.AndersonProximalGradient` and
+:py:class:`pyproximal.optimization.primal.TwIST` solvers, respectively
 
 The IHT solver tries to solve the following unconstrained problem with a L0Ball
 regularization term:
@@ -48,7 +49,6 @@ def callback(x, pf, pg, eps, cost):
 A = np.random.randn(N, M)
 A = A / np.linalg.norm(A, axis=0)
 Aop = pylops.MatrixMult(A)
-Aop.explicit = False  # temporary solution whilst PyLops gets updated
 
 x = np.random.rand(M)
 x[x < 0.9] = 0
@@ -88,6 +88,17 @@ def callback(x, pf, pg, eps, cost):
                                                     callback=lambda x: callback(x, l2, l1, eps, costf))
 niterf = len(costf)
 
+# Anderson accelerated ISTA
+l1 = pyproximal.proximal.L1()
+l2 = pyproximal.proximal.L2(Op=Aop, b=y)
+costa = []
+x_ander = \
+    pyproximal.optimization.primal.AndersonProximalGradient(l2, l1, tau=tau, x0=np.zeros(M),
+                                                            epsg=eps, niter=maxit, 
+                                                            nhistory=5, show=False,
+                                                            callback=lambda x: callback(x, l2, l1, eps, costa))
+nitera = len(costa)
+
 # TWIST (Note that since the smallest eigenvalue is zero, we arbitrarily
 # choose a small value for the solver to converge stably)
 l1 = pyproximal.proximal.L1(sigma=eps)
@@ -111,6 +122,9 @@ def callback(x, pf, pg, eps, cost):
 m, s, b = ax.stem(x_fista, linefmt='--g', basefmt='--g',
                   markerfmt='go', label='FISTA')
 plt.setp(m, markersize=7)
+m, s, b = ax.stem(x_ander, linefmt='--m', basefmt='--m',
+                  markerfmt='mo', label='AA-ISTA')
+plt.setp(m, markersize=7)
 m, s, b = ax.stem(x_twist, linefmt='--b', basefmt='--b',
                   markerfmt='bo', label='TWIST')
 plt.setp(m, markersize=7)
@@ -118,9 +132,13 @@ def callback(x, pf, pg, eps, cost):
 ax.legend()
 plt.tight_layout()
 
+###############################################################################
+# Finally, let's compare the converge behaviour of the different algorithms
+
 fig, ax = plt.subplots(1, 1, figsize=(8, 3))
 ax.semilogy(costi, 'r', lw=2, label=r'$x_{ISTA} (niter=%d)$' % niteri)
 ax.semilogy(costf, 'g', lw=2, label=r'$x_{FISTA} (niter=%d)$' % niterf)
+ax.semilogy(costa, 'm', lw=2, label=r'$x_{AA-ISTA} (niter=%d)$' % nitera)
 ax.semilogy(costt, 'b', lw=2, label=r'$x_{TWIST} (niter=%d)$' % maxit)
 ax.set_title('Cost function', size=15, fontweight='bold')
 ax.set_xlabel('Iteration')
@@ -134,6 +152,6 @@ def callback(x, pf, pg, eps, cost):
 # cost function since this is a constrained problem. This is however greatly influenced
 # by the fact that we assume exact knowledge of the number of non-zero coefficients.
 # When this information is not available, IHT may become suboptimal. In this case the
-# FISTA solver should always be preferred (over ISTA) and TwIST represents an
-# alternative worth checking.
+# FISTA or AA-ISTA solvers should always be preferred (over ISTA) and TwIST represents 
+# an alternative worth checking.
 

From f17bb1301b9920644d9c1b68ad38ac4f4dfa77f4 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Fri, 23 Aug 2024 10:43:03 -0500
Subject: [PATCH 2/7] feat: added fungrad to Nonlinear

---
 pyproximal/proximal/Nonlinear.py | 12 ++++++++++++
 pytests/test_grads.py            |  1 +
 2 files changed, 13 insertions(+)

diff --git a/pyproximal/proximal/Nonlinear.py b/pyproximal/proximal/Nonlinear.py
index 4bf318c..4564732 100644
--- a/pyproximal/proximal/Nonlinear.py
+++ b/pyproximal/proximal/Nonlinear.py
@@ -14,6 +14,8 @@ class Nonlinear(ProxOperator):
     - ``fun``: a method evaluating the generic function :math:`f`
     - ``grad``: a method evaluating the gradient of the generic function
       :math:`f`
+    - ``fungrad``: a method evaluating both the generic function :math:`f` 
+      and its gradient
     - ``optimize``: a method that solves the optimization problem associated
       with the proximal operator of :math:`f`. Note that the
       ``gradprox`` method must be used (instead of ``grad``) as this will
@@ -58,6 +60,12 @@ def _funprox(self, x, tau):
     def _gradprox(self, x, tau):
         return self.grad(x) + 1. / tau * (x - self.y)
 
+    def _fungradprox(self, x, tau):
+        f, g = self.fungrad(x)
+        f = f + 1. / (2 * tau) * ((x - self.y) ** 2).sum()
+        g = g + 1. / tau * (x - self.y)
+        return f, g
+
     def fun(self, x):
         raise NotImplementedError('The method fun has not been implemented.'
                                   'Refer to the documentation for details on '
@@ -66,6 +74,10 @@ def grad(self, x):
         raise NotImplementedError('The method grad has not been implemented.'
                                   'Refer to the documentation for details on '
                                   'how to subclass this operator.')
+    def fungrad(self, x):
+        raise NotImplementedError('The method grad has not been implemented.'
+                                  'Refer to the documentation for details on '
+                                  'how to subclass this operator.')
     def optimize(self):
         raise NotImplementedError('The method optimize has not been implemented.'
                                   'Refer to the documentation for details on '
diff --git a/pytests/test_grads.py b/pytests/test_grads.py
index 88b8fd4..b121593 100644
--- a/pytests/test_grads.py
+++ b/pytests/test_grads.py
@@ -52,6 +52,7 @@ def test_l2(par):
                           raiseerror=True, atol=1e-3,
                           verb=False)
 
+
 @pytest.mark.parametrize("par", [(par1), (par2), (par1j), (par2j)])
 def test_lowrank(par):
     """LowRankFactorizedMatrix gradient

From 179064aaa4f92599eee5651f5b4468d3a022fbfd Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Fri, 23 Aug 2024 10:58:21 -0500
Subject: [PATCH 3/7] doc: increased iterations in twist tutorial

---
 tutorials/twist.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tutorials/twist.py b/tutorials/twist.py
index a6ce8a0..c83414a 100644
--- a/tutorials/twist.py
+++ b/tutorials/twist.py
@@ -59,7 +59,7 @@ def callback(x, pf, pg, eps, cost):
 L = np.abs((Aop.H * Aop).eigs(1)[0])
 tau = 0.95 / L
 eps = 5e-3
-maxit = 100
+maxit = 300
 
 # IHT
 l0 = pyproximal.proximal.L0Ball(3)
@@ -99,6 +99,7 @@ def callback(x, pf, pg, eps, cost):
                                                             callback=lambda x: callback(x, l2, l1, eps, costa))
 nitera = len(costa)
 
+
 # TWIST (Note that since the smallest eigenvalue is zero, we arbitrarily
 # choose a small value for the solver to converge stably)
 l1 = pyproximal.proximal.L1(sigma=eps)
@@ -135,11 +136,11 @@ def callback(x, pf, pg, eps, cost):
 ###############################################################################
 # Finally, let's compare the converge behaviour of the different algorithms
 
-fig, ax = plt.subplots(1, 1, figsize=(8, 3))
-ax.semilogy(costi, 'r', lw=2, label=r'$x_{ISTA} (niter=%d)$' % niteri)
-ax.semilogy(costf, 'g', lw=2, label=r'$x_{FISTA} (niter=%d)$' % niterf)
-ax.semilogy(costa, 'm', lw=2, label=r'$x_{AA-ISTA} (niter=%d)$' % nitera)
-ax.semilogy(costt, 'b', lw=2, label=r'$x_{TWIST} (niter=%d)$' % maxit)
+fig, ax = plt.subplots(1, 1, figsize=(8, 4))
+ax.loglog(costi, 'r', lw=2, label=r'$x_{ISTA} (niter=%d)$' % niteri)
+ax.loglog(costf, 'g', lw=2, label=r'$x_{FISTA} (niter=%d)$' % niterf)
+ax.loglog(costa, 'm', lw=2, label=r'$x_{AA-ISTA} (niter=%d)$' % nitera)
+ax.loglog(costt, 'b', lw=2, label=r'$x_{TWIST} (niter=%d)$' % maxit)
 ax.set_title('Cost function', size=15, fontweight='bold')
 ax.set_xlabel('Iteration')
 ax.legend()
@@ -154,4 +155,3 @@ def callback(x, pf, pg, eps, cost):
 # When this information is not available, IHT may become suboptimal. In this case the
 # FISTA or AA-ISTA solvers should always be preferred (over ISTA) and TwIST represents 
 # an alternative worth checking.
-

From 9acab7e5982713edc35bf64d0b60011ea517e103 Mon Sep 17 00:00:00 2001
From: Matteo Ravasi <matteoravasi@gmail.com>
Date: Fri, 13 Sep 2024 22:00:53 +0300
Subject: [PATCH 4/7] Numpy v2 support (#188)

* build: remove requirement to use numpy<2.0.0

* build: remove python 3.8 from GA

* build: temporarely remove mac from GA
---
 .github/workflows/build.yaml |  8 ++++----
 .readthedocs.yaml            |  2 +-
 .travis.yml                  | 22 ----------------------
 azure-pipelines.yml          |  6 +++---
 environment-dev-arm.yml      |  4 ++--
 environment-dev.yml          |  4 ++--
 environment.yml              |  4 ++--
 pyproject.toml               |  4 ++--
 requirements-dev.txt         |  4 ++--
 requirements-doc.txt         | 20 ++++++++++++++++++++
 10 files changed, 38 insertions(+), 40 deletions(-)
 delete mode 100644 .travis.yml
 create mode 100644 requirements-doc.txt

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8a6de9d..624bcdc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -6,18 +6,18 @@ jobs:
   build:
     strategy:
       matrix:
-        platform: [ ubuntu-latest, macos-latest ]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        platform: [ ubuntu-latest ] # temporarily removed macos-latest because of different behavior with numpy v2
+        python-version: ["3.9", "3.10", "3.11"]
 
     runs-on: ${{ matrix.platform }}
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Get history and tags for SCM versioning to work
       run: |
         git fetch --prune --unshallow
         git fetch --depth=1 origin +refs/tags/*:refs/tags/*
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 7edf0cb..364e88d 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -18,6 +18,6 @@ sphinx:
 # Declare the Python requirements required to build your docs
 python:
   install:
-    - requirements: requirements-dev.txt
+    - requirements: requirements-doc.txt
     - method: pip
       path: .
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index cb1179a..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-language: python
-python:
-- '3.8'
-os:
-- linux
-script:
-- pip3 install --upgrade setuptools
-- pip3 install -r requirements-dev.txt
-- pip3 install coverage codacy-coverage
-- pip3 install .
-- python3 setup.py test
-deploy:
-  provider: pypi
-  user: __token__
-  distributions: sdist bdist_wheel
-  skip_existing: true
-  skip_upload_docs: true
-  skip_cleanup: true
-  on:
-    tags: true
-  password:
-    secure: Er4noQo2sMKhtvtheKvvKKOfj8uNEhpjhBRf54y0S5e3HNK1U38nMZuKT7GXRwCDjPXzWRF6oZdlAyi1SRB0hPGtT4Ucm31k6qAEzi9Ph43T9BNIMIuIcEIwY3X2D9g5ySHyfEYLgGNAYzxttiKIhMPOk7vafUMsFUSh5ldZbY/ykSpcB8DiLKw+Z6+AV9pM5YTFn1Djn4pfC88G7tzFySw+b8BL9d8/hLAWmw70Kczh20l86zIOFV/CaW6ph9irssx9nrxz7W3Kb6YFl/QaOh34mEC0ZKoiz6LMdNAGX0RI0iCwtPHdUlPi2qSXXHLvLgqPXKjoqc2bTI19n39EBoSnIveyTYP1wUj3jRLG5pqeFr/Bo2Ti8By8Hye8Iqqdx2PT5wR8bWLiy+M1FMkeJjMT/ZvUudy00gg+7J/xRutBWhRmk2bZt6aCBG0NwpAVoN9UqssoXcYwFmRcOpleGNHo6Wi/0Rg59oAlN0PI+SCWMOcW1veKbsOgSi6nXisffgnZEsFWfEVZB0sLGMqFtBLAmY56PPKbEKqJXDZZ2MRcHDiZB0YfczewCeKdlPiRIQlpLDPu/xaUCiEAnvFZS1EKkpZ9F8/gpJey3e4UPUL+PPmAyMNA/4yBM5xrSjKVl09wuyzqvyTktBGDjaMHvZw0kwnysrmsdP5p30Rf3y8=
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5e7d229..cdd0fa1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -25,7 +25,7 @@ jobs:
 #   steps:
 #   - task: UsePythonVersion@0
 #     inputs:
-#       versionSpec: '3.8'
+#       versionSpec: '3.9'
 #       architecture: 'x64'
 #
 #   - script: |
@@ -54,7 +54,7 @@ jobs:
   steps:
   - task: UsePythonVersion@0
     inputs:
-      versionSpec: '3.8'
+      versionSpec: '3.9'
       architecture: 'x64'
 
   - script: |
@@ -83,7 +83,7 @@ jobs:
   steps:
   - task: UsePythonVersion@0
     inputs:
-      versionSpec: '3.8'
+      versionSpec: '3.9'
       architecture: 'x64'
 
   - script: |
diff --git a/environment-dev-arm.yml b/environment-dev-arm.yml
index 6e2f743..9402b0f 100644
--- a/environment-dev-arm.yml
+++ b/environment-dev-arm.yml
@@ -5,8 +5,8 @@ channels:
   - numba
 dependencies:
   - python>=3.8.12
-  - numpy>=1.15.0, <2.0.0
-  - scipy>=1.8.0
+  - numpy>=1.21.0
+  - scipy>=1.11.0
   - pylops>=2.0.0
   - scikit-image
   - matplotlib
diff --git a/environment-dev.yml b/environment-dev.yml
index 3855772..8095e9c 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -5,8 +5,8 @@ channels:
   - numba
 dependencies:
   - python>=3.8.12
-  - numpy>=1.15.0, <2.0.0
-  - scipy>=1.8.0
+  - numpy>=1.21.0
+  - scipy>=1.11.0
   - pylops>=2.0.0
   - scikit-image
   - matplotlib
diff --git a/environment.yml b/environment.yml
index 4ca4350..8f14a35 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,6 +3,6 @@ channels:
   - defaults
 dependencies:
   - python>=3.8.12
-  - numpy>=1.15.0, <2.0.0
-  - scipy>=1.8.0
+  - numpy>=1.21.0
+  - scipy>=1.11.0
   - pylops>=2.0.0
diff --git a/pyproject.toml b/pyproject.toml
index 0b61eea..2d49437 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,8 +30,8 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Mathematics",
 ]
 dependencies = [
-    "numpy >= 1.15.0, <2.0.0",
-    "scipy >= 1.8.0",
+    "numpy >= 1.21.0",
+    "scipy >= 1.11.0",
     "pylops >= 2.0.0",
 ]
 dynamic = ["version"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ad0b3f5..b762f5c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,5 @@
-numpy>=1.15.0, <2.0.0
-scipy>=1.8.0
+numpy>=1.21.0
+scipy>=1.11.0
 pylops>=2.0.0
 numba
 scikit-image
diff --git a/requirements-doc.txt b/requirements-doc.txt
new file mode 100644
index 0000000..b762f5c
--- /dev/null
+++ b/requirements-doc.txt
@@ -0,0 +1,20 @@
+numpy>=1.21.0
+scipy>=1.11.0
+pylops>=2.0.0
+numba
+scikit-image
+matplotlib
+ipython
+bm4d<4.2.4 # temporary as gclib problem arises in readthedocs
+bm3d<4.0.2 # temporary as gclib problem arises in readthedocs
+pytest
+pytest-runner
+setuptools_scm
+docutils<0.18
+Sphinx
+sphinx-gallery
+pydata-sphinx-theme
+numpydoc
+nbsphinx
+image
+sphinxemoji
\ No newline at end of file

From 660bce700167c0fa2475f5375aec0c817292699f Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Tue, 10 Dec 2024 16:39:22 +0300
Subject: [PATCH 5/7] doc: small change in definition of Huber prox

---
 pyproximal/proximal/Huber.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproximal/proximal/Huber.py b/pyproximal/proximal/Huber.py
index 21ba6cb..c0b6ec6 100644
--- a/pyproximal/proximal/Huber.py
+++ b/pyproximal/proximal/Huber.py
@@ -32,8 +32,8 @@ class Huber(ProxOperator):
 
     .. math::
 
-        \prox^*_{\tau H_\alpha(\cdot)}(\mathbf{x}) =
-        \left( 1 - \frac{\tau}{\max\{\|\mathbf{x}\|_2, \tau\} + \alpha} \right) \mathbf{x}
+        \prox_{\tau H_\alpha(\cdot)}(\mathbf{x}) =
+        \left( 1 - \frac{\tau}{\max\{\|\mathbf{x}\|_2, \tau + \alpha \} } \right) \mathbf{x}
 
     """
     def __init__(self, alpha):

From 38d865090bc3eeb3f32cb9cfca648384a0bac578 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 11 Dec 2024 09:42:59 +0300
Subject: [PATCH 6/7] feat: added HuberCircular

---
 docs/source/api/index.rst         |  1 +
 pyproximal/optimization/primal.py |  2 +-
 pyproximal/proximal/Huber.py      | 79 ++++++++++++++++++++++++++++---
 pyproximal/proximal/__init__.py   |  3 +-
 pytests/test_norms.py             | 20 +++++++-
 5 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 492124c..4ca4c77 100755
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -66,6 +66,7 @@ Convex
     EuclideanBall
     Hankel
     Huber
+    HuberCircular
     Intersection
     L0
     L0Ball
diff --git a/pyproximal/optimization/primal.py b/pyproximal/optimization/primal.py
index 2a163b8..c2c6dcb 100644
--- a/pyproximal/optimization/primal.py
+++ b/pyproximal/optimization/primal.py
@@ -345,7 +345,7 @@ def AcceleratedProximalGradient(proxf, proxg, x0, tau=None, beta=0.5,
     r"""Accelerated Proximal gradient
 
     This is a thin wrapper around :func:`pyproximal.optimization.primal.ProximalGradient` with
-    ``vandenberghe`` or ``fista``acceleration. See :func:`pyproximal.optimization.primal.ProximalGradient`
+    ``vandenberghe`` or ``fista`` acceleration. See :func:`pyproximal.optimization.primal.ProximalGradient`
     for details.
 
     """
diff --git a/pyproximal/proximal/Huber.py b/pyproximal/proximal/Huber.py
index c0b6ec6..cc32aad 100644
--- a/pyproximal/proximal/Huber.py
+++ b/pyproximal/proximal/Huber.py
@@ -1,14 +1,78 @@
 import numpy as np
 
 from pyproximal.ProxOperator import _check_tau
-from pyproximal.projection import BoxProj
 from pyproximal import ProxOperator
+from pyproximal.proximal import L2, L1
 
 
 class Huber(ProxOperator):
     r"""Huber norm proximal operator.
 
-    Proximal operator of the Huber norm defined as:
+    Proximal operator of the Huber norm defined as 
+    :math:`H_\alpha(\mathbf{x}) = \sum_i H_\alpha(x_i)` where:
+
+    .. math::
+
+        H_\alpha(x_i) = 
+        \begin{cases}
+        \frac{|x_i|^2}{2 \alpha}, & |x_i| \leq \alpha \\
+        |x_i| - \frac{\alpha}{2}, & |x_i| > \alpha
+        \end{cases}
+
+    which behaves like a :math:`\ell_2` norm for :math:`|x_i| \leq \alpha` and a
+    :math:`\ell_1` norm for :math:`|x_i| > \alpha`.
+
+    Parameters
+    ----------
+    alpha : :obj:`float`
+        Huber parameter
+
+    Notes
+    -----
+    The Huber proximal operator is defined as:
+
+    .. math::
+
+        \prox_{\tau H_\alpha(\cdot)}(\mathbf{x}) =
+        \begin{cases}
+        \prox_{\frac{\tau}{2 \alpha} |x_i|^2}(x_i), & |x_i| \leq \alpha \\
+        \prox_{\tau |x_i|}(x_i), & |x_i| > \alpha
+        \end{cases}
+        
+    """
+    def __init__(self, alpha):
+        super().__init__(None, False)
+        self.alpha = alpha
+        self.l2 = L2(sigma=1. / self.alpha)
+        self.l1 = L1()
+
+    def __call__(self, x):
+        h = np.zeros_like(x)
+        xabs = np.abs(x)
+        mask = xabs > self.alpha
+        h[~mask] = xabs[~mask] ** 2 / (2. * self.alpha)
+        h[mask] = xabs[mask] - self.alpha / 2.
+        return np.sum(h)
+
+    @_check_tau
+    def prox(self, x, tau):
+        y = np.zeros_like(x)
+        xabs = np.abs(x)
+        mask = xabs > self.alpha
+        y[~mask] = self.l2.prox(x[~mask], tau)
+        y[mask] = self.l1.prox(x[mask], tau)
+        # alternative from https://math.stackexchange.com/questions/1650411/
+        # proximal-operator-of-the-huber-loss-function... currently commented
+        # as it does not provide the same result
+        # y = (1. - tau / np.maximum(np.abs(x), tau + self.alpha)) * x
+
+        return y
+    
+
+class HuberCircular(ProxOperator):
+    r"""Circular Huber norm proximal operator.
+
+    Proximal operator of the Circular Huber norm defined as:
 
     .. math::
 
@@ -18,8 +82,8 @@ class Huber(ProxOperator):
         \|\mathbf{x}\|_2 - \frac{\alpha}{2}, & \|\mathbf{x}\|_2 > \alpha \\
         \end{cases}
 
-    which behaves like a :math:`\ell_2` norm for :math:`|x_i| \leq \alpha` and a
-    :math:`\ell_1` norm for :math:`|x_i| < \alpha`.
+    which behaves like a :math:`\ell_2` norm for :math:`\|\mathbf{x}\|_2 \leq \alpha` and a
+    :math:`\ell_1` norm for :math:`\|\mathbf{x}\|_2 > \alpha`.
 
     Parameters
     ----------
@@ -28,13 +92,16 @@ class Huber(ProxOperator):
 
     Notes
     -----
-    The Huber proximal operator is defined as:
+    The Circular Huber proximal operator is defined as [1]_:
 
     .. math::
 
         \prox_{\tau H_\alpha(\cdot)}(\mathbf{x}) =
         \left( 1 - \frac{\tau}{\max\{\|\mathbf{x}\|_2, \tau + \alpha \} } \right) \mathbf{x}
 
+    .. [1] O’Donoghue, B. and Stathopoulos, G. and Boyd, S. "A Splitting Method for Optimal Control", 
+        In the IEEE Transactions on Control Systems Technology, 2013.
+        
     """
     def __init__(self, alpha):
         super().__init__(None, False)
@@ -51,4 +118,4 @@ def __call__(self, x):
     @_check_tau
     def prox(self, x, tau):
         x = (1. - tau / max(np.linalg.norm(x), tau + self.alpha)) * x
-        return x
\ No newline at end of file
+        return x
diff --git a/pyproximal/proximal/__init__.py b/pyproximal/proximal/__init__.py
index 26ef53f..22d803b 100644
--- a/pyproximal/proximal/__init__.py
+++ b/pyproximal/proximal/__init__.py
@@ -22,6 +22,7 @@
     L21	                            L2,1 Norm
     L21_plus_L1	                    L2,1 + L1 mixed-norm
     Huber	                        Huber Norm
+    HuberCircular	                Circular Huber Norm
     TV                              Total Variation Norm
     RelaxedMumfordShah              Relaxed Mumford Shah Norm
     Nuclear                         Nuclear Norm
@@ -69,7 +70,7 @@
 
 __all__ = ['Box', 'Simplex', 'Intersection', 'AffineSet', 'Quadratic',
            'Euclidean', 'EuclideanBall', 'L0', 'L0Ball', 'L01Ball', 'L1', 'L1Ball', 'L2',
-           'L2Convolve', 'L21', 'L21_plus_L1', 'Huber', 'TV', 'RelaxedMumfordShah',
+           'L2Convolve', 'L21', 'L21_plus_L1', 'Huber', 'HuberCircular', 'TV', 'RelaxedMumfordShah',
            'Nuclear', 'NuclearBall', 'Orthogonal', 'VStack', 'Nonlinear', 'SCAD',
            'Log', 'Log1', 'ETP', 'Geman', 'QuadraticEnvelopeCard', 'SingularValuePenalty',
            'QuadraticEnvelopeCardIndicator', 'QuadraticEnvelopeRankL2',
diff --git a/pytests/test_norms.py b/pytests/test_norms.py
index 104fe1f..4f0725b 100644
--- a/pytests/test_norms.py
+++ b/pytests/test_norms.py
@@ -5,8 +5,8 @@
 
 from pylops.basicoperators import Identity, Diagonal, MatrixMult, FirstDerivative
 from pyproximal.utils import moreau
-from pyproximal.proximal import Box, Euclidean, L2, L1, L21, L21_plus_L1, \
-    Huber, Nuclear, RelaxedMumfordShah, TV
+from pyproximal.proximal import Euclidean, L2, L1, L21, L21_plus_L1, \
+    Huber, HuberCircular, Nuclear, RelaxedMumfordShah, TV
 
 par1 = {'nx': 10, 'sigma': 1., 'dtype': 'float32'}  # even float32
 par2 = {'nx': 11, 'sigma': 2., 'dtype': 'float64'}  # odd float64
@@ -182,8 +182,24 @@ def test_Huber(par):
     """
     hub = Huber(alpha=par['sigma'])
 
+    # norm
+    x = np.random.uniform(0., 0.1 * par['sigma'], par['nx']).astype(par['dtype'])
+    assert hub(x) == np.sum(x ** 2) / (2 * par['sigma'])
+
+    # prox / dualprox
+    tau = 2.
+    assert moreau(hub, x, tau)
+
+
+@pytest.mark.parametrize("par", [(par1), (par2)])
+def test_HuberCircular(par):
+    """Circular Huber norm and proximal/dual proximal
+    """
+    hub = HuberCircular(alpha=par['sigma'])
+
     # norm
     x = np.random.uniform(0., 0.1, par['nx']).astype(par['dtype'])
+    x = (0.1 * par['sigma']) * x / np.linalg.norm(x)  # to ensure that is smaller than sigma
     assert hub(x) == np.linalg.norm(x) ** 2 / (2 * par['sigma'])
 
     # prox / dualprox

From f69c50030001e2cfaa037429da1600cc05b7eebc Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 11 Dec 2024 10:03:45 +0300
Subject: [PATCH 7/7] doc: prepare for release v0.10.0

---
 CHANGELOG.md              | 21 +++++++++++++++------
 docs/source/changelog.rst | 11 +++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 327925b..406545f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,14 +1,23 @@
 Changelog
 =========
 
+# 0.10.0
+
+* Added ``pyproximal.optimization.primal.AndersonProximalGradient`` solver
+* Added ``pyproximal.proximal.HuberCircular`` operator
+* Added `fungrad` method to ``pyproximal.proximal.Nonlinear``
+* Modified ``pyproximal.proximal.Huber`` operator as previously 
+  erroneously implemented using the definition of Circular Huber norm
+
+ 
 # 0.9.0
 
-* Added :py:class:`pyproximal.optimization.palm.iPALM` solver
-* Added :py:func:`pyproximal.optimization.palm._backtracking` method to be used when `gammaf=None` and/or `gammag=None`
-* Added :py:func:`pyproximal.utils.gradtest.gradtest_proximal` and :py:func:`pyproximal.utils.gradtest.gradtest_bilinear` methods
-* Added `tol` to :py:class:`pyproximal.optimization.primal.ProximalPoint` and
-  :py:class:`pyproximal.optimization.primal.ProximalGradient` solvers
-* Modified :py:class:`pyproximal.ProxOperator.precomposition` to allow `b` being also a vector
+* Added ``pyproximal.optimization.palm.iPALM`` solver
+* Added ``pyproximal.optimization.palm._backtracking`` method to be used when `gammaf=None` and/or `gammag=None`
+* Added ``pyproximal.utils.gradtest.gradtest_proximal`` and ``pyproximal.utils.gradtest.gradtest_bilinear`` methods
+* Added `tol` to ``pyproximal.optimization.primal.ProximalPoint`` and
+  ``pyproximal.optimization.primal.ProximalGradient`` solvers
+* Modified ``pyproximal.ProxOperator.precomposition`` to allow `b` being also a vector
 
 
 # 0.8.0
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index a0fc72b..e878bc4 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -3,6 +3,17 @@
 Changelog
 =========
 
+Version 0.10.0
+--------------
+*Released on: 11/12/2024*
+
+* Added :py:func:`pyproximal.optimization.primal.AndersonProximalGradient` solver
+* Added :py:class:`pyproximal.proximal.HuberCircular` operator
+* Added `fungrad` method to :py:class:`pyproximal.proximal.Nonlinear`
+* Modified :py:class:`pyproximal.proximal.Huber` operator as previously 
+  erroneously implemented using the definition of Circular Huber norm
+
+
 Version 0.9.0
 --------------
 *Released on: 16/08/2024*