From 5412f0de4ce4597c922abb69a8e1f9d0dcfb15d7 Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Mon, 18 Nov 2024 09:29:37 +0300
Subject: [PATCH] feat: ToCupy operator

---
 docs/source/api/index.rst          |   2 +
 docs/source/gpu.rst                | 109 ++++++++++++++++++++++++++---
 pylops/basicoperators/__init__.py  |   4 ++
 pylops/basicoperators/blockdiag.py |  22 +++++-
 pylops/basicoperators/hstack.py    |  23 +++++-
 pylops/basicoperators/tocupy.py    |  60 ++++++++++++++++
 pylops/basicoperators/vstack.py    |  23 +++++-
 pylops/optimization/cls_basic.py   |  21 ++++--
 pylops/utils/backend.py            |  44 ++++++++++++
 9 files changed, 284 insertions(+), 24 deletions(-)
 create mode 100644 pylops/basicoperators/tocupy.py

diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index 59e40e51..da9beb36 100755
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -62,6 +62,8 @@ Basic operators
     Real
     Imag
     Conj
+    ToCupy
+    
 
 Smoothing and derivatives
 ~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/gpu.rst b/docs/source/gpu.rst
index ecc87120..a409532b 100755
--- a/docs/source/gpu.rst
+++ b/docs/source/gpu.rst
@@ -409,11 +409,21 @@ Geophysical subsurface characterization:
    when ``explicit=False``.
 
 
-Example
--------
+Examples
+--------
+
+Finally, let's briefly look at some example. 
+
+End-to-end GPU powered inverse problems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Finally, let's briefly look at an example. First we write a code snippet using
-``numpy`` arrays which PyLops will run on your CPU:
+First we consider the most common scenario when both the model and data 
+vectors fit onto the GPU memory. We can therefore simply replace all our 
+``numpy`` arrays with ``cupy`` arrays and solve the inverse problem of 
+interest end-to-end on the GPU. 
+
+Let's first write a code snippet using ``numpy`` arrays, which PyLops 
+will run on your CPU:
 
 .. code-block:: python
 
@@ -421,11 +431,14 @@ Finally, let's briefly look at an example. First we write a code snippet using
    G = np.random.normal(0, 1, (ny, nx)).astype(np.float32)
    x = np.ones(nx, dtype=np.float32)
 
+   # Create operator
    Gop = MatrixMult(G, dtype='float32')
-   y = Gop * x
+   
+   # Create data and invert
+   y = Gop @ x
    xest = Gop / y
 
-Now we write a code snippet using ``cupy`` arrays which PyLops will run on
+Now we write a code snippet using ``cupy`` arrays, which PyLops will run on
 your GPU:
 
 .. code-block:: python
@@ -434,8 +447,11 @@ your GPU:
    G = cp.random.normal(0, 1, (ny, nx)).astype(np.float32)
    x = cp.ones(nx, dtype=np.float32)
 
+   # Create operator
    Gop = MatrixMult(G, dtype='float32')
-   y = Gop * x
+   
+   # Create data and invert
+   y = Gop @ x
    xest = Gop / y
 
 The code is almost unchanged apart from the fact that we now use ``cupy`` arrays,
@@ -450,15 +466,90 @@ your GPU/TPU:
    G = jnp.array(np.random.normal(0, 1, (ny, nx)).astype(np.float32))
    x = jnp.ones(nx, dtype=np.float32)
 
+   # Create operator
    Gop = JaxOperator(MatrixMult(G, dtype='float32'))
-   y = Gop * x
+   
+   # Create data and invert
+   y = Gop @ x
    xest = Gop / y
 
    # Adjoint via AD
    xadj = Gop.rmatvecad(x, y)
 
+Again, the code is almost unchanged apart from the fact that we now use ``jax`` arrays.
+
+
+Mixed CPU-GPU powered inverse problems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let us now consider a more intricate scenario where we have acess to 
+a GPU-powered operator, however the model and/or data vectors are too large 
+to fit onto the memory. 
+
+For the sake of clarity, we consider a problem where 
+the operator can be written as a :class:`pylops.basicoperators.BlockDiag` of 
+PyLops operators. Note how, by simply sandwitching any of the GPU-powered 
+operator within two :class:`pylops.basicoperators.ToCupy` operators, we are 
+able to tell PyLops to transfer to the GPU only the part of the model vector 
+required by a given operator and transfer back the output to the  CPU before 
+forming the combine output vector (i.e., the output vector of the 
+:class:`pylops.basicoperators.BlockDiag`)
+
+.. code-block:: python
+
+   nops, n = 5, 4
+   Ms = [np.diag((i + 1) * np.ones(n, dtype=dtype)) \
+            for i in range(nops)]
+   Ms = [M.T @ M for M in Ms]
+
+   # Create operator
+   Mops = []
+   for iop in range(nops):
+      Mop = MatrixMult(cp.asarray(Ms[iop], dtype=dtype))
+      Top = ToCupy(Mop.dims, dtype=dtype)
+      Top1 = ToCupy(Mop.dimsd, dtype=dtype)
+      Mop = Top1.H @ Mop @ Top
+      Mops.append(Mop)
+   Mops = BlockDiag(Mops, forceflat=True)
+
+   # Create data and invert
+   x = np.ones(n * nops, dtype=dtype)
+   y = Mops @ x.ravel()
+   xest = Mops / y
+
+
+Finally, let us consider a problem where 
+the operator can be written as a :class:`pylops.basicoperators.VStack` of 
+PyLops operators and the model vector can be fully transferred to the GPU. 
+We can use again the :class:`pylops.basicoperators.ToCupy` operator, however this 
+time we will only use it to move the output of each operator to the CPU. 
+Since we are now in a special scenario, where the input of the overall 
+operator sits on the GPU and the output on the
+CPU, we need to inform the :class:`pylops.basicoperators.VStack` operator about this.
+This can be easily done using the additional ``inoutengine`` parameter. Let's
+see this with an example:
+
+.. code-block:: python
+
+   nops, n, m = 3, 4, 5
+   Ms = [np.random.normal(0, 1, (n, m)) for _ in range(nops)]
+
+   # Create operator
+   Mops = []
+   for iop in range(nops):
+      Mop = MatrixMult(cp.asarray(Ms[iop]), dtype=dtype)
+      Top1 = ToCupy(Mop.dimsd, dtype=dtype)
+      Mop = Top1.H @ Mop
+      Mops.append(Mop)
+   Mops = VStack(Mops, inoutengine=("numpy", "cupy"))
+
+   # Create data and invert
+   x = cp.ones(m, dtype=dtype)
+   y = Mops @ x.ravel()
+   xest = pylops_cgls(Mops, y, x0=cp.zeros_like(x))[0]
+
+**Note:**: this feature is currently not available for ``jax`` arrays.
 
-Again, the code is almost unchanged apart from the fact that we now use ``jax`` arrays,
 
 .. note::
 
diff --git a/pylops/basicoperators/__init__.py b/pylops/basicoperators/__init__.py
index d654e50d..5d3a3d3e 100755
--- a/pylops/basicoperators/__init__.py
+++ b/pylops/basicoperators/__init__.py
@@ -38,6 +38,7 @@
     Gradient                        Gradient.
     FirstDirectionalDerivative      First Directional derivative.
     SecondDirectionalDerivative     Second Directional derivative.
+    ToCupy                          Convert to CuPy.
 """
 
 from .functionoperator import *
@@ -72,6 +73,8 @@
 from .laplacian import *
 from .gradient import *
 from .directionalderivative import *
+from .tocupy import *
+
 
 __all__ = [
     "FunctionOperator",
@@ -107,4 +110,5 @@
     "Gradient",
     "FirstDirectionalDerivative",
     "SecondDirectionalDerivative",
+    "ToCupy",
 ]
diff --git a/pylops/basicoperators/blockdiag.py b/pylops/basicoperators/blockdiag.py
index 166ae137..4d9fbc36 100644
--- a/pylops/basicoperators/blockdiag.py
+++ b/pylops/basicoperators/blockdiag.py
@@ -21,7 +21,7 @@
 
 from pylops import LinearOperator
 from pylops.basicoperators import MatrixMult
-from pylops.utils.backend import get_array_module, inplace_set
+from pylops.utils.backend import get_array_module, get_module, inplace_set
 from pylops.utils.typing import DTypeLike, NDArray
 
 
@@ -48,6 +48,12 @@ class BlockDiag(LinearOperator):
         .. versionadded:: 2.2.0
 
         Force an array to be flattened after matvec and rmatvec.
+    inoutengine : :obj:`tuple`, optional
+        .. versionadded:: 2.4.0
+
+        Type of output vectors of `matvec` and `rmatvec. If ``None``, this is
+        inferred directly from the input vectors. Note that this is ignored
+        if ``nproc>1``.
     dtype : :obj:`str`, optional
         Type of elements in input array.
 
@@ -113,6 +119,7 @@ def __init__(
         ops: Sequence[LinearOperator],
         nproc: int = 1,
         forceflat: bool = None,
+        inoutengine: Optional[tuple] = None,
         dtype: Optional[DTypeLike] = None,
     ) -> None:
         self.ops = ops
@@ -149,6 +156,7 @@ def __init__(
         if self.nproc > 1:
             self.pool = mp.Pool(processes=nproc)
 
+        self.inoutengine = inoutengine
         dtype = _get_dtype(ops) if dtype is None else np.dtype(dtype)
         clinear = all([getattr(oper, "clinear", True) for oper in self.ops])
         super().__init__(
@@ -172,7 +180,11 @@ def nproc(self, nprocnew: int) -> None:
         self._nproc = nprocnew
 
     def _matvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[0])
+        )
         y = ncp.zeros(self.nops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_set(
@@ -183,7 +195,11 @@ def _matvec_serial(self, x: NDArray) -> NDArray:
         return y
 
     def _rmatvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[1])
+        )
         y = ncp.zeros(self.mops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_set(
diff --git a/pylops/basicoperators/hstack.py b/pylops/basicoperators/hstack.py
index b71e8723..4cd450d5 100644
--- a/pylops/basicoperators/hstack.py
+++ b/pylops/basicoperators/hstack.py
@@ -21,7 +21,7 @@
 
 from pylops import LinearOperator
 from pylops.basicoperators import MatrixMult
-from pylops.utils.backend import get_array_module, inplace_add, inplace_set
+from pylops.utils.backend import get_array_module, get_module, inplace_add, inplace_set
 from pylops.utils.typing import NDArray
 
 
@@ -48,6 +48,12 @@ class HStack(LinearOperator):
         .. versionadded:: 2.2.0
 
         Force an array to be flattened after matvec.
+    inoutengine : :obj:`tuple`, optional
+        .. versionadded:: 2.4.0
+
+        Type of output vectors of `matvec` and `rmatvec. If ``None``, this is
+        inferred directly from the input vectors. Note that this is ignored
+        if ``nproc>1``.
     dtype : :obj:`str`, optional
         Type of elements in input array.
 
@@ -112,6 +118,7 @@ def __init__(
         ops: Sequence[LinearOperator],
         nproc: int = 1,
         forceflat: bool = None,
+        inoutengine: Optional[tuple] = None,
         dtype: Optional[str] = None,
     ) -> None:
         self.ops = ops
@@ -139,6 +146,8 @@ def __init__(
         self.pool = None
         if self.nproc > 1:
             self.pool = mp.Pool(processes=nproc)
+
+        self.inoutengine = inoutengine
         dtype = _get_dtype(self.ops) if dtype is None else np.dtype(dtype)
         clinear = all([getattr(oper, "clinear", True) for oper in self.ops])
         super().__init__(
@@ -162,7 +171,11 @@ def nproc(self, nprocnew: int):
         self._nproc = nprocnew
 
     def _matvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[0])
+        )
         y = ncp.zeros(self.nops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_add(
@@ -173,7 +186,11 @@ def _matvec_serial(self, x: NDArray) -> NDArray:
         return y
 
     def _rmatvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[1])
+        )
         y = ncp.zeros(self.mops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_set(
diff --git a/pylops/basicoperators/tocupy.py b/pylops/basicoperators/tocupy.py
new file mode 100644
index 00000000..8a3a7c6b
--- /dev/null
+++ b/pylops/basicoperators/tocupy.py
@@ -0,0 +1,60 @@
+__all__ = ["ToCupy"]
+
+from typing import Union
+
+import numpy as np
+
+from pylops import LinearOperator
+from pylops.utils._internal import _value_or_sized_to_tuple
+from pylops.utils.backend import to_cupy, to_numpy
+from pylops.utils.typing import DTypeLike, InputDimsLike, NDArray
+
+
+class ToCupy(LinearOperator):
+    r"""Convert to CuPy.
+
+    Convert an input array to CuPy in forward mode,
+    and convert back to NumPy in adjoint mode.
+
+    Parameters
+    ----------
+    dims : :obj:`list` or :obj:`int`
+        Number of samples for each dimension
+    dtype : :obj:`str`, optional
+        Type of elements in input array.
+    name : :obj:`str`, optional
+        Name of operator (to be used by :func:`pylops.utils.describe.describe`)
+
+    Attributes
+    ----------
+    shape : :obj:`tuple`
+        Operator shape
+    explicit : :obj:`bool`
+        Operator contains a matrix that can be solved explicitly
+        (``True``) or not (``False``)
+
+    Notes
+    -----
+    The ToCupy operator is a special operator that does not perform
+    any transformation on the input arrays other than converting
+    them from NumPy to CuPy. This operator can be used when one
+    is interested to create a chain of operators where only one
+    (or some of them) act on CuPy arrays, whilst other operate
+    on NumPy arrays.
+
+    """
+
+    def __init__(
+        self,
+        dims: Union[int, InputDimsLike],
+        dtype: DTypeLike = "float64",
+        name: str = "C",
+    ) -> None:
+        dims = _value_or_sized_to_tuple(dims)
+        super().__init__(dtype=np.dtype(dtype), dims=dims, dimsd=dims, name=name)
+
+    def _matvec(self, x: NDArray) -> NDArray:
+        return to_cupy(x)
+
+    def _rmatvec(self, x: NDArray) -> NDArray:
+        return to_numpy(x)
diff --git a/pylops/basicoperators/vstack.py b/pylops/basicoperators/vstack.py
index 0d66642e..55341411 100644
--- a/pylops/basicoperators/vstack.py
+++ b/pylops/basicoperators/vstack.py
@@ -21,7 +21,7 @@
 
 from pylops import LinearOperator
 from pylops.basicoperators import MatrixMult
-from pylops.utils.backend import get_array_module, inplace_add, inplace_set
+from pylops.utils.backend import get_array_module, get_module, inplace_add, inplace_set
 from pylops.utils.typing import DTypeLike, NDArray
 
 
@@ -48,6 +48,12 @@ class VStack(LinearOperator):
         .. versionadded:: 2.2.0
 
         Force an array to be flattened after rmatvec.
+    inoutengine : :obj:`tuple`, optional
+        .. versionadded:: 2.4.0
+
+        Type of output vectors of `matvec` and `rmatvec. If ``None``, this is
+        inferred directly from the input vectors. Note that this is ignored
+        if ``nproc>1``.
     dtype : :obj:`str`, optional
         Type of elements in input array.
 
@@ -112,6 +118,7 @@ def __init__(
         ops: Sequence[LinearOperator],
         nproc: int = 1,
         forceflat: bool = None,
+        inoutengine: Optional[tuple] = None,
         dtype: Optional[DTypeLike] = None,
     ) -> None:
         self.ops = ops
@@ -139,6 +146,8 @@ def __init__(
         self.pool = None
         if self.nproc > 1:
             self.pool = mp.Pool(processes=nproc)
+
+        self.inoutengine = inoutengine
         dtype = _get_dtype(self.ops) if dtype is None else np.dtype(dtype)
         clinear = all([getattr(oper, "clinear", True) for oper in self.ops])
         super().__init__(
@@ -162,7 +171,11 @@ def nproc(self, nprocnew: int):
         self._nproc = nprocnew
 
     def _matvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[0])
+        )
         y = ncp.zeros(self.nops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_set(
@@ -171,7 +184,11 @@ def _matvec_serial(self, x: NDArray) -> NDArray:
         return y
 
     def _rmatvec_serial(self, x: NDArray) -> NDArray:
-        ncp = get_array_module(x)
+        ncp = (
+            get_array_module(x)
+            if self.inoutengine is None
+            else get_module(self.inoutengine[1])
+        )
         y = ncp.zeros(self.mops, dtype=self.dtype)
         for iop, oper in enumerate(self.ops):
             y = inplace_add(
diff --git a/pylops/optimization/cls_basic.py b/pylops/optimization/cls_basic.py
index 2e848fcf..b7c98e0a 100644
--- a/pylops/optimization/cls_basic.py
+++ b/pylops/optimization/cls_basic.py
@@ -10,7 +10,12 @@
 import numpy as np
 
 from pylops.optimization.basesolver import Solver
-from pylops.utils.backend import get_array_module, to_numpy
+from pylops.utils.backend import (
+    get_array_module,
+    to_cupy_conditional,
+    to_numpy,
+    to_numpy_conditional,
+)
 from pylops.utils.typing import NDArray
 
 if TYPE_CHECKING:
@@ -131,10 +136,10 @@ def step(self, x: NDArray, show: bool = False) -> NDArray:
             Updated model vector
 
         """
-        Opc = self.Op.matvec(self.c)
+        Opc = self.Op.matvec(to_cupy_conditional(x, self.c))
         cOpc = self.ncp.abs(self.c.dot(Opc.conj()))
         a = self.kold / cOpc
-        x += a * self.c
+        x += to_cupy_conditional(x, a) * to_cupy_conditional(x, self.c)
         self.r -= a * Opc
         k = self.ncp.abs(self.r.dot(self.r.conj()))
         b = k / self.kold
@@ -386,7 +391,7 @@ def step(self, x: NDArray, show: bool = False) -> NDArray:
             self.q.dot(self.q.conj()) + self.damp * self.c.dot(self.c.conj())
         )
         x = x + a * self.c
-        self.s = self.s - a * self.q
+        self.s = self.s - to_numpy_conditional(self.q, a) * self.q
         r = self.Op.rmatvec(self.s) - self.damp * x
         k = self.ncp.abs(r.dot(r.conj()))
         b = k / self.kold
@@ -773,7 +778,9 @@ def step(self, x: NDArray, show: bool = False) -> NDArray:
         # next beta, u, alfa, v. These satisfy the relations
         # beta*u = Op*v - alfa*u,
         # alfa*v = Op'*u - beta*v'
-        self.u = self.Op.matvec(self.v) - self.alfa * self.u
+        self.u = (
+            self.Op.matvec(self.v) - to_numpy_conditional(self.u, self.alfa) * self.u
+        )
         self.beta = self.ncp.linalg.norm(self.u)
         if self.beta > 0:
             self.u = self.u / self.beta
@@ -812,7 +819,9 @@ def step(self, x: NDArray, show: bool = False) -> NDArray:
         self.w = self.v + self.t2 * self.w
         self.ddnorm = self.ddnorm + self.ncp.linalg.norm(self.dk) ** 2
         if self.calc_var:
-            self.var = self.var + self.ncp.dot(self.dk, self.dk)
+            self.var = self.var + to_numpy_conditional(
+                self.var, self.ncp.dot(self.dk, self.dk)
+            )
 
         # use a plane rotation on the right to eliminate the
         # super-diagonal element (theta) of the upper-bidiagonal matrix.
diff --git a/pylops/utils/backend.py b/pylops/utils/backend.py
index 9b7d3b2d..31836265 100644
--- a/pylops/utils/backend.py
+++ b/pylops/utils/backend.py
@@ -17,8 +17,10 @@
     "get_sp_fft",
     "get_complex_dtype",
     "get_real_dtype",
+    "to_cupy",
     "to_numpy",
     "to_cupy_conditional",
+    "to_numpy_conditional",
     "inplace_set",
     "inplace_add",
     "inplace_multiply",
@@ -484,6 +486,26 @@ def get_real_dtype(dtype: DTypeLike) -> DTypeLike:
     return np.real(np.ones(1, dtype)).dtype
 
 
+def to_cupy(x: NDArray) -> NDArray:
+    """Convert x to cupy array
+
+    Parameters
+    ----------
+    x : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
+        Array to evaluate
+
+    Returns
+    -------
+    x : :obj:`numpy.ndarray`
+        Converted array
+
+    """
+    if deps.cupy_enabled:
+        if cp.get_array_module(x) == np:
+            x = cp.asarray(x)
+    return x
+
+
 def to_numpy(x: NDArray) -> NDArray:
     """Convert x to numpy array
 
@@ -527,6 +549,28 @@ def to_cupy_conditional(x: npt.ArrayLike, y: npt.ArrayLike) -> NDArray:
     return y
 
 
+def to_numpy_conditional(x: npt.ArrayLike, y: npt.ArrayLike) -> NDArray:
+    """Convert y to numpy array conditional to x being a numpy array
+
+    Parameters
+    ----------
+    x : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
+        Array to evaluate
+    y : :obj:`numpy.ndarray`
+        Array to convert
+
+    Returns
+    -------
+    y : :obj:`cupy.ndarray`
+        Converted array
+
+    """
+    if deps.cupy_enabled:
+        if cp.get_array_module(x) == np and cp.get_array_module(y) == cp:
+            y = cp.asnumpy(y)
+    return y
+
+
 def inplace_set(x: npt.ArrayLike, y: npt.ArrayLike, idx: list) -> NDArray:
     """Perform inplace set based on input