flatironinstitute · WardBrian · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -34,4 +34,4 @@ jobs:
 
     - name: Pytest
       run: |
-        pytest tests/ --cov
+        pytest tests/ --cov -k "not cuda"
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,90 @@
+pipeline {
+  agent none
+  options {
+    disableConcurrentBuilds()
+    buildDiscarder(logRotator(numToKeepStr: '8', daysToKeepStr: '20'))
+    timeout(time: 1, unit: 'HOURS')
+  }
+  stages {
+    stage('main') {
+      agent {
+         dockerfile {
+            filename 'ci/docker/Dockerfile-cuda11.8'
+            args '--gpus 2'
+            label 'v100'
+         }
+      }
+      environment {
+    HOME = "$WORKSPACE"
+    PYBIN = "/opt/python/cp38-cp38/bin"
+    LIBRARY_PATH = "$WORKSPACE/finufft/build"
+    LD_LIBRARY_PATH = "$WORKSPACE/finufft/build"
+      }
+      steps {
+
+    // TODO - reconsider install strategy once finufft/cufinufft 2.2 is released
+    checkout([$class: 'GitSCM',
+            branches: [[name: '*/master']],
+            userRemoteConfigs: [[url: "https://github.com/flatironinstitute/finufft"]]]
+            )
+
+    sh '''#!/bin/bash -ex
+      nvidia-smi
+    '''
+    sh '''#!/bin/bash -ex
+      echo $HOME
+    '''
+    sh '''#!/bin/bash -ex
+        cd finufft
+        # v100 cuda arch
+        cuda_arch="70"
+
+        cmake -B build . -DFINUFFT_USE_CUDA=ON \
+                         -DFINUFFT_USE_CPU=OFF \
+                         -DFINUFFT_BUILD_TESTS=ON \
+                         -DCMAKE_CUDA_ARCHITECTURES="$cuda_arch" \
+                         -DBUILD_TESTING=ON
+        cd build
+        make -j4
+    '''
+
+    sh '${PYBIN}/python3 -m venv $HOME'
+    sh '''#!/bin/bash -ex
+      source $HOME/bin/activate
+      python3 -m pip install --upgrade pip
+      # we could also move pytorch install inside docker
+      python3 -m pip install "torch~=2.1.0" --index-url https://download.pytorch.org/whl/cu118
+      python3 -m pip install finufft/python/cufinufft
+      python3 -m pip install finufft/python/finufft
+
+      python3 -m pip install -e .[dev]
+
+      python3 -m pytest -k "cuda" tests/ --cov
+    '''
+      }
+    }
+  }
+  post {
+    failure {
+      emailext subject: '$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS',
+           body: '''$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS
+
+Check console output at $BUILD_URL to view full results.
+
+Building $BRANCH_NAME for $CAUSE
+$JOB_DESCRIPTION
+
+Chages:
+$CHANGES
+
+End of build log:
+${BUILD_LOG,maxLines=200}
+''',
+           recipientProviders: [
+         [$class: 'DevelopersRecipientProvider'],
+           ],
+           replyTo: '$DEFAULT_REPLYTO',
+           to: '[email protected]'
+    }
+  }
+}
diff --git a/ci/docker/Dockerfile-cuda11.8 b/ci/docker/Dockerfile-cuda11.8
@@ -0,0 +1,69 @@
+# Based on https://github.com/flatironinstitute/finufft/blob/master/tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64
+
+FROM quay.io/pypa/manylinux2014_x86_64
+LABEL maintainer "Brian Ward"
+
+ENV CUDA_MAJOR 11
+ENV CUDA_MINOR 8
+ENV CUDA_DASH_VERSION ${CUDA_MAJOR}-${CUDA_MINOR}
+ENV CUDA_DOT_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}
+
+# ---- The following block adds layers for CUDA --- #
+# base
+RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
+    echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict -
+
+COPY ci/docker/cuda.repo /etc/yum.repos.d/cuda.repo
+
+# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+RUN yum install -y \
+        cuda-cudart-${CUDA_DASH_VERSION} \
+        cuda-compat-${CUDA_DASH_VERSION} && \
+    ln -s cuda-${CUDA_DOT_VERSION} /usr/local/cuda && \
+    rm -rf /var/cache/yum/*
+
+# nvidia-docker 1.0
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=${CUDA_DOT_VERSION} brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441"
+
+# runtime
+RUN yum install -y \
+        cuda-libraries-${CUDA_DASH_VERSION} \
+        cuda-nvtx-${CUDA_DASH_VERSION} && \
+    rm -rf /var/cache/yum/*
+
+# devel
+RUN yum install -y \
+        cuda-cudart-devel-${CUDA_DASH_VERSION} \
+        cuda-libraries-devel-${CUDA_DASH_VERSION} \
+        cuda-nvprof-${CUDA_DASH_VERSION} \
+        cuda-nvcc-${CUDA_DASH_VERSION} && \
+    rm -rf /var/cache/yum/*
+
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+# /CUDA #
+
+# CUDA 11 doesn't work on gcc/g++ newer than v9
+RUN yum install -y \
+        devtoolset-9-gcc \
+        devtoolset-9-gcc-c++ && \
+    rm -rf /var/cache/yum/*
+
+ENV PATH /opt/rh/devtoolset-9/root/usr/bin:${PATH}
+
+# finufft reqs
+RUN yum install -y \
+        cmake && \
+    rm -rf /var/cache/yum/*
+
+RUN
diff --git a/ci/docker/cuda.repo b/ci/docker/cuda.repo
@@ -0,0 +1,6 @@
+[cuda]
+name=cuda
+baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64
+enabled=1
+gpgcheck=1
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
diff --git a/pytorch_finufft/functional.py b/pytorch_finufft/functional.py
@@ -6,6 +6,13 @@
 
 import numpy as np
 import finufft
+
+try:
+    import cufinufft
+
+    CUFINUFFT_AVAIL = True
+except:
+    CUFINUFFT_AVAIL = False
 import torch
 
 import pytorch_finufft._err as err
@@ -1595,27 +1602,40 @@ def backward(
         )
 
 
-
-
-
 ###############################################################################
 # Consolidated forward function for all 1D, 2D, and 3D problems for nufft type 1
 ###############################################################################
 
-def get_nufft_func(dim, nufft_type):
-    return getattr(finufft, f"nufft{dim}d{nufft_type}")
+
+def get_nufft_func(dim, nufft_type, device_type):
+    if device_type == "cuda":
+        return getattr(cufinufft, f"nufft{dim}d{nufft_type}")
+
+    # CPU needs extra work to go to/from torch and numpy
+    finufft_func = getattr(finufft, f"nufft{dim}d{nufft_type}")
+
+    def f(*args, **kwargs):
+        new_args = [arg for arg in args]
+        for i in range(len(new_args)):
+            if isinstance(new_args[i], torch.Tensor):
+                new_args[i] = new_args[i].data.numpy()
+
+        return torch.from_numpy(finufft_func(*new_args, **kwargs))
+
+    return f
 
 
 class finufft_type1(torch.autograd.Function):
     @staticmethod
     def forward(
-            ctx: Any,
-            points: torch.Tensor,
-            values: torch.Tensor,
-            output_shape: Union[int, tuple[int, int], tuple[int, int, int]],
-            out: Optional[torch.Tensor]=None,
-            fftshift: bool=False,
-            finufftkwargs: dict[str, Union[int, float]]=None):
+        ctx: Any,
+        points: torch.Tensor,
+        values: torch.Tensor,
+        output_shape: Union[int, tuple[int, int], tuple[int, int, int]],
+        out: Optional[torch.Tensor] = None,
+        fftshift: bool = False,
+        finufftkwargs: dict[str, Union[int, float]] = None,
+    ):
         """
         Evaluates the Type 1 NUFFT on the inputs.
 
@@ -1626,8 +1646,11 @@ def forward(
             # All this requires is a check on the out array to make sure it is the
             # correct shape.
 
-        err._type1_checks(points, values, output_shape) # revisit these error checks to take into account the shape of points instead of passing them separately
+        err._type1_checks(
+            points, values, output_shape
+        )  # revisit these error checks to take into account the shape of points instead of passing them separately
         # ^ make sure these checks check for consistency between output shape and len(points)
+        # need device checks
 
         if finufftkwargs is None:
             finufftkwargs = dict()
@@ -1654,17 +1677,13 @@ def forward(
         ndim = points.shape[0]
         assert len(output_shape) == ndim
 
-        nufft_func = get_nufft_func(ndim, 1)
-        finufft_out = torch.from_numpy(
-            nufft_func(
-                *points.data.numpy(),
-                values.data.numpy(),
-                output_shape,
-                modeord=_mode_ordering,
-                isign=_i_sign,
-                **finufftkwargs,
-            )
+        nufft_func = get_nufft_func(ndim, 1, points.device.type)
+        finufft_out = nufft_func(
+            *points, values, output_shape, isign=_i_sign, **finufftkwargs
         )
+        # because modeord is missing from cufinufft
+        if _mode_ordering:
+            finufft_out = torch.fft.ifftshift(finufft_out)
 
         return finufft_out
 
@@ -1695,52 +1714,57 @@ def backward(
 
         start_points = -(np.array(grad_output.shape) // 2)
         end_points = start_points + grad_output.shape
-        slices = tuple(slice(start, end) for start, end in zip(start_points, end_points))
+        slices = tuple(
+            slice(start, end) for start, end in zip(start_points, end_points)
+        )
 
         # CPU idiosyncracy that needs to be done differently
-        coord_ramps = torch.from_numpy(np.mgrid[slices])
+        coord_ramps = torch.from_numpy(np.mgrid[slices]).to(points.device)
 
         grads_points = None
         grad_values = None
 
         ndim = points.shape[0]
 
-        nufft_func = get_nufft_func(ndim, 2)
+        nufft_func = get_nufft_func(ndim, 2, points.device.type)
 
         if ctx.needs_input_grad[0]:
             # wrt points
 
-            if _mode_ordering != 0:
-                coord_ramps = torch.fft.ifftshift(coord_ramps, dim=tuple(range(1, ndim+1)))
-
+            if _mode_ordering:
+                coord_ramps = torch.fft.ifftshift(
+                    coord_ramps, dim=tuple(range(1, ndim + 1))
+                )
+
             ramped_grad_output = coord_ramps * grad_output[np.newaxis] * 1j * _i_sign
 
             grads_points = []
-            for ramp in ramped_grad_output: # we can batch this into finufft
-                backprop_ramp = torch.from_numpy(
-                    nufft_func(
-                        *points.numpy(),
-                        ramp.data.numpy(),
-                        isign=_i_sign,
-                        modeord=_mode_ordering,
-                        **finufftkwargs,
-                    ))
+            for ramp in ramped_grad_output:  # we can batch this into finufft
+                if _mode_ordering:
+                    ramp = torch.fft.fftshift(ramp)
+
+                backprop_ramp = nufft_func(
+                    *points,
+                    ramp,
+                    isign=_i_sign,
+                    **finufftkwargs,
+                )
+
                 grad_points = (backprop_ramp.conj() * values).real
+
                 grads_points.append(grad_points)
-            
+
             grads_points = torch.stack(grads_points)
 
         if ctx.needs_input_grad[1]:
-            np_grad_output = grad_output.data.numpy()
+            if _mode_ordering:
+                grad_output = torch.fft.fftshift(grad_output)
 
-            grad_values = torch.from_numpy(
-                nufft_func(
-                    *points.numpy(),
-                    np_grad_output,
-                    isign=_i_sign,
-                    modeord=_mode_ordering,
-                    **finufftkwargs,
-                )
+            grad_values = nufft_func(
+                *points,
+                grad_output,
+                isign=_i_sign,
+                **finufftkwargs,
             )
 
         return (