Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic CUDA support #68

Merged
merged 17 commits into from
Oct 6, 2023
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ jobs:

- name: Pytest
run: |
pytest tests/ --cov
pytest tests/ --cov -k "not cuda"
90 changes: 90 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
pipeline {
agent none
options {
disableConcurrentBuilds()
buildDiscarder(logRotator(numToKeepStr: '8', daysToKeepStr: '20'))
timeout(time: 1, unit: 'HOURS')
}
stages {
stage('main') {
agent {
dockerfile {
filename 'ci/docker/Dockerfile-cuda11.8'
args '--gpus 2'
label 'v100'
}
}
environment {
HOME = "$WORKSPACE"
PYBIN = "/opt/python/cp38-cp38/bin"
LIBRARY_PATH = "$WORKSPACE/finufft/build"
LD_LIBRARY_PATH = "$WORKSPACE/finufft/build"
}
steps {

// TODO - reconsider install strategy once finufft/cufinufft 2.2 is released
checkout([$class: 'GitSCM',
branches: [[name: '*/master']],
userRemoteConfigs: [[url: "https://github.com/flatironinstitute/finufft"]]]
)

sh '''#!/bin/bash -ex
nvidia-smi
'''
sh '''#!/bin/bash -ex
echo $HOME
'''
sh '''#!/bin/bash -ex
cd finufft
# v100 cuda arch
cuda_arch="70"

cmake -B build . -DFINUFFT_USE_CUDA=ON \
-DFINUFFT_USE_CPU=OFF \
-DFINUFFT_BUILD_TESTS=ON \
-DCMAKE_CUDA_ARCHITECTURES="$cuda_arch" \
-DBUILD_TESTING=ON
cd build
make -j4
'''

sh '${PYBIN}/python3 -m venv $HOME'
sh '''#!/bin/bash -ex
source $HOME/bin/activate
python3 -m pip install --upgrade pip
# we could also move pytorch install inside docker
python3 -m pip install "torch~=2.1.0" --index-url https://download.pytorch.org/whl/cu118
python3 -m pip install finufft/python/cufinufft
python3 -m pip install finufft/python/finufft

python3 -m pip install -e .[dev]

python3 -m pytest -k "cuda" tests/ --cov
'''
}
}
}
post {
failure {
emailext subject: '$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS',
body: '''$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS

Check console output at $BUILD_URL to view full results.

Building $BRANCH_NAME for $CAUSE
$JOB_DESCRIPTION

Chages:
$CHANGES

End of build log:
${BUILD_LOG,maxLines=200}
''',
recipientProviders: [
[$class: 'DevelopersRecipientProvider'],
],
replyTo: '$DEFAULT_REPLYTO',
to: '[email protected]'
}
}
}
69 changes: 69 additions & 0 deletions ci/docker/Dockerfile-cuda11.8
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Based on https://github.com/flatironinstitute/finufft/blob/master/tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64

FROM quay.io/pypa/manylinux2014_x86_64
LABEL maintainer "Brian Ward"

ENV CUDA_MAJOR 11
ENV CUDA_MINOR 8
ENV CUDA_DASH_VERSION ${CUDA_MAJOR}-${CUDA_MINOR}
ENV CUDA_DOT_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}

# ---- The following block adds layers for CUDA --- #
# base
RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
echo "$NVIDIA_GPGKEY_SUM /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict -

COPY ci/docker/cuda.repo /etc/yum.repos.d/cuda.repo

# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
RUN yum install -y \
cuda-cudart-${CUDA_DASH_VERSION} \
cuda-compat-${CUDA_DASH_VERSION} && \
ln -s cuda-${CUDA_DOT_VERSION} /usr/local/cuda && \
rm -rf /var/cache/yum/*

# nvidia-docker 1.0
RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf

ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

# nvidia-container-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=${CUDA_DOT_VERSION} brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441"

# runtime
RUN yum install -y \
cuda-libraries-${CUDA_DASH_VERSION} \
cuda-nvtx-${CUDA_DASH_VERSION} && \
rm -rf /var/cache/yum/*

# devel
RUN yum install -y \
cuda-cudart-devel-${CUDA_DASH_VERSION} \
cuda-libraries-devel-${CUDA_DASH_VERSION} \
cuda-nvprof-${CUDA_DASH_VERSION} \
cuda-nvcc-${CUDA_DASH_VERSION} && \
rm -rf /var/cache/yum/*

ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs

# /CUDA #

# CUDA 11 doesn't work on gcc/g++ newer than v9
RUN yum install -y \
devtoolset-9-gcc \
devtoolset-9-gcc-c++ && \
rm -rf /var/cache/yum/*

ENV PATH /opt/rh/devtoolset-9/root/usr/bin:${PATH}

# finufft reqs
RUN yum install -y \
cmake && \
rm -rf /var/cache/yum/*

RUN
6 changes: 6 additions & 0 deletions ci/docker/cuda.repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[cuda]
name=cuda
baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64
enabled=1
gpgcheck=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
120 changes: 72 additions & 48 deletions pytorch_finufft/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@

import numpy as np
import finufft

try:
import cufinufft

CUFINUFFT_AVAIL = True
except:
CUFINUFFT_AVAIL = False
import torch

import pytorch_finufft._err as err
Expand Down Expand Up @@ -1595,27 +1602,40 @@ def backward(
)





###############################################################################
# Consolidated forward function for all 1D, 2D, and 3D problems for nufft type 1
###############################################################################

def get_nufft_func(dim, nufft_type):
return getattr(finufft, f"nufft{dim}d{nufft_type}")

def get_nufft_func(dim, nufft_type, device_type):
if device_type == "cuda":
return getattr(cufinufft, f"nufft{dim}d{nufft_type}")

# CPU needs extra work to go to/from torch and numpy
finufft_func = getattr(finufft, f"nufft{dim}d{nufft_type}")

def f(*args, **kwargs):
new_args = [arg for arg in args]
for i in range(len(new_args)):
if isinstance(new_args[i], torch.Tensor):
new_args[i] = new_args[i].data.numpy()

return torch.from_numpy(finufft_func(*new_args, **kwargs))
WardBrian marked this conversation as resolved.
Show resolved Hide resolved

return f


class finufft_type1(torch.autograd.Function):
@staticmethod
def forward(
ctx: Any,
points: torch.Tensor,
values: torch.Tensor,
output_shape: Union[int, tuple[int, int], tuple[int, int, int]],
out: Optional[torch.Tensor]=None,
fftshift: bool=False,
finufftkwargs: dict[str, Union[int, float]]=None):
ctx: Any,
points: torch.Tensor,
values: torch.Tensor,
output_shape: Union[int, tuple[int, int], tuple[int, int, int]],
out: Optional[torch.Tensor] = None,
fftshift: bool = False,
finufftkwargs: dict[str, Union[int, float]] = None,
):
"""
Evaluates the Type 1 NUFFT on the inputs.

Expand All @@ -1626,8 +1646,11 @@ def forward(
# All this requires is a check on the out array to make sure it is the
# correct shape.

err._type1_checks(points, values, output_shape) # revisit these error checks to take into account the shape of points instead of passing them separately
err._type1_checks(
points, values, output_shape
) # revisit these error checks to take into account the shape of points instead of passing them separately
# ^ make sure these checks check for consistency between output shape and len(points)
# need device checks

if finufftkwargs is None:
finufftkwargs = dict()
Expand All @@ -1654,17 +1677,13 @@ def forward(
ndim = points.shape[0]
assert len(output_shape) == ndim

nufft_func = get_nufft_func(ndim, 1)
finufft_out = torch.from_numpy(
nufft_func(
*points.data.numpy(),
values.data.numpy(),
output_shape,
modeord=_mode_ordering,
isign=_i_sign,
**finufftkwargs,
)
nufft_func = get_nufft_func(ndim, 1, points.device.type)
finufft_out = nufft_func(
*points, values, output_shape, isign=_i_sign, **finufftkwargs
)
# because modeord is missing from cufinufft
if _mode_ordering:
WardBrian marked this conversation as resolved.
Show resolved Hide resolved
finufft_out = torch.fft.ifftshift(finufft_out)
WardBrian marked this conversation as resolved.
Show resolved Hide resolved

return finufft_out

Expand Down Expand Up @@ -1695,52 +1714,57 @@ def backward(

start_points = -(np.array(grad_output.shape) // 2)
end_points = start_points + grad_output.shape
slices = tuple(slice(start, end) for start, end in zip(start_points, end_points))
slices = tuple(
slice(start, end) for start, end in zip(start_points, end_points)
)

# CPU idiosyncracy that needs to be done differently
coord_ramps = torch.from_numpy(np.mgrid[slices])
coord_ramps = torch.from_numpy(np.mgrid[slices]).to(points.device)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will work, but allocates an array on cpu, then sends it to gpu. we may want to borrow from the prior code that uses torch.meshgrid(x_vals, y_vals, z_vals) after allocating x_vals=torch.arange(start, end, device=device) etc. that way it gets created on gpu.

This is second-order optimization probably, since there will likely be other bottlenecks to fix beforehand, so keep as is for now


grads_points = None
grad_values = None

ndim = points.shape[0]

nufft_func = get_nufft_func(ndim, 2)
nufft_func = get_nufft_func(ndim, 2, points.device.type)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sending points.device object doesn't work?

BTW do we know anything about how well cufinufft interacts with multple devices?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing cufinufft does not like multiple devices, but I haven't tried.

We definitely need more checks that the arrays are both on the same device (at least cpu/cuda, if not also checking they're on the same index of cuda)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh and we could use points.device, but the only thing we care about for now is if it is cuda/cpu, so sending the type seemed simplest


if ctx.needs_input_grad[0]:
# wrt points

if _mode_ordering != 0:
coord_ramps = torch.fft.ifftshift(coord_ramps, dim=tuple(range(1, ndim+1)))

if _mode_ordering:
coord_ramps = torch.fft.ifftshift(
coord_ramps, dim=tuple(range(1, ndim + 1))
)

ramped_grad_output = coord_ramps * grad_output[np.newaxis] * 1j * _i_sign

grads_points = []
for ramp in ramped_grad_output: # we can batch this into finufft
backprop_ramp = torch.from_numpy(
nufft_func(
*points.numpy(),
ramp.data.numpy(),
isign=_i_sign,
modeord=_mode_ordering,
**finufftkwargs,
))
for ramp in ramped_grad_output: # we can batch this into finufft
if _mode_ordering:
ramp = torch.fft.fftshift(ramp)

backprop_ramp = nufft_func(
WardBrian marked this conversation as resolved.
Show resolved Hide resolved
*points,
ramp,
isign=_i_sign,
**finufftkwargs,
)

grad_points = (backprop_ramp.conj() * values).real

grads_points.append(grad_points)

grads_points = torch.stack(grads_points)

if ctx.needs_input_grad[1]:
np_grad_output = grad_output.data.numpy()
if _mode_ordering:
grad_output = torch.fft.fftshift(grad_output)

grad_values = torch.from_numpy(
nufft_func(
*points.numpy(),
np_grad_output,
isign=_i_sign,
modeord=_mode_ordering,
**finufftkwargs,
)
grad_values = nufft_func(
*points,
grad_output,
isign=_i_sign,
**finufftkwargs,
)

return (
Expand Down
Loading