Skip to content

Commit

Permalink
Gpu type 3 (#517)
Browse files Browse the repository at this point in the history
* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft
* Removed the CPU fseries computation (only used for benchmark no longer needed).
* Added complex arithmetic support for cuda_complex type
* Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
* integrated flipwind on type 1-2 on GPU
* Minor fixes on the GPU code:
   - removed memory leaks in case of errors
   - renamed maxbatchsize to batchsize
   - renamed the fseries and nuft to match CPU code
  • Loading branch information
DiamonDinoia authored Sep 12, 2024
1 parent e77225d commit 481b70e
Show file tree
Hide file tree
Showing 38 changed files with 1,764 additions and 733 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).

Master (9/10/24)

* reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
#534 (Barnett).
* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
- Removed the CPU fseries computation (only used for benchmark no longer needed).
- Added complex arithmetic support for cuda_complex type
- Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
- Minor fixes on the GPU code:
a) removed memory leaks in case of errors
b) renamed maxbatchsize to batchsize

V 2.3.0 (9/5/24)

Expand Down
2 changes: 2 additions & 0 deletions docs/devnotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Developer notes

* CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.

* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.

* Testing cufinufft (for FI, mostly):

.. code-block:: sh
Expand Down
28 changes: 17 additions & 11 deletions include/cufinufft/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,37 @@
#include <finufft_errors.h>
#include <finufft_spread_opts.h>

#include <complex.h>
#include <complex>

namespace cufinufft {
namespace common {
template<typename T>
__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
int ns);
template<typename T>
__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx,
T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
T *fwkerhalf3, int ns);
template<typename T>
int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
T *d_fwkerhalf3, int ns, cudaStream_t stream);
int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase,
T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns,
cudaStream_t stream);
template<typename T>
int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx,
T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2,
T *d_fwkerhalf3, int ns, cudaStream_t stream);
template<typename T>
int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);

void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);

template<typename T>
void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
template<typename T>
void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
finufft_spread_opts opts);
template<typename T>
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
T *fwkerhalf, finufft_spread_opts opts);
void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
Expand Down
173 changes: 173 additions & 0 deletions include/cufinufft/contrib/helper_math.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#ifndef FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
#define FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H

#include <cuComplex.h>

// This header provides some helper functions for cuComplex types.
// It mainly wraps existing CUDA implementations to provide operator overloads
// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all
// provided by CUDA

// Addition for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ cuDoubleComplex operator+(
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
return cuCadd(a, b);
}

// Subtraction for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ cuDoubleComplex operator-(
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
return cuCsub(a, b);
}

// Multiplication for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ cuDoubleComplex operator*(
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
return cuCmul(a, b);
}

// Division for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ cuDoubleComplex operator/(
const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
return cuCdiv(a, b);
}

// Equality for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ bool operator==(const cuDoubleComplex &a,
const cuDoubleComplex &b) noexcept {
return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
}

// Inequality for cuDoubleComplex (double) with cuDoubleComplex (double)
__host__ __device__ __forceinline__ bool operator!=(const cuDoubleComplex &a,
const cuDoubleComplex &b) noexcept {
return !(a == b);
}

// Addition for cuDoubleComplex (double) with double
__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex &a,
double b) noexcept {
return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
}

__host__ __device__ __forceinline__ cuDoubleComplex operator+(
double a, const cuDoubleComplex &b) noexcept {
return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
}

// Subtraction for cuDoubleComplex (double) with double
__host__ __device__ __forceinline__ cuDoubleComplex operator-(const cuDoubleComplex &a,
double b) noexcept {
return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
}

__host__ __device__ __forceinline__ cuDoubleComplex operator-(
double a, const cuDoubleComplex &b) noexcept {
return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
}

// Multiplication for cuDoubleComplex (double) with double
__host__ __device__ __forceinline__ cuDoubleComplex operator*(const cuDoubleComplex &a,
double b) noexcept {
return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
}

__host__ __device__ __forceinline__ cuDoubleComplex operator*(
double a, const cuDoubleComplex &b) noexcept {
return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
}

// Division for cuDoubleComplex (double) with double
__host__ __device__ __forceinline__ cuDoubleComplex operator/(const cuDoubleComplex &a,
double b) noexcept {
return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
}

__host__ __device__ __forceinline__ cuDoubleComplex operator/(
double a, const cuDoubleComplex &b) noexcept {
double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
return make_cuDoubleComplex((a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
}

// Addition for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ cuFloatComplex operator+(
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
return cuCaddf(a, b);
}

// Subtraction for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ cuFloatComplex operator-(
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
return cuCsubf(a, b);
}

// Multiplication for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ cuFloatComplex operator*(
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
return cuCmulf(a, b);
}

// Division for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ cuFloatComplex operator/(
const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
return cuCdivf(a, b);
}

// Equality for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ bool operator==(const cuFloatComplex &a,
const cuFloatComplex &b) noexcept {
return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
}

// Inequality for cuFloatComplex (float) with cuFloatComplex (float)
__host__ __device__ __forceinline__ bool operator!=(const cuFloatComplex &a,
const cuFloatComplex &b) noexcept {
return !(a == b);
}

// Addition for cuFloatComplex (float) with float
__host__ __device__ __forceinline__ cuFloatComplex operator+(const cuFloatComplex &a,
float b) noexcept {
return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
}

__host__ __device__ __forceinline__ cuFloatComplex operator+(
float a, const cuFloatComplex &b) noexcept {
return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
}

// Subtraction for cuFloatComplex (float) with float
__host__ __device__ __forceinline__ cuFloatComplex operator-(const cuFloatComplex &a,
float b) noexcept {
return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
}

__host__ __device__ __forceinline__ cuFloatComplex operator-(
float a, const cuFloatComplex &b) noexcept {
return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
}

// Multiplication for cuFloatComplex (float) with float
__host__ __device__ __forceinline__ cuFloatComplex operator*(const cuFloatComplex &a,
float b) noexcept {
return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
}

__host__ __device__ __forceinline__ cuFloatComplex operator*(
float a, const cuFloatComplex &b) noexcept {
return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
}

// Division for cuFloatComplex (float) with float
__host__ __device__ __forceinline__ cuFloatComplex operator/(const cuFloatComplex &a,
float b) noexcept {
return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
}

__host__ __device__ __forceinline__ cuFloatComplex operator/(
float a, const cuFloatComplex &b) noexcept {
float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
}

#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
8 changes: 5 additions & 3 deletions include/cufinufft/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
#define CUFINUFFT_DEFS_H

#include <limits>

// constants needed within common
// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
// common
#define MAX_NSPREAD 16
#define MAX_NSPREAD 16

// max number of positive quadr nodes
#define MAX_NQUAD 100
#define MAX_NQUAD 100

// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
#define ARRAYWIDCEN_GROWFRAC 0.1

// FIXME: If cufft ever takes N > INT_MAX...
constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
Expand Down
Loading

0 comments on commit 481b70e

Please sign in to comment.