flatironinstitute · DiamonDinoia · Sep 12, 2024 · Jul 3, 2024 · Jul 4, 2024 · Jul 8, 2024
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,9 +2,15 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master (9/10/24)
-
 * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
    #534 (Barnett).
+* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
+    - Removed the CPU fseries computation (only used for benchmark no longer needed).
+    - Added complex arithmetic support for cuda_complex type
+    - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
+    - Minor fixes on the GPU code:
+        a) removed memory leaks in case of errors
+        b) renamed maxbatchsize to batchsize
 
 V 2.3.0 (9/5/24)
 

diff --git a/docs/devnotes.rst b/docs/devnotes.rst
@@ -54,6 +54,8 @@ Developer notes
 
 * CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
 
+* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.
+
 * Testing cufinufft (for FI, mostly):
 
 .. code-block:: sh

diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -7,31 +7,37 @@
 #include <finufft_errors.h>
 #include <finufft_spread_opts.h>
 
-#include <complex.h>
+#include <complex>
 
 namespace cufinufft {
 namespace common {
 template<typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
-                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
+                                       T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
+                                       int ns);
+template<typename T>
+__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx,
+                                       T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
                                        T *fwkerhalf3, int ns);
 template<typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
-                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
-                           T *d_fwkerhalf3, int ns, cudaStream_t stream);
+int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase,
+                           T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns,
+                           cudaStream_t stream);
+template<typename T>
+int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx,
+                        T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                        T *d_fwkerhalf3, int ns, cudaStream_t stream);
 template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);
 
 void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
                    CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
+
 template<typename T>
-void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
-template<typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
                                    finufft_spread_opts opts);
 template<typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
-                                   T *fwkerhalf, finufft_spread_opts opts);
+void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);
 
 template<typename T>
 std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,

diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h
@@ -0,0 +1,173 @@
+#ifndef FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
+#define FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
+
+#include <cuComplex.h>
+
+// This header provides some helper functions for cuComplex types.
+// It mainly wraps existing CUDA implementations to provide operator overloads
+// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all
+// provided by CUDA
+
+// Addition for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCadd(a, b);
+}
+
+// Subtraction for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCsub(a, b);
+}
+
+// Multiplication for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCmul(a, b);
+}
+
+// Division for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCdiv(a, b);
+}
+
+// Equality for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ bool operator==(const cuDoubleComplex &a,
+                                                    const cuDoubleComplex &b) noexcept {
+  return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
+}
+
+// Inequality for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ bool operator!=(const cuDoubleComplex &a,
+                                                    const cuDoubleComplex &b) noexcept {
+  return !(a == b);
+}
+
+// Addition for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
+}
+
+// Subtraction for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
+}
+
+// Multiplication for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
+}
+
+// Division for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(
+    double a, const cuDoubleComplex &b) noexcept {
+  double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
+  return make_cuDoubleComplex((a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
+}
+
+// Addition for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator+(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCaddf(a, b);
+}
+
+// Subtraction for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator-(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCsubf(a, b);
+}
+
+// Multiplication for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator*(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCmulf(a, b);
+}
+
+// Division for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator/(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCdivf(a, b);
+}
+
+// Equality for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ bool operator==(const cuFloatComplex &a,
+                                                    const cuFloatComplex &b) noexcept {
+  return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
+}
+
+// Inequality for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ bool operator!=(const cuFloatComplex &a,
+                                                    const cuFloatComplex &b) noexcept {
+  return !(a == b);
+}
+
+// Addition for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator+(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator+(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
+}
+
+// Subtraction for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator-(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator-(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
+}
+
+// Multiplication for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator*(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator*(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
+}
+
+// Division for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator/(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator/(
+    float a, const cuFloatComplex &b) noexcept {
+  float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
+  return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
+}
+
+#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
@@ -2,14 +2,16 @@
 #define CUFINUFFT_DEFS_H
 
 #include <limits>
-
 // constants needed within common
 // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
 // common
-#define MAX_NSPREAD 16
+#define MAX_NSPREAD          16
 
 // max number of positive quadr nodes
-#define MAX_NQUAD   100
+#define MAX_NQUAD            100
+
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+#define ARRAYWIDCEN_GROWFRAC 0.1
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();