From 481b70eb9d683d1d1838f8c0504ff070fb0c5b9c Mon Sep 17 00:00:00 2001
From: Marco Barbone <DiamonDinoia@users.noreply.github.com>
Date: Thu, 12 Sep 2024 13:07:35 -0400
Subject: [PATCH] Gpu type 3 (#517)

* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft
* Removed the CPU fseries computation (only used for benchmark no longer needed).
* Added complex arithmetic support for cuda_complex type
* Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
* integrated flipwind on type 1-2 on GPU
* Minor fixes on the GPU code:
   - removed memory leaks in case of errors
   - renamed maxbatchsize to batchsize
   - renamed the fseries and nuft to match CPU code
---
 CHANGELOG                                 |   8 +-
 docs/devnotes.rst                         |   2 +
 include/cufinufft/common.h                |  28 +-
 include/cufinufft/contrib/helper_math.h   | 173 +++++
 include/cufinufft/defs.h                  |   8 +-
 include/cufinufft/impl.h                  | 779 +++++++++++++++++-----
 include/cufinufft/precision_independent.h |   4 -
 include/cufinufft/spreadinterp.h          |  20 +-
 include/cufinufft/types.h                 |  64 +-
 include/cufinufft/utils.h                 | 159 +++--
 include/cufinufft_opts.h                  |   2 +
 include/finufft_errors.h                  |   3 +-
 matlab/finufft.mw                         |   4 +-
 perftest/cuda/CMakeLists.txt              |   2 +
 src/cuda/1d/cufinufft1d.cu                |  91 ++-
 src/cuda/2d/cufinufft2d.cu                |  85 ++-
 src/cuda/2d/spread2d_wrapper.cu           |   5 +-
 src/cuda/2d/spreadinterp2d.cuh            |   4 +-
 src/cuda/3d/cufinufft3d.cu                |  92 ++-
 src/cuda/3d/interp3d_wrapper.cu           |  11 +-
 src/cuda/3d/spread3d_wrapper.cu           |  11 +-
 src/cuda/3d/spreadinterp3d.cuh            |  18 +-
 src/cuda/CMakeLists.txt                   |  20 +-
 src/cuda/common.cu                        | 255 +++----
 src/cuda/cufinufft.cu                     |   3 +-
 src/cuda/deconvolve_wrapper.cu            |   6 +-
 src/cuda/memtransfer_wrapper.cu           |  56 +-
 src/cuda/precision_independent.cu         |  14 -
 src/cuda/spreadinterp.cpp                 |   6 +-
 src/finufft.cpp                           |  26 +-
 test/cuda/CMakeLists.txt                  |  48 +-
 test/cuda/cufinufft1d_test.cu             |  45 +-
 test/cuda/cufinufft2d_test.cu             |  38 +-
 test/cuda/cufinufft2dmany_test.cu         |  43 +-
 test/cuda/cufinufft3d_test.cu             |  40 +-
 test/cuda/cufinufft_math_test.cu          | 137 ++++
 test/cuda/fseries_kernel_test.cu          | 158 -----
 test/cuda/fseriesperf.sh                  |  29 -
 38 files changed, 1764 insertions(+), 733 deletions(-)
 create mode 100644 include/cufinufft/contrib/helper_math.h
 create mode 100644 test/cuda/cufinufft_math_test.cu
 delete mode 100644 test/cuda/fseries_kernel_test.cu
 delete mode 100755 test/cuda/fseriesperf.sh

diff --git a/CHANGELOG b/CHANGELOG
index 6b34c4b46..fe852d418 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,9 +2,15 @@ List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 Master (9/10/24)
-
 * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
    #534 (Barnett).
+* Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
+    - Removed the CPU fseries computation (only used for benchmark no longer needed).
+    - Added complex arithmetic support for cuda_complex type
+    - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
+    - Minor fixes on the GPU code:
+        a) removed memory leaks in case of errors
+        b) renamed maxbatchsize to batchsize
 
 V 2.3.0 (9/5/24)
 
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index adbaddc30..733d38302 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -54,6 +54,8 @@ Developer notes
 
 * CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
 
+* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.
+
 * Testing cufinufft (for FI, mostly):
 
 .. code-block:: sh
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
index e1149df1f..5c11e8815 100644
--- a/include/cufinufft/common.h
+++ b/include/cufinufft/common.h
@@ -7,31 +7,37 @@
 #include <finufft_errors.h>
 #include <finufft_spread_opts.h>
 
-#include <complex.h>
+#include <complex>
 
 namespace cufinufft {
 namespace common {
 template<typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
-                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
+                                       T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
+                                       int ns);
+template<typename T>
+__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx,
+                                       T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
                                        T *fwkerhalf3, int ns);
 template<typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
-                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
-                           T *d_fwkerhalf3, int ns, cudaStream_t stream);
+int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase,
+                           T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns,
+                           cudaStream_t stream);
+template<typename T>
+int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx,
+                        T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                        T *d_fwkerhalf3, int ns, cudaStream_t stream);
 template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);
 
 void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
                    CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
+
 template<typename T>
-void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
-template<typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
                                    finufft_spread_opts opts);
 template<typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
-                                   T *fwkerhalf, finufft_spread_opts opts);
+void onedim_nuft_kernel_precomp(T *f, T *zout, finufft_spread_opts opts);
 
 template<typename T>
 std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h
new file mode 100644
index 000000000..119aca6b6
--- /dev/null
+++ b/include/cufinufft/contrib/helper_math.h
@@ -0,0 +1,173 @@
+#ifndef FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
+#define FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
+
+#include <cuComplex.h>
+
+// This header provides some helper functions for cuComplex types.
+// It mainly wraps existing CUDA implementations to provide operator overloads
+// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all
+// provided by CUDA
+
+// Addition for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCadd(a, b);
+}
+
+// Subtraction for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCsub(a, b);
+}
+
+// Multiplication for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCmul(a, b);
+}
+
+// Division for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(
+    const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {
+  return cuCdiv(a, b);
+}
+
+// Equality for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ bool operator==(const cuDoubleComplex &a,
+                                                    const cuDoubleComplex &b) noexcept {
+  return cuCreal(a) == cuCreal(b) && cuCimag(a) == cuCimag(b);
+}
+
+// Inequality for cuDoubleComplex (double) with cuDoubleComplex (double)
+__host__ __device__ __forceinline__ bool operator!=(const cuDoubleComplex &a,
+                                                    const cuDoubleComplex &b) noexcept {
+  return !(a == b);
+}
+
+// Addition for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) + b, cuCimag(a));
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator+(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a + cuCreal(b), cuCimag(b));
+}
+
+// Subtraction for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) - b, cuCimag(a));
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator-(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a - cuCreal(b), -cuCimag(b));
+}
+
+// Multiplication for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) * b, cuCimag(a) * b);
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator*(
+    double a, const cuDoubleComplex &b) noexcept {
+  return make_cuDoubleComplex(a * cuCreal(b), a * cuCimag(b));
+}
+
+// Division for cuDoubleComplex (double) with double
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(const cuDoubleComplex &a,
+                                                              double b) noexcept {
+  return make_cuDoubleComplex(cuCreal(a) / b, cuCimag(a) / b);
+}
+
+__host__ __device__ __forceinline__ cuDoubleComplex operator/(
+    double a, const cuDoubleComplex &b) noexcept {
+  double denom = cuCreal(b) * cuCreal(b) + cuCimag(b) * cuCimag(b);
+  return make_cuDoubleComplex((a * cuCreal(b)) / denom, (-a * cuCimag(b)) / denom);
+}
+
+// Addition for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator+(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCaddf(a, b);
+}
+
+// Subtraction for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator-(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCsubf(a, b);
+}
+
+// Multiplication for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator*(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCmulf(a, b);
+}
+
+// Division for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ cuFloatComplex operator/(
+    const cuFloatComplex &a, const cuFloatComplex &b) noexcept {
+  return cuCdivf(a, b);
+}
+
+// Equality for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ bool operator==(const cuFloatComplex &a,
+                                                    const cuFloatComplex &b) noexcept {
+  return cuCrealf(a) == cuCrealf(b) && cuCimagf(a) == cuCimagf(b);
+}
+
+// Inequality for cuFloatComplex (float) with cuFloatComplex (float)
+__host__ __device__ __forceinline__ bool operator!=(const cuFloatComplex &a,
+                                                    const cuFloatComplex &b) noexcept {
+  return !(a == b);
+}
+
+// Addition for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator+(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) + b, cuCimagf(a));
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator+(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a + cuCrealf(b), cuCimagf(b));
+}
+
+// Subtraction for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator-(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) - b, cuCimagf(a));
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator-(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a - cuCrealf(b), -cuCimagf(b));
+}
+
+// Multiplication for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator*(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) * b, cuCimagf(a) * b);
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator*(
+    float a, const cuFloatComplex &b) noexcept {
+  return make_cuFloatComplex(a * cuCrealf(b), a * cuCimagf(b));
+}
+
+// Division for cuFloatComplex (float) with float
+__host__ __device__ __forceinline__ cuFloatComplex operator/(const cuFloatComplex &a,
+                                                             float b) noexcept {
+  return make_cuFloatComplex(cuCrealf(a) / b, cuCimagf(a) / b);
+}
+
+__host__ __device__ __forceinline__ cuFloatComplex operator/(
+    float a, const cuFloatComplex &b) noexcept {
+  float denom = cuCrealf(b) * cuCrealf(b) + cuCimagf(b) * cuCimagf(b);
+  return make_cuFloatComplex((a * cuCrealf(b)) / denom, (-a * cuCimagf(b)) / denom);
+}
+
+#endif // FINUFFT_INCLUDE_CUFINUFFT_CONTRIB_HELPER_MATH_H
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index 6b2a075ea..630989a26 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -2,14 +2,16 @@
 #define CUFINUFFT_DEFS_H
 
 #include <limits>
-
 // constants needed within common
 // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
 // common
-#define MAX_NSPREAD 16
+#define MAX_NSPREAD          16
 
 // max number of positive quadr nodes
-#define MAX_NQUAD   100
+#define MAX_NQUAD            100
+
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+#define ARRAYWIDCEN_GROWFRAC 0.1
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 3a9fd6877..0913a404e 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -4,9 +4,9 @@
 #include <iostream>
 
 #include <cufinufft/contrib/helper_cuda.h>
+#include <cufinufft/contrib/helper_math.h>
 
 #include <cufinufft/common.h>
-#include <cufinufft/cudeconvolve.h>
 #include <cufinufft/defs.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
@@ -14,6 +14,7 @@
 #include <cufinufft/utils.h>
 
 #include <finufft_errors.h>
+#include <thrust/device_vector.h>
 
 // 1d
 template<typename T>
@@ -22,7 +23,9 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 template<typename T>
 int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan);
-
+template<typename T>
+int cufinufft1d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 // 2d
 template<typename T>
 int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
@@ -31,6 +34,9 @@ template<typename T>
 int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan);
 
+template<typename T>
+int cufinufft2d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 // 3d
 template<typename T>
 int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
@@ -38,6 +44,9 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 template<typename T>
 int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 template<typename T>
 int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol,
@@ -62,11 +71,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
       Marco Barbone 07/26/24. Using SM when shared memory available is enough.
   */
+  using namespace cufinufft::common;
   int ier;
-  cuDoubleComplex *d_a = nullptr; // fseries temp data
-  T *d_f               = nullptr; // fseries temp data
-
-  if (type < 1 || type > 2) {
+  if (type < 1 || type > 3) {
     fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
   }
@@ -76,205 +83,253 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     return FINUFFT_ERR_NTRANS_NOTVALID;
   }
 
-  // Mult-GPU support: set the CUDA Device ID:
-  const int device_id = opts == nullptr ? 0 : opts->gpu_device_id;
-  cufinufft::utils::WithCudaDevice device_swapper(device_id);
-
   /* allocate the plan structure, assign address to user pointer. */
   auto *d_plan = new cufinufft_plan_t<T>;
-  *d_plan_ptr  = d_plan;
-  // Zero out your struct, (sets all pointers to NULL)
   memset(d_plan, 0, sizeof(*d_plan));
+  *d_plan_ptr = d_plan;
+
+  // Zero out your struct, (sets all pointers to NULL)
+  // set nf1, nf2, nf3 to 1 for type 3, type 1, type 2 will overwrite this
+  d_plan->nf1 = 1;
+  d_plan->nf2 = 1;
+  d_plan->nf3 = 1;
+  d_plan->tol = tol;
   /* If a user has not supplied their own options, assign defaults for them. */
   if (opts == nullptr) {  // use default opts
     cufinufft_default_opts(&(d_plan->opts));
   } else {                // or read from what's passed in
     d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
   }
-
-  // cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check
-  // for support
-  cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported,
-                         device_id);
-  static bool warned = false;
-  if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) {
-    fprintf(stderr,
-            "[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of "
-            "CUDA streams may not perform optimally.\n");
-    warned = true;
+  d_plan->dim                   = dim;
+  d_plan->opts.gpu_maxbatchsize = std::max(d_plan->opts.gpu_maxbatchsize, 1);
+
+  if (type != 3) {
+    d_plan->ms = nmodes[0];
+    d_plan->mt = nmodes[1];
+    d_plan->mu = nmodes[2];
+    printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+  } else {
+    d_plan->opts.gpu_spreadinterponly = 1;
   }
 
-  auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
-  using namespace cufinufft::common;
-  /* Setup Spreader */
+  int fftsign     = (iflag >= 0) ? 1 : -1;
+  d_plan->iflag   = fftsign;
+  d_plan->ntransf = ntransf;
 
-  // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
-  if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
-    delete *d_plan_ptr;
-    *d_plan_ptr = nullptr;
-    return ier;
-  }
+  int batchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0;
+  // TODO: check if this is the right heuristic
+  if (batchsize == 0)                 // implies: use a heuristic.
+    batchsize = std::min(ntransf, 8); // heuristic from test codes
+  d_plan->batchsize = batchsize;
 
-  d_plan->dim = dim;
-  d_plan->ms  = nmodes[0];
-  d_plan->mt  = nmodes[1];
-  d_plan->mu  = nmodes[2];
+  const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
 
-  cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
-  RETURN_IF_CUDA_ERROR
-
-  CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
-  set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
-                d_plan->opts.gpu_obinsizex);
-  if (dim > 1)
-    set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
-                  d_plan->opts.gpu_obinsizey);
-  if (dim > 2)
-    set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
-                  d_plan->opts.gpu_obinsizez);
-
-  // dynamically request the maximum amount of shared memory available
-  // for the spreader
-
-  /* Automatically set GPU method. */
-  if (d_plan->opts.gpu_method == 0) {
-    /* For type 1, we default to method 2 (SM) since this is generally faster
-     * if there is enough shared memory available. Otherwise, we default to GM.
-     *
-     * For type 2, we always default to method 1 (GM).
-     */
-    if (type == 2) {
-      d_plan->opts.gpu_method = 1;
-    } else {
-      // query the device for the amount of shared memory available
-      int shared_mem_per_block{};
-      cudaDeviceGetAttribute(&shared_mem_per_block,
-                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
-      RETURN_IF_CUDA_ERROR
-      // compute the amount of shared memory required for the method
-      const auto shared_mem_required = shared_memory_required<T>(
-          dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
-          d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
-      if ((shared_mem_required > shared_mem_per_block)) {
-        d_plan->opts.gpu_method = 1;
-      } else {
-        d_plan->opts.gpu_method = 2;
-      }
+  // Mult-GPU support: set the CUDA Device ID:
+  const int device_id = d_plan->opts.gpu_device_id;
+  const cufinufft::utils::WithCudaDevice FromID{device_id};
+
+  // cudaMallocAsync isn't supported for all devices, regardless of cuda version. Check
+  // for support
+  {
+    cudaDeviceGetAttribute(&d_plan->supports_pools, cudaDevAttrMemoryPoolsSupported,
+                           device_id);
+    static bool warned = false;
+    if (!warned && !d_plan->supports_pools && d_plan->opts.gpu_stream != nullptr) {
+      fprintf(stderr,
+              "[cufinufft] Warning: cudaMallocAsync not supported on this device. Use of "
+              "CUDA streams may not perform optimally.\n");
+      warned = true;
     }
   }
 
-  int fftsign = (iflag >= 0) ? 1 : -1;
-
-  d_plan->nf1      = nf1;
-  d_plan->nf2      = nf2;
-  d_plan->nf3      = nf3;
-  d_plan->iflag    = fftsign;
-  d_plan->ntransf  = ntransf;
-  int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0;
-  if (maxbatchsize == 0)                 // implies: use a heuristic.
-    maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
-  d_plan->maxbatchsize = maxbatchsize;
-  d_plan->type         = type;
-
-  if (d_plan->type == 1) d_plan->spopts.spread_direction = 1;
-  if (d_plan->type == 2) d_plan->spopts.spread_direction = 2;
-
-  using namespace cufinufft::memtransfer;
-  switch (d_plan->dim) {
-  case 1: {
-    if ((ier = allocgpumem1d_plan<T>(d_plan))) goto finalize;
-  } break;
-  case 2: {
-    if ((ier = allocgpumem2d_plan<T>(d_plan))) goto finalize;
-  } break;
-  case 3: {
-    if ((ier = allocgpumem3d_plan<T>(d_plan))) goto finalize;
-  } break;
+  // simple check to use upsampfac=1.25 if tol is big
+  // FIXME: since cufft is really fast we should use 1.25 only if we run out of vram
+  if (d_plan->opts.upsampfac == 0.0) { // indicates auto-choose
+    d_plan->opts.upsampfac = 2.0;      // default, and need for tol small
+    if (tol >= (T)1E-9 && type == 3) { // the tol sigma=5/4 can reach
+      d_plan->opts.upsampfac = 1.25;
+    }
+    if (d_plan->opts.debug) {
+      printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
+    }
   }
 
-  cufftHandle fftplan;
-  cufftResult_t cufft_status;
-  switch (d_plan->dim) {
-  case 1: {
-    int n[]       = {(int)nf1};
-    int inembed[] = {(int)nf1};
-
-    cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
-                                 inembed[0], cufft_type<T>(), maxbatchsize);
-  } break;
-  case 2: {
-    int n[]       = {(int)nf2, (int)nf1};
-    int inembed[] = {(int)nf2, (int)nf1};
+  /* Setup Spreader */
+  if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
+    // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
+    goto finalize;
+  }
 
-    cufft_status =
-        cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                      inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
-  } break;
-  case 3: {
-    int n[]       = {(int)nf3, (int)nf2, (int)nf1};
-    int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+  d_plan->type                    = type;
+  d_plan->spopts.spread_direction = d_plan->type;
 
-    cufft_status = cufftPlanMany(
-        &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-        inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
-  } break;
+  if (d_plan->opts.debug) {
+    // print the spreader options
+    printf("[cufinufft] spreader options:\n");
+    printf("[cufinufft] nspread: %d\n", d_plan->spopts.nspread);
   }
 
-  if (cufft_status != CUFFT_SUCCESS) {
-    fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
-            cufftGetErrorString(cufft_status));
-    ier = FINUFFT_ERR_CUDA_FAILURE;
+  cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
+  if (ier = cudaGetLastError(), ier != cudaSuccess) {
     goto finalize;
   }
-  cufftSetStream(fftplan, stream);
-
-  d_plan->fftplan = fftplan;
-  {
-    std::complex<double> *a = d_plan->fseries_precomp_a;
-    T *f                    = d_plan->fseries_precomp_f;
+  if (d_plan->opts.debug) {
+    printf("[cufinufft] bin size x: %d", d_plan->opts.gpu_binsizex);
+    if (dim > 1) printf(" bin size y: %d", d_plan->opts.gpu_binsizey);
+    if (dim > 2) printf(" bin size z: %d", d_plan->opts.gpu_binsizez);
+    printf("\n");
+    // shared memory required for the spreader vs available shared memory
+    int shared_mem_per_block{};
+    cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                           device_id);
+    const auto mem_required =
+        shared_memory_required<T>(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                  d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+    printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
+  }
 
-    onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
+  if (type == 1 || type == 2) {
+    CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
+    set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
+                  d_plan->opts.gpu_obinsizex);
     if (dim > 1)
-      onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
+      set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
+                    d_plan->opts.gpu_obinsizey);
     if (dim > 2)
-      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD,
-                                    d_plan->spopts);
+      set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
+                    d_plan->opts.gpu_obinsizez);
+
+    // dynamically request the maximum amount of shared memory available
+    // for the spreader
+
+    /* Automatically set GPU method. */
+    if (d_plan->opts.gpu_method == 0) {
+      /* For type 1, we default to method 2 (SM) since this is generally faster
+       * if there is enough shared memory available. Otherwise, we default to GM.
+       *
+       * For type 2, we always default to method 1 (GM).
+       */
+      if (type == 2) {
+        d_plan->opts.gpu_method = 1;
+      } else {
+        // query the device for the amount of shared memory available
+        int shared_mem_per_block{};
+        cudaDeviceGetAttribute(&shared_mem_per_block,
+                               cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+        // compute the amount of shared memory required for the method
+        const auto shared_mem_required = shared_memory_required<T>(
+            dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+            d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+        if ((shared_mem_required > shared_mem_per_block)) {
+          d_plan->opts.gpu_method = 1;
+        } else {
+          d_plan->opts.gpu_method = 2;
+        }
+      }
+    }
 
-    if ((ier = checkCudaErrors(
-             cudaMallocWrapper(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream,
-                               d_plan->supports_pools))))
-      goto finalize;
-    if ((ier = checkCudaErrors(cudaMallocWrapper(&d_f, dim * MAX_NQUAD * sizeof(T),
-                                                 stream, d_plan->supports_pools))))
-      goto finalize;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
-                             cudaMemcpyHostToDevice, stream))))
+    if ((ier = cudaGetLastError())) {
       goto finalize;
-    if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T),
-                                               cudaMemcpyHostToDevice, stream))))
+    }
+
+    d_plan->nf1 = nf1;
+    d_plan->nf2 = nf2;
+    d_plan->nf3 = nf3;
+    d_plan->nf  = nf1 * nf2 * nf3;
+    if (d_plan->opts.debug) {
+      printf("[cufinufft] (nf1,nf2,nf3) = (%d, %d, %d)\n", d_plan->nf1, d_plan->nf2,
+             d_plan->nf3);
+    }
+
+    using namespace cufinufft::memtransfer;
+    switch (d_plan->dim) {
+    case 1: {
+      if ((ier = allocgpumem1d_plan<T>(d_plan))) goto finalize;
+    } break;
+    case 2: {
+      if ((ier = allocgpumem2d_plan<T>(d_plan))) goto finalize;
+    } break;
+    case 3: {
+      if ((ier = allocgpumem3d_plan<T>(d_plan))) goto finalize;
+    } break;
+    }
+
+    cufftHandle fftplan;
+    cufftResult_t cufft_status;
+    switch (d_plan->dim) {
+    case 1: {
+      int n[]       = {(int)nf1};
+      int inembed[] = {(int)nf1};
+
+      cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                   inembed[0], cufft_type<T>(), batchsize);
+    } break;
+    case 2: {
+      int n[]       = {(int)nf2, (int)nf1};
+      int inembed[] = {(int)nf2, (int)nf1};
+
+      cufft_status =
+          cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                        inembed[0] * inembed[1], cufft_type<T>(), batchsize);
+    } break;
+    case 3: {
+      int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+      int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+      cufft_status = cufftPlanMany(
+          &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+          inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
+    } break;
+    }
+
+    if (cufft_status != CUFFT_SUCCESS) {
+      fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+              cufftGetErrorString(cufft_status));
+      ier = FINUFFT_ERR_CUDA_FAILURE;
       goto finalize;
-    if ((ier = cufserieskernelcompute(
-             d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
-             d_plan->fwkerhalf3, d_plan->spopts.nspread, stream)))
+    }
+    cufftSetStream(fftplan, stream);
+
+    d_plan->fftplan = fftplan;
+
+    // compute up to 3 * NQUAD precomputed values on CPU
+    T fseries_precomp_phase[3 * MAX_NQUAD];
+    T fseries_precomp_f[3 * MAX_NQUAD];
+    thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
+    thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
+    onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
+                                     fseries_precomp_phase, d_plan->spopts);
+    if (d_plan->dim > 1)
+      onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
+                                       fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
+    if (d_plan->dim > 2)
+      onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
+                                       fseries_precomp_phase + 2 * MAX_NQUAD,
+                                       d_plan->spopts);
+    // copy the precomputed data to the device using thrust
+    thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
+                 d_fseries_precomp_phase.begin());
+    thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
+                 d_fseries_precomp_f.begin());
+    // the full fseries is done on the GPU here
+    if ((ier = fseries_kernel_compute(
+             d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
+             d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
+             d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
+             d_plan->spopts.nspread, stream)))
       goto finalize;
   }
-
 finalize:
-  cudaFreeWrapper(d_a, stream, d_plan->supports_pools);
-  cudaFreeWrapper(d_f, stream, d_plan->supports_pools);
-
   if (ier > 1) {
-    delete *d_plan_ptr;
+    cufinufft_destroy_impl(*d_plan_ptr);
     *d_plan_ptr = nullptr;
   }
-
   return ier;
 }
 
 template<typename T>
-int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u,
-                          cufinufft_plan_t<T> *d_plan)
+int cufinufft_setpts_12_impl(int M, T *d_kx, T *d_ky, T *d_kz,
+                             cufinufft_plan_t<T> *d_plan)
 /*
     "setNUpts" stage (in single or double precision).
 
@@ -312,7 +367,7 @@ Notes: the type T means either single or double, matching the
     Melody Shih 07/25/19; Barnett 2/16/21 moved out docs.
 */
 {
-  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  const cufinufft::utils::WithCudaDevice FromID(d_plan->opts.gpu_device_id);
 
   int nf1 = d_plan->nf1;
   int nf2 = d_plan->nf2;
@@ -378,9 +433,381 @@ Notes: the type T means either single or double, matching the
   } break;
   }
 
+  if (d_plan->opts.debug) {
+    printf("[cufinufft] plan->M=%d\n", M);
+  }
   return ier;
 }
 
+template<typename T>
+int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u,
+                          cufinufft_plan_t<T> *d_plan) {
+  // type 1 and type 2 setpts
+  if (d_plan->type == 1 || d_plan->type == 2) {
+    return cufinufft_setpts_12_impl<T>(M, d_kx, d_ky, d_kz, d_plan);
+  }
+  // type 3 setpts
+
+  // This code follows the same implementation of the CPU code in finufft and uses similar
+  // variables names where possible. However, the use of GPU routines and paradigms make
+  // it harder to follow. To understand the code, it is recommended to read the CPU code
+  // first.
+
+  if (d_plan->type != 3) {
+    fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__,
+            d_plan->type);
+    return FINUFFT_ERR_TYPE_NOTVALID;
+  }
+  if (N < 0) {
+    fprintf(stderr, "[cufinufft] Invalid N (%d): cannot be negative.\n", N);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  }
+  if (N > MAX_NF) {
+    fprintf(stderr, "[cufinufft] Invalid N (%d): cannot be greater than %d.\n", N,
+            MAX_NF);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  }
+  const auto stream = d_plan->stream;
+  d_plan->N         = N;
+  if (d_plan->dim > 0 && d_s == nullptr) {
+    fprintf(stderr, "[%s] Error: d_s is nullptr but dim > 0.\n", __func__);
+    return FINUFFT_ERR_INVALID_ARGUMENT;
+  }
+  d_plan->d_Sp = d_plan->dim > 0 ? d_s : nullptr;
+
+  if (d_plan->dim > 1 && d_t == nullptr) {
+    fprintf(stderr, "[%s] Error: d_t is nullptr but dim > 1.\n", __func__);
+    return FINUFFT_ERR_INVALID_ARGUMENT;
+  }
+  d_plan->d_Tp = d_plan->dim > 1 ? d_t : nullptr;
+
+  if (d_plan->dim > 2 && d_u == nullptr) {
+    fprintf(stderr, "[%s] Error: d_u is nullptr but dim > 2.\n", __func__);
+    return FINUFFT_ERR_INVALID_ARGUMENT;
+  }
+  d_plan->d_Up = d_plan->dim > 2 ? d_u : nullptr;
+
+  const auto dim = d_plan->dim;
+  // no need to set the params to zero, as they are already zeroed out in the plan
+  //  memset(d_plan->type3_params, 0, sizeof(d_plan->type3_params));
+  using namespace cufinufft::utils;
+  if (d_plan->dim > 0) {
+    const auto [x1, c1]        = arraywidcen<T>(M, d_kx, stream);
+    d_plan->type3_params.X1    = x1;
+    d_plan->type3_params.C1    = c1;
+    const auto [S1, D1]        = arraywidcen<T>(N, d_s, stream);
+    const auto [nf1, h1, gam1] = set_nhg_type3<T>(S1, x1, d_plan->opts, d_plan->spopts);
+    d_plan->nf1                = nf1;
+    d_plan->type3_params.S1    = S1;
+    d_plan->type3_params.D1    = D1;
+    d_plan->type3_params.h1    = h1;
+    d_plan->type3_params.gam1  = gam1;
+  }
+  if (d_plan->dim > 1) {
+    const auto [x2, c2]        = arraywidcen<T>(M, d_ky, stream);
+    d_plan->type3_params.X2    = x2;
+    d_plan->type3_params.C2    = c2;
+    const auto [S2, D2]        = arraywidcen<T>(N, d_t, stream);
+    const auto [nf2, h2, gam2] = set_nhg_type3<T>(S2, x2, d_plan->opts, d_plan->spopts);
+    d_plan->nf2                = nf2;
+    d_plan->type3_params.S2    = S2;
+    d_plan->type3_params.D2    = D2;
+    d_plan->type3_params.h2    = h2;
+    d_plan->type3_params.gam2  = gam2;
+  }
+  if (d_plan->dim > 2) {
+    const auto [x3, c3]        = arraywidcen<T>(M, d_kz, stream);
+    d_plan->type3_params.X3    = x3;
+    d_plan->type3_params.C3    = c3;
+    const auto [S3, D3]        = arraywidcen<T>(N, d_u, stream);
+    const auto [nf3, h3, gam3] = set_nhg_type3<T>(S3, x3, d_plan->opts, d_plan->spopts);
+    d_plan->nf3                = nf3;
+    d_plan->type3_params.S3    = S3;
+    d_plan->type3_params.D3    = D3;
+    d_plan->type3_params.h3    = h3;
+    d_plan->type3_params.gam3  = gam3;
+  }
+  if (d_plan->opts.debug) {
+    printf("[%s]", __func__);
+    printf("\tM=%d N=%d\n", M, N);
+    printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%d h1=%.3g\t\n",
+           d_plan->type3_params.X1, d_plan->type3_params.C1, d_plan->type3_params.S1,
+           d_plan->type3_params.D1, d_plan->type3_params.gam1, d_plan->nf1,
+           d_plan->type3_params.h1);
+    if (d_plan->dim > 1) {
+      printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%d h2=%.3g\n",
+             d_plan->type3_params.X2, d_plan->type3_params.C2, d_plan->type3_params.S2,
+             d_plan->type3_params.D2, d_plan->type3_params.gam2, d_plan->nf2,
+             d_plan->type3_params.h2);
+    }
+    if (d_plan->dim > 2) {
+      printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%d h3=%.3g\n",
+             d_plan->type3_params.X3, d_plan->type3_params.C3, d_plan->type3_params.S3,
+             d_plan->type3_params.D3, d_plan->type3_params.gam3, d_plan->nf3,
+             d_plan->type3_params.h3);
+    }
+  }
+  d_plan->nf = d_plan->nf1 * d_plan->nf2 * d_plan->nf3;
+
+  // FIXME: MAX_NF might be too small...
+  if (d_plan->nf * d_plan->opts.gpu_maxbatchsize > MAX_NF) {
+    fprintf(stderr,
+            "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+            __func__);
+    return FINUFFT_ERR_MAXNALLOC;
+  }
+
+  // A macro might be better as it has access to __line__ and __func__
+  const auto checked_free = [stream, pool = d_plan->supports_pools](auto x) constexpr {
+    if (!x) return cudaFreeWrapper(x, stream, pool);
+    return cudaSuccess;
+  };
+  const auto checked_realloc = [checked_free, pool = d_plan->supports_pools, stream](
+                                   auto &x, const auto size) constexpr {
+    if (auto ier = checked_free(x); ier != cudaSuccess) return ier;
+    return cudaMallocWrapper(&x, size, stream, pool);
+  };
+  // FIXME: check the size of the allocs for the batch interface
+  if (checked_realloc(d_plan->fw, sizeof(cuda_complex<T>) * d_plan->nf *
+                                      d_plan->batchsize) != cudaSuccess)
+    goto finalize;
+  if (checked_realloc(d_plan->CpBatch, sizeof(cuda_complex<T>) * M * d_plan->batchsize) !=
+      cudaSuccess)
+    goto finalize;
+  if (checked_realloc(d_plan->kx, sizeof(T) * M) != cudaSuccess) goto finalize;
+  if (checked_realloc(d_plan->d_Sp, sizeof(T) * N) != cudaSuccess) goto finalize;
+  if (d_plan->dim > 1) {
+    if (checked_realloc(d_plan->ky, sizeof(T) * M) != cudaSuccess) goto finalize;
+    if (checked_realloc(d_plan->d_Tp, sizeof(T) * N) != cudaSuccess) goto finalize;
+  }
+  if (d_plan->dim > 2) {
+    if (checked_realloc(d_plan->kz, sizeof(T) * M) != cudaSuccess) goto finalize;
+    if (checked_realloc(d_plan->d_Up, sizeof(T) * N) != cudaSuccess) goto finalize;
+  }
+  if (checked_realloc(d_plan->prephase, sizeof(cuda_complex<T>) * M) != cudaSuccess)
+    goto finalize;
+  if (checked_realloc(d_plan->deconv, sizeof(cuda_complex<T>) * N) != cudaSuccess)
+    goto finalize;
+
+  // NOTE: init-captures are not allowed for extended __host__ __device__ lambdas
+
+  if (d_plan->dim > 0) {
+    const auto ig1 = T(1) / d_plan->type3_params.gam1;
+    const auto C1  = -d_plan->type3_params.C1;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_kx, d_kx + M, d_plan->kx,
+        [ig1, C1] __host__ __device__(const T x) -> T { return (x + C1) * ig1; });
+  }
+  if (d_plan->dim > 1) {
+    const auto ig2 = T(1) / d_plan->type3_params.gam2;
+    const auto C2  = -d_plan->type3_params.C2;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_ky, d_ky + M, d_plan->ky,
+        [ig2, C2] __host__ __device__(const T x) -> T { return (x + C2) * ig2; });
+  }
+  if (d_plan->dim > 2) {
+    const auto ig3 = T(1) / d_plan->type3_params.gam3;
+    const auto C3  = -d_plan->type3_params.C3;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_kz, d_kz + M, d_plan->kz,
+        [ig3, C3] __host__ __device__(const T x) -> T { return (x + C3) * ig3; });
+  }
+  if (d_plan->type3_params.D1 != 0 || d_plan->type3_params.D2 != 0 ||
+      d_plan->type3_params.D3 != 0) {
+    // if ky is null, use kx for ky and kz
+    // this is not the most efficient implementation, but it is the most compact
+    const auto iterator =
+        thrust::make_zip_iterator(thrust::make_tuple(d_kx,
+                                                     // to avoid out of bounds access, use
+                                                     // kx if ky is null
+                                                     (d_plan->dim > 1) ? d_ky : d_kx,
+                                                     // same idea as above
+                                                     (d_plan->dim > 2) ? d_kz : d_kx));
+    const auto D1       = d_plan->type3_params.D1;
+    const auto D2       = d_plan->type3_params.D2; // this should be 0 if dim < 2
+    const auto D3       = d_plan->type3_params.D3; // this should be 0 if dim < 3
+    const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1);
+    thrust::transform(
+        thrust::cuda::par.on(stream), iterator, iterator + M, d_plan->prephase,
+        [D1, D2, D3, realsign] __host__ __device__(
+            const thrust::tuple<T, T, T> &tuple) -> cuda_complex<T> {
+          const auto x = thrust::get<0>(tuple);
+          const auto y = thrust::get<1>(tuple);
+          const auto z = thrust::get<2>(tuple);
+          // no branching because D2 and D3 are 0 if dim < 2 and dim < 3
+          // this is generally faster on GPU
+          const auto phase = D1 * x + D2 * y + D3 * z;
+          // TODO: nvcc should have the sincos function
+          //       check the cos + i*sin
+          //       ref: https://en.wikipedia.org/wiki/Cis_(mathematics)
+          return cuda_complex<T>{std::cos(phase), std::sin(phase) * realsign};
+        });
+  } else {
+    thrust::fill(thrust::cuda::par.on(stream), d_plan->prephase, d_plan->prephase + M,
+                 cuda_complex<T>{1, 0});
+  }
+
+  if (d_plan->dim > 0) {
+    const auto scale = d_plan->type3_params.h1 * d_plan->type3_params.gam1;
+    const auto D1    = -d_plan->type3_params.D1;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_s, d_s + N, d_plan->d_Sp,
+        [scale, D1] __host__ __device__(const T s) -> T { return scale * (s + D1); });
+  }
+  if (d_plan->dim > 1) {
+    const auto scale = d_plan->type3_params.h2 * d_plan->type3_params.gam2;
+    const auto D2    = -d_plan->type3_params.D2;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_t, d_t + N, d_plan->d_Tp,
+        [scale, D2] __host__ __device__(const T t) -> T { return scale * (t + D2); });
+  }
+  if (d_plan->dim > 2) {
+    const auto scale = d_plan->type3_params.h3 * d_plan->type3_params.gam3;
+    const auto D3    = -d_plan->type3_params.D3;
+    thrust::transform(
+        thrust::cuda::par.on(stream), d_u, d_u + N, d_plan->d_Up,
+        [scale, D3] __host__ __device__(const T u) -> T { return scale * (u + D3); });
+  }
+  { // here we declare phi_hat1, phi_hat2, and phi_hat3
+    // and the precomputed data for the fseries kernel
+    using namespace cufinufft::common;
+
+    std::array<T, 3 * MAX_NQUAD> nuft_precomp_z{};
+    std::array<T, 3 * MAX_NQUAD> nuft_precomp_f{};
+    thrust::device_vector<T> d_nuft_precomp_z(3 * MAX_NQUAD);
+    thrust::device_vector<T> d_nuft_precomp_f(3 * MAX_NQUAD);
+    thrust::device_vector<T> phi_hat1, phi_hat2, phi_hat3;
+    if (d_plan->dim > 0) {
+      phi_hat1.resize(N);
+    }
+    if (d_plan->dim > 1) {
+      phi_hat2.resize(N);
+    }
+    if (d_plan->dim > 2) {
+      phi_hat3.resize(N);
+    }
+    onedim_nuft_kernel_precomp<T>(nuft_precomp_f.data(), nuft_precomp_z.data(),
+                                  d_plan->spopts);
+    if (d_plan->dim > 1) {
+      onedim_nuft_kernel_precomp<T>(nuft_precomp_f.data() + MAX_NQUAD,
+                                    nuft_precomp_z.data() + MAX_NQUAD,
+                                    d_plan->spopts);
+    }
+    if (d_plan->dim > 2) {
+      onedim_nuft_kernel_precomp<T>(nuft_precomp_f.data() + 2 * MAX_NQUAD,
+                                    nuft_precomp_z.data() + 2 * MAX_NQUAD,
+                                    d_plan->spopts);
+    }
+    // copy the precomputed data to the device using thrust
+    thrust::copy(nuft_precomp_z.begin(), nuft_precomp_z.end(), d_nuft_precomp_z.begin());
+    thrust::copy(nuft_precomp_f.begin(), nuft_precomp_f.end(), d_nuft_precomp_f.begin());
+    // sync the stream before calling the kernel might be needed
+    if (nuft_kernel_compute(d_plan->dim, N, N, N, d_nuft_precomp_f.data().get(),
+                            d_nuft_precomp_z.data().get(), d_plan->d_Sp, d_plan->d_Tp,
+                            d_plan->d_Up, phi_hat1.data().get(), phi_hat2.data().get(),
+                            phi_hat3.data().get(), d_plan->spopts.nspread, stream))
+      goto finalize;
+
+    const auto is_c_finite = std::isfinite(d_plan->type3_params.C1) &&
+                             std::isfinite(d_plan->type3_params.C2) &&
+                             std::isfinite(d_plan->type3_params.C3);
+    const auto is_c_nonzero = d_plan->type3_params.C1 != 0 ||
+                              d_plan->type3_params.C2 != 0 ||
+                              d_plan->type3_params.C3 != 0;
+
+    const auto phi_hat_iterator = thrust::make_zip_iterator(
+        thrust::make_tuple(phi_hat1.begin(),
+                           // to avoid out of bounds access, use phi_hat1 if dim < 2
+                           dim > 1 ? phi_hat2.begin() : phi_hat1.begin(),
+                           // to avoid out of bounds access, use phi_hat1 if dim < 3
+                           dim > 2 ? phi_hat3.begin() : phi_hat1.begin()));
+    thrust::transform(
+        thrust::cuda::par.on(stream), phi_hat_iterator, phi_hat_iterator + N,
+        d_plan->deconv,
+        [dim] __host__ __device__(const thrust::tuple<T, T, T> tuple) -> cuda_complex<T> {
+          auto phiHat = thrust::get<0>(tuple);
+          // in case dim < 2 or dim < 3, multiply by 1
+          phiHat *= (dim > 1) ? thrust::get<1>(tuple) : T(1);
+          phiHat *= (dim > 2) ? thrust::get<2>(tuple) : T(1);
+          return {T(1) / phiHat, T(0)};
+        });
+
+    if (is_c_finite && is_c_nonzero) {
+      const auto c1       = d_plan->type3_params.C1;
+      const auto c2       = d_plan->type3_params.C2;
+      const auto c3       = d_plan->type3_params.C3;
+      const auto d1       = -d_plan->type3_params.D1;
+      const auto d2       = -d_plan->type3_params.D2;
+      const auto d3       = -d_plan->type3_params.D3;
+      const auto realsign = d_plan->iflag >= 0 ? T(1) : T(-1);
+      // passing d_s three times if dim == 1 because d_t and d_u are not allocated
+      // passing d_s and d_t if dim == 2 because d_u is not allocated
+      const auto phase_iterator = thrust::make_zip_iterator(
+          thrust::make_tuple(d_s, dim > 1 ? d_t : d_s, dim > 2 ? d_u : d_s));
+      thrust::transform(
+          thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
+          d_plan->deconv, d_plan->deconv,
+          [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
+              const thrust::tuple<T, T, T> tuple,
+              cuda_complex<T> deconv) -> cuda_complex<T> {
+            // d2 and d3 are 0 if dim < 2 and dim < 3
+            const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
+                               c2 * (thrust::get<1>(tuple) + d2) +
+                               c3 * (thrust::get<2>(tuple) + d3);
+            return cuda_complex<T>{std::cos(phase), realsign * std::sin(phase)} * deconv;
+          });
+    }
+    // exiting the block frees the memory allocated for phi_hat1, phi_hat2, and phi_hat3
+    // and the precomputed data for the fseries kernel
+    // since GPU memory is expensive, we should free it as soon as possible
+  }
+
+  using namespace cufinufft::memtransfer;
+  switch (d_plan->dim) {
+  case 1: {
+    if ((allocgpumem1d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 2: {
+    if ((allocgpumem2d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 3: {
+    if ((allocgpumem3d_plan<T>(d_plan))) goto finalize;
+  } break;
+  }
+  if (cufinufft_setpts_12_impl(M, d_plan->kx, d_plan->ky, d_plan->kz, d_plan)) {
+    fprintf(stderr, "[%s] cufinufft_setpts_12_impl failed\n", __func__);
+    goto finalize;
+  }
+  {
+    int t2modes[]               = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
+    cufinufft_opts t2opts       = d_plan->opts;
+    t2opts.gpu_spreadinterponly = 0;
+    t2opts.gpu_method           = 1;
+    // Safe to ignore the return value here?
+    if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
+    // check that maxbatchsize is correct
+    if (cufinufft_makeplan_impl<T>(2, dim, t2modes, d_plan->iflag, d_plan->batchsize,
+                                   d_plan->tol, &d_plan->t2_plan, &t2opts)) {
+      fprintf(stderr, "[%s] inner t2 plan cufinufft_makeplan failed\n", __func__);
+      goto finalize;
+    }
+    if (cufinufft_setpts_12_impl(N, d_plan->d_Sp, d_plan->d_Tp, d_plan->d_Up,
+                                 d_plan->t2_plan)) {
+      fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 failed\n", __func__);
+      goto finalize;
+    }
+    if (d_plan->t2_plan->spopts.spread_direction != 2) {
+      fprintf(stderr, "[%s] inner t2 plan cufinufft_setpts_12 wrong direction\n",
+              __func__);
+      goto finalize;
+    }
+  }
+  return 0;
+finalize:
+  cufinufft_destroy_impl(d_plan);
+  return FINUFFT_ERR_CUDA_FAILURE;
+}
+
 template<typename T>
 int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                            cufinufft_plan_t<T> *d_plan)
@@ -413,26 +840,17 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   case 1: {
     if (type == 1) ier = cufinufft1d1_exec<T>(d_c, d_fk, d_plan);
     if (type == 2) ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
-    if (type == 3) {
-      std::cerr << "Not Implemented yet" << std::endl;
-      ier = FINUFFT_ERR_TYPE_NOTVALID;
-    }
+    if (type == 3) ier = cufinufft1d3_exec<T>(d_c, d_fk, d_plan);
   } break;
   case 2: {
     if (type == 1) ier = cufinufft2d1_exec<T>(d_c, d_fk, d_plan);
     if (type == 2) ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
-    if (type == 3) {
-      std::cerr << "Not Implemented yet" << std::endl;
-      ier = FINUFFT_ERR_TYPE_NOTVALID;
-    }
+    if (type == 3) ier = cufinufft2d3_exec<T>(d_c, d_fk, d_plan);
   } break;
   case 3: {
     if (type == 1) ier = cufinufft3d1_exec<T>(d_c, d_fk, d_plan);
     if (type == 2) ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
-    if (type == 3) {
-      std::cerr << "Not Implemented yet" << std::endl;
-      ier = FINUFFT_ERR_TYPE_NOTVALID;
-    }
+    if (type == 3) ier = cufinufft3d3_exec<T>(d_c, d_fk, d_plan);
   } break;
   }
 
@@ -447,20 +865,21 @@ int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
     In this stage, we
         (1) free all the memories that have been allocated on gpu
         (2) delete the cuFFT plan
-
-        Also see ../docs/cppdoc.md for main user-facing documentation.
 */
 {
-  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
 
   // Can't destroy a null pointer.
   if (!d_plan) return FINUFFT_ERR_PLAN_NOTVALID;
 
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
   using namespace cufinufft::memtransfer;
   freegpumemory<T>(d_plan);
 
   if (d_plan->fftplan) cufftDestroy(d_plan->fftplan);
 
+  if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
+
   /* free/destruct the plan */
   delete d_plan;
 
diff --git a/include/cufinufft/precision_independent.h b/include/cufinufft/precision_independent.h
index 9fa48a07e..e8ef209c3 100644
--- a/include/cufinufft/precision_independent.h
+++ b/include/cufinufft/precision_independent.h
@@ -41,8 +41,6 @@ __global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobs
 __global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts,
                                       int *d_numsubprob, int numbins);
 
-__global__ void trivial_global_sort_index_2d(int M, int *index);
-
 /* spreadinterp3d */
 __global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
                                    int numbins);
@@ -57,8 +55,6 @@ __global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsp
 __global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts,
                                          int *d_numsubprob, int numbins);
 
-__global__ void trivial_global_sort_index_3d(int M, int *index);
-
 __global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
                                 int nobinx, int nobiny, int nobinz, int *binsize);
 
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index 2963d381d..9efd094c8 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -10,7 +10,7 @@ namespace cufinufft {
 namespace spreadinterp {
 
 template<typename T>
-static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
+static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, const T c) {
   if constexpr (std::is_same_v<T, float>) {
     // fused multiply-add, round to nearest even
     return __fmaf_rn(a, b, c);
@@ -20,9 +20,13 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T
   }
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
                 "Only float and double are supported.");
-  return T{0};
+  return std::fma(a, b, c);
 }
 
+/**
+ * local NU coord fold+rescale macro: does the following affine transform to x:
+ *   (x+PI) mod PI    each to [0,N)
+ */
 template<typename T>
 constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
   constexpr auto x2pi = T(0.159154943091895345554011992339482617);
@@ -30,14 +34,14 @@ constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
 #if defined(__CUDA_ARCH__)
   if constexpr (std::is_same_v<T, float>) {
     // fused multiply-add, round to nearest even
-    auto result = __fmaf_rn(x, x2pi, half);
+    auto result = cudaFMA(x, x2pi, half);
     // subtract, round down
     result = __fsub_rd(result, floorf(result));
     // multiply, round down
     return __fmul_rd(result, static_cast<T>(N));
   } else if constexpr (std::is_same_v<T, double>) {
     // fused multiply-add, round to nearest even
-    auto result = __fma_rn(x, x2pi, half);
+    auto result = cudaFMA(x, x2pi, half);
     // subtract, round down
     result = __dsub_rd(result, floor(result));
     // multiply, round down
@@ -85,15 +89,15 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
 }
 
 template<typename T>
-static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
-                                                         const double upsampfac)
+static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
+                                              const double upsampfac)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
-  const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
-  //  T z = 2 * x + w - 1.0;
+  // const T z = T(2) * x + T(w - 1);
+  const auto z = cudaFMA(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
   // insert the auto-generated code which expects z, w args, writes to ker...
   if (upsampfac == 2.0) { // floating point equality is fine here
     using FLT = T;
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 16046c8ef..5b2fba790 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -8,20 +8,24 @@
 #include <finufft_spread_opts.h>
 #include <type_traits>
 
-#include <complex.h>
+#include <cuComplex.h>
 
 #define CUFINUFFT_BIGINT int
 
-// Ugly trick to map a template to a fixed type, here cuda_complex<T>
-template<typename T> struct cuda_complex_impl;
-template<> struct cuda_complex_impl<float> {
-  using type = cuFloatComplex;
-};
-template<> struct cuda_complex_impl<double> {
-  using type = cuDoubleComplex;
-};
-
-template<typename T> using cuda_complex = typename cuda_complex_impl<T>::type;
+// Marco Barbone 8/5/2924, replaced the ugly trick with std::conditional
+// to define cuda_complex
+// by using std::conditional and std::is_same, we can define cuda_complex
+// if T is float, cuda_complex<T> is cuFloatComplex
+// if T is double, cuda_complex<T> is cuDoubleComplex
+// where cuFloatComplex and cuDoubleComplex are defined in cuComplex.h
+// TODO: migrate to cuda/std/complex and remove this
+//       Issue: cufft seems not to support cuda::std::complex
+//       A reinterpret_cast should be enough
+template<typename T>
+using cuda_complex = typename std::conditional<
+    std::is_same<T, float>::value, cuFloatComplex,
+    typename std::conditional<std::is_same<T, double>::value, cuDoubleComplex,
+                              void>::type>::type;
 
 template<typename T> struct cufinufft_plan_t {
   cufinufft_opts opts;
@@ -37,7 +41,7 @@ template<typename T> struct cufinufft_plan_t {
   CUFINUFFT_BIGINT mt;
   CUFINUFFT_BIGINT mu;
   int ntransf;
-  int maxbatchsize;
+  int batchsize;
   int iflag;
   int supports_pools;
 
@@ -46,13 +50,39 @@ template<typename T> struct cufinufft_plan_t {
   T *fwkerhalf2;
   T *fwkerhalf3;
 
+  // for type 1,2 it is a pointer to kx, ky, kz (no new allocs), for type 3 it
+  // for t3: allocated as "primed" (scaled) src pts x'_j, etc
   T *kx;
   T *ky;
   T *kz;
+  cuda_complex<T> *CpBatch; // working array of prephased strengths
+  cuda_complex<T> *fwbatch;
+
+  // no allocs here
   cuda_complex<T> *c;
   cuda_complex<T> *fw;
   cuda_complex<T> *fk;
 
+  // Type 3 specific
+  struct {
+    T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale,
+                                // s=interval
+    T X2, C2, S2, D2, h2, gam2; // y
+    T X3, C3, S3, D3, h3, gam3; // z
+  } type3_params;
+  int N;                        // number of NU freq pts (type 3 only)
+  CUFINUFFT_BIGINT nf;
+  T *d_Sp;
+  T *d_Tp;
+  T *d_Up;
+  T tol;
+  // inner type 2 plan for type 3
+  cufinufft_plan_t<T> *t2_plan;
+  // new allocs.
+  // FIXME: convert to device vectors to use resize
+  cuda_complex<T> *prephase; // pre-phase, for all input NU pts
+  cuda_complex<T> *deconv;   // reciprocal of kernel FT, phase, all output NU pts
+
   // Arrays that used in subprob method
   int *idxnupts;        // length: #nupts, index of the nupts in the bin-sorted order
   int *sortidx;         // length: #nupts, order inside the bin the nupt belongs to
@@ -66,18 +96,14 @@ template<typename T> struct cufinufft_plan_t {
   int *numnupts;
   int *subprob_to_nupts;
 
-  // Temporary variables to do fseries precomputation
-  std::complex<double> fseries_precomp_a[3 * MAX_NQUAD];
-  T fseries_precomp_f[3 * MAX_NQUAD];
-
   cufftHandle fftplan;
   cudaStream_t stream;
 };
 
-template<typename T> static cufftType_t cufft_type();
-template<> inline cufftType_t cufft_type<float>() { return CUFFT_C2C; }
+template<typename T> constexpr static inline cufftType_t cufft_type();
+template<> constexpr inline cufftType_t cufft_type<float>() { return CUFFT_C2C; }
 
-template<> inline cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
+template<> constexpr inline cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
 
 static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata,
                                    cufftComplex *odata, int direction) {
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 4bfaa801d..432711aae 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -3,7 +3,6 @@
 
 // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
 #include <complex>
-#include <cstdint>
 
 #include <cuComplex.h>
 #include <cufinufft/types.h>
@@ -15,6 +14,8 @@
 #include <cuda.h>
 #include <type_traits>
 
+#include <thrust/extrema.h>
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
@@ -34,19 +35,56 @@ __inline__ __device__ double atomicAdd(double *address, double val) {
 }
 #endif
 
+/**
+ * It computes the stard and end point of the spreading window given the center x and the
+ * width ns.
+ */
+template<typename T> __forceinline__ __device__ auto interval(const int ns, const T x) {
+  const auto xstart = int(std::ceil(x - T(ns) * T(.5)));
+  const auto xend   = int(std::floor(x + T(ns) * T(.5)));
+  return int2{xstart, xend};
+}
+
+// Define a macro to check if NVCC version is >= 11.3
+#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
+#if (__CUDACC_VER_MAJOR__ > 11) || \
+    (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
+#define ALLOCA_SUPPORTED 1
+// windows compatibility
+#if __has_include(<malloc.h>)
+#include <malloc.h>
+#endif
+#endif
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ >= 900
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
+#else
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
+#endif
+#else
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
+#endif
+
 namespace cufinufft {
 namespace utils {
 class WithCudaDevice {
 public:
-  WithCudaDevice(int device) {
-    cudaGetDevice(&orig_device_);
+  explicit WithCudaDevice(const int device) : orig_device_{get_orig_device()} {
     cudaSetDevice(device);
   }
 
   ~WithCudaDevice() { cudaSetDevice(orig_device_); }
 
 private:
-  int orig_device_;
+  const int orig_device_;
+
+  static int get_orig_device() noexcept {
+    int device{};
+    cudaGetDevice(&device);
+    return device;
+  }
 };
 
 // jfm timer class
@@ -72,49 +110,6 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
   return sqrt(nrm);
 }
 
-#ifdef __CUDA_ARCH__
-__forceinline__ __device__ auto interval(const int ns, const float x) {
-  // float to int round up and fused multiply-add to round up
-  const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
-  // float to int round down and fused multiply-add to round down
-  const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
-  return int2{xstart, xend};
-}
-__forceinline__ __device__ auto interval(const int ns, const double x) {
-  // same as above
-  const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
-  const auto xend   = __double2int_rd(__fma_rd(ns, .5, x));
-  return int2{xstart, xend};
-}
-#endif
-
-// Define a macro to check if NVCC version is >= 11.3
-#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
-#if (__CUDACC_VER_MAJOR__ > 11) || \
-    (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
-
-#define ALLOCA_SUPPORTED 1
-// windows compatibility
-#if __has_include(<malloc.h>)
-#include <malloc.h>
-#endif
-#else
-#define ALLOCA_SUPPORTED 0
-#endif
-#else
-#define ALLOCA_SUPPORTED 0
-#endif
-
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 900
-#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
-#else
-#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
-#endif
-#else
-#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
-#endif
-
 /**
  * does a complex atomic add on a shared memory address
  * it adds the real and imaginary parts separately
@@ -123,8 +118,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) {
  */
 
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexShared(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   const auto raw_address = reinterpret_cast<T *>(address);
   atomicAdd(raw_address, res.x);
   atomicAdd(raw_address + 1, res.y);
@@ -136,8 +131,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
@@ -146,6 +141,68 @@ static __forceinline__ __device__ void atomicAddComplexGlobal(
   }
 }
 
+template<typename T> auto arrayrange(int n, T *a, cudaStream_t stream) {
+  const auto d_min_max = thrust::minmax_element(thrust::cuda::par.on(stream), a, a + n);
+
+  // copy d_min and d_max to host
+  T min{}, max{};
+  checkCudaErrors(cudaMemcpyAsync(&min, thrust::raw_pointer_cast(d_min_max.first),
+                                  sizeof(T), cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaMemcpyAsync(&max, thrust::raw_pointer_cast(d_min_max.second),
+                                  sizeof(T), cudaMemcpyDeviceToHost, stream));
+  return std::make_tuple(min, max);
+}
+
+// Writes out w = half-width and c = center of an interval enclosing all a[n]'s
+// Only chooses a nonzero center if this increases w by less than fraction
+// ARRAYWIDCEN_GROWFRAC defined in defs.h.
+// This prevents rephasings which don't grow nf by much. 6/8/17
+// If n==0, w and c are not finite.
+template<typename T> auto arraywidcen(int n, T *a, cudaStream_t stream) {
+  const auto [lo, hi] = arrayrange(n, a, stream);
+  auto w              = (hi - lo) / 2;
+  auto c              = (hi + lo) / 2;
+  if (std::abs(c) < ARRAYWIDCEN_GROWFRAC * w) {
+    w += std::abs(c);
+    c = 0.0;
+  }
+  return std::make_tuple(w, c);
+}
+
+template<typename T>
+auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
+                   const finufft_spread_opts &spopts)
+/*
+ * It implements the same function in finufft.cpp
+ * set_nhg_type3 in finufft.cpp for documentation
+ */
+{
+  int nss = spopts.nspread + 1; // since ns may be odd
+  T Xsafe = X, Ssafe = S;       // may be tweaked locally
+  if (X == 0.0)                 // logic ensures XS>=1, handle X=0 a/o S=0
+    if (S == 0.0) {
+      Xsafe = 1.0;
+      Ssafe = 1.0;
+    } else
+      Xsafe = std::max(Xsafe, T(1) / S);
+  else
+    Ssafe = std::max(Ssafe, T(1) / X);
+  // use the safe X and S...
+  T nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / M_PI + nss;
+  if (!std::isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
+  auto nf = (int)nfd;
+  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
+  //  catch too small nf, and nan or +-inf, otherwise spread fails...
+  if (nf < 2 * spopts.nspread) nf = 2 * spopts.nspread;
+  if (nf < MAX_NF)                   // otherwise will fail anyway
+    nf = utils::next235beven(nf, 1); // expensive at huge nf
+  // Note: b is 1 because type 3 uses a type 2 plan, so it should not need the extra
+  // condition that seems to be used by Block Gather as type 2 are only GM-sort
+  auto h   = 2 * T(M_PI) / nf;                       // upsampled grid spacing
+  auto gam = T(nf) / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
+  return std::make_tuple(nf, h, gam);
+}
+
 } // namespace utils
 } // namespace cufinufft
 
diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h
index c9898f3b7..743b3cf5c 100644
--- a/include/cufinufft_opts.h
+++ b/include/cufinufft_opts.h
@@ -29,6 +29,8 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
 
   int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
                //                  1 FFT-style mode order
+
+  int debug;   // 0: no debug, 1: debug
 } cufinufft_opts;
 
 #endif
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index 245e6d339..fb827557c 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -23,6 +23,7 @@ enum {
   FINUFFT_ERR_METHOD_NOTVALID        = 17,
   FINUFFT_ERR_BINSIZE_NOTVALID       = 18,
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
-  FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20
+  FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
+  FINUFFT_ERR_INVALID_ARGUMENT       = 21
 };
 #endif
diff --git a/matlab/finufft.mw b/matlab/finufft.mw
index 4758157a1..73c362e6a 100644
--- a/matlab/finufft.mw
+++ b/matlab/finufft.mw
@@ -150,7 +150,7 @@ classdef finufft_plan < handle
           plan.floatprec = opts.floatprec;
         end
       end
-      
+
       n_modes = ones(3,1);         % is dummy for type 3
       if type==3
         if length(n_modes_or_dim)~=1
@@ -179,7 +179,7 @@ classdef finufft_plan < handle
       plan.n_trans = n_trans;
       % Note the peculiarity that mwrap only accepts a double for n_trans, even
       % though it's declared int. It complains, also with int64 for nj, etc :(
-      
+
       % replace in finufft_opts struct whichever fields are in incoming opts...
       # copy_finufft_opts(mxArray opts, finufft_opts* o);
       if strcmp(plan.floatprec,'double')
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
index 8b1ad1c9b..92c870cc5 100644
--- a/perftest/cuda/CMakeLists.txt
+++ b/perftest/cuda/CMakeLists.txt
@@ -2,6 +2,8 @@ add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 target_link_libraries(cuperftest PUBLIC cufinufft)
 target_compile_features(cuperftest PRIVATE cxx_std_17)
+target_compile_options(cuperftest
+                       PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 set_target_properties(
   cuperftest
   PROPERTIES LINKER_LANGUAGE CUDA
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index a17b6f044..06389ef75 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -1,6 +1,9 @@
 #include <cmath>
 #include <complex>
+#include <thrust/extrema.h>
+
 #include <cufinufft/contrib/helper_cuda.h>
+#include <cufinufft/contrib/helper_math.h>
 
 #include <cassert>
 #include <cufft.h>
@@ -34,17 +37,16 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize =
-        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart   = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart  = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
-    d_plan->c  = d_cstart;
-    d_plan->fk = d_fkstart;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = std::min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
+    d_plan->c   = d_cstart;
+    d_plan->fk  = d_fkstart;
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
-             d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
+             d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
              stream))))
       return ier;
 
@@ -88,11 +90,10 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize =
-        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart  = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = std::min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
@@ -116,6 +117,65 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   return 0;
 }
 
+template<typename T>
+int cufinufft1d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan) {
+  /*
+    1D Type-3 NUFFT
+
+  This function is called in "exec" stage (See ../cufinufft.cu).
+  It includes (copied from doc in finufft library)
+    Step 0: pre-phase the input strengths
+    Step 1: spread data
+    Step 2: Type 2 NUFFT
+    Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
+
+  Marco Barbone 08/14/2024
+  */
+  int ier;
+  cuda_complex<T> *d_cstart;
+  cuda_complex<T> *d_fkstart;
+  const auto stream = d_plan->stream;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->N;
+    // setting input for spreader
+    d_plan->c = d_plan->CpBatch;
+    // setting output for spreader
+    d_plan->fk = d_plan->fw;
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+    // NOTE: fw might need to be set to 0
+    // Step 0: pre-phase the input strengths
+    for (int block = 0; block < blksize; block++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase,
+                        d_plan->prephase + d_plan->M, d_cstart + block * d_plan->M,
+                        d_plan->c + block * d_plan->M,
+                        thrust::multiplies<cuda_complex<T>>());
+    }
+    // Step 1: Spread
+    if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+    // now d_plan->fk = d_plan->fw contains the spread values
+    // Step 2: Type 2 NUFFT
+    // type 2 goes from fk to c
+    // saving the results directly in the user output array d_fk
+    // it needs to do blksize transforms
+    d_plan->t2_plan->ntransf = blksize;
+    if ((ier = cufinufft1d2_exec<T>(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier;
+    // Step 3: deconvolve
+    // now we need to d_fk = d_fk*d_plan->deconv
+    for (int i = 0; i < blksize; i++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv,
+                        d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N,
+                        d_fkstart + i * d_plan->N, thrust::multiplies<cuda_complex<T>>());
+    }
+  }
+  return 0;
+}
+
 template int cufinufft1d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
 template int cufinufft1d1_exec<double>(cuda_complex<double> *d_c,
@@ -126,3 +186,8 @@ template int cufinufft1d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<flo
 template int cufinufft1d2_exec<double>(cuda_complex<double> *d_c,
                                        cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
+template int cufinufft1d3_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
+                                      cufinufft_plan_t<float> *d_plan);
+template int cufinufft1d3_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
+                                       cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index f7f7b1559..8c165edbf 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -2,7 +2,11 @@
 #include <cmath>
 #include <complex>
 #include <cufft.h>
+
+#include <thrust/extrema.h>
+
 #include <cufinufft/contrib/helper_cuda.h>
+#include <cufinufft/contrib/helper_math.h>
 
 #include <cufinufft/cudeconvolve.h>
 #include <cufinufft/spreadinterp.h>
@@ -34,17 +38,17 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   cuda_complex<T> *d_cstart;
 
   auto &stream = d_plan->stream;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0,
-             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>),
+             d_plan->batchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>),
              stream))))
       return ier;
 
@@ -88,10 +92,10 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
@@ -115,6 +119,64 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   return 0;
 }
 
+template<typename T>
+int cufinufft2d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan) {
+  /*
+    2D Type-3 NUFFT
+
+  This function is called in "exec" stage (See ../cufinufft.cu).
+  It includes (copied from doc in finufft library)
+    Step 0: pre-phase the input strengths
+    Step 1: spread data
+    Step 2: Type 2 NUFFT
+    Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
+
+  Marco Barbone 08/14/2024
+  */
+  int ier;
+  cuda_complex<T> *d_cstart;
+  cuda_complex<T> *d_fkstart;
+  const auto stream = d_plan->stream;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->N;
+    // setting input for spreader
+    d_plan->c = d_plan->CpBatch;
+    // setting output for spreader
+    d_plan->fk = d_plan->fw;
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+    // NOTE: fw might need to be set to 0
+    // Step 0: pre-phase the input strengths
+    for (int i = 0; i < blksize; i++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase,
+                        d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M,
+                        d_plan->c + i * d_plan->M, thrust::multiplies<cuda_complex<T>>());
+    }
+    // Step 1: Spread
+    if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
+    // now d_plan->fk = d_plan->fw contains the spread values
+    // Step 2: Type 2 NUFFT
+    // type 2 goes from fk to c
+    // saving the results directly in the user output array d_fk
+    // it needs to do blksize transforms
+    d_plan->t2_plan->ntransf = blksize;
+    if ((ier = cufinufft2d2_exec<T>(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier;
+    // Step 3: deconvolve
+    // now we need to d_fk = d_fk*d_plan->deconv
+    for (int i = 0; i < blksize; i++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv,
+                        d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N,
+                        d_fkstart + i * d_plan->N, thrust::multiplies<cuda_complex<T>>());
+    }
+  }
+  return 0;
+}
+
 template int cufinufft2d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
 template int cufinufft2d1_exec<double>(cuda_complex<double> *d_c,
@@ -125,3 +187,8 @@ template int cufinufft2d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<flo
 template int cufinufft2d2_exec<double>(cuda_complex<double> *d_c,
                                        cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
+template int cufinufft2d3_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
+                                      cufinufft_plan_t<float> *d_plan);
+template int cufinufft2d3_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
+                                       cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 80cf9f8e9..490c8eed1 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -5,6 +5,7 @@
 #include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 
 #include <cufinufft/common.h>
 #include <cufinufft/precision_independent.h>
@@ -96,9 +97,7 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_
     RETURN_IF_CUDA_ERROR
   } else {
     int *d_idxnupts = d_plan->idxnupts;
-
-    trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
-                                                                             d_idxnupts);
+    thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M);
     RETURN_IF_CUDA_ERROR
   }
 
diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh
index 53a243e7e..805e921aa 100644
--- a/src/cuda/2d/spreadinterp2d.cuh
+++ b/src/cuda/2d/spreadinterp2d.cuh
@@ -47,9 +47,9 @@ __global__ void spread_2d_nupts_driven(
     }
 
     for (auto yy = ystart; yy <= yend; yy++) {
+      const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
       for (auto xx = xstart; xx <= xend; xx++) {
         const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
         const auto outidx    = ix + iy * nf1;
         const auto kervalue1 = ker1[xx - xstart];
         const auto kervalue2 = ker2[yy - ystart];
@@ -240,9 +240,9 @@ __global__ void interp_2d_nupts_driven(
     cuda_complex<T> cnow{0, 0};
     for (int yy = ystart; yy <= yend; yy++) {
       const T kervalue2 = ker2[yy - ystart];
+      const auto iy     = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
       for (int xx = xstart; xx <= xend; xx++) {
         const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
         const auto inidx     = ix + iy * nf1;
         const auto kervalue1 = ker1[xx - xstart];
         cnow.x += fw[inidx].x * kervalue1 * kervalue2;
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 5977e6d5f..ce1f37c0e 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -3,11 +3,14 @@
 
 #include <cufft.h>
 #include <cufinufft/contrib/helper_cuda.h>
+#include <cufinufft/contrib/helper_math.h>
 
 #include <cufinufft/cudeconvolve.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/types.h>
 
+#include <thrust/extrema.h>
+
 using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 using std::min;
@@ -32,19 +35,17 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu;
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    if ((ier = checkCudaErrors(
-             cudaMemsetAsync(d_plan->fw, 0,
-                             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 *
-                                 d_plan->nf3 * sizeof(cuda_complex<T>),
-                             stream))))
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
+             stream))))
       return ier;
 
     // Step 1: Spread
@@ -85,10 +86,10 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
-  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
-    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu;
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
@@ -113,6 +114,66 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   return 0;
 }
 
+// TODO: in case data is centered, we could save GPU memory
+template<typename T>
+int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan) {
+  /*
+    3D Type-3 NUFFT
+
+  This function is called in "exec" stage (See ../cufinufft.cu).
+  It includes (copied from doc in finufft library)
+    Step 0: pre-phase the input strengths
+    Step 1: spread data
+    Step 2: Type 2 NUFFT
+    Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
+
+  Marco Barbone 08/14/2024
+  */
+  int ier;
+  cuda_complex<T> *d_cstart;
+  cuda_complex<T> *d_fkstart;
+  const auto stream = d_plan->stream;
+  printf("[cufinufft] d_plan->ntransf = %d\n", d_plan->ntransf);
+  for (int i = 0; i * d_plan->batchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->batchsize, d_plan->batchsize);
+    d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->N;
+    // setting input for spreader
+    d_plan->c = d_plan->CpBatch;
+    // setting output for spreader
+    d_plan->fk = d_plan->fw;
+    // NOTE: fw might need to be set to 0
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+    // Step 0: pre-phase the input strengths
+    for (int i = 0; i < blksize; i++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase,
+                        d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M,
+                        d_plan->c + i * d_plan->M, thrust::multiplies<cuda_complex<T>>());
+    }
+    // Step 1: Spread
+    if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
+    // now d_plan->fk = d_plan->fw contains the spread values
+    // Step 2: Type 2 NUFFT
+    // type 2 goes from fk to c
+    // saving the results directly in the user output array d_fk
+    // it needs to do blksize transforms
+    d_plan->t2_plan->ntransf = blksize;
+    if ((ier = cufinufft3d2_exec<T>(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier;
+    // Step 3: deconvolve
+    // now we need to d_fk = d_fk*d_plan->deconv
+    for (int i = 0; i < blksize; i++) {
+      thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv,
+                        d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N,
+                        d_fkstart + i * d_plan->N, thrust::multiplies<cuda_complex<T>>());
+    }
+  }
+  return 0;
+}
+
 template int cufinufft3d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
 template int cufinufft3d1_exec<double>(cuda_complex<double> *d_c,
@@ -124,3 +185,8 @@ template int cufinufft3d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<flo
 template int cufinufft3d2_exec<double>(cuda_complex<double> *d_c,
                                        cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
+
+template int cufinufft3d3_exec(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+                               cufinufft_plan_t<double> *d_plan);
+template int cufinufft3d3_exec(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
+                               cufinufft_plan_t<float> *d_plan);
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index 91379d3ae..51c620756 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -50,10 +50,7 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
 template<typename T>
 int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
                            int blksize) {
-  auto &stream = d_plan->stream;
-
-  dim3 threadsPerBlock;
-  dim3 blocks;
+  const auto stream = d_plan->stream;
 
   int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
   T es_c    = d_plan->spopts.ES_c;
@@ -68,10 +65,8 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T>
   cuda_complex<T> *d_c  = d_plan->c;
   cuda_complex<T> *d_fw = d_plan->fw;
 
-  threadsPerBlock.x = 16;
-  threadsPerBlock.y = 1;
-  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-  blocks.y          = 1;
+  const dim3 threadsPerBlock{16, 1, 1};
+  const dim3 blocks{(M + threadsPerBlock.x - 1) / threadsPerBlock.x, 1, 1};
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index 475a888ac..a0411c2b1 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -5,6 +5,7 @@
 #include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
+#include <thrust/sort.h>
 
 #include <cufinufft/common.h>
 #include <cufinufft/precision_independent.h>
@@ -72,9 +73,9 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
     }
 
     int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
+    numbins[0] = (nf1 + bin_size_x - 1) / bin_size_x;
+    numbins[1] = (nf2 + bin_size_y - 1) / bin_size_y;
+    numbins[2] = (nf3 + bin_size_z - 1) / bin_size_z;
 
     T *d_kx = d_plan->kx;
     T *d_ky = d_plan->ky;
@@ -105,9 +106,7 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
     RETURN_IF_CUDA_ERROR
   } else {
     int *d_idxnupts = d_plan->idxnupts;
-
-    trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
-                                                                             d_idxnupts);
+    thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M);
     RETURN_IF_CUDA_ERROR
   }
 
diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh
index 59b4661ff..298ae4a43 100644
--- a/src/cuda/3d/spreadinterp3d.cuh
+++ b/src/cuda/3d/spreadinterp3d.cuh
@@ -445,6 +445,7 @@ __global__ void interp_3d_nupts_driven(
   T ker2[MAX_NSPREAD];
   T ker3[MAX_NSPREAD];
 #endif
+  cuda_complex<T> cnow{};
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
        i += blockDim.x * gridDim.x) {
     const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
@@ -455,11 +456,14 @@ __global__ void interp_3d_nupts_driven(
     const auto [ystart, yend] = interval(ns, y_rescaled);
     const auto [zstart, zend] = interval(ns, z_rescaled);
 
-    const auto x1 = T(xstart) - x_rescaled;
-    const auto y1 = T(ystart) - y_rescaled;
-    const auto z1 = T(zstart) - z_rescaled;
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    const T z1 = T(zstart) - z_rescaled;
 
-    cuda_complex<T> cnow{0, 0};
+    // having cnow allocated to 0 inside the loop breaks type 3 spread
+    // are we doing a buffer overflow somewhere?
+    cnow.x = T(0);
+    cnow.y = T(0);
 
     if constexpr (KEREVALMETH == 1) {
       eval_kernel_vec_horner(ker1, x1, ns, sigma);
@@ -473,13 +477,13 @@ __global__ void interp_3d_nupts_driven(
 
     for (int zz = zstart; zz <= zend; zz++) {
       const auto kervalue3 = ker3[zz - zstart];
-      int iz               = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
+      const auto iz        = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
       for (int yy = ystart; yy <= yend; yy++) {
         const auto kervalue2 = ker2[yy - ystart];
         int iy               = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
         for (int xx = xstart; xx <= xend; xx++) {
-          const int ix         = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-          const int inidx      = ix + iy * nf1 + iz * nf2 * nf1;
+          const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+          const auto inidx     = ix + iy * nf1 + iz * nf2 * nf1;
           const auto kervalue1 = ker1[xx - xstart];
           cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3;
           cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3;
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 6c78bab09..9f8d1344c 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(PRECISION_INDEPENDENT_SRC
     precision_independent.cu utils.cpp
     ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
+
 set(PRECISION_DEPENDENT_SRC
     spreadinterp.cpp
     1d/cufinufft1d.cu
@@ -23,19 +24,24 @@ set(CUFINUFFT_INCLUDE_DIRS
     $<TARGET_PROPERTY:CUDA::cudart,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:CUDA::nvToolsExt,INTERFACE_INCLUDE_DIRECTORIES>)
+
 set(CUFINUFFT_INCLUDE_DIRS
     ${CUFINUFFT_INCLUDE_DIRS}
     PARENT_SCOPE)
 
 # flush denormals to zero and enable verbose PTXAS output
 set(FINUFFT_CUDA_FLAGS
+    $<$<COMPILE_LANGUAGE:CUDA>:
+    --extended-lambda
     -ftz=true
     -fmad=true
     -restrict
     --extra-device-vectorization
     $<$<CONFIG:Debug>:-G
     -maxrregcount
-    32>)
+    64
+    >
+    >)
 
 add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC})
 target_include_directories(cufinufft_common_objects
@@ -48,12 +54,7 @@ set_target_properties(
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON)
 target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17)
-target_compile_options(
-  cufinufft_common_objects
-  PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
-target_compile_options(
-  cufinufft_common_objects
-  PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
+target_compile_options(cufinufft_common_objects PRIVATE ${FINUFFT_CUDA_FLAGS})
 
 add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC})
 target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
@@ -65,8 +66,7 @@ set_target_properties(
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON)
 target_compile_features(cufinufft_objects PRIVATE cxx_std_17)
-target_compile_options(
-  cufinufft_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
+target_compile_options(cufinufft_objects PRIVATE ${FINUFFT_CUDA_FLAGS})
 
 if(FINUFFT_SHARED_LINKING)
   add_library(cufinufft SHARED $<TARGET_OBJECTS:cufinufft_common_objects>
@@ -87,7 +87,7 @@ set_target_properties(
              CUDA_STANDARD_REQUIRED ON
              ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 target_compile_features(cufinufft PRIVATE cxx_std_17)
-
+target_compile_options(cufinufft PRIVATE ${FINUFFT_CUDA_FLAGS})
 if(WIN32)
   target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft
                                          CUDA::nvToolsExt)
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index b19986520..31f0418e2 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -21,20 +21,26 @@ namespace common {
 using namespace cufinufft::spreadinterp;
 using std::max;
 
-/* Kernel for computing approximations of exact Fourier series coeffs of
-   cnufftspread's real symmetric kernel. */
-// a , f are intermediate results from function onedim_fseries_kernel_precomp()
-// (see cufinufft/contrib/common.cpp for description)
+/** Kernel for computing approximations of exact Fourier series coeffs of
+ *  cnufftspread's real symmetric kernel.
+ * phase, f are intermediate results from function onedim_fseries_kernel_precomp()
+ * (see cufinufft/contrib/common.cpp for description)
+ * this is the equispaced frequency case, used by type 1 & 2, matching
+ * onedim_fseries_kernel in CPU code
+ */
 template<typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
-                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
-                                       T *fwkerhalf3, int ns) {
+__global__ void cu_fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *phase,
+                                          T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
+                                          int ns) {
   T J2  = ns / 2.0;
   int q = (int)(2 + 3.0 * J2);
   int nf;
-  cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD;
-  T *ft               = f + threadIdx.y * MAX_NQUAD;
+  T *phaset = phase + threadIdx.y * MAX_NQUAD;
+  T *ft     = f + threadIdx.y * MAX_NQUAD;
   T *oarr;
+  // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more
+  // threads as the parallelism is x*y*z
+  // each thread check the y index to determine which array to use
   if (threadIdx.y == 0) {
     oarr = fwkerhalf1;
     nf   = nf1;
@@ -48,19 +54,62 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
 
   for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1;
        i += blockDim.x * gridDim.x) {
-    int brk = 0.5 + i;
-    T x     = 0.0;
+    T x = 0.0;
+    for (int n = 0; n < q; n++) {
+      // in type 1/2 2*PI/nf -> k[i]
+      x += ft[n] * T(2) * std::cos(T(i) * phaset[n]);
+    }
+    oarr[i] = x * T(i % 2 ? -1 : 1); // signflip for the kernel origin being at PI
+  }
+}
+
+/** Kernel for computing approximations of exact Fourier series coeffs of
+ *  cnufftspread's real symmetric kernel.
+ * a , f are intermediate results from function onedim_fseries_kernel_precomp()
+ * (see cufinufft/contrib/common.cpp for description)
+ * this is the arbitrary frequency case (hence the extra kx, ky, kx arguments), used by
+ * type 3, matching onedim_nuft_kernel in CPU code
+ */
+template<typename T>
+__global__ void cu_nuft_kernel_compute(int nf1, int nf2, int nf3, T *f, T *z, T *kx,
+                                       T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
+                                       T *fwkerhalf3, int ns) {
+  T J2  = ns / 2.0;
+  int q = (int)(2 + 2.0 * J2);
+  int nf;
+  T *at = z + threadIdx.y * MAX_NQUAD;
+  T *ft = f + threadIdx.y * MAX_NQUAD;
+  T *oarr, *k;
+  // standard parallelism pattern in cuda. using a 2D grid, this allows to leverage more
+  // threads as the parallelism is x*y*z
+  // each thread check the y index to determine which array to use
+  if (threadIdx.y == 0) {
+    k    = kx;
+    oarr = fwkerhalf1;
+    nf   = nf1;
+  } else if (threadIdx.y == 1) {
+    k    = ky;
+    oarr = fwkerhalf2;
+    nf   = nf2;
+  } else {
+    k    = kz;
+    oarr = fwkerhalf3;
+    nf   = nf3;
+  }
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf;
+       i += blockDim.x * gridDim.x) {
+    T x = 0.0;
     for (int n = 0; n < q; n++) {
-      x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n])));
+      x += ft[n] * T(2) * std::cos(k[i] * at[n]);
     }
     oarr[i] = x;
   }
 }
 
 template<typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
-                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
-                           T *d_fwkerhalf3, int ns, cudaStream_t stream)
+int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_phase,
+                           T *d_fwkerhalf1, T *d_fwkerhalf2, T *d_fwkerhalf3, int ns,
+                           cudaStream_t stream)
 /*
     wrapper for approximation of Fourier series of real symmetric spreading
     kernel.
@@ -73,8 +122,37 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
   dim3 threadsPerBlock(16, dim);
   dim3 numBlocks((nout + 16 - 1) / 16, 1);
 
-  fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(
-      nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
+  cu_fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      nf1, nf2, nf3, d_f, d_phase, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
+  RETURN_IF_CUDA_ERROR
+
+  return 0;
+}
+
+template<typename T>
+int nuft_kernel_compute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_z, T *d_kx,
+                        T *d_ky, T *d_kz, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                        T *d_fwkerhalf3, int ns, cudaStream_t stream)
+/*
+    Approximates exact Fourier transform of cnufftspread's real symmetric
+    kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
+    narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
+    for a kernel with x measured in grid-spacings. (See previous routine for
+    FT definition).
+    It implements onedim_nuft_kernel in CPU code. Except it combines up to three
+    onedimensional kernel evaluations at once (for efficiency).
+
+    Marco Barbone 08/28/2024
+*/
+{
+  int nout = max(max(nf1, nf2), nf3);
+
+  dim3 threadsPerBlock(16, dim);
+  dim3 numBlocks((nout + 16 - 1) / 16, 1);
+
+  cu_nuft_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      nf1, nf2, nf3, d_f, d_z, d_kx, d_ky, d_kz, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3,
+      ns);
   RETURN_IF_CUDA_ERROR
 
   return 0;
@@ -104,39 +182,6 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts
   }
 }
 
-template<typename T>
-void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts)
-/*
-  Approximates exact Fourier series coeffs of cnufftspread's real symmetric
-  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
-  narrowness of kernel. Uses phase winding for cheap eval on the regular freq
-  grid. Note that this is also the Fourier transform of the non-periodized
-  kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
-  overall prefactor of 1/h, which is needed anyway for the correction, and
-  arises because the quadrature weights are scaled for grid units not x units.
-
-  Inputs:
-  nf - size of 1d uniform spread grid, must be even.
-  opts - spreading opts object, needed to eval kernel (must be already set up)
-
-  Outputs:
-  fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
-              divided by h = 2pi/n.
-              (should be allocated for at least nf/2+1 Ts)
-
-  Compare onedim_dct_kernel which has same interface, but computes DFT of
-  sampled kernel, not quite the same object.
-
-  Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18
-  Melody 2/20/22 separate into precomp & comp functions defined below.
- */
-{
-  T f[MAX_NQUAD];
-  std::complex<double> a[MAX_NQUAD];
-  onedim_fseries_kernel_precomp(nf, f, a, opts);
-  onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
-}
-
 /*
   Precomputation of approximations of exact Fourier series coeffs of cnufftspread's
   real symmetric kernel.
@@ -144,58 +189,46 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt
   Inputs:
   nf - size of 1d uniform spread grid, must be even.
   opts - spreading opts object, needed to eval kernel (must be already set up)
+  phase_winding - if true (type 1-2), scaling for the equispaced case else (type 3)
+                  scaling for the general kx,ky,kz case
 
   Outputs:
-  a - phase winding rates
-  f - funciton values at quadrature nodes multiplied with quadrature weights
-  (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
+  a - vector of phases to be used for cosines on the GPU;
+  f - function values at quadrature nodes multiplied with quadrature weights (a, f are
+      provided as the inputs of onedim_fseries_kernel_compute() defined below)
 */
+
 template<typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *phase,
                                    finufft_spread_opts opts) {
   T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
+  const auto q = (int)(2 + 3.0 * J2); // matches CPU code
   double z[2 * MAX_NQUAD];
   double w[2 * MAX_NQUAD];
-
   finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
-                                                          // eg on (0,1)
-  for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
-    z[n] *= J2;                                           // rescale nodes
-    f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);    // vals & quadr wei
-    a[n] = exp((T)(2.0 * M_PI) * std::complex<T>(0.0, 1.0) * (T)(nf / 2 - z[n]) /
-               (T)nf);                                    // phase winding rates
+  // eg on (0,1)
+  for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n
+    z[n] *= J2;                 // rescale nodes
+    f[n]     = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
+    phase[n] = T(2.0 * M_PI * z[n] / T(nf));               // phase winding rates
   }
 }
 
 template<typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
-                                   T *fwkerhalf, finufft_spread_opts opts) {
-  T J2  = opts.nspread / 2.0;         // J/2, half-width of ker z-support
-  int q = (int)(2 + 3.0 * J2);        // not sure why so large? cannot exceed MAX_NQUAD
-  CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to
-  int nt                = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks
-  std::vector<CUFINUFFT_BIGINT> brk(nt + 1); // start indices for each thread
-  for (int t = 0; t <= nt; ++t)              // split nout mode indices btw threads
-    brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt);
-#pragma omp parallel
-  {
-    int t = MY_OMP_GET_THREAD_NUM();
-    if (t < nt) {                         // could be nt < actual # threads
-      std::complex<double> aj[MAX_NQUAD]; // phase rotator for this thread
-      for (int n = 0; n < q; ++n)
-        aj[n] = pow(a[n], (T)brk[t]);     // init phase factors for chunk
-      for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output
-                                                               // array
-        T x = 0.0;                     // accumulator for answer at this j
-        for (int n = 0; n < q; ++n) {
-          x += f[n] * 2 * real(aj[n]); // include the negative freq
-          aj[n] *= a[n];               // wind the phases
-        }
-        fwkerhalf[j] = x;
-      }
-    }
+void onedim_nuft_kernel_precomp(T *f, T *z, finufft_spread_opts opts) {
+  // it implements the first half of onedim_nuft_kernel in CPU code
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 2.0 * J2); // matches CPU code
+  double z_local[2 * MAX_NQUAD];
+  double w_local[2 * MAX_NQUAD];
+  finufft::quadrature::legendre_compute_glr(2 * q, z_local, w_local); // only half the
+                                                                      // nodes used, eg on
+                                                                      // (0,1)
+  for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
+    z[n] = J2 * T(z_local[n]);                            // rescale nodes
+    f[n] = J2 * w_local[n] * evaluate_kernel(z[n], opts); // vals & quadr wei
   }
 }
 
@@ -312,34 +345,32 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
   }
 }
 
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f,
-                                            std::complex<double> *a, float *fwkerhalf,
-                                            finufft_spread_opts opts);
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f,
-                                            std::complex<double> *a, double *fwkerhalf,
-                                            finufft_spread_opts opts);
-
 template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps,
                                       cufinufft_opts opts);
 template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps,
                                       cufinufft_opts opts);
-template void onedim_fseries_kernel_precomp(
-    CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a, finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp(
-    CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a, finufft_spread_opts opts);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f,
-                                    cuDoubleComplex *d_a, float *d_fwkerhalf1,
-                                    float *d_fwkerhalf2, float *d_fwkerhalf3, int ns,
-                                    cudaStream_t stream);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f,
-                                    cuDoubleComplex *d_a, double *d_fwkerhalf1,
-                                    double *d_fwkerhalf2, double *d_fwkerhalf3, int ns,
-                                    cudaStream_t stream);
-
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf,
-                                    finufft_spread_opts opts);
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf,
-                                    finufft_spread_opts opts);
+template void onedim_fseries_kernel_precomp<float>(CUFINUFFT_BIGINT nf, float *f,
+                                                   float *a, finufft_spread_opts opts);
+template void onedim_fseries_kernel_precomp<double>(CUFINUFFT_BIGINT nf, double *f,
+                                                    double *a, finufft_spread_opts opts);
+template void onedim_nuft_kernel_precomp<float>(float *f, float *a,
+                                                finufft_spread_opts opts);
+template void onedim_nuft_kernel_precomp<double>(double *f, double *a,
+                                                 finufft_spread_opts opts);
+template int fseries_kernel_compute(int dim, int nf1, int nf2, int nf3, float *d_f,
+                                    float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2,
+                                    float *d_fwkerhalf3, int ns, cudaStream_t stream);
+template int fseries_kernel_compute(
+    int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_fwkerhalf1,
+    double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream);
+template int nuft_kernel_compute<float>(int dim, int nf1, int nf2, int nf3, float *d_f,
+                                        float *d_a, float *d_kx, float *d_ky, float *d_kz,
+                                        float *d_fwkerhalf1, float *d_fwkerhalf2,
+                                        float *d_fwkerhalf3, int ns, cudaStream_t stream);
+template int nuft_kernel_compute<double>(
+    int dim, int nf1, int nf2, int nf3, double *d_f, double *d_a, double *d_kx,
+    double *d_ky, double *d_kz, double *d_fwkerhalf1, double *d_fwkerhalf2,
+    double *d_fwkerhalf3, int ns, cudaStream_t stream);
 
 template std::size_t shared_memory_required<float>(int dim, int ns, int bin_size_x,
                                                    int bin_size_y, int bin_size_z);
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index c00bf8eba..534fa5358 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -112,7 +112,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
   opts->gpu_method         = 0;
   opts->gpu_sort           = 1;
   opts->gpu_kerevalmeth    = 1;
-  opts->upsampfac          = 2.0;
+  opts->upsampfac          = 0;
   opts->gpu_maxsubprobsize = 1024;
   opts->gpu_obinsizex      = 0;
   opts->gpu_obinsizey      = 0;
@@ -121,6 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
   opts->gpu_binsizey       = 0;
   opts->gpu_binsizez       = 0;
   opts->gpu_maxbatchsize   = 0;
+  opts->debug              = 0;
   opts->gpu_stream         = cudaStreamDefault;
   // sphinx tag (don't remove): @gpu_defopts_end
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index 94eb6b4c8..38a4f0da9 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -235,7 +235,7 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
   int ms           = d_plan->ms;
   int nf1          = d_plan->nf1;
   int nmodes       = ms;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
 
   if (d_plan->spopts.spread_direction == 1) {
     for (int t = 0; t < blksize; t++) {
@@ -268,7 +268,7 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
   int nf1          = d_plan->nf1;
   int nf2          = d_plan->nf2;
   int nmodes       = ms * mt;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
 
   if (d_plan->spopts.spread_direction == 1) {
     for (int t = 0; t < blksize; t++) {
@@ -305,7 +305,7 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
   int nf2          = d_plan->nf2;
   int nf3          = d_plan->nf3;
   int nmodes       = ms * mt * mu;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
   if (d_plan->spopts.spread_direction == 1) {
     for (int t = 0; t < blksize; t++) {
       deconvolve_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index b83b9d2d5..e5308e8bc 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -20,11 +20,11 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
+  const auto stream = d_plan->stream;
 
-  int ier;
+  int ier{0};
   int nf1          = d_plan->nf1;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
 
   switch (d_plan->opts.gpu_method) {
   case 1: {
@@ -59,6 +59,7 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
       goto finalize;
   } break;
   default:
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
     std::cerr << "err: invalid method " << std::endl;
   }
 
@@ -90,8 +91,8 @@ int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
-  int ier;
+  const auto stream = d_plan->stream;
+  int ier{0};
 
   int M = d_plan->M;
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
@@ -135,12 +136,12 @@ int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
-  int ier;
+  const auto stream = d_plan->stream;
+  int ier{0};
 
   int nf1          = d_plan->nf1;
   int nf2          = d_plan->nf2;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
 
   switch (d_plan->opts.gpu_method) {
   case 1: {
@@ -180,6 +181,7 @@ int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
       goto finalize;
   } break;
   default:
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
     std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
   }
 
@@ -213,8 +215,8 @@ int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
-  int ier;
+  const auto stream = d_plan->stream;
+  int ier{0};
 
   const int M = d_plan->M;
 
@@ -240,6 +242,7 @@ int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
       goto finalize;
   } break;
   default:
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
     std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
   }
 
@@ -258,13 +261,13 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
-  int ier;
+  const auto stream = d_plan->stream;
+  int ier{0};
 
   int nf1          = d_plan->nf1;
   int nf2          = d_plan->nf2;
   int nf3          = d_plan->nf3;
-  int maxbatchsize = d_plan->maxbatchsize;
+  int maxbatchsize = d_plan->batchsize;
 
   switch (d_plan->opts.gpu_method) {
   case 1: {
@@ -337,6 +340,7 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
       goto finalize;
   } break;
   default:
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
     std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
   }
 
@@ -360,7 +364,11 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
   }
 
 finalize:
-  if (ier) freegpumemory(d_plan);
+  if (ier) {
+    std::cerr << "[allocgpumem3d_plan] error:"
+              << cudaGetErrorString(static_cast<cudaError_t>(ier)) << std::endl;
+    freegpumemory(d_plan);
+  }
 
   return ier;
 }
@@ -374,8 +382,8 @@ int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
 */
 {
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-  auto &stream = d_plan->stream;
-  int ier;
+  const auto stream = d_plan->stream;
+  int ier{0};
   int M = d_plan->M;
 
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
@@ -405,6 +413,7 @@ int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
       goto finalize;
   } break;
   default:
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
     std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
   }
 
@@ -441,6 +450,21 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
 
   CUDA_FREE_AND_NULL(d_plan->numnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);
+
+  if (d_plan->type != 3) {
+    return;
+  }
+
+  CUDA_FREE_AND_NULL(d_plan->kx, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->d_Sp, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->ky, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->d_Tp, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->kz, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->d_Up, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->prephase, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->deconv, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwbatch, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->CpBatch, stream, d_plan->supports_pools);
 }
 
 template int allocgpumem1d_plan<float>(cufinufft_plan_t<float> *d_plan);
diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu
index b2c0c292f..7b199220a 100644
--- a/src/cuda/precision_independent.cu
+++ b/src/cuda/precision_independent.cu
@@ -71,13 +71,6 @@ __global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstart
   }
 }
 
-__global__ void trivial_global_sort_index_2d(int M, int *index) {
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
-       i += gridDim.x * blockDim.x) {
-    index[i] = i;
-  }
-}
-
 /* spreadinterp3d */
 __global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
                                    int numbins) {
@@ -121,13 +114,6 @@ __global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobs
   }
 }
 
-__global__ void trivial_global_sort_index_3d(int M, int *index) {
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
-       i += gridDim.x * blockDim.x) {
-    index[i] = i;
-  }
-}
-
 __global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
                                 int nobinx, int nobiny, int nobinz, int *binsize) {
   int binx = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 98b5382bc..646ffa434 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -1,10 +1,7 @@
 #include <cmath>
 #include <cstdio>
-#include <cstdlib>
-#include <limits>
 #include <vector>
 
-#include <cufinufft/defs.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/utils.h>
 
@@ -44,8 +41,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
   opts.upsampfac        = upsampfac;
 
   // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
-  int ier = 0;
-
+  int ier             = 0;
   constexpr T EPSILON = std::numeric_limits<T>::epsilon();
   if (eps < EPSILON) {
     fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n",
diff --git a/src/finufft.cpp b/src/finufft.cpp
index ce4ada36d..6c592396f 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -267,7 +267,6 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts
   for (int n = 0; n < q; ++n) {
     z[n] *= (FLT)J2;                         // quadr nodes for [0,J/2]
     f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights
-    // printf("f[%d] = %.3g\n",n,f[n]);
   }
 #pragma omp parallel for num_threads(opts.nthreads)
   for (BIGINT j = 0; j < nk; ++j) {          // loop along output array
@@ -723,6 +722,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
       fprintf(stderr,
               "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
               __func__);
+      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
       return FINUFFT_ERR_MAXNALLOC;
     }
 
@@ -751,12 +751,12 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans,
     // idist, ot, onembed, ostride, odist, sign, flags
     {
       std::lock_guard<std::mutex> lock(fftw_lock);
-
       // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads.
       // FIXME: Since this might override what the user wants for fftw, we'd like to
       // set it just for our one plan and then revert to the user value.
       // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and
       // there isn't a convenient mechanism to probe the version
+      // there is fftw_version which returns a string, but that's not compile time
       FFTW_PLAN_TH(nthr_fft);
       p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, (FFTW_CPX *)p->fwBatch,
                                        NULL, 1, p->nf, (FFTW_CPX *)p->fwBatch, NULL, 1,
@@ -874,14 +874,14 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT
 
     if (p->opts.debug) { // report on choices of shifts, centers, etc...
       printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
-      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1,
-             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1);
+      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1,
+             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1);
       if (d > 1)
-        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", p->t3P.X2,
-               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2);
+        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2,
+               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2);
       if (d > 2)
-        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3,
-               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3);
+        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3,
+               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3);
     }
     p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
@@ -913,6 +913,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT
     // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
+    // FIXME: should use realloc
     if (p->X) free(p->X);
     if (p->Sp) free(p->Sp);
     p->X  = (FLT *)malloc(sizeof(FLT) * nj);
@@ -970,7 +971,6 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT
         p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| <
                                                                  // pi/R
     }
-
     // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
     // (exploits that FT separates because kernel is prod of 1D funcs)
     if (p->deconv) free(p->deconv);
@@ -1158,8 +1158,9 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
 #pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
       for (int i = 0; i < thisBatchSize; i++) {
         BIGINT ioff = i * p->nj;
-        for (BIGINT j = 0; j < p->nj; ++j)
+        for (BIGINT j = 0; j < p->nj; ++j) {
           p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
+        }
       }
       t_pre += timer.elapsedsec();
 
@@ -1169,10 +1170,6 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
       spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed
       t_spr += timer.elapsedsec();
 
-      // for (int j=0;j<p->nf1;++j)
-      // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  //
-      // debug
-
       // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
       timer.restart();
       // illegal possible shrink of ntrans *after* plan for smaller last batch:
@@ -1181,7 +1178,6 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
      still the same size, as Andrea explained; just wastes a few flops) */
       FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch);
       t_t2 += timer.elapsedsec();
-
       // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
       timer.restart();
 #pragma omp parallel for num_threads(p->opts.nthreads)
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 87cbe8205..1cadb7569 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -5,14 +5,16 @@ foreach(srcfile ${test_src})
   get_filename_component(executable ${executable} NAME)
   add_executable(${executable} ${srcfile})
   target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
+  target_compile_options(${executable}
+                         PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
   find_library(MathLib m)
   if(MathLib)
     target_link_libraries(${executable} PUBLIC cufinufft ${MathLib})
   endif()
   target_compile_features(${executable} PUBLIC cxx_std_17)
   set_target_properties(
-    ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES
-                                                  "${FINUFFT_CUDA_ARCHITECTURES}")
+    ${executable} PROPERTIES LINKER_LANGUAGE CUDA
+                             CUDA_ARCHITECTURES "${FINUFFT_CUDA_ARCHITECTURES}")
   message(STATUS "Adding test ${executable}"
                  " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}"
                  " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}")
@@ -30,6 +32,9 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
   add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
+  add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
 
   add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
@@ -39,6 +44,14 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
+  add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL}
                    ${CHECK_TOL} ${PREC} ${UPSAMP})
@@ -55,6 +68,14 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL}
                    ${CHECK_TOL} ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d3many_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 1 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
+
+  add_test(NAME cufinufft2d3many_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -71,17 +92,34 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
     add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP}
              COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                      ${PREC} ${UPSAMP})
+
+    add_test(NAME cufinufft3d3_test_SM_${PREC}_${UPSAMP}
+             COMMAND cufinufft3d_test 2 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL}
+                     ${PREC} ${UPSAMP})
   endif()
 
   add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
+
+  add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100
+                   ${PREC} ${UPSAMP})
 endfunction()
 
+add_test(NAME cufinufft_public_api COMMAND public_api_test)
+add_test(NAME cufinufft_makeplan COMMAND test_makeplan)
+add_test(NAME cufinufft_math_test COMMAND cufinufft_math_test)
+
 add_tests(float 1e-5 2e-4 2.0)
 add_tests(double 1e-12 1e-11 2.0)
 add_tests(float 1e-5 2e-4 1.25)
 add_tests(double 1e-8 1e-7 1.25)
-
-add_test(NAME cufinufft_public_api COMMAND public_api_test)
-add_test(NAME cufinufft_makeplan COMMAND test_makeplan)
+# the upsamp is appended to the testname, ctest does not allows multiple tests
+# to share the same testname hence we use the trick 0. and 0.f to differentiate
+# the tests and allow them to run in the future we should add the precision to
+# the test (f
+add_tests(float 1e-5 2e-4 0.f)
+add_tests(double 1e-12 1e-11 0.f)
+add_tests(float 1e-5 2e-4 0.)
+add_tests(double 1e-8 1e-7 0.)
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index dbd6260ac..52d40ca0e 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -1,12 +1,12 @@
 #include <cmath>
 #include <complex>
-#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <random>
 
 #include <cufinufft.h>
 
+#include <cufinufft/contrib/helper_cuda.h>
 #include <cufinufft/impl.h>
 #include <cufinufft/utils.h>
 
@@ -22,11 +22,11 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
   std::cout << std::scientific << std::setprecision(3);
   int ier;
 
-  thrust::host_vector<T> x(M);
+  thrust::host_vector<T> x(M), s{};
   thrust::host_vector<thrust::complex<T>> c(M);
   thrust::host_vector<thrust::complex<T>> fk(N1);
 
-  thrust::device_vector<T> d_x(M);
+  thrust::device_vector<T> d_x(M), d_s{};
   thrust::device_vector<thrust::complex<T>> d_c(M);
   thrust::device_vector<thrust::complex<T>> d_fk(N1);
 
@@ -40,6 +40,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
   for (int i = 0; i < M; i++) {
     x[i] = M_PI * randm11(); // x in [-pi,pi)
   }
+
   if (type == 1) {
     for (int i = 0; i < M; i++) {
       c[i].real(randm11());
@@ -50,6 +51,16 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
       fk[i].real(randm11());
       fk[i].imag(randm11());
     }
+  } else if (type == 3) {
+    for (int i = 0; i < M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
+    }
+    s.resize(N1);
+    for (int i = 0; i < N1; i++) {
+      s[i] = N1 / 2 * randm11();
+    }
+    d_s = s;
   } else {
     std::cerr << "Invalid type " << type << " supplied\n";
     return 1;
@@ -60,6 +71,8 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
     d_c = c;
   else if (type == 2)
     d_fk = fk;
+  else if (type == 3)
+    d_c = c;
 
   cudaEvent_t start, stop;
   float milliseconds = 0;
@@ -107,8 +120,8 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
   printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
 
   cudaEventRecord(start);
-  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL,
-                                 dplan);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, N1, d_s.data().get(),
+                                 NULL, NULL, dplan);
 
   if (ier != 0) {
     printf("err: cufinufft_setpts\n");
@@ -153,9 +166,15 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
          opts.gpu_method, N1, M, totaltime / 1000, M / totaltime * 1000);
   printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
 
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+  else if (type == 3)
+    fk = d_fk;
+
   T rel_error = std::numeric_limits<T>::max();
   if (type == 1) {
-    fk                    = d_fk;
     int nt1               = 0.37 * N1; // choose some mode index to check
     thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
     for (int j = 0; j < M; ++j) Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct
@@ -164,8 +183,6 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
     rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
     printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error);
   } else if (type == 2) {
-    c = d_c;
-
     int jt                = M / 2; // check arbitrary choice of one targ pt
     thrust::complex<T> J  = thrust::complex<T>(0, iflag);
     thrust::complex<T> ct = thrust::complex<T>(0, 0);
@@ -174,6 +191,16 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
       ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct
     rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
     printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
+  } else if (type == 3) {
+    int jt                = (N1) / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0);
+
+    for (int j = 0; j < M; ++j) {
+      Ft += c[j] * exp(J * (x[j] * s[jt]));
+    }
+    rel_error = abs(Ft - fk[jt]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error);
   }
 
   return std::isnan(rel_error) || rel_error > checktol;
@@ -185,7 +212,7 @@ int main(int argc, char *argv[]) {
                     "Arguments:\n"
                     "  method: One of\n"
                     "    1: nupts driven\n"
-                    "  type: Type of transform (1, 2)\n"
+                    "  type: Type of transform (1, 2, 3)\n"
                     "  N1: Number of fourier modes\n"
                     "  M: The number of non-uniform points\n"
                     "  tol: NUFFT tolerance\n"
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index f3b767f2e..549508d26 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -22,10 +22,10 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
              double upsampfac) {
   std::cout << std::scientific << std::setprecision(3);
 
-  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<T> x(M), y(M), s{}, t{};
   thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2);
 
-  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<T> d_x(M), d_y(M), d_s{}, d_t{};
   thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2);
 
   std::default_random_engine eng(1);
@@ -49,6 +49,19 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
       fk[i].real(randm11());
       fk[i].imag(randm11());
     }
+  } else if (type == 3) {
+    for (int i = 0; i < M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
+    }
+    s.resize(N1 * N2);
+    t.resize(N1 * N2);
+    for (int i = 0; i < N1 * N2; i++) {
+      s[i] = (N1 / 2) * randm11();
+      t[i] = (N2 / 2) * randm11();
+    }
+    d_s = s;
+    d_t = t;
   } else {
     std::cerr << "Invalid type " << type << " supplied\n";
     return 1;
@@ -60,6 +73,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
     d_c = c;
   else if (type == 2)
     d_fk = fk;
+  else if (type == 3)
+    d_c = c;
 
   cudaEvent_t start, stop;
   float milliseconds = 0;
@@ -106,8 +121,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
   printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
 
   cudaEventRecord(start);
-  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, 0,
-                                 nullptr, nullptr, nullptr, dplan);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, N1 * N2,
+                                 d_s.data().get(), d_t.data().get(), nullptr, dplan);
   if (ier != 0) {
     printf("err: cufinufft_setpts\n");
     return ier;
@@ -144,6 +159,8 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
     fk = d_fk;
   else if (type == 2)
     c = d_c;
+  else if (type == 3)
+    fk = d_fk;
 
   printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
          opts.gpu_method, M, N1 * N2, totaltime / 1000, M / totaltime * 1000);
@@ -173,8 +190,17 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
 
     rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
     printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
-  }
+  } else if (type == 3) {
+    int jt                = (N1 * N2) / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0);
 
+    for (int j = 0; j < M; ++j) {
+      Ft += c[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt]));
+    }
+    rel_error = abs(Ft - fk[jt]) / infnorm(N1 * N2, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error);
+  }
   return std::isnan(rel_error) || rel_error > checktol;
 }
 
@@ -185,7 +211,7 @@ int main(int argc, char *argv[]) {
                     "  method: One of\n"
                     "    1: nupts driven,\n"
                     "    2: sub-problem, or\n"
-                    "  type: Type of transform (1, 2)"
+                    "  type: Type of transform (1, 2, 3)"
                     "  N1, N2: The size of the 2D array\n"
                     "  M: The number of non-uniform points\n"
                     "  tol: NUFFT tolerance\n"
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index 4afcd97dd..02658b671 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -26,10 +26,10 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
   const int N = N1 * N2;
   printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
 
-  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<T> x(M), y(M), s{}, t{};
   thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(ntransf * N1 * N2);
 
-  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<T> d_x(M), d_y(M), d_s{}, d_t{};
   thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(ntransf * N1 * N2);
 
   std::default_random_engine eng(1);
@@ -53,6 +53,19 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
       fk[i].real(randm11());
       fk[i].imag(randm11());
     }
+  } else if (type == 3) {
+    for (int i = 0; i < ntransf * M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
+    }
+    s.resize(N1 * N2);
+    t.resize(N1 * N2);
+    for (int i = 0; i < N1 * N2; i++) {
+      s[i] = M_PI * randm11();
+      t[i] = M_PI * randm11();
+    }
+    d_s = s;
+    d_t = t;
   } else {
     std::cerr << "Invalid type " << type << " supplied\n";
     return 1;
@@ -64,6 +77,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
     d_c = c;
   else if (type == 2)
     d_fk = fk;
+  else if (type == 3)
+    d_c = c;
 
   cudaEvent_t start, stop;
   float milliseconds = 0;
@@ -109,8 +124,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
   printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
 
   cudaEventRecord(start);
-  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL,
-                                 NULL, NULL, dplan);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, N1 * N2,
+                                 d_s.data().get(), d_t.data().get(), nullptr, dplan);
   if (ier != 0) {
     printf("err: cufinufft_setpts\n");
     return ier;
@@ -137,6 +152,10 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 
   cudaEventRecord(start);
   ier = cufinufft_destroy_impl<T>(dplan);
+  if (ier != 0) {
+    printf("err: cufinufft3d_destroy\n");
+    return ier;
+  }
   cudaEventRecord(stop);
   cudaEventSynchronize(stop);
   cudaEventElapsedTime(&milliseconds, start, stop);
@@ -147,6 +166,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
     fk = d_fk;
   else if (type == 2)
     c = d_c;
+  else if (type == 3)
+    fk = d_fk;
 
   T rel_error = std::numeric_limits<T>::max();
   if (type == 1) {
@@ -175,6 +196,18 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 
     rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
     printf("[gpu   ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error);
+  } else if (type == 3) {
+    int jt                      = (N1 * N2) / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J        = thrust::complex<T>(0, iflag);
+    thrust::complex<T> Ft       = thrust::complex<T>(0, 0);
+    thrust::complex<T> *fkstart = fk.data() + (ntransf - 1) * N1 * N2;
+    const thrust::complex<T> *cstart = c.data() + (ntransf - 1) * M;
+
+    for (int j = 0; j < M; ++j) {
+      Ft += cstart[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt]));
+    }
+    rel_error = abs(Ft - fkstart[jt]) / infnorm(N1 * N2, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error);
   }
 
   printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000,
@@ -193,7 +226,7 @@ int main(int argc, char *argv[]) {
             "  method: One of\n"
             "    1: nupts driven,\n"
             "    2: sub-problem, or\n"
-            "  type: Type of transform (1, 2)\n"
+            "  type: Type of transform (1, 2, 3)\n"
             "  N1, N2: The size of the 2D array\n"
             "  ntransf: Number of inputs\n"
             "  maxbatchsize: Number of simultaneous transforms (or 0 for default)\n"
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index 67818c2b2..65b0d7a0c 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -23,10 +23,10 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
   std::cout << std::scientific << std::setprecision(3);
   int ier;
 
-  thrust::host_vector<T> x(M), y(M), z(M);
+  thrust::host_vector<T> x(M), y(M), z(M), s{}, t{}, u{};
   thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2 * N3);
 
-  thrust::device_vector<T> d_x(M), d_y(M), d_z(M);
+  thrust::device_vector<T> d_x(M), d_y(M), d_z(M), d_s{}, d_t{}, d_u{};
   thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2 * N3);
 
   std::default_random_engine eng(1);
@@ -51,6 +51,22 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
       fk[i].real(randm11());
       fk[i].imag(randm11());
     }
+  } else if (type == 3) {
+    for (int i = 0; i < M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
+    }
+    s.resize(N1 * N2 * N3);
+    t.resize(N1 * N2 * N3);
+    u.resize(N1 * N2 * N3);
+    for (int i = 0; i < N1 * N2 * N3; i++) {
+      s[i] = (N1 / 2) * randm11();
+      t[i] = (N2 / 2) * randm11();
+      u[i] = (N3 / 2) * randm11();
+    }
+    d_s = s;
+    d_t = t;
+    d_u = u;
   } else {
     std::cerr << "Invalid type " << type << " supplied\n";
     return 1;
@@ -64,6 +80,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
     d_c = c;
   else if (type == 2)
     d_fk = fk;
+  else if (type == 3)
+    d_c = c;
 
   cudaEvent_t start, stop;
   float milliseconds = 0;
@@ -112,7 +130,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
 
   cudaEventRecord(start);
   ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), d_z.data().get(),
-                                 0, nullptr, nullptr, nullptr, dplan);
+                                 N1 * N2 * N3, d_s.data().get(), d_t.data().get(),
+                                 d_u.data().get(), dplan);
   if (ier != 0) {
     printf("err: cufinufft_setpts\n");
     return ier;
@@ -149,6 +168,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
     fk = d_fk;
   else if (type == 2)
     c = d_c;
+  else if (type == 3)
+    fk = d_fk;
 
   printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method,
          M, N1 * N2 * N3, totaltime / 1000, M / totaltime * 1000);
@@ -184,6 +205,17 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
 
     rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
     printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error);
+  } else if (type == 3) {
+
+    int jt                = (N1 * N2 * N3) / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0);
+
+    for (int j = 0; j < M; ++j) {
+      Ft += c[j] * exp(J * (x[j] * s[jt] + y[j] * t[jt] + z[j] * u[jt]));
+    }
+    rel_error = abs(Ft - fk[jt]) / infnorm(N1 * N2 * N3, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", jt, rel_error);
   }
 
   return std::isnan(rel_error) || rel_error > checktol;
@@ -198,7 +230,7 @@ int main(int argc, char *argv[]) {
             "    1: nupts driven,\n"
             "    2: sub-problem, or\n"
             "    4: block gather.\n"
-            "  type: Type of transform (1, 2)"
+            "  type: Type of transform (1, 2, 3)"
             "  N1, N2, N3: The size of the 3D array\n"
             "  M: The number of non-uniform points\n"
             "  tol: NUFFT tolerance\n"
diff --git a/test/cuda/cufinufft_math_test.cu b/test/cuda/cufinufft_math_test.cu
new file mode 100644
index 000000000..1588abb23
--- /dev/null
+++ b/test/cuda/cufinufft_math_test.cu
@@ -0,0 +1,137 @@
+#include <complex>
+#include <iostream>
+#include <limits>
+#include <random>
+
+// Include the custom operators for cuComplex
+#include <cufinufft/contrib/helper_math.h>
+#include <cufinufft/types.h>
+
+// Helper function to create cuComplex
+template<typename T> cuda_complex<T> make_cuda_complex(T real, T imag) {
+  return cuda_complex<T>{real, imag};
+}
+
+// Helper function to compare cuComplex with std::complex<T> using 1 - ratio as error
+template<typename T>
+bool compareComplexRel(const cuda_complex<T> a, const std::complex<T> b,
+                       const std::string &operation,
+                       T epsilon = std::numeric_limits<T>::epsilon()) {
+  const auto std_a = std::complex<T>(a.x, a.y);
+  const auto err   = std::abs(std_a - b) / std::abs(std_a);
+  const auto tol   = epsilon * T(10); // factor to allow for rounding error
+  if (err > tol) {
+    std::cout << "Comparison failed in operation: " << operation << "\n";
+    std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n";
+    std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n";
+    std::cout << "RelError: " << err << "\n";
+  }
+  return err <= tol;
+}
+
+template<typename T>
+bool compareComplexAbs(const cuda_complex<T> a, const std::complex<T> b,
+                       const std::string &operation,
+                       T epsilon = std::numeric_limits<T>::epsilon()) {
+  const auto std_a = std::complex<T>(a.x, a.y);
+  const auto err   = std::abs(std_a - b);
+  const auto tol   = epsilon * T(10); // factor to allow for rounding error
+  if (err > tol) {
+    std::cout << "Comparison failed in operation: " << operation << "\n";
+    std::cout << "cuComplex: (" << a.x << ", " << a.y << ")\n";
+    std::cout << "std::complex: (" << b.real() << ", " << b.imag() << ")\n";
+    std::cout << "AbsError: " << err << "\n";
+  }
+  return err <= tol;
+}
+
+template<typename T> int testRandomOperations() {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<T> dis(-1.0, 1.0);
+
+  for (int i = 0; i < 1000; ++i) {
+    T real1  = dis(gen);
+    T imag1  = dis(gen);
+    T real2  = dis(gen);
+    T imag2  = dis(gen);
+    T scalar = dis(gen);
+
+    cuda_complex<T> a = make_cuda_complex(real1, imag1);
+    cuda_complex<T> b = make_cuda_complex(real2, imag2);
+    std::complex<T> std_a(real1, imag1);
+    std::complex<T> std_b(real2, imag2);
+
+    // Test addition
+    cuda_complex<T> result_add   = a + b;
+    std::complex<T> expected_add = std_a + std_b;
+    if (!compareComplexAbs(result_add, expected_add,
+                           "add complex<" + std::string(typeid(T).name()) + "> complex<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test subtraction
+    cuda_complex<T> result_sub   = a - b;
+    std::complex<T> expected_sub = std_a - std_b;
+    if (!compareComplexAbs(result_sub, expected_sub,
+                           "sub complex<" + std::string(typeid(T).name()) + "> complex<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test multiplication
+    cuda_complex<T> result_mul   = a * b;
+    std::complex<T> expected_mul = std_a * std_b;
+    if (!compareComplexRel(result_mul, expected_mul,
+                           "mul complex<" + std::string(typeid(T).name()) + "> complex<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test division
+    cuda_complex<T> result_div   = a / b;
+    std::complex<T> expected_div = std_a / std_b;
+    if (!compareComplexRel(result_div, expected_div,
+                           "div complex<" + std::string(typeid(T).name()) + "> complex<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test addition with scalar
+    cuda_complex<T> result_add_scalar   = a + scalar;
+    std::complex<T> expected_add_scalar = std_a + scalar;
+    if (!compareComplexRel(result_add_scalar, expected_add_scalar,
+                           "add complex<" + std::string(typeid(T).name()) + "> scalar<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test subtraction with scalar
+    cuda_complex<T> result_sub_scalar   = a - scalar;
+    std::complex<T> expected_sub_scalar = std_a - scalar;
+    if (!compareComplexRel(result_sub_scalar, expected_sub_scalar,
+                           "sub complex<" + std::string(typeid(T).name()) + "> scalar<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    // Test multiplication with scalar
+    cuda_complex<T> result_mul_scalar   = a * scalar;
+    std::complex<T> expected_mul_scalar = std_a * scalar;
+    if (!compareComplexRel(result_mul_scalar, expected_mul_scalar,
+                           "mul complex<" + std::string(typeid(T).name()) + "> scalar<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+
+    cuda_complex<T> result_div_scalar   = a / scalar;
+    std::complex<T> expected_div_scalar = std_a / scalar;
+    if (!compareComplexRel(result_div_scalar, expected_div_scalar,
+                           "div complex<" + std::string(typeid(T).name()) + "> scalar<" +
+                               std::string(typeid(T).name()) + ">"))
+      return 1;
+  }
+  return 0;
+}
+
+int main() {
+  if (testRandomOperations<float>()) return 1;
+  if (testRandomOperations<double>()) return 1;
+
+  std::cout << "All tests passed!" << std::endl;
+  return 0;
+}
diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu
deleted file mode 100644
index 7f18ee21c..000000000
--- a/test/cuda/fseries_kernel_test.cu
+++ /dev/null
@@ -1,158 +0,0 @@
-#include <cmath>
-#include <complex>
-#include <cufinufft/contrib/helper_cuda.h>
-#include <iomanip>
-#include <iostream>
-
-#include <cufinufft/common.h>
-#include <cufinufft/defs.h>
-#include <cufinufft/spreadinterp.h>
-#include <cufinufft/utils.h>
-
-using namespace cufinufft::common;
-using namespace cufinufft::spreadinterp;
-using namespace cufinufft::utils;
-
-template<typename T> int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) {
-
-  finufft_spread_opts opts;
-  T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
-  T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
-  checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1)));
-  if (dim > 1) checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1)));
-  if (dim > 2) checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1)));
-
-  int ier = setup_spreader(opts, (T)eps, (T)2.0, 0);
-
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  float milliseconds = 0;
-  float gputime      = 0;
-  float cputime      = 0;
-
-  CNTime timer;
-  if (!gpu) {
-    timer.start();
-    fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-    if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-    if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
-
-    onedim_fseries_kernel(nf1, fwkerhalf1, opts);
-    if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts);
-    if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts);
-    cputime = timer.elapsedsec();
-    cudaEventRecord(start);
-    {
-      checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
-                                 cudaMemcpyHostToDevice));
-      if (dim > 1)
-        checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
-                                   cudaMemcpyHostToDevice));
-      if (dim > 2)
-        checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
-                                   cudaMemcpyHostToDevice));
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    gputime = milliseconds;
-    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread,
-           gputime + cputime * 1000);
-    free(fwkerhalf1);
-    if (dim > 1) free(fwkerhalf2);
-    if (dim > 2) free(fwkerhalf3);
-  } else {
-    timer.start();
-    std::complex<double> a[dim * MAX_NQUAD];
-    T f[dim * MAX_NQUAD];
-    onedim_fseries_kernel_precomp(nf1, f, a, opts);
-    if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts);
-    if (dim > 2)
-      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts);
-    cputime = timer.elapsedsec();
-
-    cuDoubleComplex *d_a;
-    T *d_f;
-    cudaEventRecord(start);
-    {
-      checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
-      checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
-      checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
-                                 cudaMemcpyHostToDevice));
-      checkCudaErrors(
-          cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
-      ier =
-          cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2,
-                                 d_fwkerhalf3, opts.nspread, cudaStreamDefault);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    gputime = milliseconds;
-    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread,
-           gputime + cputime * 1000);
-    cudaFree(d_a);
-    cudaFree(d_f);
-  }
-
-  fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-  if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-  if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
-
-  checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
-                             cudaMemcpyDeviceToHost));
-  if (dim > 1)
-    checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
-                               cudaMemcpyDeviceToHost));
-  if (dim > 2)
-    checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
-                               cudaMemcpyDeviceToHost));
-  for (int i = 0; i < nf1 / 2 + 1; i++) printf("%10.8e ", fwkerhalf1[i]);
-  printf("\n");
-  if (dim > 1)
-    for (int i = 0; i < nf2 / 2 + 1; i++) printf("%10.8e ", fwkerhalf2[i]);
-  printf("\n");
-  if (dim > 2)
-    for (int i = 0; i < nf3 / 2 + 1; i++) printf("%10.8e ", fwkerhalf3[i]);
-  printf("\n");
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  if (argc < 3) {
-    fprintf(stderr,
-            "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 "
-            "[nf3]]]]]\n"
-            "Arguments:\n"
-            "  prec: 'f' or 'd' (float/double)\n"
-            "  nf1: The size of the upsampled fine grid size in x.\n"
-            "  dim: Dimension of the nuFFT.\n"
-            "  tol: NUFFT tolerance (default 1e-6).\n"
-            "  gpuversion: Use gpu version or not (default True).\n"
-            "  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
-            "  nf3: The size of the upsampled fine grid size in z. (default nf3)\n");
-    return 1;
-  }
-  char prec  = argv[1][0];
-  int nf1    = std::atof(argv[2]);
-  int dim    = 1;
-  double eps = 1e-6;
-  int gpu    = 1;
-  int nf2    = nf1;
-  int nf3    = nf1;
-  if (argc > 3) dim = std::atoi(argv[3]);
-  if (argc > 4) eps = std::atof(argv[4]);
-  if (argc > 5) gpu = std::atoi(argv[5]);
-  if (argc > 6) nf2 = std::atoi(argv[6]);
-  if (argc > 7) nf3 = std::atoi(argv[7]);
-
-  if (prec == 'f')
-    return run_test<float>(nf1, dim, eps, gpu, nf2, nf3);
-  else if (prec == 'd')
-    return run_test<double>(nf1, dim, eps, gpu, nf2, nf3);
-  else
-    return -1;
-}
diff --git a/test/cuda/fseriesperf.sh b/test/cuda/fseriesperf.sh
deleted file mode 100755
index 36af42276..000000000
--- a/test/cuda/fseriesperf.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# basic perf test of compute fseries for 1d, single/double
-# Melody 02/20/22
-
-BINDIR=./
-
-BIN=$BINDIR/fseries_kernel_test
-DIM=1
-
-echo "Double.............................................."
-for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5
-do
-	for TOL in 1e-8
-	do
-		$BIN $N $DIM $TOL 0
-		$BIN $N $DIM $TOL 1
-	done
-done
-
-BIN=$BINDIR/fseries_kernel_testf
-echo "Single.............................................."
-for N in 1e2 5e2 1e3 2e3 5e3 1e4 5e4 1e5 5e5
-do
-	for TOL in 1e-6
-	do
-		$BIN $N $DIM $TOL 0
-		$BIN $N $DIM $TOL 1
-	done
-done