From 2726bc383a12d14411b8c5e1f63d97c736a894c8 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Tue, 22 Oct 2024 19:20:51 +0200
Subject: [PATCH] fix PR

---
 CMakeLists.txt                                |  10 +-
 fortran/finufftfort.cpp                       | 221 +++++------
 include/finufft.h                             |   3 +-
 include/finufft/defs.h                        | 137 -------
 include/finufft/dirft.h                       |  42 ++-
 include/finufft/fft.h                         |   9 +
 include/finufft/finufft_core.h                |  22 +-
 include/finufft/spreadinterp.h                |   3 +-
 include/finufft/test_defs.h                   | 113 +++++-
 include/finufft/utils.h                       |  35 +-
 include/finufft_eitherprec.h                  |   7 +-
 makefile                                      |   8 +-
 perftest/manysmallprobs.cpp                   |   2 +-
 perftest/spreadtestnd.cpp                     |   2 +-
 perftest/spreadtestndall.cpp                  |   2 +-
 src/{simpleinterfaces.cpp => c_interface.cpp} | 248 ++++--------
 src/{finufft_core.cpp => finufft.cpp}         | 357 ++++++++----------
 src/spreadinterp.cpp                          | 125 +++---
 src/utils.cpp                                 |   2 +-
 test/directft/dirft1d.cpp                     |  38 +-
 test/directft/dirft2d.cpp                     |  66 ++--
 test/directft/dirft3d.cpp                     |  79 ++--
 test/dumbinputs.cpp                           |   2 +-
 23 files changed, 696 insertions(+), 837 deletions(-)
 delete mode 100644 include/finufft/defs.h
 rename src/{simpleinterfaces.cpp => c_interface.cpp} (55%)
 rename src/{finufft_core.cpp => finufft.cpp} (82%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e5e2cf5d..3ea8b76c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,7 +169,7 @@ if(FINUFFT_USE_CPU)
   set(XTL_VERSION 0.7.7)
   set(XSIMD_VERSION 13.0.0)
   # using latest ducc0 version for now as it fixes MacOS GCC build
-  set(DUCC0_VERSION b0beb85e03982344a31ebb119758d7aa3ef5d362)
+  set(DUCC0_VERSION ducc0_0_35_0)
   set(FINUFFT_FFTW_LIBRARIES)
   include(cmake/setupXSIMD.cmake)
   if(FINUFFT_USE_DUCC0)
@@ -257,8 +257,8 @@ if(FINUFFT_USE_CPU)
       src/utils.cpp
       contrib/legendre_rule_fast.cpp
       src/fft.cpp
-      src/finufft_core.cpp
-      src/simpleinterfaces.cpp
+      src/finufft.cpp
+      src/c_interface.cpp
       fortran/finufftfort.cpp)
   else()
     add_library(
@@ -267,8 +267,8 @@ if(FINUFFT_USE_CPU)
       src/utils.cpp
       contrib/legendre_rule_fast.cpp
       src/fft.cpp
-      src/finufft_core.cpp
-      src/simpleinterfaces.cpp
+      src/finufft.cpp
+      src/c_interface.cpp
       fortran/finufftfort.cpp)
   endif()
   set_finufft_options(finufft)
diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp
index 400ff0985..059b44c4c 100644
--- a/fortran/finufftfort.cpp
+++ b/fortran/finufftfort.cpp
@@ -21,13 +21,19 @@
 #include <finufft.h>
 #include <finufft/finufft_core.h>
 
+using f32  = float;
+using f64  = double;
+using c64  = std::complex<float>;
+using c128 = std::complex<double>;
+using i64  = BIGINT;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // --------------------- guru interface from fortran ------------------------
-void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf,
-                       double *tol, finufft_plan *plan, finufft_opts *o, int *ier) {
+void finufft_makeplan_(int *type, int *n_dims, i64 *n_modes, int *iflag, int *n_transf,
+                       f64 *tol, finufft_plan *plan, finufft_opts *o, int *ier) {
   if (!plan)
     fprintf(stderr,
             "%s fortran: plan must be allocated as at least the size of a C pointer "
@@ -39,8 +45,8 @@ void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int
   }
 }
 
-void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, double *zj,
-                     BIGINT *nk, double *s, double *t, double *u, int *ier) {
+void finufft_setpts_(finufft_plan *plan, i64 *M, f64 *xj, f64 *yj, f64 *zj, i64 *nk,
+                     f64 *s, f64 *t, f64 *u, int *ier) {
   if (!*plan) {
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
     return;
@@ -50,8 +56,7 @@ void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, doub
   *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
 }
 
-void finufft_execute_(finufft_plan *plan, std::complex<double> *weights,
-                      std::complex<double> *result, int *ier) {
+void finufft_execute_(finufft_plan *plan, c128 *weights, c128 *result, int *ier) {
   if (!plan)
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
@@ -77,124 +82,104 @@ void finufft_default_opts_(finufft_opts *o) {
 
 // -------------- simple and many-vector interfaces --------------------
 // --- 1D ---
-void finufft1d1_(BIGINT *nj, double *xj, std::complex<double> *cj, int *iflag,
-                 double *eps, BIGINT *ms, std::complex<double> *fk, finufft_opts *o,
-                 int *ier) {
+void finufft1d1_(i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, i64 *ms, c128 *fk,
+                 finufft_opts *o, int *ier) {
   *ier = finufft1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufft1d1many_(int *ntransf, BIGINT *nj, double *xj, std::complex<double> *cj,
-                     int *iflag, double *eps, BIGINT *ms, std::complex<double> *fk,
-                     finufft_opts *o, int *ier) {
+void finufft1d1many_(int *ntransf, i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps,
+                     i64 *ms, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufft1d2_(BIGINT *nj, double *xj, std::complex<double> *cj, int *iflag,
-                 double *eps, BIGINT *ms, std::complex<double> *fk, finufft_opts *o,
-                 int *ier) {
+void finufft1d2_(i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, i64 *ms, c128 *fk,
+                 finufft_opts *o, int *ier) {
   *ier = finufft1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufft1d2many_(int *ntransf, BIGINT *nj, double *xj, std::complex<double> *cj,
-                     int *iflag, double *eps, BIGINT *ms, std::complex<double> *fk,
-                     finufft_opts *o, int *ier) {
+void finufft1d2many_(int *ntransf, i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps,
+                     i64 *ms, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufft1d3_(BIGINT *nj, double *x, std::complex<double> *c, int *iflag, double *eps,
-                 BIGINT *nk, double *s, std::complex<double> *f, finufft_opts *o,
-                 int *ier) {
+void finufft1d3_(i64 *nj, f64 *x, c128 *c, int *iflag, f64 *eps, i64 *nk, f64 *s, c128 *f,
+                 finufft_opts *o, int *ier) {
   *ier = finufft1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
-void finufft1d3many_(int *ntransf, BIGINT *nj, double *x, std::complex<double> *c,
-                     int *iflag, double *eps, BIGINT *nk, double *s,
-                     std::complex<double> *f, finufft_opts *o, int *ier) {
+void finufft1d3many_(int *ntransf, i64 *nj, f64 *x, c128 *c, int *iflag, f64 *eps,
+                     i64 *nk, f64 *s, c128 *f, finufft_opts *o, int *ier) {
   *ier = finufft1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
 // --- 2D ---
-void finufft2d1_(BIGINT *nj, double *xj, double *yj, std::complex<double> *cj, int *iflag,
-                 double *eps, BIGINT *ms, BIGINT *mt, std::complex<double> *fk,
-                 finufft_opts *o, int *ier) {
+void finufft2d1_(i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, f64 *eps, i64 *ms,
+                 i64 *mt, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void finufft2d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj,
-                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
-                     BIGINT *mt, std::complex<double> *fk, finufft_opts *o, int *ier) {
+void finufft2d1many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag,
+                     f64 *eps, i64 *ms, i64 *mt, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void finufft2d2_(BIGINT *nj, double *xj, double *yj, std::complex<double> *cj, int *iflag,
-                 double *eps, BIGINT *ms, BIGINT *mt, std::complex<double> *fk,
-                 finufft_opts *o, int *ier) {
+void finufft2d2_(i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, f64 *eps, i64 *ms,
+                 i64 *mt, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void finufft2d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj,
-                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
-                     BIGINT *mt, std::complex<double> *fk, finufft_opts *o, int *ier) {
+void finufft2d2many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag,
+                     f64 *eps, i64 *ms, i64 *mt, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void finufft2d3_(BIGINT *nj, double *x, double *y, std::complex<double> *c, int *iflag,
-                 double *eps, BIGINT *nk, double *s, double *t, std::complex<double> *f,
-                 finufft_opts *o, int *ier) {
+void finufft2d3_(i64 *nj, f64 *x, f64 *y, c128 *c, int *iflag, f64 *eps, i64 *nk, f64 *s,
+                 f64 *t, c128 *f, finufft_opts *o, int *ier) {
   *ier = finufft2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
-void finufft2d3many_(int *ntransf, BIGINT *nj, double *x, double *y,
-                     std::complex<double> *c, int *iflag, double *eps, BIGINT *nk,
-                     double *s, double *t, std::complex<double> *f, finufft_opts *o,
-                     int *ier) {
+void finufft2d3many_(int *ntransf, i64 *nj, f64 *x, f64 *y, c128 *c, int *iflag, f64 *eps,
+                     i64 *nk, f64 *s, f64 *t, c128 *f, finufft_opts *o, int *ier) {
   *ier = finufft2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
 // --- 3D ---
-void finufft3d1_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
-                 int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
-                 std::complex<double> *fk, finufft_opts *o, int *ier) {
+void finufft3d1_(i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int *iflag, f64 *eps,
+                 i64 *ms, i64 *mt, i64 *mu, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufft3d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj,
-                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
-                     BIGINT *mt, BIGINT *mu, std::complex<double> *fk, finufft_opts *o,
-                     int *ier) {
+void finufft3d1many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj,
+                     int *iflag, f64 *eps, i64 *ms, i64 *mt, i64 *mu, c128 *fk,
+                     finufft_opts *o, int *ier) {
   *ier =
       finufft3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufft3d2_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
-                 int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
-                 std::complex<double> *fk, finufft_opts *o, int *ier) {
+void finufft3d2_(i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int *iflag, f64 *eps,
+                 i64 *ms, i64 *mt, i64 *mu, c128 *fk, finufft_opts *o, int *ier) {
   *ier = finufft3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufft3d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj,
-                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
-                     BIGINT *mt, BIGINT *mu, std::complex<double> *fk, finufft_opts *o,
-                     int *ier) {
+void finufft3d2many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj,
+                     int *iflag, f64 *eps, i64 *ms, i64 *mt, i64 *mu, c128 *fk,
+                     finufft_opts *o, int *ier) {
   *ier =
       finufft3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufft3d3_(BIGINT *nj, double *x, double *y, double *z, std::complex<double> *c,
-                 int *iflag, double *eps, BIGINT *nk, double *s, double *t, double *u,
-                 std::complex<double> *f, finufft_opts *o, int *ier) {
+void finufft3d3_(i64 *nj, f64 *x, f64 *y, f64 *z, c128 *c, int *iflag, f64 *eps, i64 *nk,
+                 f64 *s, f64 *t, f64 *u, c128 *f, finufft_opts *o, int *ier) {
   *ier = finufft3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void finufft3d3many_(int *ntransf, BIGINT *nj, double *x, double *y, double *z,
-                     std::complex<double> *c, int *iflag, double *eps, BIGINT *nk,
-                     double *s, double *t, double *u, std::complex<double> *f,
-                     finufft_opts *o, int *ier) {
+void finufft3d3many_(int *ntransf, i64 *nj, f64 *x, f64 *y, f64 *z, c128 *c, int *iflag,
+                     f64 *eps, i64 *nk, f64 *s, f64 *t, f64 *u, c128 *f, finufft_opts *o,
+                     int *ier) {
   *ier = finufft3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
 // --------------------- guru interface from fortran ------------------------
-void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag,
-                        int *n_transf, float *tol, finufftf_plan *plan, finufft_opts *o,
-                        int *ier) {
+void finufftf_makeplan_(int *type, int *n_dims, i64 *n_modes, int *iflag, int *n_transf,
+                        f32 *tol, finufftf_plan *plan, finufft_opts *o, int *ier) {
   if (!plan)
     fprintf(stderr,
             "%s fortran: plan must be allocated as at least the size of a C pointer "
@@ -206,8 +191,8 @@ void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag,
   }
 }
 
-void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, float *zj,
-                      BIGINT *nk, float *s, float *t, float *u, int *ier) {
+void finufftf_setpts_(finufftf_plan *plan, i64 *M, f32 *xj, f32 *yj, f32 *zj, i64 *nk,
+                      f32 *s, f32 *t, f32 *u, int *ier) {
   if (!*plan) {
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
     return;
@@ -217,8 +202,7 @@ void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, floa
   *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
 }
 
-void finufftf_execute_(finufftf_plan *plan, std::complex<float> *weights,
-                       std::complex<float> *result, int *ier) {
+void finufftf_execute_(finufftf_plan *plan, c64 *weights, c64 *result, int *ier) {
   if (!plan)
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
@@ -244,115 +228,98 @@ void finufftf_default_opts_(finufft_opts *o) {
 
 // -------------- simple and many-vector interfaces --------------------
 // --- 1D ---
-void finufftf1d1_(BIGINT *nj, float *xj, std::complex<float> *cj, int *iflag, float *eps,
-                  BIGINT *ms, std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf1d1_(i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, i64 *ms, c64 *fk,
+                  finufft_opts *o, int *ier) {
   *ier = finufftf1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufftf1d1many_(int *ntransf, BIGINT *nj, float *xj, std::complex<float> *cj,
-                      int *iflag, float *eps, BIGINT *ms, std::complex<float> *fk,
-                      finufft_opts *o, int *ier) {
+void finufftf1d1many_(int *ntransf, i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps,
+                      i64 *ms, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufftf1d2_(BIGINT *nj, float *xj, std::complex<float> *cj, int *iflag, float *eps,
-                  BIGINT *ms, std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf1d2_(i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, i64 *ms, c64 *fk,
+                  finufft_opts *o, int *ier) {
   *ier = finufftf1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufftf1d2many_(int *ntransf, BIGINT *nj, float *xj, std::complex<float> *cj,
-                      int *iflag, float *eps, BIGINT *ms, std::complex<float> *fk,
-                      finufft_opts *o, int *ier) {
+void finufftf1d2many_(int *ntransf, i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps,
+                      i64 *ms, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void finufftf1d3_(BIGINT *nj, float *x, std::complex<float> *c, int *iflag, float *eps,
-                  BIGINT *nk, float *s, std::complex<float> *f, finufft_opts *o,
-                  int *ier) {
+void finufftf1d3_(i64 *nj, f32 *x, c64 *c, int *iflag, f32 *eps, i64 *nk, f32 *s, c64 *f,
+                  finufft_opts *o, int *ier) {
   *ier = finufftf1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
-void finufftf1d3many_(int *ntransf, BIGINT *nj, float *x, std::complex<float> *c,
-                      int *iflag, float *eps, BIGINT *nk, float *s,
-                      std::complex<float> *f, finufft_opts *o, int *ier) {
+void finufftf1d3many_(int *ntransf, i64 *nj, f32 *x, c64 *c, int *iflag, f32 *eps,
+                      i64 *nk, f32 *s, c64 *f, finufft_opts *o, int *ier) {
   *ier = finufftf1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
 // --- 2D ---
-void finufftf2d1_(BIGINT *nj, float *xj, float *yj, std::complex<float> *cj, int *iflag,
-                  float *eps, BIGINT *ms, BIGINT *mt, std::complex<float> *fk,
-                  finufft_opts *o, int *ier) {
+void finufftf2d1_(i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, f32 *eps, i64 *ms,
+                  i64 *mt, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void finufftf2d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj,
-                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
-                      BIGINT *mt, std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf2d1many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag,
+                      f32 *eps, i64 *ms, i64 *mt, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void finufftf2d2_(BIGINT *nj, float *xj, float *yj, std::complex<float> *cj, int *iflag,
-                  float *eps, BIGINT *ms, BIGINT *mt, std::complex<float> *fk,
-                  finufft_opts *o, int *ier) {
+void finufftf2d2_(i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, f32 *eps, i64 *ms,
+                  i64 *mt, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void finufftf2d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj,
-                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
-                      BIGINT *mt, std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf2d2many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag,
+                      f32 *eps, i64 *ms, i64 *mt, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void finufftf2d3_(BIGINT *nj, float *x, float *y, std::complex<float> *c, int *iflag,
-                  float *eps, BIGINT *nk, float *s, float *t, std::complex<float> *f,
-                  finufft_opts *o, int *ier) {
+void finufftf2d3_(i64 *nj, f32 *x, f32 *y, c64 *c, int *iflag, f32 *eps, i64 *nk, f32 *s,
+                  f32 *t, c64 *f, finufft_opts *o, int *ier) {
   *ier = finufftf2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
-void finufftf2d3many_(int *ntransf, BIGINT *nj, float *x, float *y,
-                      std::complex<float> *c, int *iflag, float *eps, BIGINT *nk,
-                      float *s, float *t, std::complex<float> *f, finufft_opts *o,
-                      int *ier) {
+void finufftf2d3many_(int *ntransf, i64 *nj, f32 *x, f32 *y, c64 *c, int *iflag, f32 *eps,
+                      i64 *nk, f32 *s, f32 *t, c64 *f, finufft_opts *o, int *ier) {
   *ier = finufftf2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
 // --- 3D ---
-void finufftf3d1_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
-                  int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
-                  std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf3d1_(i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int *iflag, f32 *eps,
+                  i64 *ms, i64 *mt, i64 *mu, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufftf3d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj,
-                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
-                      BIGINT *mt, BIGINT *mu, std::complex<float> *fk, finufft_opts *o,
-                      int *ier) {
+void finufftf3d1many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj,
+                      int *iflag, f32 *eps, i64 *ms, i64 *mt, i64 *mu, c64 *fk,
+                      finufft_opts *o, int *ier) {
   *ier =
       finufftf3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufftf3d2_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
-                  int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
-                  std::complex<float> *fk, finufft_opts *o, int *ier) {
+void finufftf3d2_(i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int *iflag, f32 *eps,
+                  i64 *ms, i64 *mt, i64 *mu, c64 *fk, finufft_opts *o, int *ier) {
   *ier = finufftf3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufftf3d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj,
-                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
-                      BIGINT *mt, BIGINT *mu, std::complex<float> *fk, finufft_opts *o,
-                      int *ier) {
+void finufftf3d2many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj,
+                      int *iflag, f32 *eps, i64 *ms, i64 *mt, i64 *mu, c64 *fk,
+                      finufft_opts *o, int *ier) {
   *ier =
       finufftf3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void finufftf3d3_(BIGINT *nj, float *x, float *y, float *z, std::complex<float> *c,
-                  int *iflag, float *eps, BIGINT *nk, float *s, float *t, float *u,
-                  std::complex<float> *f, finufft_opts *o, int *ier) {
+void finufftf3d3_(i64 *nj, f32 *x, f32 *y, f32 *z, c64 *c, int *iflag, f32 *eps, i64 *nk,
+                  f32 *s, f32 *t, f32 *u, c64 *f, finufft_opts *o, int *ier) {
   *ier = finufftf3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void finufftf3d3many_(int *ntransf, BIGINT *nj, float *x, float *y, float *z,
-                      std::complex<float> *c, int *iflag, float *eps, BIGINT *nk,
-                      float *s, float *t, float *u, std::complex<float> *f,
-                      finufft_opts *o, int *ier) {
+void finufftf3d3many_(int *ntransf, i64 *nj, f32 *x, f32 *y, f32 *z, c64 *c, int *iflag,
+                      f32 *eps, i64 *nk, f32 *s, f32 *t, f32 *u, c64 *f, finufft_opts *o,
+                      int *ier) {
   *ier = finufftf3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
diff --git a/include/finufft.h b/include/finufft.h
index 18eab5921..ab1da910d 100644
--- a/include/finufft.h
+++ b/include/finufft.h
@@ -2,8 +2,7 @@
 
 // This contains both single and double precision user-facing commands.
 // "macro-safe" rewrite, including the plan object, Barnett 5/21/22-6/7/22.
-// They will clobber any prior macros starting FINUFFT*, so in the lib/test
-// sources finufft.h must be included before defs.h
+// They will clobber any prior macros starting FINUFFT*.
 
 /* Devnotes.
    A) Two precisions done by including the "either precision" headers twice.
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
deleted file mode 100644
index 633a7c1f3..000000000
--- a/include/finufft/defs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Library private definitions & macros; also used by some test routines.
-// If SINGLE defined, chooses single prec, otherwise double prec.
-// Must be #included *after* finufft.h which clobbers FINUFFT* macros
-// (see discussion near line 145 of this file).
-
-// Split out by Joakim Anden, Alex Barnett 9/20/18-9/24/18.
-// Merged in dataTypes, private/public header split, clean. Barnett 6/7/22.
-// finufft_plan_s made private, Wenda's idea.  Barnett 8/8/22.
-
-/* Devnotes:
-   1) Only need work for C++ since that's how compiled, including f_plan_s.
-   (But we use C-style templating, following fftw, etc.)
-*/
-
-#ifndef DEFS_H
-#define DEFS_H
-
-// public header gives access to f_opts, f_spread_opts, f_plan...
-// (and clobbers FINUFFT* macros; watch out!)
-#include <finufft.h>
-#include <finufft/finufft_core.h>
-#include <memory>
-
-// --------------- Private data types for compilation in either prec ---------
-// Devnote: must match those in relevant prec of public finufft.h interface!
-
-// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
-// This includes all calling arguments (eg M,N) that could be huge someday.
-// Precision-independent real and complex types, for private lib/test compile
-#ifdef SINGLE
-using FLT = float;
-#else
-using FLT = double;
-#endif
-#include <complex> // we define C++ complex type only
-using CPX = std::complex<FLT>;
-
-// -------------- Math consts (not in math.h) and useful math macros ----------
-#include <cmath>
-
-// either-precision unit imaginary number...
-#define IMA (CPX(0.0, 1.0))
-
-// MR: In the longer term I suggest to move
-// away from M_PI, which was never part of the standard.
-// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi
-// if no namespaces are used?
-// In C++20 these constants will be part of the language, and the problem will go away.
-#ifndef M_PI // Windows apparently doesn't have this const
-#define M_PI 3.14159265358979329
-#endif
-#define M_1_2PI 0.159154943091895336
-#define M_2PI   6.28318530717958648
-// to avoid mixed precision operators in eg i*pi, an either-prec PI...
-#define PI      FLT(M_PI)
-
-// Random numbers: crappy unif random number generator in [0,1).
-// These macros should probably be replaced by modern C++ std lib or random123.
-// (RAND_MAX is in stdlib.h)
-#include <stdlib.h>
-static inline FLT rand01 [[maybe_unused]] () { return FLT(rand()) / FLT(RAND_MAX); }
-// unif[-1,1]:
-static inline FLT randm11 [[maybe_unused]] () { return 2 * rand01() - FLT(1); }
-// complex unif[-1,1] for Re and Im:
-static inline CPX crandm11 [[maybe_unused]] () { return randm11() + IMA * randm11(); }
-
-// Thread-safe seed-carrying versions of above (x is ptr to seed)...
-// MR: we have to leave those as macros for now, as "rand_r" is deprecated
-// and apparently no longer available on Windows.
-#if 1
-#define rand01r(x)   ((FLT)rand_r(x) / (FLT)RAND_MAX)
-// unif[-1,1]:
-#define randm11r(x)  (2 * rand01r(x) - (FLT)1.0)
-// complex unif[-1,1] for Re and Im:
-#define crandm11r(x) (randm11r(x) + IMA * randm11r(x))
-#else
-static inline FLT rand01r [[maybe_unused]] (unsigned int *x) {
-  return FLT(rand_r(x)) / FLT(RAND_MAX);
-}
-// unif[-1,1]:
-static inline FLT randm11r [[maybe_unused]] (unsigned int *x) {
-  return 2 * rand01r(x) - FLT(1);
-}
-// complex unif[-1,1] for Re and Im:
-static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
-  return randm11r(x) + IMA * randm11r(x);
-}
-#endif
-
-// Prec-switching name macros (respond to SINGLE), used in lib & test sources
-// and the plan object below.
-// Note: crucially, these are now indep of macros used to gen public finufft.h!
-// However, some overlap in name (FINUFFTIFY*, FINUFFT_PLAN*), meaning
-// finufft.h cannot be included after defs.h since it undefs these overlaps :(
-#ifdef SINGLE
-// a macro to prepend finufft or finufftf to a string without a space.
-// The 2nd level of indirection is needed for safety, see:
-// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
-#define FINUFFTIFY_UNSAFE(x) finufftf##x
-#else
-#define FINUFFTIFY_UNSAFE(x) finufft##x
-#endif
-#define FINUFFTIFY(x)        FINUFFTIFY_UNSAFE(x)
-// Now use the above tool to set up 2020-style macros used in tester source...
-#define FINUFFT_PLAN         FINUFFTIFY(_plan)
-#define FINUFFT_PLAN_S       FINUFFTIFY(_plan_s)
-#define FINUFFT_DEFAULT_OPTS FINUFFTIFY(_default_opts)
-#define FINUFFT_MAKEPLAN     FINUFFTIFY(_makeplan)
-#define FINUFFT_SETPTS       FINUFFTIFY(_setpts)
-#define FINUFFT_EXECUTE      FINUFFTIFY(_execute)
-#define FINUFFT_DESTROY      FINUFFTIFY(_destroy)
-#define FINUFFT1D1           FINUFFTIFY(1d1)
-#define FINUFFT1D2           FINUFFTIFY(1d2)
-#define FINUFFT1D3           FINUFFTIFY(1d3)
-#define FINUFFT2D1           FINUFFTIFY(2d1)
-#define FINUFFT2D2           FINUFFTIFY(2d2)
-#define FINUFFT2D3           FINUFFTIFY(2d3)
-#define FINUFFT3D1           FINUFFTIFY(3d1)
-#define FINUFFT3D2           FINUFFTIFY(3d2)
-#define FINUFFT3D3           FINUFFTIFY(3d3)
-#define FINUFFT1D1MANY       FINUFFTIFY(1d1many)
-#define FINUFFT1D2MANY       FINUFFTIFY(1d2many)
-#define FINUFFT1D3MANY       FINUFFTIFY(1d3many)
-#define FINUFFT2D1MANY       FINUFFTIFY(2d1many)
-#define FINUFFT2D2MANY       FINUFFTIFY(2d2many)
-#define FINUFFT2D3MANY       FINUFFTIFY(2d3many)
-#define FINUFFT3D1MANY       FINUFFTIFY(3d1many)
-#define FINUFFT3D2MANY       FINUFFTIFY(3d2many)
-#define FINUFFT3D3MANY       FINUFFTIFY(3d3many)
-
-// --------  FINUFFT's plan object, prec-switching version ------------------
-// NB: now private (the public C++ or C etc user sees an opaque pointer to it)
-
-#include <finufft/fft.h> // (must come after complex.h)
-struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T<FLT> {};
-
-#endif // DEFS_H
diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h
index 5d13265a4..2449d864e 100644
--- a/include/finufft/dirft.h
+++ b/include/finufft/dirft.h
@@ -1,22 +1,36 @@
 #ifndef DIRFT_H
 #define DIRFT_H
 
-#include <finufft/defs.h>
+#include <finufft/finufft_core.h>
 
-void dirft1d1(BIGINT nj, FLT *x, CPX *c, int isign, BIGINT ms, CPX *f);
-void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f);
-void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f);
+template<typename T>
+void dirft1d1(BIGINT nj, T *x, std::complex<T> *c, int isign, BIGINT ms,
+              std::complex<T> *f);
+template<typename T>
+void dirft1d2(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT ms,
+              std::complex<T> *f);
+template<typename T>
+void dirft1d3(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT nk, T *s,
+              std::complex<T> *f);
 
-void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
-void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
-void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
-              CPX *f);
+template<typename T>
+void dirft2d1(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
+              std::complex<T> *f);
+template<typename T>
+void dirft2d2(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
+              std::complex<T> *f);
+template<typename T>
+void dirft2d3(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT nk, T *s, T *t,
+              std::complex<T> *f);
 
-void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
-              BIGINT mu, CPX *f);
-void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
-              BIGINT mu, CPX *f);
-void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
-              FLT *t, FLT *u, CPX *f);
+template<typename T>
+void dirft3d1(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
+              BIGINT mt, BIGINT mu, std::complex<T> *f);
+template<typename T>
+void dirft3d2(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
+              BIGINT mt, BIGINT mu, std::complex<T> *f);
+template<typename T>
+void dirft3d3(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT nk, T *s,
+              T *t, T *u, std::complex<T> *f);
 
 #endif
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
index c6d5de7a5..1fd8d1635 100644
--- a/include/finufft/fft.h
+++ b/include/finufft/fft.h
@@ -10,6 +10,9 @@ template<typename T> class Finufft_FFT_plan {
 public:
   [[maybe_unused]] Finufft_FFT_plan(void (*)(void *) = nullptr,
                                     void (*)(void *) = nullptr, void * = nullptr) {}
+  // deleting these operations to be consistent with the FFTW plans (seel below)
+  Finufft_FFT_plan(const Finufft_FFT_plan &)            = delete;
+  Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
   [[maybe_unused]] void plan(const std::vector<int> & /*dims*/, size_t /*batchSize*/,
                              std::complex<T> * /*ptr*/, int /*sign*/, int /*options*/,
                              int /*nthreads*/) {}
@@ -63,11 +66,15 @@ template<> struct Finufft_FFT_plan<float> {
 #endif
     unlock();
   }
+  // we have raw pointers in the object (the FFTW plan).
+  // If we allow copying those, we end up destroying the plans multiple times.
+  Finufft_FFT_plan(const Finufft_FFT_plan &) = delete;
   [[maybe_unused]] ~Finufft_FFT_plan() {
     lock();
     fftwf_destroy_plan(plan_);
     unlock();
   }
+  Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
 
   void plan
       [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
@@ -131,11 +138,13 @@ template<> struct Finufft_FFT_plan<double> {
 #endif
     unlock();
   }
+  Finufft_FFT_plan(const Finufft_FFT_plan &) = delete;
   [[maybe_unused]] ~Finufft_FFT_plan() {
     lock();
     fftw_destroy_plan(plan_);
     unlock();
   }
+  Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete;
 
   void plan
       [[maybe_unused]] (const std::vector<int> &dims, size_t batchSize,
diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h
index de2f2dab9..cdfa4ab33 100644
--- a/include/finufft/finufft_core.h
+++ b/include/finufft/finufft_core.h
@@ -95,6 +95,17 @@ inline constexpr BIGINT MAX_NF = BIGINT(1e12);
 // values for M = nj (also nk in type 3)...
 inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 
+// MR: In the longer term I suggest to move
+// away from M_PI, which was never part of the standard.
+// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi
+// if no namespaces are used?
+// In C++20 these constants will be part of the language, and the problem will go away.
+#ifndef M_PI // Windows apparently doesn't have this const
+#define M_PI 3.14159265358979329
+#endif
+#define M_1_2PI 0.159154943091895336
+#define M_2PI   6.28318530717958648
+
 // ----- OpenMP macros which also work when omp not present -----
 // Allows compile-time switch off of openmp, so compilation without any openmp
 // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
@@ -138,7 +149,8 @@ template<typename TF> struct FINUFFT_PLAN_T { // the main plan object, fully C++
 
   // These default and delete specifications just state the obvious,
   // but are here to silence compiler warnings.
-  FINUFFT_PLAN_T() = default;
+  FINUFFT_PLAN_T(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, TF tol,
+                 finufft_opts *opts, int &ier);
   // Copy construction and assignent are already deleted implicitly
   // because of the unique_ptr member.
   FINUFFT_PLAN_T(const FINUFFT_PLAN_T &)            = delete;
@@ -189,7 +201,8 @@ template<typename TF> struct FINUFFT_PLAN_T { // the main plan object, fully C++
   std::vector<TF> Sp, Tp, Up; // internal primed targs (s'_k, etc),
                               // allocated
   type3params<TF> t3P;        // groups together type 3 shift, scale, phase, parameters
-  FINUFFT_PLAN_T<TF> *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3
+  std::unique_ptr<FINUFFT_PLAN_T<TF>> innerT2plan; // ptr used for type 2 in step 2 of
+                                                   // type 3
 
   // other internal structs
   std::unique_ptr<Finufft_FFT_plan<TF>> fftPlan;
@@ -204,10 +217,5 @@ void finufft_default_opts_t(finufft_opts *o);
 template<typename TF>
 int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
                        TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts);
-template<typename TF>
-int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
-                     TF *s, TF *t, TF *u);
-template<typename TF>
-int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk);
 
 #endif // FINUFFT_CORE_H
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 8a83af3ce..04c4734ba 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -1,12 +1,13 @@
 // Defines interface to spreading/interpolation code.
 
-// Devnotes: see defs.h for definition of MAX_NSPREAD (as of 9/24/18).
+// Devnotes: see finufft_core.h for definition of MAX_NSPREAD
 // RESCALE macro moved to spreadinterp.cpp, 7/15/20.
 // finufft_spread_opts renamed 6/7/22.
 
 #ifndef SPREADINTERP_H
 #define SPREADINTERP_H
 
+#include <finufft/finufft_core.h>
 #include <finufft_spread_opts.h>
 
 /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags.
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index bdd4cf147..e8e80e9f8 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -11,21 +11,120 @@
 // for omp rand filling
 #define TEST_RANDCHUNK 1000000
 
-// the public interface: since this clobbers FINUFFT* macros, must be included
-// *before* private defs.h...
+// the public interface
 #include <finufft.h>
 
-// convenient private finufft internals (must come after finufft.h)
+// convenient private finufft internals
+#include <finufft/finufft_core.h>
 #include <finufft/utils.h>
-// prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc...
-#include <finufft/defs.h>
+#include <memory>
+
+// --------------- Private data types for compilation in either prec ---------
+// Devnote: must match those in relevant prec of public finufft.h interface!
+
+// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
+// This includes all calling arguments (eg M,N) that could be huge someday.
+
+// Precision-independent real and complex types, for private lib/test compile
+#ifdef SINGLE
+using FLT = float;
+#else
+using FLT = double;
+#endif
+#include <complex> // we define C++ complex type only
+using CPX = std::complex<FLT>;
+
+// -------------- Math consts (not in math.h) and useful math macros ----------
+#include <cmath>
+
+// either-precision unit imaginary number...
+#define IMA (CPX(0.0, 1.0))
+
+// to avoid mixed precision operators in eg i*pi, an either-prec PI...
+#define PI  FLT(M_PI)
+
+// Random numbers: crappy unif random number generator in [0,1).
+// These macros should probably be replaced by modern C++ std lib or random123.
+// (RAND_MAX is in stdlib.h)
+#include <cstdlib>
+static inline FLT rand01 [[maybe_unused]] () { return FLT(rand()) / FLT(RAND_MAX); }
+// unif[-1,1]:
+static inline FLT randm11 [[maybe_unused]] () { return 2 * rand01() - FLT(1); }
+// complex unif[-1,1] for Re and Im:
+static inline CPX crandm11 [[maybe_unused]] () { return randm11() + IMA * randm11(); }
+
+// Thread-safe seed-carrying versions of above (x is ptr to seed)...
+// MR: we have to leave those as macros for now, as "rand_r" is deprecated
+// and apparently no longer available on Windows.
+#if 1
+#define rand01r(x)   ((FLT)rand_r(x) / (FLT)RAND_MAX)
+// unif[-1,1]:
+#define randm11r(x)  (2 * rand01r(x) - (FLT)1.0)
+// complex unif[-1,1] for Re and Im:
+#define crandm11r(x) (randm11r(x) + IMA * randm11r(x))
+#else
+static inline FLT rand01r [[maybe_unused]] (unsigned int *x) {
+  return FLT(rand_r(x)) / FLT(RAND_MAX);
+}
+// unif[-1,1]:
+static inline FLT randm11r [[maybe_unused]] (unsigned int *x) {
+  return 2 * rand01r(x) - FLT(1);
+}
+// complex unif[-1,1] for Re and Im:
+static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
+  return randm11r(x) + IMA * randm11r(x);
+}
+#endif
+
+// Prec-switching name macros (respond to SINGLE), used in lib & test sources
+// and the plan object below.
+// Note: crucially, these are now indep of macros used to gen public finufft.h!
+#ifdef SINGLE
+// a macro to prepend finufft or finufftf to a string without a space.
+// The 2nd level of indirection is needed for safety, see:
+// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
+#define FINUFFTIFY_UNSAFE(x) finufftf##x
+#else
+#define FINUFFTIFY_UNSAFE(x) finufft##x
+#endif
+#define FINUFFTIFY(x)        FINUFFTIFY_UNSAFE(x)
+// Now use the above tool to set up 2020-style macros used in tester source...
+#define FINUFFT_PLAN         FINUFFTIFY(_plan)
+#define FINUFFT_PLAN_S       FINUFFTIFY(_plan_s)
+#define FINUFFT_DEFAULT_OPTS FINUFFTIFY(_default_opts)
+#define FINUFFT_MAKEPLAN     FINUFFTIFY(_makeplan)
+#define FINUFFT_SETPTS       FINUFFTIFY(_setpts)
+#define FINUFFT_EXECUTE      FINUFFTIFY(_execute)
+#define FINUFFT_DESTROY      FINUFFTIFY(_destroy)
+#define FINUFFT1D1           FINUFFTIFY(1d1)
+#define FINUFFT1D2           FINUFFTIFY(1d2)
+#define FINUFFT1D3           FINUFFTIFY(1d3)
+#define FINUFFT2D1           FINUFFTIFY(2d1)
+#define FINUFFT2D2           FINUFFTIFY(2d2)
+#define FINUFFT2D3           FINUFFTIFY(2d3)
+#define FINUFFT3D1           FINUFFTIFY(3d1)
+#define FINUFFT3D2           FINUFFTIFY(3d2)
+#define FINUFFT3D3           FINUFFTIFY(3d3)
+#define FINUFFT1D1MANY       FINUFFTIFY(1d1many)
+#define FINUFFT1D2MANY       FINUFFTIFY(1d2many)
+#define FINUFFT1D3MANY       FINUFFTIFY(1d3many)
+#define FINUFFT2D1MANY       FINUFFTIFY(2d1many)
+#define FINUFFT2D2MANY       FINUFFTIFY(2d2many)
+#define FINUFFT2D3MANY       FINUFFTIFY(2d3many)
+#define FINUFFT3D1MANY       FINUFFTIFY(3d1many)
+#define FINUFFT3D2MANY       FINUFFTIFY(3d2many)
+#define FINUFFT3D3MANY       FINUFFTIFY(3d3many)
+
+// --------  FINUFFT's plan object, prec-switching version ------------------
+// NB: now private (the public C++ or C etc user sees an opaque pointer to it)
+
+#include <finufft/fft.h>
+struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T<FLT> {};
 
 // std stuff for tester src
 #include <cstdio>
 #include <iomanip>
 #include <iostream>
-#include <math.h>
-#include <stdlib.h>
 #include <vector>
 
 #endif // TEST_DEFS_H
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
index 040f60543..c550a8bb9 100644
--- a/include/finufft/utils.h
+++ b/include/finufft/utils.h
@@ -14,51 +14,44 @@ namespace utils {
 
 // ahb's low-level array helpers
 template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, std::complex<T> *a,
-                                             std::complex<T> *b)
+FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, const std::complex<T> *a,
+                                             const std::complex<T> *b)
 // ||a-b||_2 / ||a||_2
 {
   T err = 0.0, nrm = 0.0;
   for (BIGINT m = 0; m < n; ++m) {
-    nrm += real(conj(a[m]) * a[m]);
-    std::complex<T> diff = a[m] - b[m];
-    err += real(conj(diff) * diff);
+    nrm += std::norm(a[m]);
+    err += std::norm(a[m] - b[m]);
   }
   return sqrt(err / nrm);
 }
 template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, std::complex<T> *a,
-                                          std::complex<T> *b)
+FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, const std::complex<T> *a,
+                                          const std::complex<T> *b)
 // ||a-b||_2
 {
   T err = 0.0; // compute error 2-norm
-  for (BIGINT m = 0; m < n; ++m) {
-    std::complex<T> diff = a[m] - b[m];
-    err += real(conj(diff) * diff);
-  }
+  for (BIGINT m = 0; m < n; ++m) err += std::norm(a[m] - b[m]);
   return sqrt(err);
 }
 template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, std::complex<T> *a)
+FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, const std::complex<T> *a)
 // ||a||_2
 {
   T nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]);
+  for (BIGINT m = 0; m < n; ++m) nrm += std::norm(a[m]);
   return sqrt(nrm);
 }
 template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, std::complex<T> *a)
+FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, const std::complex<T> *a)
 // ||a||_infty
 {
   T nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) {
-    T aa = real(conj(a[m]) * a[m]);
-    if (aa > nrm) nrm = aa;
-  }
+  for (BIGINT m = 0; m < n; ++m) nrm = std::max(nrm, std::norm(a[m]));
   return sqrt(nrm);
 }
 template<typename T>
-FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi)
+FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, const T *a, T *lo, T *hi)
 // With a a length-n array, writes out min(a) to lo and max(a) to hi,
 // so that all a values lie in [lo,hi].
 // If n==0, lo and hi are not finite.
@@ -71,10 +64,10 @@ FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi)
   }
 }
 template<typename T>
-FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c)
+FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, const T *a, T *w, T *c)
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in defs.h.
+// ARRAYWIDCEN_GROWFRAC defined in finufft_core.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 {
diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
index 3f0a7d95c..fa698cec2 100644
--- a/include/finufft_eitherprec.h
+++ b/include/finufft_eitherprec.h
@@ -4,8 +4,7 @@
 /* Devnotes.
  1)  Since everything here is exposed to the public interface, macros must be
  safe, eg FINUFFTsomething.
- 2)  They will clobber any prior macros starting FINUFFT*, so in the lib/test
- sources finufft.h must be included before defs.h
+ 2)  They will clobber any prior macros starting FINUFFT*
  3) for debug of header macros, see finufft.h
 */
 
@@ -82,7 +81,7 @@ extern "C" {
 typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN;
 
 // ------------------ the guru interface ------------------------------------
-// (sources in finufft.cpp)
+// (sources in c_interface.cpp)
 
 FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
 FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(
@@ -96,7 +95,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(
 FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan);
 
 // ----------------- the 18 simple interfaces -------------------------------
-// (sources in simpleinterfaces.cpp)
+// (sources in c_interface.cpp)
 
 FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(
     FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps,
diff --git a/makefile b/makefile
index c9754a376..7a715a7d5 100644
--- a/makefile
+++ b/makefile
@@ -66,7 +66,7 @@ XSIMD_DIR := $(DEPS_ROOT)/xsimd
 
 # DUCC sources optional dependency repo
 DUCC_URL := https://gitlab.mpcdf.mpg.de/mtr/ducc.git
-DUCC_VERSION := ducc0_0_34_0
+DUCC_VERSION := ducc0_0_35_0
 DUCC_DIR := $(DEPS_ROOT)/ducc
 # this dummy file used as empty target by make...
 DUCC_COOKIE := $(DUCC_DIR)/.finufft_has_ducc
@@ -137,7 +137,7 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 SOBJS = src/utils.o src/spreadinterp.o
 
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
-OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o $(DUCC_OBJS)
+OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
 
 .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean
 
@@ -181,12 +181,10 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS)
 	$(CC) -c $(CFLAGS) $< -o $@
 %.o: %.f
 	$(FC) -c $(FFLAGS) $< -o $@
-%_32.o: %.f
-	$(FC) -DSINGLE -c $(FFLAGS) $< -o $@
 
 # spreadinterp include auto-generated code, xsimd header-only dependency;
 # if FFT=DUCC also setup ducc with fft.h dependency on $(DUCC_SETUP)...
-# Note src/spreadinterp.cpp includes finufft/defs.h which includes finufft/fft.h
+# Note src/spreadinterp.cpp includes finufft/finufft_core.h which includes finufft/fft.h
 # so fftw/ducc header needed for spreadinterp, though spreadinterp should not
 # depend on fftw/ducc directly?
 include/finufft/fft.h: $(DUCC_SETUP)
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index 5e27289d8..ded866ba0 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -1,6 +1,6 @@
 // public header
 #include "finufft.h"
-#include "finufft/defs.h"
+#include "finufft/test_defs.h"
 
 // private access to timer
 #include "finufft/utils.h"
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index d30626007..8c35828bd 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -1,5 +1,5 @@
-#include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
+#include <finufft/test_defs.h>
 #include <finufft/utils.h>
 
 #include <math.h>
diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp
index 14aad3420..9787d86b0 100644
--- a/perftest/spreadtestndall.cpp
+++ b/perftest/spreadtestndall.cpp
@@ -1,5 +1,5 @@
-#include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
+#include <finufft/test_defs.h>
 #include <finufft/utils.h>
 
 #include <cmath>
diff --git a/src/simpleinterfaces.cpp b/src/c_interface.cpp
similarity index 55%
rename from src/simpleinterfaces.cpp
rename to src/c_interface.cpp
index d2606ee6b..6f7deb61b 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/c_interface.cpp
@@ -12,8 +12,6 @@ using namespace std;
    As of v1.2 these simply invoke the guru interface, through a helper layer.
    See ../docs/usage.rst or http://finufft.readthedocs.io for documentation
    all routines here.
-   This compiles in either double or single precision (based on -DSINGLE),
-   producing functions finufft?d?{many} or finufftf?1?{many} respectively.
 
    Authors: Andrea Malleo and Alex Barnett, 2019-2020.
    Safe namespacing, Barnett, May 2022.
@@ -42,20 +40,18 @@ int finufftf_makeplan(int type, int dim, const i64 *n_modes, int iflag, int ntra
 
 int finufft_setpts(finufft_plan p, i64 nj, f64 *xj, f64 *yj, f64 *zj, i64 nk, f64 *s,
                    f64 *t, f64 *u) {
-  return finufft_setpts_t<f64>(reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p), nj, xj, yj, zj,
-                               nk, s, t, u);
+  return reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p)->setpts(nj, xj, yj, zj, nk, s, t, u);
 }
 int finufftf_setpts(finufftf_plan p, i64 nj, f32 *xj, f32 *yj, f32 *zj, i64 nk, f32 *s,
                     f32 *t, f32 *u) {
-  return finufft_setpts_t<f32>(reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p), nj, xj, yj, zj,
-                               nk, s, t, u);
+  return reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p)->setpts(nj, xj, yj, zj, nk, s, t, u);
 }
 
 int finufft_execute(finufft_plan p, c128 *cj, c128 *fk) {
-  return finufft_execute_t<f64>(reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p), cj, fk);
+  return reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p)->execute(cj, fk);
 }
 int finufftf_execute(finufftf_plan p, c64 *cj, c64 *fk) {
-  return finufft_execute_t<f32>(reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p), cj, fk);
+  return reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p)->execute(cj, fk);
 }
 
 int finufft_destroy(finufft_plan p)
@@ -68,7 +64,6 @@ int finufft_destroy(finufft_plan p)
     return 1;
 
   delete reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p);
-  p = nullptr;
   return 0; // success
 }
 int finufftf_destroy(finufftf_plan p)
@@ -81,19 +76,15 @@ int finufftf_destroy(finufftf_plan p)
     return 1;
 
   delete reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p);
-  p = nullptr;
   return 0; // success
 }
 // Helper layer ...........................................................
 
-namespace finufft {
-namespace common {
-
 template<typename T>
-static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T *zj,
-                    std::complex<T> *cj, int iflag, T eps,
-                    const std::array<i64, 3> &n_modes, i64 nk, T *s, T *t, T *u,
-                    std::complex<T> *fk, finufft_opts *popts)
+static int guru(int n_dims, int type, int n_transf, i64 nj, const std::array<T *, 3> &xyz,
+                std::complex<T> *cj, int iflag, T eps, const std::array<i64, 3> &n_modes,
+                i64 nk, const std::array<T *, 3> &stu, std::complex<T> *fk,
+                finufft_opts *popts)
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
@@ -107,14 +98,14 @@ static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T
     return ier;
   }
 
-  int ier2 = finufft_setpts_t<T>(plan, nj, xj, yj, zj, nk, s, t, u);
+  int ier2 = plan->setpts(nj, xyz[0], xyz[1], xyz[2], nk, stu[0], stu[1], stu[2]);
   if (ier2 > 1) {
     fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
     delete plan;
     return ier2;
   }
 
-  int ier3 = finufft_execute_t<T>(plan, cj, fk);
+  int ier3 = plan->execute(cj, fk);
   if (ier3 > 1) {
     fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
     delete plan;
@@ -125,258 +116,179 @@ static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T
   return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning
 }
 
-} // namespace common
-} // namespace finufft
-
-using namespace finufft::common;
-
 // Dimension 1111111111111111111111111111111111111111111111111111111111111111
 
 int finufft1d1many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms,
-                   c128 *fk, finufft_opts *opts)
-// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
-{
-  return guruCall<f64>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+                   c128 *fk, finufft_opts *opts) {
+  return guru<f64>(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts);
 }
 int finufftf1d1many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms,
-                    c64 *fk, finufft_opts *opts)
-// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
-{
-  return guruCall<f32>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+                    c64 *fk, finufft_opts *opts) {
+  return guru<f32>(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts);
 }
 
 int finufft1d1(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk,
-               finufft_opts *opts)
-//  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+               finufft_opts *opts) {
   return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 int finufftf1d1(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk,
-                finufft_opts *opts)
-//  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+                finufft_opts *opts) {
   return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
 int finufft1d2many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms,
-                   c128 *fk, finufft_opts *opts)
-//  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f64>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+                   c128 *fk, finufft_opts *opts) {
+  return guru<f64>(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts);
 }
 int finufftf1d2many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms,
-                    c64 *fk, finufft_opts *opts)
-//  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f32>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+                    c64 *fk, finufft_opts *opts) {
+  return guru<f32>(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts);
 }
 
 int finufft1d2(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk,
-               finufft_opts *opts)
-//  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+               finufft_opts *opts) {
   return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 int finufftf1d2(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk,
-                finufft_opts *opts)
-//  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+                finufft_opts *opts) {
   return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
 int finufft1d3many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk,
-                   f64 *s, c128 *fk, finufft_opts *opts)
-// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f64>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+                   f64 *s, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk,
+                   opts);
 }
 int finufftf1d3many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk,
-                    f32 *s, c64 *fk, finufft_opts *opts)
-// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f32>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
-                       {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+                    f32 *s, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk,
+                   opts);
 }
 int finufft1d3(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s, c128 *fk,
-               finufft_opts *opts)
-// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+               finufft_opts *opts) {
   return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
 }
 int finufftf1d3(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s, c64 *fk,
-                finufft_opts *opts)
-// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
-{
+                finufft_opts *opts) {
   return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
 }
 
 // Dimension 22222222222222222222222222222222222222222222222222222222222222222
 
 int finufft2d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps,
-                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts)
-//  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f64>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(2, 1, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk,
+                   opts);
 }
 int finufftf2d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps,
-                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts)
-//  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
-{
-  return guruCall<f32>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(2, 1, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk,
+                   opts);
 }
 int finufft2d1(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt,
-               c128 *fk, finufft_opts *opts)
-//  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
-{
+               c128 *fk, finufft_opts *opts) {
   return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 int finufftf2d1(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt,
-                c64 *fk, finufft_opts *opts)
-//  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
-{
+                c64 *fk, finufft_opts *opts) {
   return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
 int finufft2d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps,
-                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts)
-//  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f64>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(2, 2, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk,
+                   opts);
 }
 int finufftf2d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps,
-                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts)
-//  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f32>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(2, 2, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk,
+                   opts);
 }
 int finufft2d2(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt,
-               c128 *fk, finufft_opts *opts)
-//  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
-{
+               c128 *fk, finufft_opts *opts) {
   return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 int finufftf2d2(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt,
-                c64 *fk, finufft_opts *opts)
-//  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
-{
+                c64 *fk, finufft_opts *opts) {
   return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
 int finufft2d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps,
-                   i64 nk, f64 *s, f64 *t, c128 *fk, finufft_opts *opts)
-// Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f64>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk,
-                       s, t, nullptr, fk, opts);
+                   i64 nk, f64 *s, f64 *t, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, {s, t},
+                   fk, opts);
 }
 int finufftf2d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps,
-                    i64 nk, f32 *s, f32 *t, c64 *fk, finufft_opts *opts)
-// Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f32>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk,
-                       s, t, nullptr, fk, opts);
+                    i64 nk, f32 *s, f32 *t, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, {s, t},
+                   fk, opts);
 }
 int finufft2d3(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s,
-               f64 *t, c128 *fk, finufft_opts *opts)
-// Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
-{
+               f64 *t, c128 *fk, finufft_opts *opts) {
   return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
 }
 int finufftf2d3(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s,
-                f32 *t, c64 *fk, finufft_opts *opts)
-// Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
-{
+                f32 *t, c64 *fk, finufft_opts *opts) {
   return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
 }
 
 // Dimension 3333333333333333333333333333333333333333333333333333333333333333
 
 int finufft3d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
-                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
-// Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f64>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {},
+                   fk, opts);
 }
 int finufftf3d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
-                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
-// Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
-{
-  return guruCall<f32>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {},
+                   fk, opts);
 }
 int finufft3d1(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms,
-               i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
-//  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+               i64 mt, i64 mu, c128 *fk, finufft_opts *opts) {
   return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 int finufftf3d1(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms,
-                i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
-//  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+                i64 mt, i64 mu, c64 *fk, finufft_opts *opts) {
   return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
 int finufft3d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
-                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
-// Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
-{
-  return guruCall<f64>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) {
+  return guru<f64>(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {},
+                   fk, opts);
 }
 int finufftf3d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
-                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
-// Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
-{
-  return guruCall<f32>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
-                       nullptr, nullptr, nullptr, fk, opts);
+                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) {
+  return guru<f32>(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {},
+                   fk, opts);
 }
 int finufft3d2(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms,
-               i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
-// Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+               i64 mt, i64 mu, c128 *fk, finufft_opts *opts) {
   return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 int finufftf3d2(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms,
-                i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
-// Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+                i64 mt, i64 mu, c64 *fk, finufft_opts *opts) {
   return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
 int finufft3d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
-                   f64 eps, i64 nk, f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts)
-//  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
-{
-  return guruCall<f64>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s,
-                       t, u, fk, opts);
+                   f64 eps, i64 nk, f64 *s, f64 *t, f64 *u, c128 *fk,
+                   finufft_opts *opts) {
+  return guru<f64>(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk,
+                   {s, t, u}, fk, opts);
 }
 int finufftf3d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
-                    f32 eps, i64 nk, f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts)
-//  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
-{
-  return guruCall<f32>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s,
-                       t, u, fk, opts);
+                    f32 eps, i64 nk, f32 *s, f32 *t, f32 *u, c64 *fk,
+                    finufft_opts *opts) {
+  return guru<f32>(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk,
+                   {s, t, u}, fk, opts);
 }
 int finufft3d3(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 nk,
-               f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts)
-//  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+               f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts) {
   return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
 }
 int finufftf3d3(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 nk,
-                f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts)
-//  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
-{
+                f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts) {
   return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
 }
diff --git a/src/finufft_core.cpp b/src/finufft.cpp
similarity index 82%
rename from src/finufft_core.cpp
rename to src/finufft.cpp
index e950e2160..2130cfd8a 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft.cpp
@@ -26,7 +26,7 @@ using namespace finufft::quadrature;
 
    As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions
    and the two finufft2d?many() functions. The (now 18) simple C++ interfaces
-   are in simpleinterfaces.cpp.
+   are in c_interface.cpp.
 
 Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
 
@@ -65,10 +65,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
 
 Design notes for guru interface implementation:
 
-* Since finufft_plan is C-compatible, we need to use malloc/free for its
-  allocatable arrays, keeping it quite low-level. We can't use std::vector
-  since that would only survive in the scope of each function.
-
 * Thread-safety: FINUFFT plans are passed as pointers, so it has no global
   state apart from that associated with FFTW (and the did_fftw_init).
 */
@@ -93,8 +89,8 @@ static int set_nf_type12(BIGINT ms, const finufft_opts &opts,
     return 0;
   } else {
     fprintf(stderr,
-            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
-            "malloc\n",
+            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting "
+            "memory allocation\n",
             __func__, (double)*nf, (double)MAX_NF);
     return FINUFFT_ERR_MAXNALLOC;
   }
@@ -540,64 +536,56 @@ void finufft_default_opts_t(finufft_opts *o)
 
 // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
 template<typename TF>
-int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
-                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts)
+FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, int iflag,
+                                   int ntrans_, TF tol_, finufft_opts *opts_, int &ier)
+    : type(type_), dim(dim_), ntrans(ntrans_), tol(tol_)
 // Populates the fields of finufft_plan which is pointed to by "pp".
 // opts is ptr to a finufft_opts to set options, or nullptr to use defaults.
 // For some of the fields (if "auto" selected) here choose the actual setting.
 // For types 1,2 allocates memory for internal working arrays,
 // evaluates spreading kernel coefficients, and instantiates the fftw_plan
 {
-  FINUFFT_PLAN_T<TF> *p;
-  p   = new FINUFFT_PLAN_T<TF>; // allocate fresh plan struct
-  *pp = p;                      // pass out plan as ptr to plan struct
-
-  if (!opts)                    // use default opts
-    finufft_default_opts_t(&(p->opts));
-  else                          // or read from what's passed in
-    p->opts = *opts;            // keep a deep copy; changing *opts now has no effect
+  if (!opts_)      // use default opts
+    finufft_default_opts_t(&opts);
+  else             // or read from what's passed in
+    opts = *opts_; // keep a deep copy; changing *opts now has no effect
 
-  if (p->opts.debug)            // do a hello world
+  if (opts.debug)  // do a hello world
     printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
            __func__);
 
-  p->fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
-      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
+  fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
+      opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data);
 
   if ((type != 1) && (type != 2) && (type != 3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
-    return FINUFFT_ERR_TYPE_NOTVALID;
+    throw int(FINUFFT_ERR_TYPE_NOTVALID);
   }
   if ((dim != 1) && (dim != 2) && (dim != 3)) {
     fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-    return FINUFFT_ERR_DIM_NOTVALID;
+    throw int(FINUFFT_ERR_DIM_NOTVALID);
   }
   if (ntrans < 1) {
     fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
-    return FINUFFT_ERR_NTRANS_NOTVALID;
+    throw int(FINUFFT_ERR_NTRANS_NOTVALID);
   }
-  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
+  if (!opts.fftw_lock_fun != !opts.fftw_unlock_fun) {
     fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
             __func__);
-    return FINUFFT_ERR_LOCK_FUNS_INVALID;
-    ;
+    throw int(FINUFFT_ERR_LOCK_FUNS_INVALID);
   }
 
   // get stuff from args...
-  p->type    = type;
-  p->dim     = dim;
-  p->ntrans  = ntrans;
-  p->tol     = tol;
-  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
+  fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
 
-                                      // choose overall # threads...
+                                   // choose overall # threads...
 #ifdef _OPENMP
   int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
   int nthr       = ompmaxnthr; // default: use as many as OMP gives us
   // (the above could be set, or suggested set, to 1 for small enough problems...)
-  if (p->opts.nthreads > 0) {
-    nthr = p->opts.nthreads; // user override, now without limit
-    if (p->opts.showwarn && (nthr > ompmaxnthr))
+  if (opts.nthreads > 0) {
+    nthr = opts.nthreads; // user override, now without limit
+    if (opts.showwarn && (nthr > ompmaxnthr))
       fprintf(stderr,
               "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
               "available; note large nthreads can be slower.\n",
@@ -605,61 +593,61 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int
   }
 #else
   int nthr = 1; // always 1 thread (avoid segfault)
-  if (p->opts.nthreads > 1)
+  if (opts.nthreads > 1)
     fprintf(stderr,
             "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
-            __func__, p->opts.nthreads);
+            __func__, opts.nthreads);
 #endif
-  p->opts.nthreads = nthr; // store actual # thr planned for
+  opts.nthreads = nthr; // store actual # thr planned for
   // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
 
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
-  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
-    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
-    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
-  } else {                                          // batchSize override by user
-    p->batchSize = std::min(p->opts.maxbatchsize, ntrans);
-    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
+  if (opts.maxbatchsize == 0) {                        // logic to auto-set best batchsize
+    nbatch    = 1 + (ntrans - 1) / nthr;               // min # batches poss
+    batchSize = 1 + (ntrans - 1) / nbatch;             // then cut # thr in each b
+  } else {                                             // batchSize override by user
+    batchSize = std::min(opts.maxbatchsize, ntrans);
+    nbatch    = 1 + (ntrans - 1) / batchSize;          // resulting # batches
   }
-  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
-  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
+  if (opts.spread_thread == 0) opts.spread_thread = 2; // our auto choice
+  if (opts.spread_thread != 1 && opts.spread_thread != 2) {
     fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
-    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
+    throw int(FINUFFT_ERR_SPREAD_THREAD_NOTVALID);
   }
 
-  if (type != 3) {                      // read in user Fourier mode array sizes...
-    p->ms = n_modes[0];
-    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
-    p->mu = (dim > 2) ? n_modes[2] : 1;
-    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
+  if (type != 3) {                   // read in user Fourier mode array sizes...
+    ms = n_modes[0];
+    mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
+    mu = (dim > 2) ? n_modes[2] : 1;
+    N  = ms * mt * mu;               // N = total # modes
   }
 
   // heuristic to choose default upsampfac... (currently two poss)
-  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
-    p->opts.upsampfac = 2.0;                 // default, and need for tol small
-    if (tol >= (TF)1E-9) {                   // the tol sigma=5/4 can reach
-      if (type == 3)                         // could move to setpts, more known?
-        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
-      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
-               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
-                                             // typ tol, 12-core xeon
-        p->opts.upsampfac = 1.25;
+  if (opts.upsampfac == 0.0) {            // indicates auto-choose
+    opts.upsampfac = 2.0;                 // default, and need for tol small
+    if (tol >= (TF)1E-9) {                // the tol sigma=5/4 can reach
+      if (type == 3)                      // could move to setpts, more known?
+        opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
+      else if ((dim == 1 && N > 10000000) || (dim == 2 && N > 300000) ||
+               (dim == 3 && N > 3000000)) // type 1,2 heuristic cutoffs, double,
+                                          // typ tol, 12-core xeon
+        opts.upsampfac = 1.25;
     }
-    if (p->opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
+    if (opts.debug > 1)
+      printf("[%s] set auto upsampfac=%.2f\n", __func__, opts.upsampfac);
   }
   // use opts to choose and write into plan's spread options...
-  int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
+  ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
   if (ier > 1) // proceed if success or warning
-    return ier;
+    throw int(ier);
 
   // set others as defaults (or unallocated for arrays)...
-  p->X   = nullptr;
-  p->Y   = nullptr;
-  p->Z   = nullptr;
-  p->nf1 = 1;
-  p->nf2 = 1;
-  p->nf3 = 1; // crucial to leave as 1 for unused dims
+  X   = nullptr;
+  Y   = nullptr;
+  Z   = nullptr;
+  nf1 = 1;
+  nf2 = 1;
+  nf3 = 1; // crucial to leave as 1 for unused dims
 
   //  ------------------------ types 1,2: planning needed ---------------------
   if (type == 1 || type == 2) {
@@ -667,97 +655,115 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int
     int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
                          // Note: batchSize not used since might be only 1.
 
-    p->spopts.spread_direction = type;
+    spopts.spread_direction = type;
 
     constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();
-    if (p->opts.showwarn) { // user warn round-off error...
-      if (EPSILON * p->ms > 1.0)
+    if (opts.showwarn) { // user warn round-off error...
+      if (EPSILON * ms > 1.0)
         fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->ms));
-      if (EPSILON * p->mt > 1.0)
+                __func__, (double)(EPSILON * ms));
+      if (EPSILON * mt > 1.0)
         fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mt));
-      if (EPSILON * p->mu > 1.0)
+                __func__, (double)(EPSILON * mt));
+      if (EPSILON * mu > 1.0)
         fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mu));
+                __func__, (double)(EPSILON * mu));
     }
 
     // determine fine grid sizes, sanity check..
-    int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1));
-    if (nfier) return nfier; // nf too big; we're done
-    p->phiHat1.resize(p->nf1 / 2 + 1);
+    int nfier = set_nf_type12(ms, opts, spopts, &nf1);
+    if (nfier) throw nfier; // nf too big; we're done
+    phiHat1.resize(nf1 / 2 + 1);
     if (dim > 1) {
-      nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2));
-      if (nfier) return nfier;
-      p->phiHat2.resize(p->nf2 / 2 + 1);
+      nfier = set_nf_type12(mt, opts, spopts, &nf2);
+      if (nfier) throw nfier;
+      phiHat2.resize(nf2 / 2 + 1);
     }
     if (dim > 2) {
-      nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3));
-      if (nfier) return nfier;
-      p->phiHat3.resize(p->nf3 / 2 + 1);
+      nfier = set_nf_type12(mu, opts, spopts, &nf3);
+      if (nfier) throw nfier;
+      phiHat3.resize(nf3 / 2 + 1);
     }
 
-    if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
+    if (opts.debug) { // "long long" here is to avoid warnings with printf...
       printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
              "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
              "batchSize=%d ",
-             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
-             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
-             p->batchSize);
-      if (p->batchSize == 1) // spread_thread has no effect in this case
+             __func__, dim, type, (long long)ms, (long long)mt, (long long)mu,
+             (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize);
+      if (batchSize == 1) // spread_thread has no effect in this case
         printf("\n");
       else
-        printf(" spread_thread=%d\n", p->opts.spread_thread);
+        printf(" spread_thread=%d\n", opts.spread_thread);
     }
 
     // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
     CNTime timer;
     timer.start();
-    onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
-    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
-    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
-    if (p->opts.debug)
-      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
+    onedim_fseries_kernel(nf1, phiHat1, spopts);
+    if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts);
+    if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts);
+    if (opts.debug)
+      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
              timer.elapsedsec());
 
-    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
-    if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr,
-              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
-              __func__);
-      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
-      return FINUFFT_ERR_MAXNALLOC;
+    nf = nf1 * nf2 * nf3; // fine grid total number of points
+    if (nf * batchSize > MAX_NF) {
+      fprintf(
+          stderr,
+          "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
+          __func__);
+      throw int(FINUFFT_ERR_MAXNALLOC);
     }
 
     timer.restart();
-    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug)
+    fwBatch = fftPlan->alloc_complex(nf * batchSize); // the big workspace
+    if (opts.debug)
       printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(std::complex<TF>) * p->nf * p->batchSize,
+             (double)1E-09 * sizeof(std::complex<TF>) * nf * batchSize,
              timer.elapsedsec());
-    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
+    if (!fwBatch) { // we don't catch all such allocs, just this big one
+      fprintf(stderr, "[%s] FFT allocation failed for fwBatch (working fine grids)!\n",
               __func__);
-      return FINUFFT_ERR_ALLOC;
+      throw int(FINUFFT_ERR_ALLOC);
     }
 
     timer.restart(); // plan the FFTW
-    const auto ns = gridsize_for_fft(p);
-    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
-    if (p->opts.debug)
-      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
-             nthr_fft, timer.elapsedsec());
+    const auto ns = gridsize_for_fft(this);
+    fftPlan->plan(ns, batchSize, fwBatch, fftSign, opts.fftw, nthr_fft);
+    if (opts.debug)
+      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
+             timer.elapsedsec());
 
   } else { // -------------------------- type 3 (no planning) ------------
 
-    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
+    if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
     // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->fwBatch     = nullptr;
-    p->innerT2plan = nullptr;
+    fwBatch     = nullptr;
+    innerT2plan = nullptr;
     // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
     // Note we don't even know nj or nk yet, so can't do anything else!
   }
-  return ier; // report setup_spreader status (could be warning)
+  if (ier > 1) throw int(ier); // report setup_spreader status (could be warning)
+}
+
+template<typename TF>
+int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts)
+// Populates the fields of finufft_plan which is pointed to by "pp".
+// opts is ptr to a finufft_opts to set options, or nullptr to use defaults.
+// For some of the fields (if "auto" selected) here choose the actual setting.
+// For types 1,2 allocates memory for internal working arrays,
+// evaluates spreading kernel coefficients, and instantiates the fftw_plan
+{
+  *pp     = nullptr;
+  int ier = 0;
+  try {
+    *pp = new FINUFFT_PLAN_T<TF>(type, dim, n_modes, iflag, ntrans, tol, opts, ier);
+  } catch (int errcode) {
+    return errcode;
+  }
+  return ier;
 }
 
 // For this function and the following ones (i.e. everything that is accessible
@@ -824,7 +830,7 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     U        = u;
 
     // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
-    TF S1, S2, S3; // get half-width X, center C, which contains {x_j}...
+    TF S1 = 0, S2 = 0, S3 = 0; // get half-width X, center C, which contains {x_j}...
     arraywidcen(nj, xj, &(t3P.X1), &(t3P.C1));
     arraywidcen(nk, s, &S1, &(t3P.D1)); // same D, S, but for {s_k}
     set_nhg_type3(S1, t3P.X1, opts, spopts, &(nf1), &(t3P.h1),
@@ -858,7 +864,8 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     nf = nf1 * nf2 * nf3; // fine grid total number of points
     if (nf * batchSize > MAX_NF) {
       fprintf(stderr,
-              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting memory "
+              "allocation!\n",
               __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
@@ -872,24 +879,24 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
              (double)1E-09 * sizeof(std::complex<TF>) * (nf + nj) * batchSize,
              timer.elapsedsec());
     if (!fwBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
+      fprintf(stderr, "[%s t3] allocation fail for fwBatch or CpBatch!\n", __func__);
       return FINUFFT_ERR_ALLOC;
     }
     // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
     // FIXME: should use realloc
-    if (X) free(X);
-    X = (TF *)malloc(sizeof(TF) * nj);
+    if (X) delete[] X;
+    X = new TF[nj];
     Sp.resize(nk);
     if (d > 1) {
-      if (Y) free(Y);
-      Y = (TF *)malloc(sizeof(TF) * nj);
+      if (Y) delete[] Y;
+      Y = new TF[nj];
       Tp.resize(nk);
     }
     if (d > 2) {
-      if (Z) free(Z);
-      Z = (TF *)malloc(sizeof(TF) * nj);
+      if (Z) delete[] Z;
+      Z = new TF[nj];
       Up.resize(nk);
     }
 
@@ -920,9 +927,9 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
       }
     } else
       for (BIGINT j = 0; j < nj; ++j)
-        prephase[j] = (std::complex<TF>)1.0; // *** or keep flag so no mult in exec??
+        prephase[j] = {1, 0}; // *** or keep flag so no mult in exec??
 
-                                             // rescale the target s_k etc to s'_k etc...
+                              // rescale the target s_k etc to s'_k etc...
 #pragma omp parallel for num_threads(opts.nthreads) schedule(static)
     for (BIGINT k = 0; k < nk; ++k) {
       Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1);   // so |s'_k| < pi/R
@@ -947,16 +954,9 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
       phiHatk3.resize(nk);
       onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3
     }
-    int Cfinite =
-        std::isfinite(t3P.C1) && std::isfinite(t3P.C2) && std::isfinite(t3P.C3); // C can
-                                                                                 // be nan
-                                                                                 // or inf
-                                                                                 // if
-                                                                                 // M=0,
-                                                                                 // no
-                                                                                 // input
-                                                                                 // NU pts
-    int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0;              // cen
+    // C can be nan or inf if M=0, no input NU pts
+    int Cfinite = std::isfinite(t3P.C1) && std::isfinite(t3P.C2) && std::isfinite(t3P.C3);
+    int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0; // cen
 #pragma omp parallel for num_threads(opts.nthreads) schedule(static)
     for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
       TF phiHat = phiHatk1[k];
@@ -990,20 +990,18 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
     t2opts.spread_debug = std::max(0, opts.spread_debug - 1);
     t2opts.showwarn     = 0;                           // so don't see warnings 2x
     // (...could vary other t2opts here?)
-    if (innerT2plan) {
-      delete innerT2plan;
-      innerT2plan = nullptr;
-    }
-    int ier = finufft_makeplan_t<TF>(2, d, t2nmodes, fftSign, batchSize, tol,
-                                     &innerT2plan, &t2opts);
+    // MR: temporary hack, until we have figured out the C++ interface.
+    FINUFFT_PLAN_T<TF> *tmpplan;
+    int ier = finufft_makeplan_t<TF>(2, d, t2nmodes, fftSign, batchSize, tol, &tmpplan,
+                                     &t2opts);
+    innerT2plan.reset(tmpplan);
     if (ier > 1) { // if merely warning, still proceed
       fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
               __func__, ier);
       return ier;
     }
-    ier = finufft_setpts_t<TF>(innerT2plan, nk, Sp.data(), Tp.data(), Up.data(), 0,
-                               nullptr, nullptr,
-                               nullptr); // note nk = # output points (not nj)
+    ier = innerT2plan->setpts(nk, Sp.data(), Tp.data(), Up.data(), 0, nullptr, nullptr,
+                              nullptr); // note nk = # output points (not nj)
     if (ier > 1) {
       fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
       return ier;
@@ -1013,23 +1011,10 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
   }
   return 0;
 }
-template<typename TF>
-int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
-                     TF *s, TF *t, TF *u)
-/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
-   spreading. (The last 4 arguments are ignored.)
-   For type 3: allocates internal working arrays, scales/centers the NU points
-   and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
-*/
-{
-  return p->setpts(nj, xj, yj, zj, nk, s, t, u);
-}
-template int finufft_setpts_t<float>(FINUFFT_PLAN_T<float> *p, BIGINT nj, float *xj,
-                                     float *yj, float *zj, BIGINT nk, float *s, float *t,
-                                     float *u);
-template int finufft_setpts_t<double>(FINUFFT_PLAN_T<double> *p, BIGINT nj, double *xj,
-                                      double *yj, double *zj, BIGINT nk, double *s,
-                                      double *t, double *u);
+template int FINUFFT_PLAN_T<float>::setpts(BIGINT nj, float *xj, float *yj, float *zj,
+                                           BIGINT nk, float *s, float *t, float *u);
+template int FINUFFT_PLAN_T<double>::setpts(BIGINT nj, double *xj, double *yj, double *zj,
+                                            BIGINT nk, double *s, double *t, double *u);
 
 // ............ end setpts ..................................................
 
@@ -1152,7 +1137,7 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
       innerT2plan->ntrans = thisBatchSize; // do not try this at home!
       /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
      still the same size, as Andrea explained; just wastes a few flops) */
-      finufft_execute_t(innerT2plan, fkb, fwBatch);
+      innerT2plan->execute(fkb, fwBatch);
       t_t2 += timer.elapsedsec();
       // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
       timer.restart();
@@ -1176,26 +1161,10 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
 
   return 0;
 }
-template<typename TF>
-int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk) {
-  /* See ../docs/cguru.doc for current documentation.
-
-   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
-   existing (sorted) NU pts and existing plan.
-   For type 1 and 3: cj is input, fk is output.
-   For type 2: fk is input, cj is output.
-   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
-   for each of the 3 types.
-   For cases of ntrans>1, performs work in blocks of size up to batchSize.
-   Return value 0 (no error diagnosis yet).
-   Barnett 5/20/20, based on Malleo 2019.
-*/
-  return p->execute(cj, fk);
-}
-template int finufft_execute_t<float>(FINUFFT_PLAN_T<float> *p, std::complex<float> *cj,
-                                      std::complex<float> *fk);
-template int finufft_execute_t<double>(
-    FINUFFT_PLAN_T<double> *p, std::complex<double> *cj, std::complex<double> *fk);
+template int FINUFFT_PLAN_T<float>::execute(std::complex<float> *cj,
+                                            std::complex<float> *fk);
+template int FINUFFT_PLAN_T<double>::execute(std::complex<double> *cj,
+                                             std::complex<double> *fk);
 
 // DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 template<typename TF> FINUFFT_PLAN_T<TF>::~FINUFFT_PLAN_T() {
@@ -1203,13 +1172,11 @@ template<typename TF> FINUFFT_PLAN_T<TF>::~FINUFFT_PLAN_T() {
   // Also must not crash if called immediately after finufft_makeplan.
   // Thus either each thing free'd here is guaranteed to be nullptr or correctly
   // allocated.
-  if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array
+  if (fftPlan) fftPlan->free(fwBatch); // free the big FFT (or t3 spread) working array
   if (type == 3) {
-    delete innerT2plan;
-    innerT2plan = nullptr;
-    free(X);
-    free(Y);
-    free(Z);
+    delete[] X;
+    delete[] Y;
+    delete[] Z;
   }
 }
 template FINUFFT_PLAN_T<float>::~FINUFFT_PLAN_T();
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 7c7309de2..a0341a0b5 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1,6 +1,5 @@
 // Spreading/interpolating module within FINUFFT.
 
-#include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 
@@ -9,6 +8,7 @@
 
 #include <xsimd/xsimd.hpp>
 
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
@@ -164,7 +164,7 @@ constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros(
     const std::array<std::array<T, M>, N> &input) noexcept {
   constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept {
     std::array<T, PaddedM> padded{0};
-    for (auto i = 0; i < input.size(); ++i) {
+    for (size_t i = 0; i < input.size(); ++i) {
       padded[i] = input[i];
     }
     return padded;
@@ -187,20 +187,21 @@ FINUFFT_NEVER_INLINE
 void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
                         UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
                         UBIGINT M0) {
-  printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1);
+  printf("size1 %" PRIu64 ", padded_size1 %" PRIu64 "\n", size1, padded_size1);
   switch (ndims) {
   case 1:
-    printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1,
-           (long long)padded_size1, (long long)M0);
+    printf("\tsubgrid: off %" PRId64 "\t siz %" PRIu64 "\t #NU %" PRIu64 "\n", offset1,
+           padded_size1, M0);
     break;
   case 2:
-    printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1,
-           (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0);
+    printf("\tsubgrid: off %" PRId64 ",%" PRId64 "\t siz %" PRIu64 ",%" PRIu64
+           "\t #NU %" PRIu64 "\n",
+           offset1, offset2, padded_size1, size2, M0);
     break;
   case 3:
-    printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n",
-           (long long)offset1, (long long)offset2, (long long)offset3,
-           (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0);
+    printf("\tsubgrid: off %" PRId64 ",%" PRId64 ",%" PRId64 "\t siz %" PRIu64 ",%" PRIu64
+           ",%" PRIu64 "\t #NU %" PRIu64 "\n",
+           offset1, offset2, offset3, padded_size1, size2, size3, M0);
     break;
   default:
     printf("Invalid number of dimensions: %d\n", ndims);
@@ -223,13 +224,6 @@ static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept
   return (result - floor(result)) * T(N);
 }
 
-template<typename T, class simd_type>
-static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
-                                                    const BIGINT N) noexcept {
-  const simd_type x2pi   = T(M_1_2PI);
-  const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5));
-  return (result - xsimd::floor(result)) * simd_type(T(N));
-}
 template<typename T, uint8_t ns>
 static void set_kernel_args(T *args, T x) noexcept
 // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
@@ -400,7 +394,7 @@ static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *k
       out[0] = std::fma(du[2 * j], ker[dx], out[0]);
       out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
     }
-  } else if (i1 + ns >= N1) { // wraps at right
+  } else if (i1 + ns >= BIGINT(N1)) { // wraps at right
     for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) {
       out[0] = std::fma(du[2 * j], ker[dx], out[0]);
       out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
@@ -446,14 +440,13 @@ static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, B
 */
   using arch_t                       = typename simd_type::arch_type;
   static constexpr auto padding      = get_padding<T, 2 * ns>();
-  static constexpr auto alignment    = arch_t::alignment();
   static constexpr auto simd_size    = simd_type::size;
   static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
   std::array<T, 2> out{0};
   const auto j = i1;
   // removing the wrapping leads up to 10% speedup in certain cases
   // moved the wrapping to another function to reduce instruction cache pressure
-  if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) {
+  if (i1 < 0 || i1 + ns >= BIGINT(N1) || i1 + ns + (padding + 1) / 2 >= BIGINT(N1)) {
     return interp_line_wrap<T, ns>(target, du, ker, i1, N1);
   } else { // doesn't wrap
     // logic largely similar to spread 1D kernel, please see the explanation there
@@ -519,7 +512,7 @@ static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T
   std::array<T, 2> out{0};
   using arch_t                    = typename simd_type::arch_type;
   static constexpr auto alignment = arch_t::alignment();
-  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) {
+  if (i1 >= 0 && i1 + ns <= BIGINT(N1) && i2 >= 0 && i2 + ns <= BIGINT(N2)) {
     // store a horiz line (interleaved real,imag)
     alignas(alignment) std::array<T, 2 * ns> line{0};
     // add remaining const-y lines to the line (expensive inner loop)
@@ -539,10 +532,10 @@ static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T
     auto x = i1, y = i2;                // initialize coords
     for (uint8_t d{0}; d < ns; d++) {   // set up ptr lists
       if (x < 0) x += BIGINT(N1);
-      if (x >= N1) x -= BIGINT(N1);
+      if (x >= BIGINT(N1)) x -= BIGINT(N1);
       j1[d] = x++;
       if (y < 0) y += BIGINT(N2);
-      if (y >= N2) y -= BIGINT(N2);
+      if (y >= BIGINT(N2)) y -= BIGINT(N2);
       j2[d] = y++;
     }
     for (uint8_t dy{0}; dy < ns; dy++) { // use the pts lists
@@ -598,11 +591,10 @@ static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1
   // no wrapping: avoid ptrs
   using arch_t                          = typename simd_type::arch_type;
   static constexpr auto padding         = get_padding<T, 2 * ns>();
-  static constexpr auto alignment       = arch_t::alignment();
   static constexpr auto simd_size       = simd_type::size;
   static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
-  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 &&
-      (i1 + ns + (padding + 1) / 2 < N1)) {
+  if (i1 >= 0 && i1 + ns <= BIGINT(N1) && i2 >= 0 && i2 + ns <= BIGINT(N2) &&
+      (i1 + ns + (padding + 1) / 2 < BIGINT(N1))) {
     const auto line = [du, N1, i1 = UBIGINT(i1), i2 = UBIGINT(i2),
                        ker2]() constexpr noexcept {
       // new array du_pts to store the du values for the current y line
@@ -666,9 +658,9 @@ static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T
    */
   using arch_t                    = typename simd_type::arch_type;
   static constexpr auto alignment = arch_t::alignment();
-  const auto in_bounds_1          = (i1 >= 0) & (i1 + ns <= N1);
-  const auto in_bounds_2          = (i2 >= 0) & (i2 + ns <= N2);
-  const auto in_bounds_3          = (i3 >= 0) & (i3 + ns <= N3);
+  const auto in_bounds_1          = (i1 >= 0) & (i1 + ns <= BIGINT(N1));
+  const auto in_bounds_2          = (i2 >= 0) & (i2 + ns <= BIGINT(N2));
+  const auto in_bounds_3          = (i3 >= 0) & (i3 + ns <= BIGINT(N3));
   std::array<T, 2> out{0};
   // case no wrapping needed but padding spills over du array.
   // Hence, no explicit vectorization but the code is still faster
@@ -701,13 +693,13 @@ static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T
     auto x = i1, y = i2, z = i3;                                 // initialize coords
     for (uint8_t d{0}; d < ns; d++) {                            // set up ptr lists
       if (x < 0) x += BIGINT(N1);
-      if (x >= N1) x -= BIGINT(N1);
+      if (x >= BIGINT(N1)) x -= BIGINT(N1);
       j1[d] = x++;
       if (y < 0) y += BIGINT(N2);
-      if (y >= N2) y -= BIGINT(N2);
+      if (y >= BIGINT(N2)) y -= BIGINT(N2);
       j2[d] = y++;
       if (z < 0) z += BIGINT(N3);
-      if (z >= N3) z -= BIGINT(N3);
+      if (z >= BIGINT(N3)) z -= BIGINT(N3);
       j3[d] = z++;
     }
     for (uint8_t dz{0}; dz < ns; dz++) {     // use the pts lists
@@ -763,15 +755,14 @@ static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
 {
   using arch_t                          = typename simd_type::arch_type;
   static constexpr auto padding         = get_padding<T, 2 * ns>();
-  static constexpr auto alignment       = arch_t::alignment();
   static constexpr auto simd_size       = simd_type::size;
-  static constexpr auto ker23_size      = (ns + simd_size - 1) & -simd_size;
   static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
-  const auto in_bounds_1                = (i1 >= 0) & (i1 + ns <= N1);
-  const auto in_bounds_2                = (i2 >= 0) & (i2 + ns <= N2);
-  const auto in_bounds_3                = (i3 >= 0) & (i3 + ns <= N3);
+  const auto in_bounds_1                = (i1 >= 0) & (i1 + ns <= BIGINT(N1));
+  const auto in_bounds_2                = (i2 >= 0) & (i2 + ns <= BIGINT(N2));
+  const auto in_bounds_3                = (i3 >= 0) & (i3 + ns <= BIGINT(N3));
   std::array<T, 2> out{0};
-  if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) {
+  if (in_bounds_1 && in_bounds_2 && in_bounds_3 &&
+      (i1 + ns + (padding + 1) / 2 < BIGINT(N1))) {
     const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2,
                        ker3, du]() constexpr noexcept {
       std::array<simd_type, line_vectors> line{0};
@@ -844,7 +835,7 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(
    */
   const std::array inputs{elems...};
   // compile time loop, no performance overhead
-  for (auto i = 0; i < sizeof...(elems); ++i) {
+  for (size_t i = 0; i < sizeof...(elems); ++i) {
     // compile time branch no performance overhead
     if constexpr (kerevalmeth == 1) {
       if (opts.upsampfac == 2.0) {
@@ -1365,27 +1356,27 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
   };
 
   BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z...
-  for (int i = 0; i < size2; ++i) {
+  for (UBIGINT i = 0; i < size2; ++i) {
     if (y < 0) y += BIGINT(N2);
-    if (y >= N2) y -= BIGINT(N2);
+    if (y >= BIGINT(N2)) y -= BIGINT(N2);
     o2[i] = y++;
   }
-  for (int i = 0; i < size3; ++i) {
+  for (UBIGINT i = 0; i < size3; ++i) {
     if (z < 0) z += BIGINT(N3);
-    if (z >= N3) z -= BIGINT(N3);
+    if (z >= BIGINT(N3)) z -= BIGINT(N3);
     o3[i] = z++;
   }
   UBIGINT nlo = (offset1 < 0) ? -offset1 : 0; // # wrapping below in x
   UBIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x
   // this triple loop works in all dims
-  for (int dz = 0; dz < size3; dz++) {               // use ptr lists in each axis
+  for (UBIGINT dz = 0; dz < size3; dz++) {           // use ptr lists in each axis
     const auto oz = N1 * N2 * o3[dz];                // offset due to z (0 in <3D)
-    for (int dy = 0; dy < size2; dy++) {
+    for (UBIGINT dy = 0; dy < size2; dy++) {
       const auto oy              = N1 * o2[dy] + oz; // off due to y & z (0 in 1D)
       auto *FINUFFT_RESTRICT out = data_uniform + 2 * oy;
       const auto in = du0 + 2 * padded_size1 * (dy + size2 * dz); // ptr to subgrid array
       auto o        = 2 * (offset1 + N1);                         // 1d offset for output
-      for (auto j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts)
+      for (UBIGINT j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts)
         accumulate(out[j + o], in[j]);
       }
       o = 2 * offset1;
@@ -1447,7 +1438,7 @@ static void bin_sort_singlethread(std::vector<BIGINT> &ret, UBIGINT M, const T *
   // count how many pts in each bin
   std::vector<BIGINT> counts(nbins, 0);
 
-  for (auto i = 0; i < M; i++) {
+  for (UBIGINT i = 0; i < M; i++) {
     // find the bin index in however many dims are needed
     const auto i1  = BIGINT(fold_rescale<T>(kx[i], N1) * inv_bin_size_x);
     const auto i2  = isky ? BIGINT(fold_rescale<T>(ky[i], N2) * inv_bin_size_y) : 0;
@@ -1464,7 +1455,7 @@ static void bin_sort_singlethread(std::vector<BIGINT> &ret, UBIGINT M, const T *
     current_offset += tmp;
   } // (counts now contains the index offsets for each bin)
 
-  for (auto i = 0; i < M; i++) {
+  for (UBIGINT i = 0; i < M; i++) {
     // find the bin index (again! but better than using RAM)
     const auto i1    = BIGINT(fold_rescale<T>(kx[i], N1) * inv_bin_size_x);
     const auto i2    = isky ? BIGINT(fold_rescale<T>(ky[i], N2) * inv_bin_size_y) : 0;
@@ -1726,7 +1717,7 @@ int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T *
 */
 {
   // INPUT CHECKING & REPORTING .... cuboid not too small for spreading?
-  int minN = 2 * opts.nspread;
+  UBIGINT minN = UBIGINT(2 * opts.nspread);
   if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) {
     fprintf(stderr,
             "%s error: one or more non-trivial box dims is less than 2.nspread!\n",
@@ -1815,8 +1806,8 @@ int indexSort(std::vector<BIGINT> &sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT
     did_sort = 1;
   } else {
 #pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000)
-    for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7
-      sort_indices[i] = i;         // the identity permutation
+    for (BIGINT i = 0; i < BIGINT(M); i++) // here omp helps xeon, hinders i7
+      sort_indices[i] = i;                 // the identity permutation
     if (opts.debug)
       printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec());
   }
@@ -1887,21 +1878,22 @@ static int spreadSorted(const std::vector<BIGINT> &sort_indices, UBIGINT N1, UBI
       printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n");
 
     std::vector<UBIGINT> brk(nb + 1); // NU index breakpoints defining nb subproblems
-    for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb;
+    for (UBIGINT p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb;
 
 #pragma omp parallel num_threads(nthr)
     {
       // local copies of NU pts and data for each subproblem
       std::vector<T> kx0{}, ky0{}, kz0{}, dd0{}, du0{};
-#pragma omp for schedule(dynamic, 1)               // each is big
-      for (int isub = 0; isub < nb; isub++) {      // Main loop through the subproblems
-        const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem
+#pragma omp for schedule(dynamic, 1)                     // each is big
+      for (BIGINT isub = 0; isub < BIGINT(nb); isub++) { // Main loop through the
+                                                         // subproblems
+        const auto M0 = brk[isub + 1] - brk[isub];       // # NU pts in this subproblem
         // copy the location and data vectors for the nonuniform points
         kx0.resize(M0);
         ky0.resize(M0 * (N2 > 1));
         kz0.resize(M0 * (N3 > 1));
         dd0.resize(2 * M0);                            // complex strength data
-        for (auto j = 0; j < M0; j++) {                // todo: can avoid this copying?
+        for (UBIGINT j = 0; j < M0; j++) {             // todo: can avoid this copying?
           const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list
           kx0[j]        = fold_rescale<T>(kx[kk], N1);
           if (N2 > 1) ky0[j] = fold_rescale<T>(ky[kk], N2);
@@ -1950,7 +1942,8 @@ static int spreadSorted(const std::vector<BIGINT> &sort_indices, UBIGINT N1, UBI
       } // end main loop over subprobs
     }
     if (opts.debug)
-      printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb);
+      printf("\tt1 fancy spread: \t%.3g s (%" PRIu64 " subprobs)\n", timer.elapsedsec(),
+             nb);
   } // end of choice of which t1 spread type to use
   return 0;
 };
@@ -1998,10 +1991,10 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
     // main loop over NU trgs, interp each from U
     // (note: windows omp doesn't like unsigned loop vars)
 #pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts:
-    for (BIGINT i = 0; i < M; i += CHUNKSIZE) {
+    for (BIGINT i = 0; i < BIGINT(M); i += CHUNKSIZE) {
       // Setup buffers for this chunk
       const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE;
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+      for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) {
         UBIGINT j    = sort_indices[i + ibuf];
         jlist[ibuf]  = j;
         xjlist[ibuf] = fold_rescale<T>(kx[j], N1);
@@ -2010,7 +2003,7 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
       }
 
       // Loop over targets in chunk
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+      for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) {
         const auto xj = xjlist[ibuf];
         const auto yj = (ndims > 1) ? yjlist[ibuf] : 0;
         const auto zj = (ndims > 2) ? zjlist[ibuf] : 0;
@@ -2052,7 +2045,7 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel(
       } // end loop over targets in chunk
 
       // Copy result buffer to output array
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+      for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) {
         const UBIGINT j            = jlist[ibuf];
         data_nonuniform[2 * j]     = outbuf[2 * ibuf];
         data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1];
@@ -2154,8 +2147,9 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
    opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where
    upsampfac is set. Must call this before any kernel evals done, otherwise segfault
    likely. Returns: 0  : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be
-   achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h);
-   spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20.
+   achieved, but proceed with best possible eps otherwise : failure (see codes in
+   finufft_errors.h); spreading must not proceed Barnett 2017. debug, loosened eps logic
+   6/14/20.
 */
 {
   constexpr T EPSILON = std::numeric_limits<T>::epsilon();
@@ -2206,7 +2200,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
   if (upsampfac == 2.0)                    // standard sigma (see SISC paper)
     ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10
   else                                     // custom sigma
-    ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
+    ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
   ns = max(2, ns);        // (we don't have ns=1 version yet)
   if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules
     if (showwarn)
@@ -2228,7 +2222,8 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
   if (ns == 4) betaoverns = 2.38;
   if (upsampfac != 2.0) { // again, override beta for custom sigma
     T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m !
-    betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff
+    betaoverns = gamma * T(M_PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based on
+                                                                  // cutoff
   }
   opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
   if (debug)
diff --git a/src/utils.cpp b/src/utils.cpp
index f64009132..2627a179a 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -73,7 +73,7 @@ int get_num_threads_parallel_block()
 }
 
 // ---------- thread-safe rand number generator for Windows platform ---------
-// (note this is used by macros in defs.h, and supplied in linux/macosx)
+// (note this is used by macros in test_defs.h, and supplied in linux/macosx)
 #ifdef _WIN32
 int rand_r(unsigned int * /*seedp*/)
 // Libin Lu, 6/18/20
diff --git a/test/directft/dirft1d.cpp b/test/directft/dirft1d.cpp
index 5f36d76d7..c80299b47 100644
--- a/test/directft/dirft1d.cpp
+++ b/test/directft/dirft1d.cpp
@@ -1,11 +1,13 @@
-#include <finufft/defs.h>
 #include <finufft/dirft.h>
+#include <finufft/finufft_core.h>
 #include <iostream>
 
 // This is basically a port of dirft1d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft1d1(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
+template<typename T>
+void dirft1d1(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT ms,
+              std::complex<T> *f)
 /* Direct computation of 1D type-1 nonuniform FFT. Interface same as finufft1d1.
 c                  nj-1
 c     f[k1]    =   SUM  c[j] exp(+-i k1 x[j])
@@ -17,12 +19,14 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/25/17
 */
 {
-  BIGINT kmin = -(ms / 2);                          // integer divide
-  for (BIGINT m = 0; m < ms; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  BIGINT kmin = -(ms / 2);        // integer divide
+  for (BIGINT m = 0; m < ms; ++m)
+    f[m] = std::complex<T>(0, 0); // it knows f is complex type
   for (BIGINT j = 0; j < nj; ++j) {
-    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
-    CPX cc = c[j];              // no 1/nj prefac
+    std::complex<T> a  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                     : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> p  = pow(a, (T)kmin); // starting phase for most neg freq
+    std::complex<T> cc = c[j];            // no 1/nj prefac
     for (BIGINT m = 0; m < ms; ++m) {
       f[m] += cc * p;
       p *= a;
@@ -30,7 +34,9 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
+template<typename T>
+void dirft1d2(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT ms,
+              std::complex<T> *f)
 /* Direct computation of 1D type-2 nonuniform FFT. Interface same as finufft1d2
 c
 c     c[j] = SUM   f[k1] exp(+-i k1 x[j])
@@ -44,9 +50,10 @@ c     used, otherwise the - sign is used, in the exponential.
 {
   BIGINT kmin = -(ms / 2); // integer divide
   for (BIGINT j = 0; j < nj; ++j) {
-    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
-    CPX cc = CPX(0, 0);
+    std::complex<T> a  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                     : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> p  = pow(a, (T)kmin); // starting phase for most neg freq
+    std::complex<T> cc = std::complex<T>(0, 0);
     for (BIGINT m = 0; m < ms; ++m) {
       cc += f[m] * p;
       p *= a;
@@ -55,7 +62,9 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f)
+template<typename T>
+void dirft1d3(BIGINT nj, T *x, std::complex<T> *c, int iflag, BIGINT nk, T *s,
+              std::complex<T> *f)
 /* Direct computation of 1D type-3 nonuniform FFT. Interface same as finufft1d3
 c              nj-1
 c     f[k]  =  SUM   c[j] exp(+-i s[k] x[j])
@@ -66,8 +75,9 @@ c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/25/17
 */
 {
   for (BIGINT k = 0; k < nk; ++k) {
-    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
-    f[k]   = CPX(0, 0);
+    std::complex<T> ss =
+        (iflag > 0) ? std::complex<T>(0, 1) * s[k] : -std::complex<T>(0, 1) * s[k];
+    f[k] = std::complex<T>(0, 0);
     for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j]);
   }
 }
diff --git a/test/directft/dirft2d.cpp b/test/directft/dirft2d.cpp
index c13661549..62f126c15 100644
--- a/test/directft/dirft2d.cpp
+++ b/test/directft/dirft2d.cpp
@@ -1,11 +1,13 @@
-#include <finufft/defs.h>
 #include <finufft/dirft.h>
+#include <finufft/finufft_core.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
+template<typename T>
+void dirft2d1(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
+              std::complex<T> *f)
 /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1.
 c                  nj-1
 c     f[k1,k2] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j]))
@@ -18,19 +20,22 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/26/17
 */
 {
-  BIGINT k1min = -(ms / 2), k2min = -(mt / 2);     // integer divide
-  BIGINT N = ms * mt;                              // total # output modes
-  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
-  for (BIGINT j = 0; j < nj; ++j) {                // src pts
-    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
-    CPX sp1  = pow(a1, (FLT)k1min);        // starting phase for most neg k1 freq
-    CPX p2   = pow(a2, (FLT)k2min);
-    CPX cc   = c[j];                       // no 1/nj norm
-    BIGINT m = 0;                          // output pointer
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide
+  BIGINT N = ms * mt;                          // total # output modes
+  for (BIGINT m = 0; m < N; ++m)
+    f[m] = std::complex<T>(0, 0);              // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {            // src pts
+    std::complex<T> a1  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                      : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> a2  = (iflag > 0) ? exp(std::complex<T>(0, 1) * y[j])
+                                      : exp(-std::complex<T>(0, 1) * y[j]);
+    std::complex<T> sp1 = pow(a1, (T)k1min); // starting phase for most neg k1 freq
+    std::complex<T> p2  = pow(a2, (T)k2min);
+    std::complex<T> cc  = c[j];              // no 1/nj norm
+    BIGINT m            = 0;                 // output pointer
     for (BIGINT m2 = 0; m2 < mt; ++m2) {
-      CPX p1 = sp1;                        // must reset p1 for each inner loop
-      for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow
+      std::complex<T> p1 = sp1;              // must reset p1 for each inner loop
+      for (BIGINT m1 = 0; m1 < ms; ++m1) {   // ms is fast, mt slow
         f[m++] += cc * p1 * p2;
         p1 *= a1;
       }
@@ -39,7 +44,9 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
+template<typename T>
+void dirft2d2(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT ms, BIGINT mt,
+              std::complex<T> *f)
 /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2
 
      c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j]))
@@ -56,14 +63,16 @@ void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt
 {
   BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide
   for (BIGINT j = 0; j < nj; ++j) {
-    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
-    CPX sp1  = pow(a1, (FLT)k1min);
-    CPX p2   = pow(a2, (FLT)k2min);
-    CPX cc   = CPX(0, 0);
-    BIGINT m = 0; // input pointer
+    std::complex<T> a1  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                      : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> a2  = (iflag > 0) ? exp(std::complex<T>(0, 1) * y[j])
+                                      : exp(-std::complex<T>(0, 1) * y[j]);
+    std::complex<T> sp1 = pow(a1, (T)k1min);
+    std::complex<T> p2  = pow(a2, (T)k2min);
+    std::complex<T> cc  = std::complex<T>(0, 0);
+    BIGINT m            = 0; // input pointer
     for (BIGINT m2 = 0; m2 < mt; ++m2) {
-      CPX p1 = sp1;
+      std::complex<T> p1 = sp1;
       for (BIGINT m1 = 0; m1 < ms; ++m1) {
         cc += f[m++] * p1 * p2;
         p1 *= a1;
@@ -74,8 +83,9 @@ void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt
   }
 }
 
-void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
-              CPX *f)
+template<typename T>
+void dirft2d3(BIGINT nj, T *x, T *y, std::complex<T> *c, int iflag, BIGINT nk, T *s, T *t,
+              std::complex<T> *f)
 /* Direct computation of 2D type-3 nonuniform FFT. Interface same as finufft2d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j]))
@@ -86,9 +96,11 @@ c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/26/17
 */
 {
   for (BIGINT k = 0; k < nk; ++k) {
-    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
-    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
-    f[k]   = CPX(0, 0);
+    std::complex<T> ss =
+        (iflag > 0) ? std::complex<T>(0, 1) * s[k] : -std::complex<T>(0, 1) * s[k];
+    std::complex<T> tt =
+        (iflag > 0) ? std::complex<T>(0, 1) * t[k] : -std::complex<T>(0, 1) * t[k];
+    f[k] = std::complex<T>(0, 0);
     for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j]);
   }
 }
diff --git a/test/directft/dirft3d.cpp b/test/directft/dirft3d.cpp
index 452b62471..b77111257 100644
--- a/test/directft/dirft3d.cpp
+++ b/test/directft/dirft3d.cpp
@@ -1,12 +1,13 @@
-#include <finufft/defs.h>
 #include <finufft/dirft.h>
+#include <finufft/finufft_core.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
-              BIGINT mu, CPX *f)
+template<typename T>
+void dirft3d1(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
+              BIGINT mt, BIGINT mu, std::complex<T> *f)
 /* Direct computation of 3D type-1 nonuniform FFT. Interface same as finufft3d1.
 c                     nj-1
 c     f[k1,k2,k3] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j] + k2 z[j]))
@@ -22,20 +23,24 @@ c     used, otherwise the - sign is used, in the exponential.
 {
   BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
   BIGINT N = ms * mt * mu;                                        // total # output modes
-  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
-  for (BIGINT j = 0; j < nj; ++j) {                // src pts
-    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
-    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
-    CPX sp1  = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq
-    CPX sp2  = pow(a2, (FLT)k2min);
-    CPX p3   = pow(a3, (FLT)k3min);
-    CPX cc   = c[j]; // no 1/nj norm
-    BIGINT m = 0;    // output pointer
+  for (BIGINT m = 0; m < N; ++m)
+    f[m] = std::complex<T>(0, 0);   // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) { // src pts
+    std::complex<T> a1  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                      : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> a2  = (iflag > 0) ? exp(std::complex<T>(0, 1) * y[j])
+                                      : exp(-std::complex<T>(0, 1) * y[j]);
+    std::complex<T> a3  = (iflag > 0) ? exp(std::complex<T>(0, 1) * z[j])
+                                      : exp(-std::complex<T>(0, 1) * z[j]);
+    std::complex<T> sp1 = pow(a1, (T)k1min); // starting phase for most neg k1 freq
+    std::complex<T> sp2 = pow(a2, (T)k2min);
+    std::complex<T> p3  = pow(a3, (T)k3min);
+    std::complex<T> cc  = c[j]; // no 1/nj norm
+    BIGINT m            = 0;    // output pointer
     for (BIGINT m3 = 0; m3 < mu; ++m3) {
-      CPX p2 = sp2;
+      std::complex<T> p2 = sp2;
       for (BIGINT m2 = 0; m2 < mt; ++m2) {
-        CPX p1 = sp1;
+        std::complex<T> p1 = sp1;
         for (BIGINT m1 = 0; m1 < ms; ++m1) {
           f[m++] += cc * p1 * p2 * p3;
           p1 *= a1;
@@ -47,8 +52,9 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
-              BIGINT mu, CPX *f)
+template<typename T>
+void dirft3d2(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT ms,
+              BIGINT mt, BIGINT mu, std::complex<T> *f)
 /* Direct computation of 3D type-2 nonuniform FFT. Interface same as finufft3d2
 
      c[j] =   SUM    f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j]))
@@ -66,18 +72,21 @@ void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, B
 {
   BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
   for (BIGINT j = 0; j < nj; ++j) {
-    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
-    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
-    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
-    CPX sp1  = pow(a1, (FLT)k1min);
-    CPX sp2  = pow(a2, (FLT)k2min);
-    CPX p3   = pow(a3, (FLT)k3min);
-    CPX cc   = CPX(0, 0);
-    BIGINT m = 0; // input pointer
+    std::complex<T> a1  = (iflag > 0) ? exp(std::complex<T>(0, 1) * x[j])
+                                      : exp(-std::complex<T>(0, 1) * x[j]);
+    std::complex<T> a2  = (iflag > 0) ? exp(std::complex<T>(0, 1) * y[j])
+                                      : exp(-std::complex<T>(0, 1) * y[j]);
+    std::complex<T> a3  = (iflag > 0) ? exp(std::complex<T>(0, 1) * z[j])
+                                      : exp(-std::complex<T>(0, 1) * z[j]);
+    std::complex<T> sp1 = pow(a1, (T)k1min);
+    std::complex<T> sp2 = pow(a2, (T)k2min);
+    std::complex<T> p3  = pow(a3, (T)k3min);
+    std::complex<T> cc  = std::complex<T>(0, 0);
+    BIGINT m            = 0; // input pointer
     for (BIGINT m3 = 0; m3 < mu; ++m3) {
-      CPX p2 = sp2;
+      std::complex<T> p2 = sp2;
       for (BIGINT m2 = 0; m2 < mt; ++m2) {
-        CPX p1 = sp1;
+        std::complex<T> p1 = sp1;
         for (BIGINT m1 = 0; m1 < ms; ++m1) {
           cc += f[m++] * p1 * p2 * p3;
           p1 *= a1;
@@ -90,8 +99,9 @@ void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, B
   }
 }
 
-void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
-              FLT *t, FLT *u, CPX *f)
+template<typename T>
+void dirft3d3(BIGINT nj, T *x, T *y, T *z, std::complex<T> *c, int iflag, BIGINT nk, T *s,
+              T *t, T *u, std::complex<T> *f)
 /* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j]))
@@ -102,10 +112,13 @@ c  exponential. Uses C++ complex type. Simple brute force.  Barnett 2/1/17
 */
 {
   for (BIGINT k = 0; k < nk; ++k) {
-    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
-    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
-    CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k];
-    f[k]   = CPX(0, 0);
+    std::complex<T> ss =
+        (iflag > 0) ? std::complex<T>(0, 1) * s[k] : -std::complex<T>(0, 1) * s[k];
+    std::complex<T> tt =
+        (iflag > 0) ? std::complex<T>(0, 1) * t[k] : -std::complex<T>(0, 1) * t[k];
+    std::complex<T> uu =
+        (iflag > 0) ? std::complex<T>(0, 1) * u[k] : -std::complex<T>(0, 1) * u[k];
+    f[k] = std::complex<T>(0, 0);
     for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]);
   }
 }
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index 5e74d6d30..e182cfe5f 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -103,7 +103,7 @@ int main(int argc, char *argv[]) {
     printf("1d1 M<0:\twrong err code %d\n", ier);
     return 1;
   }
-  int64_t Mhuge = (int64_t)(1e16); // cf defs.h MAX_NU_PTS
+  int64_t Mhuge = (int64_t)(1e16); // cf finufft_core.h MAX_NU_PTS
   ier           = FINUFFT1D1(Mhuge, x, c, +1, acc, 0, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
     printf("1d1 M huge:\twrong err code %d\n", ier);