diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0bae95ad5..435bcd8c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,7 +121,7 @@ endif()
 
 # This set of sources is compiled twice, once in single precision and once in
 # double precision The single precision compilation is done with -DSINGLE
-set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp
+set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp
                                         src/simpleinterfaces.cpp)
 
 # If we're building for Fortran, make sure we also include the translation
@@ -258,11 +258,15 @@ if(FINUFFT_USE_CPU)
   add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
   set_finufft_options(finufft_f64)
   if(NOT FINUFFT_STATIC_LINKING)
-    add_library(finufft SHARED src/spreadinterp.cpp src/utils_precindep.cpp
-                               contrib/legendre_rule_fast.cpp)
+    add_library(
+      finufft SHARED
+      src/spreadinterp.cpp src/utils_precindep.cpp
+      contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp)
   else()
-    add_library(finufft STATIC src/spreadinterp.cpp src/utils_precindep.cpp
-                               contrib/legendre_rule_fast.cpp)
+    add_library(
+      finufft STATIC
+      src/spreadinterp.cpp src/utils_precindep.cpp
+      contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp)
   endif()
   target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64)
   set_finufft_options(finufft)
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 4265ced01..084ffa41c 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -18,6 +18,7 @@
 // public header gives access to f_opts, f_spread_opts, f_plan...
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
+#include <finufft/finufft_core.h>
 #include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
@@ -25,8 +26,8 @@
 
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
-using BIGINT  = int64_t;
-using UBIGINT = uint64_t;
+// using BIGINT  = int64_t;
+// using UBIGINT = uint64_t;
 // Precision-independent real and complex types, for private lib/test compile
 #ifdef SINGLE
 using FLT = float;
@@ -36,59 +37,6 @@ using FLT = double;
 #include <complex> // we define C++ complex type only
 using CPX = std::complex<FLT>;
 
-// inline macro, to force inlining of small functions
-// this avoids the use of macros to implement functions
-#if defined(_MSC_VER)
-#define FINUFFT_ALWAYS_INLINE __forceinline inline
-#define FINUFFT_NEVER_INLINE  __declspec(noinline)
-#define FINUFFT_RESTRICT      __restrict
-#define FINUFFT_UNREACHABLE   __assume(0)
-#define FINUFFT_UNLIKELY(x)   (x)
-#define FINUFFT_LIKELY(x)     (x)
-#elif defined(__GNUC__) || defined(__clang__)
-#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
-#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
-#define FINUFFT_RESTRICT      __restrict__
-#define FINUFFT_UNREACHABLE   __builtin_unreachable()
-#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
-#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
-#else
-#define FINUFFT_ALWAYS_INLINE inline
-#define FINUFFT_NEVER_INLINE
-#define FINUFFT_RESTRICT
-#define FINUFFT_UNREACHABLE
-#define FINUFFT_UNLIKELY(x) (x)
-#define FINUFFT_LIKELY(x)   (x)
-#endif
-
-// ------------- Library-wide algorithm parameter settings ----------------
-
-// Library version (is a string)
-#define FINUFFT_VER "2.3.0"
-
-// Smallest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MIN_NSPREAD = 2;
-
-// Largest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MAX_NSPREAD = 16;
-
-// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
-
-// Max number of positive quadr nodes for kernel FT (used only in common.cpp)
-inline constexpr int MAX_NQUAD = 100;
-
-// Internal (nf1 etc) array allocation size that immediately raises error.
-// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
-// Increase this if you need >10TB (!) RAM...
-inline constexpr BIGINT MAX_NF = BIGINT(1e12);
-
-// Maximum allowed number M of NU points; useful to catch incorrectly cast int32
-// values for M = nj (also nk in type 3)...
-inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
-
 // -------------- Math consts (not in math.h) and useful math macros ----------
 #include <cmath>
 
@@ -108,13 +56,6 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 // to avoid mixed precision operators in eg i*pi, an either-prec PI...
 #define PI      FLT(M_PI)
 
-// machine epsilon for decisions of achievable tolerance...
-#ifdef SINGLE
-#define EPSILON (float)6e-08
-#else
-#define EPSILON (double)1.1e-16
-#endif
-
 // Random numbers: crappy unif random number generator in [0,1).
 // These macros should probably be replaced by modern C++ std lib or random123.
 // (RAND_MAX is in stdlib.h)
@@ -148,32 +89,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
 }
 #endif
 
-// ----- OpenMP macros which also work when omp not present -----
-// Allows compile-time switch off of openmp, so compilation without any openmp
-// is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
-#ifdef _OPENMP
-#include <omp.h>
-// point to actual omp utils
-static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () {
-  return omp_get_num_threads();
-}
-static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () {
-  return omp_get_max_threads();
-}
-static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () {
-  return omp_get_thread_num();
-}
-static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) {
-  omp_set_num_threads(x);
-}
-#else
-// non-omp safe dummy versions of omp utils...
-static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; }
-static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; }
-static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; }
-static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
-#endif
-
 // Prec-switching name macros (respond to SINGLE), used in lib & test sources
 // and the plan object below.
 // Note: crucially, these are now indep of macros used to gen public finufft.h!
@@ -219,70 +134,6 @@ static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
 // NB: now private (the public C++ or C etc user sees an opaque pointer to it)
 
 #include <finufft/fft.h> // (must come after complex.h)
-
-// group together a bunch of type 3 rescaling/centering/phasing parameters:
-template<typename T> struct type3params {
-  T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
-  T X2, C2, D2, h2, gam2; // y
-  T X3, C3, D3, h3, gam3; // z
-};
-
-struct FINUFFT_PLAN_S { // the main plan object, fully C++
-  // These default and delete specifications just state the obvious,
-  // but are here to silence compiler warnings.
-  FINUFFT_PLAN_S() = default;
-  // Copy construction and assignent are already deleted implicitly
-  // because of the unique_ptr member.
-  FINUFFT_PLAN_S(const FINUFFT_PLAN_S &)            = delete;
-  FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete;
-
-  int type;            // transform type (Rokhlin naming): 1,2 or 3
-  int dim;             // overall dimension: 1,2 or 3
-  int ntrans;          // how many transforms to do at once (vector or "many" mode)
-  BIGINT nj;           // num of NU pts in type 1,2 (for type 3, num input x pts)
-  BIGINT nk;           // number of NU freq pts (type 3 only)
-  FLT tol;             // relative user tolerance
-  int batchSize;       // # strength vectors to group together for FFTW, etc
-  int nbatch;          // how many batches done to cover all ntrans vectors
-
-  BIGINT ms;           // number of modes in x (1) dir (historical CMCL name) = N1
-  BIGINT mt;           // number of modes in y (2) direction = N2
-  BIGINT mu;           // number of modes in z (3) direction = N3
-  BIGINT N;            // total # modes (prod of above three)
-
-  BIGINT nf1;          // size of internal fine grid in x (1) direction
-  BIGINT nf2;          // " y (2)
-  BIGINT nf3;          // " z (3)
-  BIGINT nf;           // total # fine grid points (product of the above three)
-
-  int fftSign;         // sign in exponential for NUFFT defn, guaranteed to be +-1
-
-  FLT *phiHat1;        // FT of kernel in t1,2, on x-axis mode grid
-  FLT *phiHat2;        // " y-axis.
-  FLT *phiHat3;        // " z-axis.
-
-  CPX *fwBatch;        // (batches of) fine grid(s) for FFTW to plan
-                       // & act on. Usually the largest working array
-
-  BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp
-  bool didSort;        // whether binsorting used (false: identity perm used)
-
-  FLT *X, *Y, *Z;      // for t1,2: ptr to user-supplied NU pts (no new allocs).
-                       // for t3: allocated as "primed" (scaled) src pts x'_j, etc
-
-  // type 3 specific
-  FLT *S, *T, *U;           // pointers to user's target NU pts arrays (no new allocs)
-  CPX *prephase;            // pre-phase, for all input NU pts
-  CPX *deconv;              // reciprocal of kernel FT, phase, all output NU pts
-  CPX *CpBatch;             // working array of prephased strengths
-  FLT *Sp, *Tp, *Up;        // internal primed targs (s'_k, etc), allocated
-  type3params<FLT> t3P;     // groups together type 3 shift, scale, phase, parameters
-  FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
-
-  // other internal structs
-  std::unique_ptr<Finufft_FFT_plan<FLT>> fftPlan;
-  finufft_opts opts; // this and spopts could be made ptrs
-  finufft_spread_opts spopts;
-};
+struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T<FLT> {};
 
 #endif // DEFS_H
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
index bab43966c..c6d5de7a5 100644
--- a/include/finufft/fft.h
+++ b/include/finufft/fft.h
@@ -171,19 +171,22 @@ template<> struct Finufft_FFT_plan<double> {
 
 #endif
 
-#include <finufft/defs.h>
+#include <finufft/finufft_core.h>
 
 static inline void finufft_fft_forget_wisdom [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::forget_wisdom();
+  Finufft_FFT_plan<float>::forget_wisdom();
+  Finufft_FFT_plan<double>::forget_wisdom();
 }
 static inline void finufft_fft_cleanup [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::cleanup();
+  Finufft_FFT_plan<float>::cleanup();
+  Finufft_FFT_plan<double>::cleanup();
 }
 static inline void finufft_fft_cleanup_threads [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::cleanup_threads();
+  Finufft_FFT_plan<float>::cleanup_threads();
+  Finufft_FFT_plan<double>::cleanup_threads();
 }
-
-std::vector<int> gridsize_for_fft(FINUFFT_PLAN p);
-void do_fft(FINUFFT_PLAN p);
+template<typename TF> struct FINUFFT_PLAN_T;
+template<typename TF> std::vector<int> gridsize_for_fft(FINUFFT_PLAN_T<TF> *p);
+template<typename TF> void do_fft(FINUFFT_PLAN_T<TF> *p);
 
 #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H
diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h
new file mode 100644
index 000000000..afc6ef864
--- /dev/null
+++ b/include/finufft/finufft_core.h
@@ -0,0 +1,210 @@
+#ifndef FINUFFT_CORE_H
+#define FINUFFT_CORE_H
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FINUFFT_DLL
+   here if you are compiling/using FINUFFT as a DLL,
+   in order to do the proper importing/exporting, or
+   alternatively compile with -DFINUFFT_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically.
+   Alternatively use include(GenerateExportHeader) and
+   generate_export_header(finufft) to auto generate an header containing
+   these defines.The main reason is that if msvc changes the way it deals
+   with it in the future we just need to update cmake for it to work
+   instead of having a check on the msvc version. */
+#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
+#else
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT
+#endif
+
+/* specify calling convention (Windows only)
+   The cdecl calling convention is actually not the default in all but a very
+   few C/C++ compilers.
+   If the user code changes the default compiler calling convention, may need
+   this when generating DLL. */
+#if defined(_WIN32) || defined(__WIN32__)
+#define FINUFFT_CDECL __cdecl
+#else
+#define FINUFFT_CDECL
+#endif
+
+// inline macro, to force inlining of small functions
+// this avoids the use of macros to implement functions
+#if defined(_MSC_VER)
+#define FINUFFT_ALWAYS_INLINE __forceinline inline
+#define FINUFFT_NEVER_INLINE  __declspec(noinline)
+#define FINUFFT_RESTRICT      __restrict
+#define FINUFFT_UNREACHABLE   __assume(0)
+#define FINUFFT_UNLIKELY(x)   (x)
+#define FINUFFT_LIKELY(x)     (x)
+#elif defined(__GNUC__) || defined(__clang__)
+#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
+#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
+#define FINUFFT_RESTRICT      __restrict__
+#define FINUFFT_UNREACHABLE   __builtin_unreachable()
+#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
+#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
+#else
+#define FINUFFT_ALWAYS_INLINE inline
+#define FINUFFT_NEVER_INLINE
+#define FINUFFT_RESTRICT
+#define FINUFFT_UNREACHABLE
+#define FINUFFT_UNLIKELY(x) (x)
+#define FINUFFT_LIKELY(x)   (x)
+#endif
+
+#include <finufft_errors.h>
+#include <memory>
+
+// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
+// This includes all calling arguments (eg M,N) that could be huge someday.
+using BIGINT  = int64_t;
+using UBIGINT = uint64_t;
+
+// ------------- Library-wide algorithm parameter settings ----------------
+
+// Library version (is a string)
+#define FINUFFT_VER "2.3.0"
+
+// Smallest possible kernel spread width per dimension, in fine grid points
+// (used only in spreadinterp.cpp)
+inline constexpr int MIN_NSPREAD = 2;
+
+// Largest possible kernel spread width per dimension, in fine grid points
+// (used only in spreadinterp.cpp)
+inline constexpr int MAX_NSPREAD = 16;
+
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
+
+// Max number of positive quadr nodes for kernel FT (used only in common.cpp)
+inline constexpr int MAX_NQUAD = 100;
+
+// Internal (nf1 etc) array allocation size that immediately raises error.
+// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
+// Increase this if you need >10TB (!) RAM...
+inline constexpr BIGINT MAX_NF = BIGINT(1e12);
+
+// Maximum allowed number M of NU points; useful to catch incorrectly cast int32
+// values for M = nj (also nk in type 3)...
+inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
+
+// ----- OpenMP macros which also work when omp not present -----
+// Allows compile-time switch off of openmp, so compilation without any openmp
+// is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
+#ifdef _OPENMP
+#include <omp.h>
+// point to actual omp utils
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () {
+  return omp_get_num_threads();
+}
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () {
+  return omp_get_max_threads();
+}
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () {
+  return omp_get_thread_num();
+}
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) {
+  omp_set_num_threads(x);
+}
+#else
+// non-omp safe dummy versions of omp utils...
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; }
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
+#endif
+
+#include <finufft/fft.h> // (must come after complex.h)
+#include <finufft_opts.h>
+#include <finufft_spread_opts.h>
+
+// group together a bunch of type 3 rescaling/centering/phasing parameters:
+template<typename T> struct type3params {
+  T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
+  T X2, C2, D2, h2, gam2; // y
+  T X3, C3, D3, h3, gam3; // z
+};
+
+template<typename TF> struct FINUFFT_PLAN_T { // the main plan object, fully C++
+
+  using TC = std::complex<TF>;
+
+  // These default and delete specifications just state the obvious,
+  // but are here to silence compiler warnings.
+  FINUFFT_PLAN_T() = default;
+  // Copy construction and assignent are already deleted implicitly
+  // because of the unique_ptr member.
+  FINUFFT_PLAN_T(const FINUFFT_PLAN_T &)            = delete;
+  FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete;
+  ~FINUFFT_PLAN_T();
+
+  int type;              // transform type (Rokhlin naming): 1,2 or 3
+  int dim;               // overall dimension: 1,2 or 3
+  int ntrans;            // how many transforms to do at once (vector or "many" mode)
+  BIGINT nj;             // num of NU pts in type 1,2 (for type 3, num input x pts)
+  BIGINT nk;             // number of NU freq pts (type 3 only)
+  TF tol;                // relative user tolerance
+  int batchSize;         // # strength vectors to group together for FFTW, etc
+  int nbatch;            // how many batches done to cover all ntrans vectors
+
+  BIGINT ms;             // number of modes in x (1) dir (historical CMCL name) = N1
+  BIGINT mt;             // number of modes in y (2) direction = N2
+  BIGINT mu;             // number of modes in z (3) direction = N3
+  BIGINT N;              // total # modes (prod of above three)
+
+  BIGINT nf1;            // size of internal fine grid in x (1) direction
+  BIGINT nf2;            // " y (2)
+  BIGINT nf3;            // " z (3)
+  BIGINT nf;             // total # fine grid points (product of the above three)
+
+  int fftSign;           // sign in exponential for NUFFT defn, guaranteed to be +-1
+
+  TF *phiHat1 = nullptr; // FT of kernel in t1,2, on x-axis mode grid
+  TF *phiHat2 = nullptr; // " y-axis.
+  TF *phiHat3 = nullptr; // " z-axis.
+
+  TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan
+                         // & act on. Usually the largest working array
+
+  BIGINT *sortIndices = nullptr; // precomputed NU pt permutation, speeds spread/interp
+  bool didSort;                  // whether binsorting used (false: identity perm used)
+
+  TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts
+                                               // (no new allocs). for t3: allocated as
+                                               // "primed" (scaled) src pts x'_j, etc
+
+  // type 3 specific
+  TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays
+                                               // (no new allocs)
+  TC *prephase = nullptr;                      // pre-phase, for all input NU pts
+  TC *deconv   = nullptr; // reciprocal of kernel FT, phase, all output NU pts
+  TC *CpBatch  = nullptr; // working array of prephased strengths
+  TF *Sp = nullptr, *Tp = nullptr, *Up = nullptr; // internal primed targs (s'_k, etc),
+                                                  // allocated
+  type3params<TF> t3P; // groups together type 3 shift, scale, phase, parameters
+  FINUFFT_PLAN_T<TF> *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3
+
+  // other internal structs
+  std::unique_ptr<Finufft_FFT_plan<TF>> fftPlan;
+  finufft_opts opts; // this and spopts could be made ptrs
+  finufft_spread_opts spopts;
+};
+
+void finufft_default_opts_t(finufft_opts *o);
+template<typename TF>
+int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts);
+template<typename TF>
+int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
+                     TF *s, TF *t, TF *u);
+template<typename TF>
+int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk);
+
+#endif // FINUFFT_CORE_H
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 56d705563..779101be8 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -7,7 +7,6 @@
 #ifndef SPREADINTERP_H
 #define SPREADINTERP_H
 
-#include <finufft/defs.h>
 #include <finufft_spread_opts.h>
 
 /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags.
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
index b4fe64681..132fafb53 100644
--- a/include/finufft/utils.h
+++ b/include/finufft/utils.h
@@ -4,7 +4,7 @@
 #ifndef UTILS_H
 #define UTILS_H
 
-#include "finufft/defs.h"
+#include "finufft/finufft_core.h"
 
 namespace finufft {
 namespace utils {
diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h
index 0504bb8df..8dd3839d8 100644
--- a/include/finufft/utils_precindep.h
+++ b/include/finufft/utils_precindep.h
@@ -4,9 +4,9 @@
 #ifndef UTILS_PRECINDEP_H
 #define UTILS_PRECINDEP_H
 
-#include "defs.h"
-// for CNTime...
-// using chrono since the interface is portable between linux and windows
+// #include "defs.h"
+//  for CNTime...
+//  using chrono since the interface is portable between linux and windows
 #include <chrono>
 
 namespace finufft {
diff --git a/makefile b/makefile
index 85a6a9a78..9d3e4c29c 100644
--- a/makefile
+++ b/makefile
@@ -31,7 +31,7 @@ PYTHON = python3
 #           they allow gcc to vectorize the code more effectively
 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\
 		  -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\
-		  -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS)
+		  -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) -Wfatal-errors
 FFLAGS := $(CFLAGS) $(FFLAGS)
 CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
 # FFTW base name, and math linking...
@@ -138,17 +138,17 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 SOBJS =
 # their single-prec versions
 SOBJSF = $(SOBJS:%.o=%_32.o)
-# precision-dependent spreader object files (compiled & linked only once)...
+# precision-independent spreader object files (compiled & linked only once)...
 SOBJS_PI = src/utils_precindep.o src/spreadinterp.o
 # spreader dual-precision objs
 SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI)
 
 # double-prec library object files that also need single precision...
-OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o
+OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o
 # their single-prec versions
 OBJSF = $(OBJS:%.o=%_32.o)
-# precision-dependent library object files (compiled & linked only once)...
-OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o
+# precision-independent library object files (compiled & linked only once)...
+OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
 OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS)
 
@@ -435,7 +435,7 @@ endif
 
 # python ---------------------------------------------------------------------
 python: $(STATICLIB) $(DYNLIB)
-	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft
+	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft
 # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps
 	$(PYTHON) python/finufft/test/run_accuracy_tests.py
 	$(PYTHON) python/finufft/examples/simple1d1.py
diff --git a/src/fft.cpp b/src/fft.cpp
index bb7e32442..3a7fbf2f6 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -7,7 +7,7 @@ using namespace std;
 #include "ducc0/fft/fftnd_impl.h"
 #endif
 
-std::vector<int> gridsize_for_fft(FINUFFT_PLAN p) {
+template<typename TF> std::vector<int> gridsize_for_fft(FINUFFT_PLAN_T<TF> *p) {
   // local helper func returns a new int array of length dim, extracted from
   // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
   if (p->dim == 1) return {(int)p->nf1};
@@ -15,8 +15,10 @@ std::vector<int> gridsize_for_fft(FINUFFT_PLAN p) {
   // if (p->dim == 3)
   return {(int)p->nf3, (int)p->nf2, (int)p->nf1};
 }
+template std::vector<int> gridsize_for_fft<float>(FINUFFT_PLAN_T<float> *p);
+template std::vector<int> gridsize_for_fft<double>(FINUFFT_PLAN_T<double> *p);
 
-void do_fft(FINUFFT_PLAN p) {
+template<typename TF> void do_fft(FINUFFT_PLAN_T<TF> *p) {
 #ifdef FINUFFT_USE_DUCC0
   size_t nthreads = min<size_t>(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads);
   const auto ns   = gridsize_for_fft(p);
@@ -106,3 +108,5 @@ void do_fft(FINUFFT_PLAN p) {
   p->fftPlan->execute(); // if thisBatchSize<batchSize it wastes some flops
 #endif
 }
+template void do_fft<float>(FINUFFT_PLAN_T<float> *p);
+template void do_fft<double>(FINUFFT_PLAN_T<double> *p);
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 96e986a5d..fddb3fb6e 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -4,1182 +4,25 @@
 // private headers for lib build
 // (must come after finufft.h which clobbers FINUFFT* macros)
 #include <finufft/defs.h>
-#include <finufft/fft.h>
-#include <finufft/spreadinterp.h>
-#include <finufft/utils.h>
-#include <finufft/utils_precindep.h>
 
-#include "../contrib/legendre_rule_fast.h"
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <vector>
+void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); }
 
-using namespace std;
-using namespace finufft;
-using namespace finufft::utils;
-using namespace finufft::spreadinterp;
-using namespace finufft::quadrature;
-
-/* Computational core for FINUFFT.
-
-   Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
-   2d1/2d2 many-vector drivers by Melody Shih, summer 2018.
-   Original guru interface written by Andrea Malleo, summer 2019, mentored
-   by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu.
-
-   As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions
-   and the two finufft2d?many() functions. The (now 18) simple C++ interfaces
-   are in simpleinterfaces.cpp.
-
-Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
-
-   TYPE 1:
-   The type 1 NUFFT proceeds in three main steps:
-   1) spread data to oversampled regular mesh using kernel.
-   2) compute FFT on uniform mesh
-   3) deconvolve by division of each Fourier mode independently by the kernel
-    Fourier series coeffs (not merely FFT of kernel), shuffle to output.
-   The kernel coeffs are precomputed in what is called step 0 in the code.
-
-   TYPE 2:
-   The type 2 algorithm proceeds in three main steps:
-   1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
-   2) compute inverse FFT on uniform fine grid
-   3) spread (dir=2, ie interpolate) data to regular mesh
-   The kernel coeffs are precomputed in what is called step 0 in the code.
-
-   TYPE 3:
-   The type 3 algorithm is basically a type 2 (which is implemented precisely
-   as call to type 2) replacing the middle FFT (Step 2) of a type 1.
-   Beyond this, the new twists are:
-   i) nf1, number of upsampled points for the type-1, depends on the product
-     of interval widths containing input and output points (X*S).
-   ii) The deconvolve (post-amplify) step is division by the Fourier transform
-     of the scaled kernel, evaluated on the *nonuniform* output frequency
-     grid; this is done by direct approximation of the Fourier integral
-     using quadrature of the kernel function times exponentials.
-   iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
-     half-widths X and S, hence nf1.
-
-   MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
-   maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
-   this is good only for small problems.
-
-
-Design notes for guru interface implementation:
-
-* Since finufft_plan is C-compatible, we need to use malloc/free for its
-  allocatable arrays, keeping it quite low-level. We can't use std::vector
-  since that would only survive in the scope of each function.
-
-* Thread-safety: FINUFFT plans are passed as pointers, so it has no global
-  state apart from that associated with FFTW (and the did_fftw_init).
-*/
-
-// ---------- local math routines (were in common.cpp; no need now): --------
-
-namespace finufft {
-namespace common {
-
-static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
-                         BIGINT *nf)
-// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
-// and requested number of Fourier modes ms. Returns 0 if success, else an
-// error code if nf was unreasonably big (& tell the world).
-{
-  *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial
-  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
-  if (*nf < MAX_NF) {
-    *nf = next235even(*nf);                               // expensive at huge nf
-    return 0;
-  } else {
-    fprintf(stderr,
-            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
-            "malloc\n",
-            __func__, (double)*nf, (double)MAX_NF);
-    return FINUFFT_ERR_MAXNALLOC;
-  }
-}
-
-template<typename T>
-static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts,
-                                    int dim)
-// Set up the spreader parameters given eps, and pass across various nufft
-// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
-{
-  // this calls spreadinterp.cpp...
-  int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
-                           opts.spread_debug, opts.showwarn, dim);
-  // override various spread opts from their defaults...
-  spopts.debug    = opts.spread_debug;
-  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
-  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
-  spopts.chkbnds  = opts.chkbnds;
-  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
-  if (opts.spread_nthr_atomic >= 0)     // overrides
-    spopts.atomic_threshold = opts.spread_nthr_atomic;
-  if (opts.spread_max_sp_size > 0)      // overrides
-    spopts.max_subproblem_size = opts.spread_max_sp_size;
-  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
-    fprintf(stderr,
-            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
-            __func__);
-  return ier;
-}
-
-template<typename T>
-static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts,
-                          BIGINT *nf, T *h, T *gam)
-/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
-   for type 3 only.
-   Inputs:
-   X and S are the xj and sk interval half-widths respectively.
-   opts and spopts are the NUFFT and spreader opts strucs, respectively.
-   Outputs:
-   nf is the size of upsampled grid for a given single dimension.
-   h is the grid spacing = 2pi/nf
-   gam is the x rescale factor, ie x'_j = x_j/gam  (modulo shifts).
-   Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17
-   New logic 6/12/17
-*/
-{
-  int nss = spopts.nspread + 1; // since ns may be odd
-  T Xsafe = X, Ssafe = S;       // may be tweaked locally
-  if (X == 0.0)                 // logic ensures XS>=1, handle X=0 a/o S=0
-    if (S == 0.0) {
-      Xsafe = 1.0;
-      Ssafe = 1.0;
-    } else
-      Xsafe = max(Xsafe, 1 / S);
-  else
-    Ssafe = max(Ssafe, 1 / X);
-  // use the safe X and S...
-  auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
-  if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf
-  *nf = (BIGINT)nfd;
-  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
-  //  catch too small nf, and nan or +-inf, otherwise spread fails...
-  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
-  if (*nf < MAX_NF)                               // otherwise will fail anyway
-    *nf = next235even(*nf);                       // expensive at huge nf
-  *h   = T(2.0 * PI / *nf);                       // upsampled grid spacing
-  *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x'
-}
-
-template<typename T>
-static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts)
-/*
-  Approximates exact Fourier series coeffs of cnufftspread's real symmetric
-  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
-  narrowness of kernel. Uses phase winding for cheap eval on the regular freq
-  grid. Note that this is also the Fourier transform of the non-periodized
-  kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
-  overall prefactor of 1/h, which is needed anyway for the correction, and
-  arises because the quadrature weights are scaled for grid units not x units.
-  The kernel is actually centered at nf/2, related to the centering of the grid;
-  this is now achieved by the sign flip in a[n] below.
-
-  Inputs:
-  nf - size of 1d uniform spread grid, must be even.
-  opts - spreading opts object, needed to eval kernel (must be already set up)
-
-  Outputs:
-  fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
-        divided by h = 2pi/n.
-        (should be allocated for at least nf/2+1 Ts)
-
-  Compare onedim_dct_kernel which has same interface, but computes DFT of
-  sampled kernel, not quite the same object.
-
-  Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18.
-  Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24.
- */
-{
-  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
-  T f[MAX_NQUAD];
-  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
-  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
-  std::complex<T> a[MAX_NQUAD];
-  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
-    z[n] *= J2;                      // rescale nodes
-    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
-    a[n] = -exp(2 * PI * IMA * (T)z[n] / (T)nf);          // phase winding rates
-  }
-  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
-  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
-  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
-  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
-    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
-#pragma omp parallel num_threads(nt)
-  {                                                // each thread gets own chunk to do
-    int t = MY_OMP_GET_THREAD_NUM();
-    std::complex<T> aj[MAX_NQUAD];                 // phase rotator for this thread
-    for (int n = 0; n < q; ++n)
-      aj[n] = pow(a[n], (T)brk[t]);                // init phase factors for chunk
-    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
-      T x = 0.0;                                   // accumulator for answer at this j
-      for (int n = 0; n < q; ++n) {
-        x += f[n] * 2 * real(aj[n]);               // include the negative freq
-        aj[n] *= a[n];                             // wind the phases
-      }
-      fwkerhalf[j] = x;
-    }
-  }
-}
-
-template<typename T>
-static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts)
-/*
-  Approximates exact 1D Fourier transform of cnufftspread's real symmetric
-  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
-  narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
-  for a kernel with x measured in grid-spacings. (See previous routine for
-  FT definition).
-
-  Inputs:
-  nk - number of freqs
-  k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z)
-     Note, z is in grid-point units, and k values must be in [-pi, pi) for
-     accuracy.
-  opts - spreading opts object, needed to eval kernel (must be already set up)
-
-  Outputs:
-  phihat - real Fourier transform evaluated at freqs (alloc for nk Ts)
-
-  Barnett 2/8/17. openmp since cos slow 2/9/17
- */
-{
-  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
-  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
-  T f[MAX_NQUAD];
-  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
-  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
-  for (int n = 0; n < q; ++n) {
-    z[n] *= (T)J2;                           // quadr nodes for [0,J/2]
-    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights
-  }
-#pragma omp parallel for num_threads(opts.nthreads)
-  for (BIGINT j = 0; j < nk; ++j) {        // loop along output array
-    T x = 0.0;                             // register
-    for (int n = 0; n < q; ++n)
-      x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair.  use T cos!
-    phihat[j] = x;
-  }
-}
-
-template<typename T>
-static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1,
-                                std::complex<T> *fw, int modeord)
-/*
-  if dir==1: copies fw to fk with amplification by prefac/ker
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1)
-      1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
-
-  fk is a size-ms T complex array (2*ms Ts alternating re,im parts)
-  fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts)
-  ker is real-valued T array of length nf1/2+1.
-
-  Single thread only, but shouldn't matter since mostly data movement.
-
-  It has been tested that the repeated floating division in this inner loop
-  only contributes at the <3% level in 3D relative to the FFT cost (8 threads).
-  This could be removed by passing in an inverse kernel and doing mults.
-
-  todo: rewrite w/ C++-complex I/O, check complex divide not slower than
-    real divide, or is there a way to force a real divide?
-
-  Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
-*/
-{
-  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
-  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (kmax + 1);
-  } // or, instead, FFT ordering
-  if (dir == 1) {                                       // read fw, write out to fk...
-    for (BIGINT k = 0; k <= kmax; ++k) {                // non-neg freqs k
-      fk[pp++] = prefac * fw[k].real() / ker[k];        // re
-      fk[pp++] = prefac * fw[k].imag() / ker[k];        // im
-    }
-    for (BIGINT k = kmin; k < 0; ++k) {                 // neg freqs k
-      fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re
-      fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im
-    }
-  } else { // read fk, write out to fw w/ zero padding...
-    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where
-                                                     // needed
-      fw[k] = 0.0;
-    }
-    for (BIGINT k = 0; k <= kmax; ++k) {             // non-neg freqs k
-      fw[k].real(prefac * fk[pp++] / ker[k]);        // re
-      fw[k].imag(prefac * fk[pp++] / ker[k]);        // im
-    }
-    for (BIGINT k = kmin; k < 0; ++k) {              // neg freqs k
-      fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re
-      fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im
-    }
-  }
-}
-
-template<typename T>
-static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt,
-                                T *fk, BIGINT nf1, BIGINT nf2, std::complex<T> *fw,
-                                int modeord)
-/*
-  2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
-
-  if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)).
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-      1: use FFT-style (pos then negative, on each dim)
-
-  fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with
-  ms looped over fast and mt slow.
-  fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with
-  nf1 looped over fast and nf2 slow.
-  ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1
-     respectively.
-
-  Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
-*/
-{
-  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
-  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (k2max + 1) * ms;
-  } // or, instead, FFT ordering
-  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
-    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all
-                                                                     // dims
-      fw[j] = 0.0;
-  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
-    // point fk and fw to the start of this y value's row (2* is for complex):
-    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
-                                &fw[nf1 * k2], modeord);
-  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
-    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
-                                &fw[nf1 * (nf2 + k2)], modeord);
-}
-
-template<typename T>
-static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms,
-                                BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2,
-                                BIGINT nf3, std::complex<T> *fw, int modeord)
-/*
-  3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
-
-  if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)).
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-      1: use FFT-style (pos then negative, on each dim)
-
-  fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with
-  ms looped over fastest and mu slowest.
-  fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with
-  nf1 looped over fastest and nf3 slowest.
-  ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1,
-     and nf3/2+1 respectively.
-
-  Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
-*/
-{
-  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
-  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (k3max + 1) * ms * mt;
-  } // or FFT ordering
-  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
-  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
-    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
-      fw[j] = 0.0;
-  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
-    // point fk and fw to the start of this z value's plane (2* is for complex):
-    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
-                                nf2, &fw[np * k3], modeord);
-  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
-    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
-                                nf2, &fw[np * (nf3 + k3)], modeord);
-}
-
-// --------- batch helper functions for t1,2 exec: ---------------------------
-
-template<typename T>
-static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, std::complex<T> *cBatch)
-/*
-  Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
-  to (or from) the batch of fine working grids p->fwBatch, using the same set of
-  (index-sorted) NU points p->X,Y,Z for each vector in the batch.
-  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
-  Returns 0 (no error reporting for now).
-  Notes:
-  1) cBatch is already assumed to have the correct offset, ie here we
-   read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
-  2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp
-  Barnett 5/19/20, based on Malleo 2019.
-*/
-{
-  // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread.
-  // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
-  // But when nthr_outer=1 here, omp par inside the loop sees all threads...
-#ifdef _OPENMP
-  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
-#endif
-#pragma omp parallel for num_threads(nthr_outer)
-  for (int i = 0; i < batchSize; i++) {
-    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
-    std::complex<T> *ci  = cBatch + i * p->nj;     // start of i'th c array in cBatch
-    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X,
-                       p->Y, p->Z, (T *)ci, p->spopts, p->didSort);
-  }
-  return 0;
-}
-
-template<typename T>
-static int deconvolveBatch(int batchSize, FINUFFT_PLAN p, std::complex<T> *fkBatch)
-/*
-  Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
-  into each output array fk in fkBatch.
-  Type 2: deconvolves from user-supplied input fk to 0-padded interior fw,
-  again looping over fk in fkBatch and fw in p->fwBatch.
-  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
-  This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize
-  times.
-  Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here)
-*/
-{
-  // since deconvolveshuffle?d are single-thread, omp par seems to help here...
-#pragma omp parallel for num_threads(batchSize)
-  for (int i = 0; i < batchSize; i++) {
-    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
-    std::complex<T> *fki = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
-
-    // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
-    if (p->dim == 1)
-      deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki,
-                          p->nf1, fwi, p->opts.modeord);
-    else if (p->dim == 2)
-      deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms,
-                          p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
-    else
-      deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2,
-                          p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2,
-                          p->nf3, fwi, p->opts.modeord);
-  }
-  return 0;
-}
-
-} // namespace common
-} // namespace finufft
-
-// --------------- rest is the 5 user guru (plan) interface drivers: ---------
-// (not namespaced since have safe names finufft{f}_* )
-using namespace finufft::common; // accesses routines defined above
-
-// Marco Barbone: 5.8.2024
-// These are user-facing.
-// The various options could be macros to follow c standard library conventions.
-// Question: would these be enums?
-
-// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
-void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
-// Sets default nufft opts (referenced by all language interfaces too).
-// See finufft_opts.h for meanings.
-// This was created to avoid uncertainty about C++11 style static initialization
-// when called from MEX, but now is generally used. Barnett 10/30/17 onwards.
-// Sphinx sucks the below code block into the web docs, hence keep it clean...
-{
-  // sphinx tag (don't remove): @defopts_start
-  o->modeord = 0;
-  o->chkbnds = 1;
-
-  o->debug        = 0;
-  o->spread_debug = 0;
-  o->showwarn     = 1;
-
-  o->nthreads = 0;
-#ifdef FINUFFT_USE_DUCC0
-  o->fftw = 0;
-#else
-  o->fftw = FFTW_ESTIMATE;
-#endif
-  o->spread_sort        = 2;
-  o->spread_kerevalmeth = 1;
-  o->spread_kerpad      = 1;
-  o->upsampfac          = 0.0;
-  o->spread_thread      = 0;
-  o->maxbatchsize       = 0;
-  o->spread_nthr_atomic = -1;
-  o->spread_max_sp_size = 0;
-  o->fftw_lock_fun      = nullptr;
-  o->fftw_unlock_fun    = nullptr;
-  o->fftw_lock_data     = nullptr;
-  // sphinx tag (don't remove): @defopts_end
-}
-
-// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
 int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
-                     FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts)
-// Populates the fields of finufft_plan which is pointed to by "pp".
-// opts is ptr to a finufft_opts to set options, or NULL to use defaults.
-// For some of the fields (if "auto" selected) here choose the actual setting.
-// For types 1,2 allocates memory for internal working arrays,
-// evaluates spreading kernel coefficients, and instantiates the fftw_plan
-{
-  FINUFFT_PLAN p;
-  p   = new FINUFFT_PLAN_S; // allocate fresh plan struct
-  *pp = p;                  // pass out plan as ptr to plan struct
-
-  if (opts == NULL)         // use default opts
-    FINUFFT_DEFAULT_OPTS(&(p->opts));
-  else                      // or read from what's passed in
-    p->opts = *opts;        // keep a deep copy; changing *opts now has no effect
-
-  if (p->opts.debug)        // do a hello world
-    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
-           __func__);
-
-  p->fftPlan = std::make_unique<Finufft_FFT_plan<FLT>>(
-      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
-
-  if ((type != 1) && (type != 2) && (type != 3)) {
-    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
-    return FINUFFT_ERR_TYPE_NOTVALID;
-  }
-  if ((dim != 1) && (dim != 2) && (dim != 3)) {
-    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-    return FINUFFT_ERR_DIM_NOTVALID;
-  }
-  if (ntrans < 1) {
-    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
-    return FINUFFT_ERR_NTRANS_NOTVALID;
-  }
-  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
-    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
-            __func__);
-    return FINUFFT_ERR_LOCK_FUNS_INVALID;
-    ;
-  }
-
-  // get stuff from args...
-  p->type    = type;
-  p->dim     = dim;
-  p->ntrans  = ntrans;
-  p->tol     = tol;
-  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
-
-                                      // choose overall # threads...
-#ifdef _OPENMP
-  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
-  // (the above could be set, or suggested set, to 1 for small enough problems...)
-  if (p->opts.nthreads > 0) {
-    nthr = p->opts.nthreads; // user override, now without limit
-    if (p->opts.showwarn && (nthr > ompmaxnthr))
-      fprintf(stderr,
-              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
-              "available; note large nthreads can be slower.\n",
-              __func__, nthr, ompmaxnthr);
-  }
-#else
-  int nthr = 1; // always 1 thread (avoid segfault)
-  if (p->opts.nthreads > 1)
-    fprintf(stderr,
-            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
-            __func__, p->opts.nthreads);
-#endif
-  p->opts.nthreads = nthr; // store actual # thr planned for
-  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
-
-  // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
-  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
-    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
-    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
-  } else {                                          // batchSize override by user
-    p->batchSize = min(p->opts.maxbatchsize, ntrans);
-    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
-  }
-  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
-  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
-    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
-    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
-  }
-
-  if (type != 3) {                      // read in user Fourier mode array sizes...
-    p->ms = n_modes[0];
-    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
-    p->mu = (dim > 2) ? n_modes[2] : 1;
-    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
-  }
-
-  // heuristic to choose default upsampfac... (currently two poss)
-  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
-    p->opts.upsampfac = 2.0;                 // default, and need for tol small
-    if (tol >= (FLT)1E-9) {                  // the tol sigma=5/4 can reach
-      if (type == 3)                         // could move to setpts, more known?
-        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
-      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
-               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
-                                             // typ tol, 12-core xeon
-        p->opts.upsampfac = 1.25;
-    }
-    if (p->opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
-  }
-  // use opts to choose and write into plan's spread options...
-  int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
-  if (ier > 1) // proceed if success or warning
-    return ier;
-
-  // set others as defaults (or unallocated for arrays)...
-  p->X           = NULL;
-  p->Y           = NULL;
-  p->Z           = NULL;
-  p->phiHat1     = NULL;
-  p->phiHat2     = NULL;
-  p->phiHat3     = NULL;
-  p->nf1         = 1;
-  p->nf2         = 1;
-  p->nf3         = 1;    // crucial to leave as 1 for unused dims
-  p->sortIndices = NULL; // used in all three types
-
-  //  ------------------------ types 1,2: planning needed ---------------------
-  if (type == 1 || type == 2) {
-
-    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
-                         // Note: batchSize not used since might be only 1.
-
-    p->spopts.spread_direction = type;
-
-    if (p->opts.showwarn) { // user warn round-off error...
-      if (EPSILON * p->ms > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->ms));
-      if (EPSILON * p->mt > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mt));
-      if (EPSILON * p->mu > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mu));
-    }
-
-    // determine fine grid sizes, sanity check..
-    int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1));
-    if (nfier) return nfier; // nf too big; we're done
-    p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1));
-    if (dim > 1) {
-      nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2));
-      if (nfier) return nfier;
-      p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1));
-    }
-    if (dim > 2) {
-      nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3));
-      if (nfier) return nfier;
-      p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1));
-    }
-
-    if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
-             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
-             "batchSize=%d ",
-             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
-             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
-             p->batchSize);
-      if (p->batchSize == 1) // spread_thread has no effect in this case
-        printf("\n");
-      else
-        printf(" spread_thread=%d\n", p->opts.spread_thread);
-    }
-
-    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer;
-    timer.start();
-    onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
-    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
-    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
-    if (p->opts.debug)
-      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
-             timer.elapsedsec());
-
-    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
-    if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr,
-              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
-              __func__);
-      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
-      return FINUFFT_ERR_MAXNALLOC;
-    }
-
-    timer.restart();
-    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug)
-      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
-    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
-              __func__);
-      free(p->phiHat1);
-      free(p->phiHat2);
-      free(p->phiHat3);
-      return FINUFFT_ERR_ALLOC;
-    }
-
-    timer.restart(); // plan the FFTW
-    const auto ns = gridsize_for_fft(p);
-    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
-    if (p->opts.debug)
-      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
-             nthr_fft, timer.elapsedsec());
-
-  } else { // -------------------------- type 3 (no planning) ------------
-
-    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
-    // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->CpBatch     = NULL;
-    p->fwBatch     = NULL;
-    p->Sp          = NULL;
-    p->Tp          = NULL;
-    p->Up          = NULL;
-    p->prephase    = NULL;
-    p->deconv      = NULL;
-    p->innerT2plan = NULL;
-    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
-    // Note we don't even know nj or nk yet, so can't do anything else!
-  }
-  return ier; // report setup_spreader status (could be warning)
+                     FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<FLT>(type, dim, n_modes, iflag, ntrans, tol,
+                                 reinterpret_cast<FINUFFT_PLAN_T<FLT> **>(pp), opts);
 }
 
-// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
 int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk,
-                   FLT *s, FLT *t, FLT *u)
-/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
-   spreading. (The last 4 arguments are ignored.)
-   For type 3: allocates internal working arrays, scales/centers the NU points
-   and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
-*/
-{
-  int d = p->dim; // abbrev for spatial dim
-  CNTime timer;
-  timer.start();
-  p->nj = nj; // the user only now chooses how many NU (x,y,z) pts
-  if (nj < 0) {
-    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
-    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  } else if (nj > MAX_NU_PTS) {
-    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
-    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  }
-
-  if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
-                      // (all we can do is check and maybe bin-sort the NU pts)
-    p->X    = xj;     // plan must keep pointers to user's fixed NU pts
-    p->Y    = yj;
-    p->Z    = zj;
-    int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug > 1)
-      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds,
-             timer.elapsedsec());
-    if (ier) // no warnings allowed here
-      return ier;
-    timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts
-    // calls causing memory leak. We don't know it is the same size as before, so we
-    // have to malloc each time.
-    if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
-    if (!p->sortIndices) {
-      fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__);
-      return FINUFFT_ERR_SPREAD_ALLOC;
-    }
-    p->didSort =
-        indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug)
-      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
-             timer.elapsedsec());
-
-  } else { // ------------------------- TYPE 3 SETPTS -----------------------
-           // (here we can precompute pre/post-phase factors and plan the t2)
-
-    if (nk < 0) {
-      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
-      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    } else if (nk > MAX_NU_PTS) {
-      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
-      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    }
-    p->nk = nk; // user set # targ freq pts
-    p->S  = s;  // keep pointers to user's input target pts
-    p->T  = t;
-    p->U  = u;
-
-    // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
-    FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}...
-    arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1));
-    arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k}
-    set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1),
-                  &(p->t3P.gam1));         // applies twist i)
-    p->t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
-    p->t3P.D2 = 0.0;
-    if (d > 1) {
-      arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j}
-      arraywidcen(nk, t, &S2, &(p->t3P.D2));           // {t_k}
-      set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2),
-                    &(p->t3P.gam2));
-    }
-    p->t3P.C3 = 0.0;
-    p->t3P.D3 = 0.0;
-    if (d > 2) {
-      arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j}
-      arraywidcen(nk, u, &S3, &(p->t3P.D3));           // {u_k}
-      set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3),
-                    &(p->t3P.gam3));
-    }
-
-    if (p->opts.debug) { // report on choices of shifts, centers, etc...
-      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
-      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1,
-             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1);
-      if (d > 1)
-        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2,
-               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2);
-      if (d > 2)
-        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3,
-               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3);
-    }
-    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
-    if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr,
-              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
-              __func__);
-      return FINUFFT_ERR_MAXNALLOC;
-    }
-    p->fftPlan->free(p->fwBatch);
-    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace
-
-    // (note FFTW_ALLOC is not needed over malloc, but matches its type)
-    if (p->CpBatch) free(p->CpBatch);
-    p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work
-
-    if (p->opts.debug)
-      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize,
-             timer.elapsedsec());
-    if (!p->fwBatch || !p->CpBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
-      return FINUFFT_ERR_ALLOC;
-    }
-    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
-
-    // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
-    // FIXME: should use realloc
-    if (p->X) free(p->X);
-    if (p->Sp) free(p->Sp);
-    p->X  = (FLT *)malloc(sizeof(FLT) * nj);
-    p->Sp = (FLT *)malloc(sizeof(FLT) * nk);
-    if (d > 1) {
-      if (p->Y) free(p->Y);
-      if (p->Tp) free(p->Tp);
-      p->Y  = (FLT *)malloc(sizeof(FLT) * nj);
-      p->Tp = (FLT *)malloc(sizeof(FLT) * nk);
-    }
-    if (d > 2) {
-      if (p->Z) free(p->Z);
-      if (p->Up) free(p->Up);
-      p->Z  = (FLT *)malloc(sizeof(FLT) * nj);
-      p->Up = (FLT *)malloc(sizeof(FLT) * nk);
-    }
-
-    // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
-    FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
-    if (d > 1) ig2 = 1.0 / p->t3P.gam2;
-    if (d > 2) ig3 = 1.0 / p->t3P.gam3;
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT j = 0; j < nj; ++j) {
-      p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j
-      if (d > 1) // (ok to do inside loop because of branch predict)
-        p->Y[j] = (yj[j] - p->t3P.C2) * ig2;          // rescale y_j
-      if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j
-    }
-
-    // set up prephase array...
-    CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i
-    if (p->prephase) free(p->prephase);
-    p->prephase = (CPX *)malloc(sizeof(CPX) * nj);
-    if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) {
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
-        FLT phase = p->t3P.D1 * xj[j];
-        if (d > 1) phase += p->t3P.D2 * yj[j];
-        if (d > 2) phase += p->t3P.D3 * zj[j];
-        p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler
-                                                            // e^{+-i.phase}
-      }
-    } else
-      for (BIGINT j = 0; j < nj; ++j)
-        p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec??
-
-                                   // rescale the target s_k etc to s'_k etc...
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k = 0; k < nk; ++k) {
-      p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1);   // so |s'_k| < pi/R
-      if (d > 1)
-        p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| <
-                                                                 // pi/R
-      if (d > 2)
-        p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| <
-                                                                 // pi/R
-    }
-    // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
-    // (exploits that FT separates because kernel is prod of 1D funcs)
-    if (p->deconv) free(p->deconv);
-    p->deconv     = (CPX *)malloc(sizeof(CPX) * nk);
-    FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk);    // don't confuse w/ p->phiHat
-    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1
-    FLT *phiHatk2 = NULL, *phiHatk3 = NULL;
-    if (d > 1) {
-      phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk);
-      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2
-    }
-    if (d > 2) {
-      phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk);
-      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3
-    }
-    int Cfinite =
-        isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan
-                                                                           // or inf if
-                                                                           // M=0, no
-                                                                           // input NU pts
-    int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
-      FLT phiHat = phiHatk1[k];
-      if (d > 1) phiHat *= phiHatk2[k];
-      if (d > 2) phiHat *= phiHatk3[k];
-      p->deconv[k] = (CPX)(1.0 / phiHat);
-      if (Cfinite && Cnonzero) {
-        FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1;
-        if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2;
-        if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3;
-        p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
-      }
-    }
-    free(phiHatk1);
-    free(phiHatk2);
-    free(phiHatk3); // done w/ deconv fill
-    if (p->opts.debug)
-      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
-
-    // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
-    timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts
-    // calls causing memory leak. We don't know it is the same size as before, so we
-    // have to malloc each time.
-    if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
-    if (!p->sortIndices) {
-      fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__);
-      return FINUFFT_ERR_SPREAD_ALLOC;
-    }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y,
-                           p->Z, p->spopts);
-    if (p->opts.debug)
-      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
-             timer.elapsedsec());
-
-    // Plan and setpts once, for the (repeated) inner type 2 finufft call...
-    timer.restart();
-    BIGINT t2nmodes[]   = {p->nf1, p->nf2, p->nf3};  // t2 input is actually fw
-    finufft_opts t2opts = p->opts;                   // deep copy, since not ptrs
-    t2opts.modeord      = 0;                         // needed for correct t3!
-    t2opts.debug        = max(0, p->opts.debug - 1); // don't print as much detail
-    t2opts.spread_debug = max(0, p->opts.spread_debug - 1);
-    t2opts.showwarn     = 0;                         // so don't see warnings 2x
-    // (...could vary other t2opts here?)
-    if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
-    int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
-                               &p->innerT2plan, &t2opts);
-    if (ier > 1) { // if merely warning, still proceed
-      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
-              __func__, ier);
-      return ier;
-    }
-    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL,
-                         NULL); // note nk = # output points (not nj)
-    if (ier > 1) {
-      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
-      return ier;
-    }
-    if (p->opts.debug)
-      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
-  }
-  return 0;
+                   FLT *s, FLT *t, FLT *u) {
+  return finufft_setpts_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), nj, xj, yj, zj,
+                               nk, s, t, u);
 }
-// ............ end setpts ..................................................
 
-// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
 int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
-  /* See ../docs/cguru.doc for current documentation.
-
-   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
-   existing (sorted) NU pts and existing plan.
-   For type 1 and 3: cj is input, fk is output.
-   For type 2: fk is input, cj is output.
-   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
-   for each of the 3 types.
-   For cases of ntrans>1, performs work in blocks of size up to batchSize.
-   Return value 0 (no error diagnosis yet).
-   Barnett 5/20/20, based on Malleo 2019.
-*/
-  CNTime timer;
-  timer.start();
-
-  if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
-
-    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
-    if (p->opts.debug)
-      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
-             p->nbatch, p->batchSize);
-
-    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
-
-      // current batch is either batchSize, or possibly truncated if last one
-      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
-      int bB            = b * p->batchSize; // index of vector, since batchsizes same
-      CPX *cjb          = cj + bB * p->nj;  // point to batch of weights
-      CPX *fkb          = fk + bB * p->N;   // point to batch of mode coeffs
-      if (p->opts.debug > 1)
-        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
-
-      // STEP 1: (varies by type)
-      timer.restart();
-      if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid
-        spreadinterpSortedBatch<FLT>(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec();
-      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
-        deconvolveBatch<FLT>(thisBatchSize, p, fkb);
-        t_deconv += timer.elapsedsec();
-      }
-
-      // STEP 2: call the FFT on this batch
-      timer.restart();
-      do_fft(p);
-      t_fft += timer.elapsedsec();
-      if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
-
-      // STEP 3: (varies by type)
-      timer.restart();
-      if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
-        deconvolveBatch<FLT>(thisBatchSize, p, fkb);
-        t_deconv += timer.elapsedsec();
-      } else { // type 2: interpolate unif fw grid to NU target pts
-        spreadinterpSortedBatch<FLT>(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec();
-      }
-    } // ........end b loop
-
-    if (p->opts.debug) { // report total times in their natural order...
-      if (p->type == 1) {
-        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
-        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
-      } else {
-        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
-        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
-      }
-    }
-  }
-
-  else { // ----------------------------- TYPE 3 EXEC ---------------------
-
-    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
-    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
-
-    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0,
-           t_deconv = 0.0; // accumulated timings
-    if (p->opts.debug)
-      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
-             p->nbatch, p->batchSize);
-
-    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
-
-      // batching and pointers to this batch, identical to t1,2 above...
-      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
-      int bB            = b * p->batchSize;
-      CPX *cjb          = cj + bB * p->nj; // batch of input strengths
-      CPX *fkb          = fk + bB * p->nk; // batch of output strengths
-      if (p->opts.debug > 1)
-        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
-
-      // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
-      timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
-      for (int i = 0; i < thisBatchSize; i++) {
-        BIGINT ioff = i * p->nj;
-        for (BIGINT j = 0; j < p->nj; ++j) {
-          p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
-        }
-      }
-      t_pre += timer.elapsedsec();
-
-      // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
-      timer.restart();
-      p->spopts.spread_direction = 1;                             // spread
-      spreadinterpSortedBatch<FLT>(thisBatchSize, p, p->CpBatch); // p->X are primed
-      t_spr += timer.elapsedsec();
-
-      // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
-      timer.restart();
-      // illegal possible shrink of ntrans *after* plan for smaller last batch:
-      p->innerT2plan->ntrans = thisBatchSize; // do not try this at home!
-      /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
-     still the same size, as Andrea explained; just wastes a few flops) */
-      FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch);
-      t_t2 += timer.elapsedsec();
-      // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
-      timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads)
-      for (int i = 0; i < thisBatchSize; i++) {
-        BIGINT ioff = i * p->nk;
-        for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k];
-      }
-      t_deconv += timer.elapsedsec();
-    } // ........end b loop
-
-    if (p->opts.debug) { // report total times in their natural order...
-      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
-      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
-      printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
-      printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
-    }
-  }
-  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
-  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
-
-  return 0;
+  return finufft_execute_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), cj, fk);
 }
 
-// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 int FINUFFT_DESTROY(FINUFFT_PLAN p)
 // Free everything we allocated inside of finufft_plan pointed to by p.
 // Also must not crash if called immediately after finufft_makeplan.
@@ -1189,24 +32,7 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
   if (!p) // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
-  p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array
-  free(p->sortIndices);
-  if (p->type == 1 || p->type == 2) {
-    free(p->phiHat1);
-    free(p->phiHat2);
-    free(p->phiHat3);
-  } else {                           // free the stuff alloc for type 3 only
-    FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code
-    free(p->CpBatch);
-    free(p->Sp);
-    free(p->Tp);
-    free(p->Up);
-    free(p->X);
-    free(p->Y);
-    free(p->Z);
-    free(p->prephase);
-    free(p->deconv);
-  }
-  delete p;
+  delete reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p);
+  p = nullptr;
   return 0; // success
 }
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
new file mode 100644
index 000000000..70a52afa8
--- /dev/null
+++ b/src/finufft_core.cpp
@@ -0,0 +1,1237 @@
+#include <finufft/fft.h>
+#include <finufft/finufft_core.h>
+#include <finufft/spreadinterp.h>
+#include <finufft/utils.h>
+#include <finufft/utils_precindep.h>
+
+#include "../contrib/legendre_rule_fast.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+using namespace std;
+using namespace finufft;
+using namespace finufft::utils;
+using namespace finufft::spreadinterp;
+using namespace finufft::quadrature;
+
+/* Computational core for FINUFFT.
+
+   Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
+   2d1/2d2 many-vector drivers by Melody Shih, summer 2018.
+   Original guru interface written by Andrea Malleo, summer 2019, mentored
+   by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu.
+
+   As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions
+   and the two finufft2d?many() functions. The (now 18) simple C++ interfaces
+   are in simpleinterfaces.cpp.
+
+Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
+
+   TYPE 1:
+   The type 1 NUFFT proceeds in three main steps:
+   1) spread data to oversampled regular mesh using kernel.
+   2) compute FFT on uniform mesh
+   3) deconvolve by division of each Fourier mode independently by the kernel
+    Fourier series coeffs (not merely FFT of kernel), shuffle to output.
+   The kernel coeffs are precomputed in what is called step 0 in the code.
+
+   TYPE 2:
+   The type 2 algorithm proceeds in three main steps:
+   1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
+   2) compute inverse FFT on uniform fine grid
+   3) spread (dir=2, ie interpolate) data to regular mesh
+   The kernel coeffs are precomputed in what is called step 0 in the code.
+
+   TYPE 3:
+   The type 3 algorithm is basically a type 2 (which is implemented precisely
+   as call to type 2) replacing the middle FFT (Step 2) of a type 1.
+   Beyond this, the new twists are:
+   i) nf1, number of upsampled points for the type-1, depends on the product
+     of interval widths containing input and output points (X*S).
+   ii) The deconvolve (post-amplify) step is division by the Fourier transform
+     of the scaled kernel, evaluated on the *nonuniform* output frequency
+     grid; this is done by direct approximation of the Fourier integral
+     using quadrature of the kernel function times exponentials.
+   iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
+     half-widths X and S, hence nf1.
+
+   MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
+   maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
+   this is good only for small problems.
+
+
+Design notes for guru interface implementation:
+
+* Since finufft_plan is C-compatible, we need to use malloc/free for its
+  allocatable arrays, keeping it quite low-level. We can't use std::vector
+  since that would only survive in the scope of each function.
+
+* Thread-safety: FINUFFT plans are passed as pointers, so it has no global
+  state apart from that associated with FFTW (and the did_fftw_init).
+*/
+
+// ---------- local math routines (were in common.cpp; no need now): --------
+
+namespace finufft {
+namespace common {
+
+static constexpr double PI = 3.14159265358979329;
+
+static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
+                         BIGINT *nf)
+// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
+// and requested number of Fourier modes ms. Returns 0 if success, else an
+// error code if nf was unreasonably big (& tell the world).
+{
+  *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {
+    *nf = next235even(*nf);                               // expensive at huge nf
+    return 0;
+  } else {
+    fprintf(stderr,
+            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
+            "malloc\n",
+            __func__, (double)*nf, (double)MAX_NF);
+    return FINUFFT_ERR_MAXNALLOC;
+  }
+}
+
+template<typename T>
+static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts,
+                                    int dim)
+// Set up the spreader parameters given eps, and pass across various nufft
+// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
+{
+  // this calls spreadinterp.cpp...
+  int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
+                           opts.spread_debug, opts.showwarn, dim);
+  // override various spread opts from their defaults...
+  spopts.debug    = opts.spread_debug;
+  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
+  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
+  spopts.chkbnds  = opts.chkbnds;
+  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
+  if (opts.spread_nthr_atomic >= 0)     // overrides
+    spopts.atomic_threshold = opts.spread_nthr_atomic;
+  if (opts.spread_max_sp_size > 0)      // overrides
+    spopts.max_subproblem_size = opts.spread_max_sp_size;
+  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
+    fprintf(stderr,
+            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
+            __func__);
+  return ier;
+}
+
+template<typename T>
+static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts,
+                          BIGINT *nf, T *h, T *gam)
+/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
+   for type 3 only.
+   Inputs:
+   X and S are the xj and sk interval half-widths respectively.
+   opts and spopts are the NUFFT and spreader opts strucs, respectively.
+   Outputs:
+   nf is the size of upsampled grid for a given single dimension.
+   h is the grid spacing = 2pi/nf
+   gam is the x rescale factor, ie x'_j = x_j/gam  (modulo shifts).
+   Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17
+   New logic 6/12/17
+*/
+{
+  int nss = spopts.nspread + 1; // since ns may be odd
+  T Xsafe = X, Ssafe = S;       // may be tweaked locally
+  if (X == 0.0)                 // logic ensures XS>=1, handle X=0 a/o S=0
+    if (S == 0.0) {
+      Xsafe = 1.0;
+      Ssafe = 1.0;
+    } else
+      Xsafe = max(Xsafe, 1 / S);
+  else
+    Ssafe = max(Ssafe, 1 / X);
+  // use the safe X and S...
+  auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
+  if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf
+  *nf = (BIGINT)nfd;
+  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
+  //  catch too small nf, and nan or +-inf, otherwise spread fails...
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
+  if (*nf < MAX_NF)                               // otherwise will fail anyway
+    *nf = next235even(*nf);                       // expensive at huge nf
+  *h   = T(2.0 * PI / *nf);                       // upsampled grid spacing
+  *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x'
+}
+
+template<typename T>
+static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts)
+/*
+  Approximates exact Fourier series coeffs of cnufftspread's real symmetric
+  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
+  narrowness of kernel. Uses phase winding for cheap eval on the regular freq
+  grid. Note that this is also the Fourier transform of the non-periodized
+  kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
+  overall prefactor of 1/h, which is needed anyway for the correction, and
+  arises because the quadrature weights are scaled for grid units not x units.
+  The kernel is actually centered at nf/2, related to the centering of the grid;
+  this is now achieved by the sign flip in a[n] below.
+
+  Inputs:
+  nf - size of 1d uniform spread grid, must be even.
+  opts - spreading opts object, needed to eval kernel (must be already set up)
+
+  Outputs:
+  fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
+        divided by h = 2pi/n.
+        (should be allocated for at least nf/2+1 Ts)
+
+  Compare onedim_dct_kernel which has same interface, but computes DFT of
+  sampled kernel, not quite the same object.
+
+  Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18.
+  Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24.
+ */
+{
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
+  T f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
+  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
+  std::complex<T> a[MAX_NQUAD];
+  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
+    z[n] *= J2;                      // rescale nodes
+    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
+    a[n] = -exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf)); // phase winding
+                                                                          // rates
+  }
+  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
+  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
+  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
+  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
+    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
+#pragma omp parallel num_threads(nt)
+  {                                                // each thread gets own chunk to do
+    int t = MY_OMP_GET_THREAD_NUM();
+    std::complex<T> aj[MAX_NQUAD];                 // phase rotator for this thread
+    for (int n = 0; n < q; ++n)
+      aj[n] = pow(a[n], (T)brk[t]);                // init phase factors for chunk
+    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
+      T x = 0.0;                                   // accumulator for answer at this j
+      for (int n = 0; n < q; ++n) {
+        x += f[n] * 2 * real(aj[n]);               // include the negative freq
+        aj[n] *= a[n];                             // wind the phases
+      }
+      fwkerhalf[j] = x;
+    }
+  }
+}
+
+template<typename T>
+static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts)
+/*
+  Approximates exact 1D Fourier transform of cnufftspread's real symmetric
+  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
+  narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
+  for a kernel with x measured in grid-spacings. (See previous routine for
+  FT definition).
+
+  Inputs:
+  nk - number of freqs
+  k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z)
+     Note, z is in grid-point units, and k values must be in [-pi, pi) for
+     accuracy.
+  opts - spreading opts object, needed to eval kernel (must be already set up)
+
+  Outputs:
+  phihat - real Fourier transform evaluated at freqs (alloc for nk Ts)
+
+  Barnett 2/8/17. openmp since cos slow 2/9/17
+ */
+{
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
+  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
+  T f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
+  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
+  for (int n = 0; n < q; ++n) {
+    z[n] *= (T)J2;                           // quadr nodes for [0,J/2]
+    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights
+  }
+#pragma omp parallel for num_threads(opts.nthreads)
+  for (BIGINT j = 0; j < nk; ++j) {        // loop along output array
+    T x = 0.0;                             // register
+    for (int n = 0; n < q; ++n)
+      x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair.  use T cos!
+    phihat[j] = x;
+  }
+}
+
+template<typename T>
+static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1,
+                                std::complex<T> *fw, int modeord)
+/*
+  if dir==1: copies fw to fk with amplification by prefac/ker
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1)
+      1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
+
+  fk is a size-ms T complex array (2*ms Ts alternating re,im parts)
+  fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts)
+  ker is real-valued T array of length nf1/2+1.
+
+  Single thread only, but shouldn't matter since mostly data movement.
+
+  It has been tested that the repeated floating division in this inner loop
+  only contributes at the <3% level in 3D relative to the FFT cost (8 threads).
+  This could be removed by passing in an inverse kernel and doing mults.
+
+  todo: rewrite w/ C++-complex I/O, check complex divide not slower than
+    real divide, or is there a way to force a real divide?
+
+  Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
+*/
+{
+  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
+  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (kmax + 1);
+  } // or, instead, FFT ordering
+  if (dir == 1) {                                       // read fw, write out to fk...
+    for (BIGINT k = 0; k <= kmax; ++k) {                // non-neg freqs k
+      fk[pp++] = prefac * fw[k].real() / ker[k];        // re
+      fk[pp++] = prefac * fw[k].imag() / ker[k];        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {                 // neg freqs k
+      fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re
+      fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im
+    }
+  } else { // read fk, write out to fw w/ zero padding...
+    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where
+                                                     // needed
+      fw[k] = 0.0;
+    }
+    for (BIGINT k = 0; k <= kmax; ++k) {             // non-neg freqs k
+      fw[k].real(prefac * fk[pp++] / ker[k]);        // re
+      fw[k].imag(prefac * fk[pp++] / ker[k]);        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {              // neg freqs k
+      fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re
+      fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im
+    }
+  }
+}
+
+template<typename T>
+static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt,
+                                T *fk, BIGINT nf1, BIGINT nf2, std::complex<T> *fw,
+                                int modeord)
+/*
+  2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
+
+  if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)).
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
+      1: use FFT-style (pos then negative, on each dim)
+
+  fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with
+  ms looped over fast and mt slow.
+  fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with
+  nf1 looped over fast and nf2 slow.
+  ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1
+     respectively.
+
+  Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
+*/
+{
+  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
+  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k2max + 1) * ms;
+  } // or, instead, FFT ordering
+  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
+    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all
+                                                                     // dims
+      fw[j] = 0.0;
+  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
+    // point fk and fw to the start of this y value's row (2* is for complex):
+    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
+                                &fw[nf1 * k2], modeord);
+  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
+    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
+                                &fw[nf1 * (nf2 + k2)], modeord);
+}
+
+template<typename T>
+static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms,
+                                BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2,
+                                BIGINT nf3, std::complex<T> *fw, int modeord)
+/*
+  3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
+
+  if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)).
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
+      1: use FFT-style (pos then negative, on each dim)
+
+  fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with
+  ms looped over fastest and mu slowest.
+  fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with
+  nf1 looped over fastest and nf3 slowest.
+  ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1,
+     and nf3/2+1 respectively.
+
+  Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
+*/
+{
+  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
+  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k3max + 1) * ms * mt;
+  } // or FFT ordering
+  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
+  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
+    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
+      fw[j] = 0.0;
+  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
+    // point fk and fw to the start of this z value's plane (2* is for complex):
+    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
+                                nf2, &fw[np * k3], modeord);
+  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
+    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
+                                nf2, &fw[np * (nf3 + k3)], modeord);
+}
+
+// --------- batch helper functions for t1,2 exec: ---------------------------
+
+template<typename T>
+static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T<T> *p,
+                                   std::complex<T> *cBatch)
+/*
+  Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
+  to (or from) the batch of fine working grids p->fwBatch, using the same set of
+  (index-sorted) NU points p->X,Y,Z for each vector in the batch.
+  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
+  Returns 0 (no error reporting for now).
+  Notes:
+  1) cBatch is already assumed to have the correct offset, ie here we
+   read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
+  2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp
+  Barnett 5/19/20, based on Malleo 2019.
+*/
+{
+  // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread.
+  // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
+  // But when nthr_outer=1 here, omp par inside the loop sees all threads...
+#ifdef _OPENMP
+  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
+#endif
+#pragma omp parallel for num_threads(nthr_outer)
+  for (int i = 0; i < batchSize; i++) {
+    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    std::complex<T> *ci  = cBatch + i * p->nj;     // start of i'th c array in cBatch
+    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X,
+                       p->Y, p->Z, (T *)ci, p->spopts, p->didSort);
+  }
+  return 0;
+}
+
+template<typename T>
+static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T<T> *p, std::complex<T> *fkBatch)
+/*
+  Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
+  into each output array fk in fkBatch.
+  Type 2: deconvolves from user-supplied input fk to 0-padded interior fw,
+  again looping over fk in fkBatch and fw in p->fwBatch.
+  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
+  This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize
+  times.
+  Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here)
+*/
+{
+  // since deconvolveshuffle?d are single-thread, omp par seems to help here...
+#pragma omp parallel for num_threads(batchSize)
+  for (int i = 0; i < batchSize; i++) {
+    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    std::complex<T> *fki = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
+
+    // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
+    if (p->dim == 1)
+      deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki,
+                          p->nf1, fwi, p->opts.modeord);
+    else if (p->dim == 2)
+      deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms,
+                          p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
+    else
+      deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2,
+                          p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2,
+                          p->nf3, fwi, p->opts.modeord);
+  }
+  return 0;
+}
+
+} // namespace common
+} // namespace finufft
+
+// --------------- rest is the 5 user guru (plan) interface drivers: ---------
+// (not namespaced since have safe names finufft{f}_* )
+using namespace finufft::common; // accesses routines defined above
+
+// Marco Barbone: 5.8.2024
+// These are user-facing.
+// The various options could be macros to follow c standard library conventions.
+// Question: would these be enums?
+
+// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
+void finufft_default_opts_t(finufft_opts *o)
+// Sets default nufft opts (referenced by all language interfaces too).
+// See finufft_opts.h for meanings.
+// This was created to avoid uncertainty about C++11 style static initialization
+// when called from MEX, but now is generally used. Barnett 10/30/17 onwards.
+// Sphinx sucks the below code block into the web docs, hence keep it clean...
+{
+  // sphinx tag (don't remove): @defopts_start
+  o->modeord = 0;
+  o->chkbnds = 1;
+
+  o->debug        = 0;
+  o->spread_debug = 0;
+  o->showwarn     = 1;
+
+  o->nthreads = 0;
+#ifdef FINUFFT_USE_DUCC0
+  o->fftw = 0;
+#else
+  o->fftw = FFTW_ESTIMATE;
+#endif
+  o->spread_sort        = 2;
+  o->spread_kerevalmeth = 1;
+  o->spread_kerpad      = 1;
+  o->upsampfac          = 0.0;
+  o->spread_thread      = 0;
+  o->maxbatchsize       = 0;
+  o->spread_nthr_atomic = -1;
+  o->spread_max_sp_size = 0;
+  o->fftw_lock_fun      = nullptr;
+  o->fftw_unlock_fun    = nullptr;
+  o->fftw_lock_data     = nullptr;
+  // sphinx tag (don't remove): @defopts_end
+}
+
+// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
+template<typename TF>
+int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts)
+// Populates the fields of finufft_plan which is pointed to by "pp".
+// opts is ptr to a finufft_opts to set options, or NULL to use defaults.
+// For some of the fields (if "auto" selected) here choose the actual setting.
+// For types 1,2 allocates memory for internal working arrays,
+// evaluates spreading kernel coefficients, and instantiates the fftw_plan
+{
+  FINUFFT_PLAN_T<TF> *p;
+  p   = new FINUFFT_PLAN_T<TF>; // allocate fresh plan struct
+  *pp = p;                      // pass out plan as ptr to plan struct
+
+  if (opts == NULL)             // use default opts
+    finufft_default_opts_t(&(p->opts));
+  else                          // or read from what's passed in
+    p->opts = *opts;            // keep a deep copy; changing *opts now has no effect
+
+  if (p->opts.debug)            // do a hello world
+    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
+           __func__);
+
+  p->fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
+      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
+
+  if ((type != 1) && (type != 2) && (type != 3)) {
+    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
+    return FINUFFT_ERR_TYPE_NOTVALID;
+  }
+  if ((dim != 1) && (dim != 2) && (dim != 3)) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
+  if (ntrans < 1) {
+    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
+    return FINUFFT_ERR_NTRANS_NOTVALID;
+  }
+  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
+    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
+            __func__);
+    return FINUFFT_ERR_LOCK_FUNS_INVALID;
+    ;
+  }
+
+  // get stuff from args...
+  p->type    = type;
+  p->dim     = dim;
+  p->ntrans  = ntrans;
+  p->tol     = tol;
+  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
+
+                                      // choose overall # threads...
+#ifdef _OPENMP
+  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
+  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
+  // (the above could be set, or suggested set, to 1 for small enough problems...)
+  if (p->opts.nthreads > 0) {
+    nthr = p->opts.nthreads; // user override, now without limit
+    if (p->opts.showwarn && (nthr > ompmaxnthr))
+      fprintf(stderr,
+              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
+              "available; note large nthreads can be slower.\n",
+              __func__, nthr, ompmaxnthr);
+  }
+#else
+  int nthr = 1; // always 1 thread (avoid segfault)
+  if (p->opts.nthreads > 1)
+    fprintf(stderr,
+            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
+            __func__, p->opts.nthreads);
+#endif
+  p->opts.nthreads = nthr; // store actual # thr planned for
+  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
+
+  // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
+  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
+    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
+    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
+  } else {                                          // batchSize override by user
+    p->batchSize = min(p->opts.maxbatchsize, ntrans);
+    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
+  }
+  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
+  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
+    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
+    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
+  }
+
+  if (type != 3) {                      // read in user Fourier mode array sizes...
+    p->ms = n_modes[0];
+    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
+    p->mu = (dim > 2) ? n_modes[2] : 1;
+    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
+  }
+
+  // heuristic to choose default upsampfac... (currently two poss)
+  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
+    p->opts.upsampfac = 2.0;                 // default, and need for tol small
+    if (tol >= (TF)1E-9) {                   // the tol sigma=5/4 can reach
+      if (type == 3)                         // could move to setpts, more known?
+        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
+      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
+               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
+                                             // typ tol, 12-core xeon
+        p->opts.upsampfac = 1.25;
+    }
+    if (p->opts.debug > 1)
+      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
+  }
+  // use opts to choose and write into plan's spread options...
+  int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
+  if (ier > 1) // proceed if success or warning
+    return ier;
+
+  // set others as defaults (or unallocated for arrays)...
+  p->X           = NULL;
+  p->Y           = NULL;
+  p->Z           = NULL;
+  p->phiHat1     = NULL;
+  p->phiHat2     = NULL;
+  p->phiHat3     = NULL;
+  p->nf1         = 1;
+  p->nf2         = 1;
+  p->nf3         = 1;    // crucial to leave as 1 for unused dims
+  p->sortIndices = NULL; // used in all three types
+
+  //  ------------------------ types 1,2: planning needed ---------------------
+  if (type == 1 || type == 2) {
+
+    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
+                         // Note: batchSize not used since might be only 1.
+
+    p->spopts.spread_direction = type;
+
+    constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();
+    if (p->opts.showwarn) { // user warn round-off error...
+      if (EPSILON * p->ms > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->ms));
+      if (EPSILON * p->mt > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mt));
+      if (EPSILON * p->mu > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mu));
+    }
+
+    // determine fine grid sizes, sanity check..
+    int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1));
+    if (nfier) return nfier; // nf too big; we're done
+    p->phiHat1 = (TF *)malloc(sizeof(TF) * (p->nf1 / 2 + 1));
+    if (dim > 1) {
+      nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2));
+      if (nfier) return nfier;
+      p->phiHat2 = (TF *)malloc(sizeof(TF) * (p->nf2 / 2 + 1));
+    }
+    if (dim > 2) {
+      nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3));
+      if (nfier) return nfier;
+      p->phiHat3 = (TF *)malloc(sizeof(TF) * (p->nf3 / 2 + 1));
+    }
+
+    if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
+      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
+             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
+             "batchSize=%d ",
+             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
+             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
+             p->batchSize);
+      if (p->batchSize == 1) // spread_thread has no effect in this case
+        printf("\n");
+      else
+        printf(" spread_thread=%d\n", p->opts.spread_thread);
+    }
+
+    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
+    CNTime timer;
+    timer.start();
+    onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
+    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
+    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
+             timer.elapsedsec());
+
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
+    if (p->nf * p->batchSize > MAX_NF) {
+      fprintf(stderr,
+              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
+      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
+      return FINUFFT_ERR_MAXNALLOC;
+    }
+
+    timer.restart();
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
+    if (p->opts.debug)
+      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(std::complex<TF>) * p->nf * p->batchSize,
+             timer.elapsedsec());
+    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
+      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
+              __func__);
+      free(p->phiHat1);
+      free(p->phiHat2);
+      free(p->phiHat3);
+      return FINUFFT_ERR_ALLOC;
+    }
+
+    timer.restart(); // plan the FFTW
+    const auto ns = gridsize_for_fft(p);
+    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
+    if (p->opts.debug)
+      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
+             nthr_fft, timer.elapsedsec());
+
+  } else { // -------------------------- type 3 (no planning) ------------
+
+    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
+    // in case destroy occurs before setpts, need safe dummy ptrs/plans...
+    p->CpBatch     = NULL;
+    p->fwBatch     = NULL;
+    p->Sp          = NULL;
+    p->Tp          = NULL;
+    p->Up          = NULL;
+    p->prephase    = NULL;
+    p->deconv      = NULL;
+    p->innerT2plan = NULL;
+    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
+    // Note we don't even know nj or nk yet, so can't do anything else!
+  }
+  return ier; // report setup_spreader status (could be warning)
+}
+template int finufft_makeplan_t<float>(int type, int dim, const BIGINT *n_modes,
+                                       int iflag, int ntrans, float tol,
+                                       FINUFFT_PLAN_T<float> **pp, finufft_opts *opts);
+template int finufft_makeplan_t<double>(int type, int dim, const BIGINT *n_modes,
+                                        int iflag, int ntrans, double tol,
+                                        FINUFFT_PLAN_T<double> **pp, finufft_opts *opts);
+
+// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
+template<typename TF>
+int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
+                     TF *s, TF *t, TF *u)
+/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
+   spreading. (The last 4 arguments are ignored.)
+   For type 3: allocates internal working arrays, scales/centers the NU points
+   and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
+*/
+{
+  int d = p->dim; // abbrev for spatial dim
+  CNTime timer;
+  timer.start();
+  p->nj = nj; // the user only now chooses how many NU (x,y,z) pts
+  if (nj < 0) {
+    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  } else if (nj > MAX_NU_PTS) {
+    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  }
+
+  if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
+                      // (all we can do is check and maybe bin-sort the NU pts)
+    p->X    = xj;     // plan must keep pointers to user's fixed NU pts
+    p->Y    = yj;
+    p->Z    = zj;
+    int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
+    if (p->opts.debug > 1)
+      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds,
+             timer.elapsedsec());
+    if (ier) // no warnings allowed here
+      return ier;
+    timer.restart();
+    // Free sortIndices if it has been allocated before in case of repeated setpts
+    // calls causing memory leak. We don't know it is the same size as before, so we
+    // have to malloc each time.
+    if (p->sortIndices) free(p->sortIndices);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
+    if (!p->sortIndices) {
+      fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__);
+      return FINUFFT_ERR_SPREAD_ALLOC;
+    }
+    p->didSort =
+        indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
+
+  } else { // ------------------------- TYPE 3 SETPTS -----------------------
+           // (here we can precompute pre/post-phase factors and plan the t2)
+
+    if (nk < 0) {
+      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
+      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+    } else if (nk > MAX_NU_PTS) {
+      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
+      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+    }
+    p->nk = nk; // user set # targ freq pts
+    p->S  = s;  // keep pointers to user's input target pts
+    p->T  = t;
+    p->U  = u;
+
+    // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
+    TF S1, S2, S3; // get half-width X, center C, which contains {x_j}...
+    arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1));
+    arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k}
+    set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1),
+                  &(p->t3P.gam1));         // applies twist i)
+    p->t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
+    p->t3P.D2 = 0.0;
+    if (d > 1) {
+      arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j}
+      arraywidcen(nk, t, &S2, &(p->t3P.D2));           // {t_k}
+      set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2),
+                    &(p->t3P.gam2));
+    }
+    p->t3P.C3 = 0.0;
+    p->t3P.D3 = 0.0;
+    if (d > 2) {
+      arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j}
+      arraywidcen(nk, u, &S3, &(p->t3P.D3));           // {u_k}
+      set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3),
+                    &(p->t3P.gam3));
+    }
+
+    if (p->opts.debug) { // report on choices of shifts, centers, etc...
+      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
+      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1,
+             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1);
+      if (d > 1)
+        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2,
+               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2);
+      if (d > 2)
+        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3,
+               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3);
+    }
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
+    if (p->nf * p->batchSize > MAX_NF) {
+      fprintf(stderr,
+              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
+      return FINUFFT_ERR_MAXNALLOC;
+    }
+    p->fftPlan->free(p->fwBatch);
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace
+
+    // (note FFTW_ALLOC is not needed over malloc, but matches its type)
+    if (p->CpBatch) free(p->CpBatch);
+    p->CpBatch =
+        (std::complex<TF> *)malloc(sizeof(std::complex<TF>) * nj * p->batchSize); // batch
+                                                                                  // c'
+                                                                                  // work
+
+    if (p->opts.debug)
+      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(std::complex<TF>) * (p->nf + nj) * p->batchSize,
+             timer.elapsedsec());
+    if (!p->fwBatch || !p->CpBatch) {
+      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
+      return FINUFFT_ERR_ALLOC;
+    }
+    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
+
+    // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
+    // FIXME: should use realloc
+    if (p->X) free(p->X);
+    if (p->Sp) free(p->Sp);
+    p->X  = (TF *)malloc(sizeof(TF) * nj);
+    p->Sp = (TF *)malloc(sizeof(TF) * nk);
+    if (d > 1) {
+      if (p->Y) free(p->Y);
+      if (p->Tp) free(p->Tp);
+      p->Y  = (TF *)malloc(sizeof(TF) * nj);
+      p->Tp = (TF *)malloc(sizeof(TF) * nk);
+    }
+    if (d > 2) {
+      if (p->Z) free(p->Z);
+      if (p->Up) free(p->Up);
+      p->Z  = (TF *)malloc(sizeof(TF) * nj);
+      p->Up = (TF *)malloc(sizeof(TF) * nk);
+    }
+
+    // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
+    TF ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
+    if (d > 1) ig2 = 1.0 / p->t3P.gam2;
+    if (d > 2) ig3 = 1.0 / p->t3P.gam3;
+#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
+    for (BIGINT j = 0; j < nj; ++j) {
+      p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j
+      if (d > 1) // (ok to do inside loop because of branch predict)
+        p->Y[j] = (yj[j] - p->t3P.C2) * ig2;          // rescale y_j
+      if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j
+    }
+
+    // set up prephase array...
+    std::complex<TF> imasign =
+        (p->fftSign >= 0) ? std::complex<TF>(0, 1) : std::complex<TF>(0, -1); // +-i
+    if (p->prephase) free(p->prephase);
+    p->prephase = (std::complex<TF> *)malloc(sizeof(std::complex<TF>) * nj);
+    if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) {
+#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
+      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
+        TF phase = p->t3P.D1 * xj[j];
+        if (d > 1) phase += p->t3P.D2 * yj[j];
+        if (d > 2) phase += p->t3P.D3 * zj[j];
+        p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler
+                                                            // e^{+-i.phase}
+      }
+    } else
+      for (BIGINT j = 0; j < nj; ++j)
+        p->prephase[j] = (std::complex<TF>)1.0; // *** or keep flag so no mult in exec??
+
+        // rescale the target s_k etc to s'_k etc...
+#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
+    for (BIGINT k = 0; k < nk; ++k) {
+      p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1);   // so |s'_k| < pi/R
+      if (d > 1)
+        p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| <
+                                                                 // pi/R
+      if (d > 2)
+        p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| <
+                                                                 // pi/R
+    }
+    // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
+    // (exploits that FT separates because kernel is prod of 1D funcs)
+    if (p->deconv) free(p->deconv);
+    p->deconv    = (std::complex<TF> *)malloc(sizeof(std::complex<TF>) * nk);
+    TF *phiHatk1 = (TF *)malloc(sizeof(TF) * nk);       // don't confuse w/ p->phiHat
+    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1
+    TF *phiHatk2 = NULL, *phiHatk3 = NULL;
+    if (d > 1) {
+      phiHatk2 = (TF *)malloc(sizeof(TF) * nk);
+      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2
+    }
+    if (d > 2) {
+      phiHatk3 = (TF *)malloc(sizeof(TF) * nk);
+      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3
+    }
+    int Cfinite =
+        isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan
+                                                                           // or inf if
+                                                                           // M=0, no
+                                                                           // input NU pts
+    int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen
+#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
+    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
+      TF phiHat = phiHatk1[k];
+      if (d > 1) phiHat *= phiHatk2[k];
+      if (d > 2) phiHat *= phiHatk3[k];
+      p->deconv[k] = (std::complex<TF>)(1.0 / phiHat);
+      if (Cfinite && Cnonzero) {
+        TF phase = (s[k] - p->t3P.D1) * p->t3P.C1;
+        if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2;
+        if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3;
+        p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
+      }
+    }
+    free(phiHatk1);
+    free(phiHatk2);
+    free(phiHatk3); // done w/ deconv fill
+    if (p->opts.debug)
+      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
+
+    // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
+    timer.restart();
+    // Free sortIndices if it has been allocated before in case of repeated setpts
+    // calls causing memory leak. We don't know it is the same size as before, so we
+    // have to malloc each time.
+    if (p->sortIndices) free(p->sortIndices);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
+    if (!p->sortIndices) {
+      fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__);
+      return FINUFFT_ERR_SPREAD_ALLOC;
+    }
+    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y,
+                           p->Z, p->spopts);
+    if (p->opts.debug)
+      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
+
+    // Plan and setpts once, for the (repeated) inner type 2 finufft call...
+    timer.restart();
+    BIGINT t2nmodes[]   = {p->nf1, p->nf2, p->nf3};  // t2 input is actually fw
+    finufft_opts t2opts = p->opts;                   // deep copy, since not ptrs
+    t2opts.modeord      = 0;                         // needed for correct t3!
+    t2opts.debug        = max(0, p->opts.debug - 1); // don't print as much detail
+    t2opts.spread_debug = max(0, p->opts.spread_debug - 1);
+    t2opts.showwarn     = 0;                         // so don't see warnings 2x
+    // (...could vary other t2opts here?)
+    if (p->innerT2plan) {
+      delete p->innerT2plan;
+      p->innerT2plan = nullptr;
+    }
+    int ier = finufft_makeplan_t<TF>(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
+                                     &p->innerT2plan, &t2opts);
+    if (ier > 1) { // if merely warning, still proceed
+      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
+              __func__, ier);
+      return ier;
+    }
+    ier = finufft_setpts_t<TF>(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL,
+                               NULL); // note nk = # output points (not nj)
+    if (ier > 1) {
+      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
+      return ier;
+    }
+    if (p->opts.debug)
+      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
+  }
+  return 0;
+}
+template int finufft_setpts_t<float>(FINUFFT_PLAN_T<float> *p, BIGINT nj, float *xj,
+                                     float *yj, float *zj, BIGINT nk, float *s, float *t,
+                                     float *u);
+template int finufft_setpts_t<double>(FINUFFT_PLAN_T<double> *p, BIGINT nj, double *xj,
+                                      double *yj, double *zj, BIGINT nk, double *s,
+                                      double *t, double *u);
+
+// ............ end setpts ..................................................
+
+// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+template<typename TF>
+int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk) {
+  /* See ../docs/cguru.doc for current documentation.
+
+   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
+   existing (sorted) NU pts and existing plan.
+   For type 1 and 3: cj is input, fk is output.
+   For type 2: fk is input, cj is output.
+   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
+   for each of the 3 types.
+   For cases of ntrans>1, performs work in blocks of size up to batchSize.
+   Return value 0 (no error diagnosis yet).
+   Barnett 5/20/20, based on Malleo 2019.
+*/
+  CNTime timer;
+  timer.start();
+
+  if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
+
+    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
+    if (p->opts.debug)
+      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
+
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
+
+      // current batch is either batchSize, or possibly truncated if last one
+      int thisBatchSize     = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB                = b * p->batchSize; // index of vector, since batchsizes same
+      std::complex<TF> *cjb = cj + bB * p->nj;  // point to batch of weights
+      std::complex<TF> *fkb = fk + bB * p->N;   // point to batch of mode coeffs
+      if (p->opts.debug > 1)
+        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
+      // STEP 1: (varies by type)
+      timer.restart();
+      if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid
+        spreadinterpSortedBatch<TF>(thisBatchSize, p, cjb);
+        t_sprint += timer.elapsedsec();
+      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
+        deconvolveBatch<TF>(thisBatchSize, p, fkb);
+        t_deconv += timer.elapsedsec();
+      }
+
+      // STEP 2: call the FFT on this batch
+      timer.restart();
+      do_fft(p);
+      t_fft += timer.elapsedsec();
+      if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
+
+      // STEP 3: (varies by type)
+      timer.restart();
+      if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
+        deconvolveBatch<TF>(thisBatchSize, p, fkb);
+        t_deconv += timer.elapsedsec();
+      } else { // type 2: interpolate unif fw grid to NU target pts
+        spreadinterpSortedBatch<TF>(thisBatchSize, p, cjb);
+        t_sprint += timer.elapsedsec();
+      }
+    } // ........end b loop
+
+    if (p->opts.debug) { // report total times in their natural order...
+      if (p->type == 1) {
+        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
+        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
+        printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
+      } else {
+        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
+        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
+        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
+      }
+    }
+  }
+
+  else { // ----------------------------- TYPE 3 EXEC ---------------------
+
+    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
+    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
+
+    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0,
+           t_deconv = 0.0; // accumulated timings
+    if (p->opts.debug)
+      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
+
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
+
+      // batching and pointers to this batch, identical to t1,2 above...
+      int thisBatchSize     = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB                = b * p->batchSize;
+      std::complex<TF> *cjb = cj + bB * p->nj; // batch of input strengths
+      std::complex<TF> *fkb = fk + bB * p->nk; // batch of output strengths
+      if (p->opts.debug > 1)
+        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
+      // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
+      timer.restart();
+#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nj;
+        for (BIGINT j = 0; j < p->nj; ++j) {
+          p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
+        }
+      }
+      t_pre += timer.elapsedsec();
+
+      // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
+      timer.restart();
+      p->spopts.spread_direction = 1;                            // spread
+      spreadinterpSortedBatch<TF>(thisBatchSize, p, p->CpBatch); // p->X are primed
+      t_spr += timer.elapsedsec();
+
+      // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
+      timer.restart();
+      // illegal possible shrink of ntrans *after* plan for smaller last batch:
+      p->innerT2plan->ntrans = thisBatchSize; // do not try this at home!
+      /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
+     still the same size, as Andrea explained; just wastes a few flops) */
+      finufft_execute_t(p->innerT2plan, fkb, p->fwBatch);
+      t_t2 += timer.elapsedsec();
+      // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
+      timer.restart();
+#pragma omp parallel for num_threads(p->opts.nthreads)
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nk;
+        for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k];
+      }
+      t_deconv += timer.elapsedsec();
+    } // ........end b loop
+
+    if (p->opts.debug) { // report total times in their natural order...
+      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
+      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
+      printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
+      printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
+    }
+  }
+  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
+  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
+
+  return 0;
+}
+template int finufft_execute_t<float>(FINUFFT_PLAN_T<float> *p, std::complex<float> *cj,
+                                      std::complex<float> *fk);
+template int finufft_execute_t<double>(
+    FINUFFT_PLAN_T<double> *p, std::complex<double> *cj, std::complex<double> *fk);
+
+// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
+template<typename TF> FINUFFT_PLAN_T<TF>::~FINUFFT_PLAN_T() {
+  // Free everything we allocated inside of finufft_plan pointed to by p.
+  // Also must not crash if called immediately after finufft_makeplan.
+  // Thus either each thing free'd here is guaranteed to be NULL or correctly
+  // allocated.
+  if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array
+  free(sortIndices);
+  if (type == 1 || type == 2) {
+    free(phiHat1);
+    free(phiHat2);
+    free(phiHat3);
+  } else {                 // free the stuff alloc for type 3 only
+    delete innerT2plan;
+    innerT2plan = nullptr; // if NULL, ignore its error code
+    free(CpBatch);
+    free(Sp);
+    free(Tp);
+    free(Up);
+    free(X);
+    free(Y);
+    free(Z);
+    free(prephase);
+    free(deconv);
+  }
+}
+template FINUFFT_PLAN_T<float>::~FINUFFT_PLAN_T();
+template FINUFFT_PLAN_T<double>::~FINUFFT_PLAN_T();
diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp
index 1fb49db9e..454f22af7 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/simpleinterfaces.cpp
@@ -31,12 +31,12 @@ static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FL
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
-  FINUFFT_PLAN plan;
+  FINUFFT_PLAN plan = nullptr;
   int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan,
                              popts); // popts (ptr to opts) can be NULL
   if (ier > 1) {                     // since 1 (a warning) still allows proceeding...
     fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier);
-    delete plan;
+    FINUFFT_DESTROY(plan);
     return ier;
   }
 
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index b2b5ea8b0..126bcd2d7 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -2162,6 +2162,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
    spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20.
 */
 {
+  constexpr T EPSILON = std::numeric_limits<T>::epsilon();
   if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma
     if (kerevalmeth == 1) {
       fprintf(stderr,
diff --git a/test/testutils.cpp b/test/testutils.cpp
index 64b5d7a0a..7b550ebff 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -57,7 +57,8 @@ int main(int argc, char *argv[]) {
     a[j] = CPX(1.0, 0.0);
     b[j] = a[j];
   }
-  FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
+  constexpr FLT EPSILON = std::numeric_limits<FLT>::epsilon();
+  FLT relerr            = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
   if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1;
   if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1;
   b[0] = CPX(0.0, 0.0); // perturb b from a