Merge remote-tracking branch 'flatiron/master' into HEAD

flatironinstitute · Sep 11, 2024 · a679d6d · a679d6d
2 parents d68a661 + defdd48
commit a679d6d
Show file tree

Hide file tree

Showing 12 changed files with 113 additions and 107 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,11 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
+Master (9/10/24)
+
+* reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
+   #534 (Barnett).
+
 V 2.3.0 (9/5/24)
 
 * Switched C++ standards from C++14 to C++17, allowing various templating

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -20,17 +20,17 @@ to the simple, vectorized, or guru makeplan routines.
 Recall how to do this from C++:
 
 .. code-block:: C++
-                
+
   // (... set up M,x,c,tol,N, and allocate F here...)
-  finufft_opts* opts;
-  finufft_default_opts(opts);
-  opts->debug = 1;
-  int ier = finufft1d1(M,x,c,+1,tol,N,F,opts);
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.debug = 1;
+  int ier = finufft1d1(M,x,c,+1,tol,N,F,&opts);
 
 This setting produces more timing output to ``stdout``.
 
 .. warning::
-   
+
  In C/C++ and Fortran, don't forget to call the command which sets default options
  (``finufft_default_opts`` or ``finufftf_default_opts``)
  before you start changing them and passing them to FINUFFT.
@@ -51,9 +51,9 @@ Here are their default settings (from ``src/finufft.cpp:finufft_default_opts``):
 .. literalinclude:: ../src/finufft.cpp
    :start-after: @defopts_start
    :end-before: @defopts_end
-  
+
 As for quick advice, the main options you'll want to play with are:
-  
+
 - ``modeord`` to flip ("fftshift") the Fourier mode ordering
 - ``debug`` to look at timing output (to determine if your problem is spread/interpolation dominated, vs FFT dominated)
 - ``nthreads`` to run with a different number of threads than the current maximum available through OpenMP (a large number can sometimes be detrimental, and very small problems can sometimes run faster on 1 thread)
@@ -92,15 +92,15 @@ Data handling options
   .. note:: The index *sets* are the same in the two ``modeord`` choices; their ordering differs only by a cyclic shift. The FFT ordering cyclically shifts the CMCL indices $\mbox{floor}(N/2)$ to the left (often called an "fftshift").
 
 **chkbnds**: [DEPRECATED] has no effect.
-  
+
 
 Diagnostic options
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 **debug**: Controls the amount of overall debug/timing output to stdout.
 
 * ``debug=0`` : silent
-  
+
 * ``debug=1`` : print some information
 
 * ``debug=2`` : prints more information
@@ -113,11 +113,11 @@ Diagnostic options
 
   * ``spread_debug=2`` : prints lots. This can print thousands of lines since it includes one line per *subproblem*.
 
-   
+
 **showwarn**: Whether to print warnings (these go to stderr).
-    
+
 * ``showwarn=0`` : suppresses such warnings
-  
+
 * ``showwarn=1`` : prints warnings
 
 
@@ -173,16 +173,16 @@ for only two settings, as follows. Otherwise, setting it to zero chooses a good
 **spread_thread**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), controls how multithreading is used to spread/interpolate each batch of data.
 
 * ``spread_thread=0`` : makes an automatic choice between the below. Recommended.
-  
+
 * ``spread_thread=1`` : acts on each vector in the batch in sequence, using multithreaded spread/interpolate on that vector. It can be slightly better than ``2`` for large problems.
-    
+
 * ``spread_thread=2`` : acts on all vectors in a batch (of size chosen typically to be the number of threads) simultaneously, assigning each a thread which performs a single-threaded spread/interpolate.  It is much better than ``1`` for all but large problems. (Historical note: this was used by Melody Shih for the original "2dmany" interface in 2018.)
 
   .. note::
-  
+
     Historical note: A former option ``3`` has been removed. This was like ``2`` except allowing nested OMP parallelism, so multi-threaded spread-interpolate was used for each of the vectors in a batch in parallel. This was used by Andrea Malleo in 2019. We have not yet found a case where this beats both ``1`` and ``2``, hence removed it due to complications with changing the OMP nesting state in both old and new OMP versions.
 
-     
+
 **maxbatchsize**:  in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors.
 Here ``0`` makes an automatic choice. If you are unhappy with this, then for small problems it should equal the number of threads, while for large problems it appears that ``1`` often better (since otherwise too much simultaneous RAM movement occurs). Some further work is needed to optimize this parameter.
 

diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -41,8 +41,8 @@ template<typename T>
 void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
 
 template<typename T, typename V>
-auto cufinufft_set_shared_memory(V *kernel, const int dim,
-                                 const cufinufft_plan_t<T> &d_plan) {
+int cufinufft_set_shared_memory(V *kernel, const int dim,
+                                const cufinufft_plan_t<T> &d_plan) {
   /**
    * WARNING: this function does not handle cuda errors. The caller should check them.
    */

diff --git a/include/finufft.h b/include/finufft.h
@@ -35,10 +35,6 @@
 #include <stdint.h>
 #define FINUFFT_BIGINT int64_t
 
-#ifndef __cplusplus
-#include <stdbool.h> // for bool type in C (needed for item in plan struct)
-#endif
-
 // this macro name has to be safe since exposed to user
 #define FINUFFT_SINGLE
 #include <finufft_eitherprec.h>

diff --git a/include/finufft/defs.h b/include/finufft/defs.h
@@ -24,18 +24,16 @@
 
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
-#define BIGINT  int64_t
-#define UBIGINT uint64_t
+using BIGINT  = int64_t;
+using UBIGINT = uint64_t;
 // Precision-independent real and complex types, for private lib/test compile
 #ifdef SINGLE
-#define FLT float
+using FLT = float;
 #else
-#define FLT double
+using FLT = double;
 #endif
-// next line possibly obsolete...
-#define _USE_MATH_DEFINES
 #include <complex> // we define C++ complex type only
-#define CPX std::complex<FLT>
+using CPX = std::complex<FLT>;
 
 // inline macro, to force inlining of small functions
 // this avoids the use of macros to implement functions
@@ -65,44 +63,49 @@
 // ------------- Library-wide algorithm parameter settings ----------------
 
 // Library version (is a string)
-#define FINUFFT_VER          "2.3.0"
+#define FINUFFT_VER "2.3.0"
 
 // Smallest possible kernel spread width per dimension, in fine grid points
 // (used only in spreadinterp.cpp)
-#define MIN_NSPREAD          2
+inline constexpr int MIN_NSPREAD = 2;
 
 // Largest possible kernel spread width per dimension, in fine grid points
 // (used only in spreadinterp.cpp)
-#define MAX_NSPREAD          16
+inline constexpr int MAX_NSPREAD = 16;
 
 // Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-#define ARRAYWIDCEN_GROWFRAC 0.1
+inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
 
 // Max number of positive quadr nodes for kernel FT (used only in common.cpp)
-#define MAX_NQUAD            100
+inline constexpr int MAX_NQUAD = 100;
 
 // Internal (nf1 etc) array allocation size that immediately raises error.
 // (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
 // Increase this if you need >10TB (!) RAM...
-#define MAX_NF               (BIGINT)1e12
+inline constexpr BIGINT MAX_NF = BIGINT(1e12);
 
 // Maximum allowed number M of NU points; useful to catch incorrectly cast int32
 // values for M = nj (also nk in type 3)...
-#define MAX_NU_PTS           (BIGINT)1e14
+inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 
 // -------------- Math consts (not in math.h) and useful math macros ----------
 #include <math.h>
 
 // either-precision unit imaginary number...
 #define IMA (CPX(0.0, 1.0))
-// using namespace std::complex_literals;  // needs C++14, provides 1i, 1if
+
+// MR: In the longer term I suggest to move
+// away from M_PI, which was never part of the standard.
+// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi
+// if no namespaces are used?
+// In C++20 these constants will be part of the language, and the problem will go away.
 #ifndef M_PI // Windows apparently doesn't have this const
 #define M_PI 3.14159265358979329
 #endif
 #define M_1_2PI 0.159154943091895336
 #define M_2PI   6.28318530717958648
 // to avoid mixed precision operators in eg i*pi, an either-prec PI...
-#define PI      (FLT) M_PI
+#define PI      FLT(M_PI)
 
 // machine epsilon for decisions of achievable tolerance...
 #ifdef SINGLE
@@ -115,36 +118,45 @@
 // These macros should probably be replaced by modern C++ std lib or random123.
 // (RAND_MAX is in stdlib.h)
 #include <stdlib.h>
-// #define rand01() (((FLT)(rand()%RAND_MAX))/RAND_MAX)
-#define rand01()     ((FLT)rand() / (FLT)RAND_MAX)
+static inline FLT rand01() { return FLT(rand()) / FLT(RAND_MAX); }
 // unif[-1,1]:
-#define randm11()    (2 * rand01() - (FLT)1.0)
+static inline FLT randm11() { return 2 * rand01() - FLT(1); }
 // complex unif[-1,1] for Re and Im:
-#define crandm11()   (randm11() + IMA * randm11())
+static inline CPX crandm11() { return randm11() + IMA * randm11(); }
 
 // Thread-safe seed-carrying versions of above (x is ptr to seed)...
+// MR: we have to leave those as macros for now, as "rand_r" is deprecated
+// and apparently no longer available on Windows.
+#if 1
 #define rand01r(x)   ((FLT)rand_r(x) / (FLT)RAND_MAX)
 // unif[-1,1]:
 #define randm11r(x)  (2 * rand01r(x) - (FLT)1.0)
 // complex unif[-1,1] for Re and Im:
 #define crandm11r(x) (randm11r(x) + IMA * randm11r(x))
+#else
+static inline FLT rand01r(unsigned int *x) { return FLT(rand_r(x)) / FLT(RAND_MAX); }
+// unif[-1,1]:
+static inline FLT randm11r(unsigned int *x) { return 2 * rand01r(x) - FLT(1); }
+// complex unif[-1,1] for Re and Im:
+static inline CPX crandm11r(unsigned int *x) { return randm11r(x) + IMA * randm11r(x); }
+#endif
 
 // ----- OpenMP macros which also work when omp not present -----
 // Allows compile-time switch off of openmp, so compilation without any openmp
 // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
 #ifdef _OPENMP
 #include <omp.h>
 // point to actual omp utils
-#define MY_OMP_GET_NUM_THREADS()  omp_get_num_threads()
-#define MY_OMP_GET_MAX_THREADS()  omp_get_max_threads()
-#define MY_OMP_GET_THREAD_NUM()   omp_get_thread_num()
-#define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
+static inline int MY_OMP_GET_NUM_THREADS() { return omp_get_num_threads(); }
+static inline int MY_OMP_GET_MAX_THREADS() { return omp_get_max_threads(); }
+static inline int MY_OMP_GET_THREAD_NUM() { return omp_get_thread_num(); }
+static inline void MY_OMP_SET_NUM_THREADS(int x) { omp_set_num_threads(x); }
 #else
 // non-omp safe dummy versions of omp utils...
-#define MY_OMP_GET_NUM_THREADS() 1
-#define MY_OMP_GET_MAX_THREADS() 1
-#define MY_OMP_GET_THREAD_NUM()  0
-#define MY_OMP_SET_NUM_THREADS(x)
+static inline int MY_OMP_GET_NUM_THREADS() { return 1; }
+static inline int MY_OMP_GET_MAX_THREADS() { return 1; }
+static inline int MY_OMP_GET_THREAD_NUM() { return 0; }
+static inline void MY_OMP_SET_NUM_THREADS(int) {}
 #endif
 
 // Prec-switching name macros (respond to SINGLE), used in lib & test sources
@@ -194,12 +206,11 @@
 #include <finufft/fft.h> // (must come after complex.h)
 
 // group together a bunch of type 3 rescaling/centering/phasing parameters:
-#define TYPE3PARAMS FINUFFTIFY(_type3Params)
-typedef struct {
-  FLT X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
-  FLT X2, C2, D2, h2, gam2; // y
-  FLT X3, C3, D3, h3, gam3; // z
-} TYPE3PARAMS;
+template<typename T> struct type3params {
+  T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
+  T X2, C2, D2, h2, gam2; // y
+  T X3, C3, D3, h3, gam3; // z
+};
 
 typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
 
@@ -243,7 +254,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
   CPX *deconv;              // reciprocal of kernel FT, phase, all output NU pts
   CPX *CpBatch;             // working array of prephased strengths
   FLT *Sp, *Tp, *Up;        // internal primed targs (s'_k, etc), allocated
-  TYPE3PARAMS t3P;          // groups together type 3 shift, scale, phase, parameters
+  type3params<FLT> t3P;     // groups together type 3 shift, scale, phase, parameters
   FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
 
   // other internal structs; each is C-compatible of course
@@ -255,6 +266,4 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
 
 } FINUFFT_PLAN_S;
 
-#undef TYPE3PARAMS
-
 #endif // DEFS_H
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
@@ -3,10 +3,10 @@
 
 #ifdef FINUFFT_USE_DUCC0
 #include "ducc0/fft/fftnd_impl.h"
-#define FFTW_FORGET_WISDOM()   // temporary hack since some tests call this unconditionally
-#define FFTW_CLEANUP()         // temporary hack since some tests call this unconditionally
-#define FFTW_CLEANUP_THREADS() // temporary hack since some tests call this
-                               // unconditionally
+// temporary hacks to allow compilation of tests that assume FFTW is used
+static inline void FFTW_FORGET_WISDOM() {}
+static inline void FFTW_CLEANUP() {}
+static inline void FFTW_CLEANUP_THREADS() {}
 #else
 #include "fftw_defs.h"
 #endif

diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
@@ -22,11 +22,7 @@
 // now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros):
 #define FFTW_CPX           FFTWIFY(complex)
 #define FFTW_PLAN          FFTWIFY(plan)
-#define FFTW_ALLOC_RE      FFTWIFY(alloc_real)
 #define FFTW_ALLOC_CPX     FFTWIFY(alloc_complex)
-#define FFTW_PLAN_1D       FFTWIFY(plan_dft_1d)
-#define FFTW_PLAN_2D       FFTWIFY(plan_dft_2d)
-#define FFTW_PLAN_3D       FFTWIFY(plan_dft_3d)
 #define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft)
 #define FFTW_EX            FFTWIFY(execute)
 #define FFTW_DE            FFTWIFY(destroy_plan)

diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
@@ -20,10 +20,12 @@
     NOTE: non-zero values are for experts only, since
     NUMERICAL OUTPUT MAY BE INCORRECT UNLESS finufft_spread_opts.flags=0 !
 */
-#define TF_OMIT_WRITE_TO_GRID        1 // don't add subgrids to out grid (dir=1)
-#define TF_OMIT_EVALUATE_KERNEL      2 // don't evaluate the kernel at all
-#define TF_OMIT_EVALUATE_EXPONENTIAL 4 // omit exp() in kernel (kereval=0 only)
-#define TF_OMIT_SPREADING            8 // don't interp/spread (dir=1: to subgrids)
+enum {
+  TF_OMIT_WRITE_TO_GRID        = 1, // don't add subgrids to out grid (dir=1)
+  TF_OMIT_EVALUATE_KERNEL      = 2, // don't evaluate the kernel at all
+  TF_OMIT_EVALUATE_EXPONENTIAL = 4, // omit exp() in kernel (kereval=0 only)
+  TF_OMIT_SPREADING            = 8  // don't interp/spread (dir=1: to subgrids)
+};
 
 namespace finufft {
 namespace spreadinterp {

diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
@@ -24,7 +24,6 @@
 
 // decide which kind of complex numbers FINUFFT_CPX is (four options)
 #ifdef __cplusplus
-#define _USE_MATH_DEFINES
 #include <complex> // C++ type
 #define FINUFFT_COMPLEXIFY(X) std::complex<X>
 #else
@@ -183,4 +182,3 @@ FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(
 #undef FINUFFT_CPX
 #undef FINUFFT_PLAN
 #undef FINUFFT_PLAN_S
-#undef FINUFFT_TYPE3PARAMS