diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bae95ad5..435bcd8c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE -set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp +set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/simpleinterfaces.cpp) # If we're building for Fortran, make sure we also include the translation @@ -258,11 +258,15 @@ if(FINUFFT_USE_CPU) add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) set_finufft_options(finufft_f64) if(NOT FINUFFT_STATIC_LINKING) - add_library(finufft SHARED src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft SHARED + src/spreadinterp.cpp src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) else() - add_library(finufft STATIC src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft STATIC + src/spreadinterp.cpp src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) endif() target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) set_finufft_options(finufft) diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 4265ced01..084ffa41c 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -18,6 +18,7 @@ // public header gives access to f_opts, f_spread_opts, f_plan... // (and clobbers FINUFFT* macros; watch out!) #include +#include #include // --------------- Private data types for compilation in either prec --------- @@ -25,8 +26,8 @@ // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. -using BIGINT = int64_t; -using UBIGINT = uint64_t; +// using BIGINT = int64_t; +// using UBIGINT = uint64_t; // Precision-independent real and complex types, for private lib/test compile #ifdef SINGLE using FLT = float; @@ -36,59 +37,6 @@ using FLT = double; #include // we define C++ complex type only using CPX = std::complex; -// inline macro, to force inlining of small functions -// this avoids the use of macros to implement functions -#if defined(_MSC_VER) -#define FINUFFT_ALWAYS_INLINE __forceinline inline -#define FINUFFT_NEVER_INLINE __declspec(noinline) -#define FINUFFT_RESTRICT __restrict -#define FINUFFT_UNREACHABLE __assume(0) -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#elif defined(__GNUC__) || defined(__clang__) -#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline -#define FINUFFT_NEVER_INLINE __attribute__((noinline)) -#define FINUFFT_RESTRICT __restrict__ -#define FINUFFT_UNREACHABLE __builtin_unreachable() -#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) -#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) -#else -#define FINUFFT_ALWAYS_INLINE inline -#define FINUFFT_NEVER_INLINE -#define FINUFFT_RESTRICT -#define FINUFFT_UNREACHABLE -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#endif - -// ------------- Library-wide algorithm parameter settings ---------------- - -// Library version (is a string) -#define FINUFFT_VER "2.3.0" - -// Smallest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MIN_NSPREAD = 2; - -// Largest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MAX_NSPREAD = 16; - -// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 -inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; - -// Max number of positive quadr nodes for kernel FT (used only in common.cpp) -inline constexpr int MAX_NQUAD = 100; - -// Internal (nf1 etc) array allocation size that immediately raises error. -// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) -// Increase this if you need >10TB (!) RAM... -inline constexpr BIGINT MAX_NF = BIGINT(1e12); - -// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 -// values for M = nj (also nk in type 3)... -inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); - // -------------- Math consts (not in math.h) and useful math macros ---------- #include @@ -108,13 +56,6 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); // to avoid mixed precision operators in eg i*pi, an either-prec PI... #define PI FLT(M_PI) -// machine epsilon for decisions of achievable tolerance... -#ifdef SINGLE -#define EPSILON (float)6e-08 -#else -#define EPSILON (double)1.1e-16 -#endif - // Random numbers: crappy unif random number generator in [0,1). // These macros should probably be replaced by modern C++ std lib or random123. // (RAND_MAX is in stdlib.h) @@ -148,32 +89,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) { } #endif -// ----- OpenMP macros which also work when omp not present ----- -// Allows compile-time switch off of openmp, so compilation without any openmp -// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) -#ifdef _OPENMP -#include -// point to actual omp utils -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { - return omp_get_num_threads(); -} -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { - return omp_get_max_threads(); -} -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { - return omp_get_thread_num(); -} -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { - omp_set_num_threads(x); -} -#else -// non-omp safe dummy versions of omp utils... -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} -#endif - // Prec-switching name macros (respond to SINGLE), used in lib & test sources // and the plan object below. // Note: crucially, these are now indep of macros used to gen public finufft.h! @@ -219,70 +134,6 @@ static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} // NB: now private (the public C++ or C etc user sees an opaque pointer to it) #include // (must come after complex.h) - -// group together a bunch of type 3 rescaling/centering/phasing parameters: -template struct type3params { - T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale - T X2, C2, D2, h2, gam2; // y - T X3, C3, D3, h3, gam3; // z -}; - -struct FINUFFT_PLAN_S { // the main plan object, fully C++ - // These default and delete specifications just state the obvious, - // but are here to silence compiler warnings. - FINUFFT_PLAN_S() = default; - // Copy construction and assignent are already deleted implicitly - // because of the unique_ptr member. - FINUFFT_PLAN_S(const FINUFFT_PLAN_S &) = delete; - FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete; - - int type; // transform type (Rokhlin naming): 1,2 or 3 - int dim; // overall dimension: 1,2 or 3 - int ntrans; // how many transforms to do at once (vector or "many" mode) - BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) - BIGINT nk; // number of NU freq pts (type 3 only) - FLT tol; // relative user tolerance - int batchSize; // # strength vectors to group together for FFTW, etc - int nbatch; // how many batches done to cover all ntrans vectors - - BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 - BIGINT mt; // number of modes in y (2) direction = N2 - BIGINT mu; // number of modes in z (3) direction = N3 - BIGINT N; // total # modes (prod of above three) - - BIGINT nf1; // size of internal fine grid in x (1) direction - BIGINT nf2; // " y (2) - BIGINT nf3; // " z (3) - BIGINT nf; // total # fine grid points (product of the above three) - - int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 - - FLT *phiHat1; // FT of kernel in t1,2, on x-axis mode grid - FLT *phiHat2; // " y-axis. - FLT *phiHat3; // " z-axis. - - CPX *fwBatch; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array - - BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp - bool didSort; // whether binsorting used (false: identity perm used) - - FLT *X, *Y, *Z; // for t1,2: ptr to user-supplied NU pts (no new allocs). - // for t3: allocated as "primed" (scaled) src pts x'_j, etc - - // type 3 specific - FLT *S, *T, *U; // pointers to user's target NU pts arrays (no new allocs) - CPX *prephase; // pre-phase, for all input NU pts - CPX *deconv; // reciprocal of kernel FT, phase, all output NU pts - CPX *CpBatch; // working array of prephased strengths - FLT *Sp, *Tp, *Up; // internal primed targs (s'_k, etc), allocated - type3params t3P; // groups together type 3 shift, scale, phase, parameters - FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 - - // other internal structs - std::unique_ptr> fftPlan; - finufft_opts opts; // this and spopts could be made ptrs - finufft_spread_opts spopts; -}; +struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T {}; #endif // DEFS_H diff --git a/include/finufft/fft.h b/include/finufft/fft.h index bab43966c..c6d5de7a5 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -171,19 +171,22 @@ template<> struct Finufft_FFT_plan { #endif -#include +#include static inline void finufft_fft_forget_wisdom [[maybe_unused]] () { - Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); } static inline void finufft_fft_cleanup [[maybe_unused]] () { - Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); } static inline void finufft_fft_cleanup_threads [[maybe_unused]] () { - Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); } - -std::vector gridsize_for_fft(FINUFFT_PLAN p); -void do_fft(FINUFFT_PLAN p); +template struct FINUFFT_PLAN_T; +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h new file mode 100644 index 000000000..afc6ef864 --- /dev/null +++ b/include/finufft/finufft_core.h @@ -0,0 +1,210 @@ +#ifndef FINUFFT_CORE_H +#define FINUFFT_CORE_H + +/* IMPORTANT: for Windows compilers, you should add a line + #define FINUFFT_DLL + here if you are compiling/using FINUFFT as a DLL, + in order to do the proper importing/exporting, or + alternatively compile with -DFINUFFT_DLL or the equivalent + command-line flag. This is not necessary under MinGW/Cygwin, where + libtool does the imports/exports automatically. + Alternatively use include(GenerateExportHeader) and + generate_export_header(finufft) to auto generate an header containing + these defines.The main reason is that if msvc changes the way it deals + with it in the future we just need to update cmake for it to work + instead of having a check on the msvc version. */ +#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__)) +#if defined(dll_EXPORTS) +#define FINUFFT_EXPORT __declspec(dllexport) +#else +#define FINUFFT_EXPORT __declspec(dllimport) +#endif +#else +#define FINUFFT_EXPORT +#endif + +/* specify calling convention (Windows only) + The cdecl calling convention is actually not the default in all but a very + few C/C++ compilers. + If the user code changes the default compiler calling convention, may need + this when generating DLL. */ +#if defined(_WIN32) || defined(__WIN32__) +#define FINUFFT_CDECL __cdecl +#else +#define FINUFFT_CDECL +#endif + +// inline macro, to force inlining of small functions +// this avoids the use of macros to implement functions +#if defined(_MSC_VER) +#define FINUFFT_ALWAYS_INLINE __forceinline inline +#define FINUFFT_NEVER_INLINE __declspec(noinline) +#define FINUFFT_RESTRICT __restrict +#define FINUFFT_UNREACHABLE __assume(0) +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#elif defined(__GNUC__) || defined(__clang__) +#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline +#define FINUFFT_NEVER_INLINE __attribute__((noinline)) +#define FINUFFT_RESTRICT __restrict__ +#define FINUFFT_UNREACHABLE __builtin_unreachable() +#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) +#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define FINUFFT_ALWAYS_INLINE inline +#define FINUFFT_NEVER_INLINE +#define FINUFFT_RESTRICT +#define FINUFFT_UNREACHABLE +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#endif + +#include +#include + +// All indexing in library that potentially can exceed 2^31 uses 64-bit signed. +// This includes all calling arguments (eg M,N) that could be huge someday. +using BIGINT = int64_t; +using UBIGINT = uint64_t; + +// ------------- Library-wide algorithm parameter settings ---------------- + +// Library version (is a string) +#define FINUFFT_VER "2.3.0" + +// Smallest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MIN_NSPREAD = 2; + +// Largest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MAX_NSPREAD = 16; + +// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 +inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; + +// Max number of positive quadr nodes for kernel FT (used only in common.cpp) +inline constexpr int MAX_NQUAD = 100; + +// Internal (nf1 etc) array allocation size that immediately raises error. +// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) +// Increase this if you need >10TB (!) RAM... +inline constexpr BIGINT MAX_NF = BIGINT(1e12); + +// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 +// values for M = nj (also nk in type 3)... +inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); + +// ----- OpenMP macros which also work when omp not present ----- +// Allows compile-time switch off of openmp, so compilation without any openmp +// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) +#ifdef _OPENMP +#include +// point to actual omp utils +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { + return omp_get_num_threads(); +} +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { + return omp_get_max_threads(); +} +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { + return omp_get_thread_num(); +} +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { + omp_set_num_threads(x); +} +#else +// non-omp safe dummy versions of omp utils... +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} +#endif + +#include // (must come after complex.h) +#include +#include + +// group together a bunch of type 3 rescaling/centering/phasing parameters: +template struct type3params { + T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, D2, h2, gam2; // y + T X3, C3, D3, h3, gam3; // z +}; + +template struct FINUFFT_PLAN_T { // the main plan object, fully C++ + + using TC = std::complex; + + // These default and delete specifications just state the obvious, + // but are here to silence compiler warnings. + FINUFFT_PLAN_T() = default; + // Copy construction and assignent are already deleted implicitly + // because of the unique_ptr member. + FINUFFT_PLAN_T(const FINUFFT_PLAN_T &) = delete; + FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete; + ~FINUFFT_PLAN_T(); + + int type; // transform type (Rokhlin naming): 1,2 or 3 + int dim; // overall dimension: 1,2 or 3 + int ntrans; // how many transforms to do at once (vector or "many" mode) + BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) + BIGINT nk; // number of NU freq pts (type 3 only) + TF tol; // relative user tolerance + int batchSize; // # strength vectors to group together for FFTW, etc + int nbatch; // how many batches done to cover all ntrans vectors + + BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 + BIGINT mt; // number of modes in y (2) direction = N2 + BIGINT mu; // number of modes in z (3) direction = N3 + BIGINT N; // total # modes (prod of above three) + + BIGINT nf1; // size of internal fine grid in x (1) direction + BIGINT nf2; // " y (2) + BIGINT nf3; // " z (3) + BIGINT nf; // total # fine grid points (product of the above three) + + int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 + + TF *phiHat1 = nullptr; // FT of kernel in t1,2, on x-axis mode grid + TF *phiHat2 = nullptr; // " y-axis. + TF *phiHat3 = nullptr; // " z-axis. + + TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan + // & act on. Usually the largest working array + + BIGINT *sortIndices = nullptr; // precomputed NU pt permutation, speeds spread/interp + bool didSort; // whether binsorting used (false: identity perm used) + + TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts + // (no new allocs). for t3: allocated as + // "primed" (scaled) src pts x'_j, etc + + // type 3 specific + TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays + // (no new allocs) + TC *prephase = nullptr; // pre-phase, for all input NU pts + TC *deconv = nullptr; // reciprocal of kernel FT, phase, all output NU pts + TC *CpBatch = nullptr; // working array of prephased strengths + TF *Sp = nullptr, *Tp = nullptr, *Up = nullptr; // internal primed targs (s'_k, etc), + // allocated + type3params t3P; // groups together type 3 shift, scale, phase, parameters + FINUFFT_PLAN_T *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3 + + // other internal structs + std::unique_ptr> fftPlan; + finufft_opts opts; // this and spopts could be made ptrs + finufft_spread_opts spopts; +}; + +void finufft_default_opts_t(finufft_opts *o); +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts); +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u); +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +#endif // FINUFFT_CORE_H diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 56d705563..779101be8 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -7,7 +7,6 @@ #ifndef SPREADINTERP_H #define SPREADINTERP_H -#include #include /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags. diff --git a/include/finufft/utils.h b/include/finufft/utils.h index b4fe64681..132fafb53 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -4,7 +4,7 @@ #ifndef UTILS_H #define UTILS_H -#include "finufft/defs.h" +#include "finufft/finufft_core.h" namespace finufft { namespace utils { diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h index 0504bb8df..8dd3839d8 100644 --- a/include/finufft/utils_precindep.h +++ b/include/finufft/utils_precindep.h @@ -4,9 +4,9 @@ #ifndef UTILS_PRECINDEP_H #define UTILS_PRECINDEP_H -#include "defs.h" -// for CNTime... -// using chrono since the interface is portable between linux and windows +// #include "defs.h" +// for CNTime... +// using chrono since the interface is portable between linux and windows #include namespace finufft { diff --git a/makefile b/makefile index 85a6a9a78..9d3e4c29c 100644 --- a/makefile +++ b/makefile @@ -31,7 +31,7 @@ PYTHON = python3 # they allow gcc to vectorize the code more effectively CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\ -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\ - -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) + -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) -Wfatal-errors FFLAGS := $(CFLAGS) $(FFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) # FFTW base name, and math linking... @@ -138,17 +138,17 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB) SOBJS = # their single-prec versions SOBJSF = $(SOBJS:%.o=%_32.o) -# precision-dependent spreader object files (compiled & linked only once)... +# precision-independent spreader object files (compiled & linked only once)... SOBJS_PI = src/utils_precindep.o src/spreadinterp.o # spreader dual-precision objs SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI) # double-prec library object files that also need single precision... -OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o +OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o # their single-prec versions OBJSF = $(OBJS:%.o=%_32.o) -# precision-dependent library object files (compiled & linked only once)... -OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o +# precision-independent library object files (compiled & linked only once)... +OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS) @@ -435,7 +435,7 @@ endif # python --------------------------------------------------------------------- python: $(STATICLIB) $(DYNLIB) - FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft + FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps $(PYTHON) python/finufft/test/run_accuracy_tests.py $(PYTHON) python/finufft/examples/simple1d1.py diff --git a/src/fft.cpp b/src/fft.cpp index bb7e32442..3a7fbf2f6 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -7,7 +7,7 @@ using namespace std; #include "ducc0/fft/fftnd_impl.h" #endif -std::vector gridsize_for_fft(FINUFFT_PLAN p) { +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p) { // local helper func returns a new int array of length dim, extracted from // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. if (p->dim == 1) return {(int)p->nf1}; @@ -15,8 +15,10 @@ std::vector gridsize_for_fft(FINUFFT_PLAN p) { // if (p->dim == 3) return {(int)p->nf3, (int)p->nf2, (int)p->nf1}; } +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); -void do_fft(FINUFFT_PLAN p) { +template void do_fft(FINUFFT_PLAN_T *p) { #ifdef FINUFFT_USE_DUCC0 size_t nthreads = min(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads); const auto ns = gridsize_for_fft(p); @@ -106,3 +108,5 @@ void do_fft(FINUFFT_PLAN p) { p->fftPlan->execute(); // if thisBatchSize(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); diff --git a/src/finufft.cpp b/src/finufft.cpp index 96e986a5d..fddb3fb6e 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -4,1182 +4,25 @@ // private headers for lib build // (must come after finufft.h which clobbers FINUFFT* macros) #include -#include -#include -#include -#include -#include "../contrib/legendre_rule_fast.h" -#include -#include -#include -#include -#include -#include -#include +void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); } -using namespace std; -using namespace finufft; -using namespace finufft::utils; -using namespace finufft::spreadinterp; -using namespace finufft::quadrature; - -/* Computational core for FINUFFT. - - Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus - 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. - Original guru interface written by Andrea Malleo, summer 2019, mentored - by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. - - As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions - and the two finufft2d?many() functions. The (now 18) simple C++ interfaces - are in simpleinterfaces.cpp. - -Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: - - TYPE 1: - The type 1 NUFFT proceeds in three main steps: - 1) spread data to oversampled regular mesh using kernel. - 2) compute FFT on uniform mesh - 3) deconvolve by division of each Fourier mode independently by the kernel - Fourier series coeffs (not merely FFT of kernel), shuffle to output. - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 2: - The type 2 algorithm proceeds in three main steps: - 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff - 2) compute inverse FFT on uniform fine grid - 3) spread (dir=2, ie interpolate) data to regular mesh - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 3: - The type 3 algorithm is basically a type 2 (which is implemented precisely - as call to type 2) replacing the middle FFT (Step 2) of a type 1. - Beyond this, the new twists are: - i) nf1, number of upsampled points for the type-1, depends on the product - of interval widths containing input and output points (X*S). - ii) The deconvolve (post-amplify) step is division by the Fourier transform - of the scaled kernel, evaluated on the *nonuniform* output frequency - grid; this is done by direct approximation of the Fourier integral - using quadrature of the kernel function times exponentials. - iii) Shifts in x (real) and s (Fourier) are done to minimize the interval - half-widths X and S, hence nf1. - - MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): - maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so - this is good only for small problems. - - -Design notes for guru interface implementation: - -* Since finufft_plan is C-compatible, we need to use malloc/free for its - allocatable arrays, keeping it quite low-level. We can't use std::vector - since that would only survive in the scope of each function. - -* Thread-safety: FINUFFT plans are passed as pointers, so it has no global - state apart from that associated with FFTW (and the did_fftw_init). -*/ - -// ---------- local math routines (were in common.cpp; no need now): -------- - -namespace finufft { -namespace common { - -static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf) -// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts -// and requested number of Fourier modes ms. Returns 0 if success, else an -// error code if nf was unreasonably big (& tell the world). -{ - *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails - if (*nf < MAX_NF) { - *nf = next235even(*nf); // expensive at huge nf - return 0; - } else { - fprintf(stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " - "malloc\n", - __func__, (double)*nf, (double)MAX_NF); - return FINUFFT_ERR_MAXNALLOC; - } -} - -template -static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, - int dim) -// Set up the spreader parameters given eps, and pass across various nufft -// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 -{ - // this calls spreadinterp.cpp... - int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, - opts.spread_debug, opts.showwarn, dim); - // override various spread opts from their defaults... - spopts.debug = opts.spread_debug; - spopts.sort = opts.spread_sort; // could make dim or CPU choices here? - spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) - spopts.chkbnds = opts.chkbnds; - spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here - if (opts.spread_nthr_atomic >= 0) // overrides - spopts.atomic_threshold = opts.spread_nthr_atomic; - if (opts.spread_max_sp_size > 0) // overrides - spopts.max_subproblem_size = opts.spread_max_sp_size; - if (opts.chkbnds != 1) // deprecated default value hardcoded here - fprintf(stderr, - "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", - __func__); - return ier; -} - -template -static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf, T *h, T *gam) -/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), - for type 3 only. - Inputs: - X and S are the xj and sk interval half-widths respectively. - opts and spopts are the NUFFT and spreader opts strucs, respectively. - Outputs: - nf is the size of upsampled grid for a given single dimension. - h is the grid spacing = 2pi/nf - gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). - Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 - New logic 6/12/17 -*/ -{ - int nss = spopts.nspread + 1; // since ns may be odd - T Xsafe = X, Ssafe = S; // may be tweaked locally - if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 - if (S == 0.0) { - Xsafe = 1.0; - Ssafe = 1.0; - } else - Xsafe = max(Xsafe, 1 / S); - else - Ssafe = max(Ssafe, 1 / X); - // use the safe X and S... - auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); - if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf - *nf = (BIGINT)nfd; - // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); - // catch too small nf, and nan or +-inf, otherwise spread fails... - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; - if (*nf < MAX_NF) // otherwise will fail anyway - *nf = next235even(*nf); // expensive at huge nf - *h = T(2.0 * PI / *nf); // upsampled grid spacing - *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' -} - -template -static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) -/* - Approximates exact Fourier series coeffs of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Uses phase winding for cheap eval on the regular freq - grid. Note that this is also the Fourier transform of the non-periodized - kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an - overall prefactor of 1/h, which is needed anyway for the correction, and - arises because the quadrature weights are scaled for grid units not x units. - The kernel is actually centered at nf/2, related to the centering of the grid; - this is now achieved by the sign flip in a[n] below. - - Inputs: - nf - size of 1d uniform spread grid, must be even. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 Ts) - - Compare onedim_dct_kernel which has same interface, but computes DFT of - sampled kernel, not quite the same object. - - Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. - Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. - */ -{ - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - T f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - std::complex a[MAX_NQUAD]; - for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = -exp(2 * PI * IMA * (T)z[n] / (T)nf); // phase winding rates - } - BIGINT nout = nf / 2 + 1; // how many values we're writing to - int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks - std::vector brk(nt + 1); // start indices for each thread - for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads - brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); -#pragma omp parallel num_threads(nt) - { // each thread gets own chunk to do - int t = MY_OMP_GET_THREAD_NUM(); - std::complex aj[MAX_NQUAD]; // phase rotator for this thread - for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk - for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array - T x = 0.0; // accumulator for answer at this j - for (int n = 0; n < q; ++n) { - x += f[n] * 2 * real(aj[n]); // include the negative freq - aj[n] *= a[n]; // wind the phases - } - fwkerhalf[j] = x; - } - } -} - -template -static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) -/* - Approximates exact 1D Fourier transform of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), - for a kernel with x measured in grid-spacings. (See previous routine for - FT definition). - - Inputs: - nk - number of freqs - k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) - - Barnett 2/8/17. openmp since cos slow 2/9/17 - */ -{ - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - T f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - for (int n = 0; n < q; ++n) { - z[n] *= (T)J2; // quadr nodes for [0,J/2] - f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights - } -#pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j = 0; j < nk; ++j) { // loop along output array - T x = 0.0; // register - for (int n = 0; n < q; ++n) - x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! - phihat[j] = x; - } -} - -template -static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, - std::complex *fw, int modeord) -/* - if dir==1: copies fw to fk with amplification by prefac/ker - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) - 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). - - fk is a size-ms T complex array (2*ms Ts alternating re,im parts) - fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) - ker is real-valued T array of length nf1/2+1. - - Single thread only, but shouldn't matter since mostly data movement. - - It has been tested that the repeated floating division in this inner loop - only contributes at the <3% level in 3D relative to the FFT cost (8 threads). - This could be removed by passing in an inverse kernel and doing mults. - - todo: rewrite w/ C++-complex I/O, check complex divide not slower than - real divide, or is there a way to force a real divide? - - Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 -*/ -{ - BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices - if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (kmax + 1); - } // or, instead, FFT ordering - if (dir == 1) { // read fw, write out to fk... - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fk[pp++] = prefac * fw[k].real() / ker[k]; // re - fk[pp++] = prefac * fw[k].imag() / ker[k]; // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re - fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im - } - } else { // read fk, write out to fw w/ zero padding... - for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where - // needed - fw[k] = 0.0; - } - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fw[k].real(prefac * fk[pp++] / ker[k]); // re - fw[k].imag(prefac * fk[pp++] / ker[k]); // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re - fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im - } - } -} - -template -static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, - T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, - int modeord) -/* - 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. - - if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with - ms looped over fast and mt slow. - fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with - nf1 looped over fast and nf2 slow. - ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 - respectively. - - Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices - if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k2max + 1) * ms; - } // or, instead, FFT ordering - if (dir == 2) // zero pad needed x-lines (contiguous in memory) - for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all - // dims - fw[j] = 0.0; - for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs - // point fk and fw to the start of this y value's row (2* is for complex): - common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, - &fw[nf1 * k2], modeord); - for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs - common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, - &fw[nf1 * (nf2 + k2)], modeord); -} - -template -static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, - BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, - BIGINT nf3, std::complex *fw, int modeord) -/* - 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. - - if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with - ms looped over fastest and mu slowest. - fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with - nf1 looped over fastest and nf3 slowest. - ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, - and nf3/2+1 respectively. - - Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices - if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k3max + 1) * ms * mt; - } // or FFT ordering - BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane - if (dir == 2) // zero pad needed xy-planes (contiguous in memory) - for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims - fw[j] = 0.0; - for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs - // point fk and fw to the start of this z value's plane (2* is for complex): - common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, - nf2, &fw[np * k3], modeord); - for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs - common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, - nf2, &fw[np * (nf3 + k3)], modeord); -} - -// --------- batch helper functions for t1,2 exec: --------------------------- - -template -static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, std::complex *cBatch) -/* - Spreads (or interpolates) a batch of batchSize strength vectors in cBatch - to (or from) the batch of fine working grids p->fwBatch, using the same set of - (index-sorted) NU points p->X,Y,Z for each vector in the batch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - Returns 0 (no error reporting for now). - Notes: - 1) cBatch is already assumed to have the correct offset, ie here we - read from the start of cBatch (unlike Malleo). fwBatch also has zero offset - 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp - Barnett 5/19/20, based on Malleo 2019. -*/ -{ - // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. - // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. - // But when nthr_outer=1 here, omp par inside the loop sees all threads... -#ifdef _OPENMP - int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; -#endif -#pragma omp parallel for num_threads(nthr_outer) - for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch - spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, - p->Y, p->Z, (T *)ci, p->spopts, p->didSort); - } - return 0; -} - -template -static int deconvolveBatch(int batchSize, FINUFFT_PLAN p, std::complex *fkBatch) -/* - Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch - into each output array fk in fkBatch. - Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, - again looping over fk in fkBatch and fw in p->fwBatch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize - times. - Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) -*/ -{ - // since deconvolveshuffle?d are single-thread, omp par seems to help here... -#pragma omp parallel for num_threads(batchSize) - for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch - - // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... - if (p->dim == 1) - deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, - p->nf1, fwi, p->opts.modeord); - else if (p->dim == 2) - deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, - p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); - else - deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, - p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, - p->nf3, fwi, p->opts.modeord); - } - return 0; -} - -} // namespace common -} // namespace finufft - -// --------------- rest is the 5 user guru (plan) interface drivers: --------- -// (not namespaced since have safe names finufft{f}_* ) -using namespace finufft::common; // accesses routines defined above - -// Marco Barbone: 5.8.2024 -// These are user-facing. -// The various options could be macros to follow c standard library conventions. -// Question: would these be enums? - -// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO -void FINUFFT_DEFAULT_OPTS(finufft_opts *o) -// Sets default nufft opts (referenced by all language interfaces too). -// See finufft_opts.h for meanings. -// This was created to avoid uncertainty about C++11 style static initialization -// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. -// Sphinx sucks the below code block into the web docs, hence keep it clean... -{ - // sphinx tag (don't remove): @defopts_start - o->modeord = 0; - o->chkbnds = 1; - - o->debug = 0; - o->spread_debug = 0; - o->showwarn = 1; - - o->nthreads = 0; -#ifdef FINUFFT_USE_DUCC0 - o->fftw = 0; -#else - o->fftw = FFTW_ESTIMATE; -#endif - o->spread_sort = 2; - o->spread_kerevalmeth = 1; - o->spread_kerpad = 1; - o->upsampfac = 0.0; - o->spread_thread = 0; - o->maxbatchsize = 0; - o->spread_nthr_atomic = -1; - o->spread_max_sp_size = 0; - o->fftw_lock_fun = nullptr; - o->fftw_unlock_fun = nullptr; - o->fftw_lock_data = nullptr; - // sphinx tag (don't remove): @defopts_end -} - -// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, - FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) -// Populates the fields of finufft_plan which is pointed to by "pp". -// opts is ptr to a finufft_opts to set options, or NULL to use defaults. -// For some of the fields (if "auto" selected) here choose the actual setting. -// For types 1,2 allocates memory for internal working arrays, -// evaluates spreading kernel coefficients, and instantiates the fftw_plan -{ - FINUFFT_PLAN p; - p = new FINUFFT_PLAN_S; // allocate fresh plan struct - *pp = p; // pass out plan as ptr to plan struct - - if (opts == NULL) // use default opts - FINUFFT_DEFAULT_OPTS(&(p->opts)); - else // or read from what's passed in - p->opts = *opts; // keep a deep copy; changing *opts now has no effect - - if (p->opts.debug) // do a hello world - printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", - __func__); - - p->fftPlan = std::make_unique>( - p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); - - if ((type != 1) && (type != 2) && (type != 3)) { - fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); - return FINUFFT_ERR_TYPE_NOTVALID; - } - if ((dim != 1) && (dim != 2) && (dim != 3)) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; - } - if (ntrans < 1) { - fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); - return FINUFFT_ERR_NTRANS_NOTVALID; - } - if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { - fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", - __func__); - return FINUFFT_ERR_LOCK_FUNS_INVALID; - ; - } - - // get stuff from args... - p->type = type; - p->dim = dim; - p->ntrans = ntrans; - p->tol = tol; - p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - - // choose overall # threads... -#ifdef _OPENMP - int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = ompmaxnthr; // default: use as many as OMP gives us - // (the above could be set, or suggested set, to 1 for small enough problems...) - if (p->opts.nthreads > 0) { - nthr = p->opts.nthreads; // user override, now without limit - if (p->opts.showwarn && (nthr > ompmaxnthr)) - fprintf(stderr, - "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " - "available; note large nthreads can be slower.\n", - __func__, nthr, ompmaxnthr); - } -#else - int nthr = 1; // always 1 thread (avoid segfault) - if (p->opts.nthreads > 1) - fprintf(stderr, - "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", - __func__, p->opts.nthreads); -#endif - p->opts.nthreads = nthr; // store actual # thr planned for - // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) - - // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) - if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize - p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss - p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b - } else { // batchSize override by user - p->batchSize = min(p->opts.maxbatchsize, ntrans); - p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches - } - if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice - if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { - fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); - return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; - } - - if (type != 3) { // read in user Fourier mode array sizes... - p->ms = n_modes[0]; - p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims - p->mu = (dim > 2) ? n_modes[2] : 1; - p->N = p->ms * p->mt * p->mu; // N = total # modes - } - - // heuristic to choose default upsampfac... (currently two poss) - if (p->opts.upsampfac == 0.0) { // indicates auto-choose - p->opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (FLT)1E-9) { // the tol sigma=5/4 can reach - if (type == 3) // could move to setpts, more known? - p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT - else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon - p->opts.upsampfac = 1.25; - } - if (p->opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); - } - // use opts to choose and write into plan's spread options... - int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); - if (ier > 1) // proceed if success or warning - return ier; - - // set others as defaults (or unallocated for arrays)... - p->X = NULL; - p->Y = NULL; - p->Z = NULL; - p->phiHat1 = NULL; - p->phiHat2 = NULL; - p->phiHat3 = NULL; - p->nf1 = 1; - p->nf2 = 1; - p->nf3 = 1; // crucial to leave as 1 for unused dims - p->sortIndices = NULL; // used in all three types - - // ------------------------ types 1,2: planning needed --------------------- - if (type == 1 || type == 2) { - - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) - // Note: batchSize not used since might be only 1. - - p->spopts.spread_direction = type; - - if (p->opts.showwarn) { // user warn round-off error... - if (EPSILON * p->ms > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->ms)); - if (EPSILON * p->mt > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mt)); - if (EPSILON * p->mu > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mu)); - } - - // determine fine grid sizes, sanity check.. - int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); - if (nfier) return nfier; // nf too big; we're done - p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1)); - if (dim > 1) { - nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); - if (nfier) return nfier; - p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1)); - } - if (dim > 2) { - nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); - if (nfier) return nfier; - p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1)); - } - - if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " - "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " - "batchSize=%d ", - __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, - (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, - p->batchSize); - if (p->batchSize == 1) // spread_thread has no effect in this case - printf("\n"); - else - printf(" spread_thread=%d\n", p->opts.spread_thread); - } - - // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim - CNTime timer; - timer.start(); - onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); - if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); - if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); - if (p->opts.debug) - printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, - timer.elapsedsec()); - - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 - return FINUFFT_ERR_MAXNALLOC; - } - - timer.restart(); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) - printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec()); - if (!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", - __func__); - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - return FINUFFT_ERR_ALLOC; - } - - timer.restart(); // plan the FFTW - const auto ns = gridsize_for_fft(p); - p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); - if (p->opts.debug) - printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, - nthr_fft, timer.elapsedsec()); - - } else { // -------------------------- type 3 (no planning) ------------ - - if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); - // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; - p->fwBatch = NULL; - p->Sp = NULL; - p->Tp = NULL; - p->Up = NULL; - p->prephase = NULL; - p->deconv = NULL; - p->innerT2plan = NULL; - // Type 3 will call finufft_makeplan for type 2; no need to init FFTW - // Note we don't even know nj or nk yet, so can't do anything else! - } - return ier; // report setup_spreader status (could be warning) + FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), opts); } -// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk, - FLT *s, FLT *t, FLT *u) -/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for - spreading. (The last 4 arguments are ignored.) - For type 3: allocates internal working arrays, scales/centers the NU points - and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. -*/ -{ - int d = p->dim; // abbrev for spatial dim - CNTime timer; - timer.start(); - p->nj = nj; // the user only now chooses how many NU (x,y,z) pts - if (nj < 0) { - fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nj > MAX_NU_PTS) { - fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - - if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- - // (all we can do is check and maybe bin-sort the NU pts) - p->X = xj; // plan must keep pointers to user's fixed NU pts - p->Y = yj; - p->Z = zj; - int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug > 1) - printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, - timer.elapsedsec()); - if (ier) // no warnings allowed here - return ier; - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = - indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug) - printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - } else { // ------------------------- TYPE 3 SETPTS ----------------------- - // (here we can precompute pre/post-phase factors and plan the t2) - - if (nk < 0) { - fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nk > MAX_NU_PTS) { - fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - p->nk = nk; // user set # targ freq pts - p->S = s; // keep pointers to user's input target pts - p->T = t; - p->U = u; - - // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... - FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}... - arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); - arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} - set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), - &(p->t3P.gam1)); // applies twist i) - p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc - p->t3P.D2 = 0.0; - if (d > 1) { - arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} - arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} - set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), - &(p->t3P.gam2)); - } - p->t3P.C3 = 0.0; - p->t3P.D3 = 0.0; - if (d > 2) { - arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} - arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} - set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), - &(p->t3P.gam3)); - } - - if (p->opts.debug) { // report on choices of shifts, centers, etc... - printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, - p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); - if (d > 1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, - p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); - if (d > 2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, - p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); - } - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - return FINUFFT_ERR_MAXNALLOC; - } - p->fftPlan->free(p->fwBatch); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace - - // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if (p->CpBatch) free(p->CpBatch); - p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work - - if (p->opts.debug) - printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize, - timer.elapsedsec()); - if (!p->fwBatch || !p->CpBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); - return FINUFFT_ERR_ALLOC; - } - // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); - - // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... - // FIXME: should use realloc - if (p->X) free(p->X); - if (p->Sp) free(p->Sp); - p->X = (FLT *)malloc(sizeof(FLT) * nj); - p->Sp = (FLT *)malloc(sizeof(FLT) * nk); - if (d > 1) { - if (p->Y) free(p->Y); - if (p->Tp) free(p->Tp); - p->Y = (FLT *)malloc(sizeof(FLT) * nj); - p->Tp = (FLT *)malloc(sizeof(FLT) * nk); - } - if (d > 2) { - if (p->Z) free(p->Z); - if (p->Up) free(p->Up); - p->Z = (FLT *)malloc(sizeof(FLT) * nj); - p->Up = (FLT *)malloc(sizeof(FLT) * nk); - } - - // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... - FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim - if (d > 1) ig2 = 1.0 / p->t3P.gam2; - if (d > 2) ig3 = 1.0 / p->t3P.gam3; -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { - p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j - if (d > 1) // (ok to do inside loop because of branch predict) - p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j - if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j - } - - // set up prephase array... - CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i - if (p->prephase) free(p->prephase); - p->prephase = (CPX *)malloc(sizeof(CPX) * nj); - if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs - FLT phase = p->t3P.D1 * xj[j]; - if (d > 1) phase += p->t3P.D2 * yj[j]; - if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler - // e^{+-i.phase} - } - } else - for (BIGINT j = 0; j < nj; ++j) - p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec?? - - // rescale the target s_k etc to s'_k etc... -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { - p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R - if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < - // pi/R - if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < - // pi/R - } - // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... - // (exploits that FT separates because kernel is prod of 1D funcs) - if (p->deconv) free(p->deconv); - p->deconv = (CPX *)malloc(sizeof(CPX) * nk); - FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk); // don't confuse w/ p->phiHat - onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 - FLT *phiHatk2 = NULL, *phiHatk3 = NULL; - if (d > 1) { - phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 - } - if (d > 2) { - phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 - } - int Cfinite = - isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan - // or inf if - // M=0, no - // input NU pts - int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs - FLT phiHat = phiHatk1[k]; - if (d > 1) phiHat *= phiHatk2[k]; - if (d > 2) phiHat *= phiHatk3[k]; - p->deconv[k] = (CPX)(1.0 / phiHat); - if (Cfinite && Cnonzero) { - FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1; - if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; - if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; - p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} - } - } - free(phiHatk1); - free(phiHatk2); - free(phiHatk3); // done w/ deconv fill - if (p->opts.debug) - printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); - - // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, - p->Z, p->spopts); - if (p->opts.debug) - printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - // Plan and setpts once, for the (repeated) inner type 2 finufft call... - timer.restart(); - BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw - finufft_opts t2opts = p->opts; // deep copy, since not ptrs - t2opts.modeord = 0; // needed for correct t3! - t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail - t2opts.spread_debug = max(0, p->opts.spread_debug - 1); - t2opts.showwarn = 0; // so don't see warnings 2x - // (...could vary other t2opts here?) - if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan); - int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, - &p->innerT2plan, &t2opts); - if (ier > 1) { // if merely warning, still proceed - fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", - __func__, ier); - return ier; - } - ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, - NULL); // note nk = # output points (not nj) - if (ier > 1) { - fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); - return ier; - } - if (p->opts.debug) - printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); - } - return 0; + FLT *s, FLT *t, FLT *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, zj, + nk, s, t, u); } -// ............ end setpts .................................................. -// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { - /* See ../docs/cguru.doc for current documentation. - - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and FFT as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ - CNTime timer; - timer.start(); - - if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ - - double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing - if (p->opts.debug) - printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // current batch is either batchSize, or possibly truncated if last one - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; // index of vector, since batchsizes same - CPX *cjb = cj + bB * p->nj; // point to batch of weights - CPX *fkb = fk + bB * p->N; // point to batch of mode coeffs - if (p->opts.debug > 1) - printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 1: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } - - // STEP 2: call the FFT on this batch - timer.restart(); - do_fft(p); - t_fft += timer.elapsedsec(); - if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); - - // STEP 3: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - if (p->type == 1) { - printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); - } else { - printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot interp:\t\t\t%.3g s\n", t_sprint); - } - } - } - - else { // ----------------------------- TYPE 3 EXEC --------------------- - - // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long - // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - - double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, - t_deconv = 0.0; // accumulated timings - if (p->opts.debug) - printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // batching and pointers to this batch, identical to t1,2 above... - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; - CPX *cjb = cj + bB * p->nj; // batch of input strengths - CPX *fkb = fk + bB * p->nk; // batch of output strengths - if (p->opts.debug > 1) - printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nj; - for (BIGINT j = 0; j < p->nj; ++j) { - p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; - } - } - t_pre += timer.elapsedsec(); - - // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... - timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed - t_spr += timer.elapsedsec(); - - // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... - timer.restart(); - // illegal possible shrink of ntrans *after* plan for smaller last batch: - p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! - /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array - still the same size, as Andrea explained; just wastes a few flops) */ - FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch); - t_t2 += timer.elapsedsec(); - // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nk; - for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; - } - t_deconv += timer.elapsedsec(); - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); - printf(" tot spread:\t\t\t%.3g s\n", t_spr); - printf(" tot type 2:\t\t\t%.3g s\n", t_t2); - printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); - } - } - // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long - // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug - - return 0; + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); } -// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD int FINUFFT_DESTROY(FINUFFT_PLAN p) // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. @@ -1189,24 +32,7 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) if (!p) // NULL ptr, so not a ptr to a plan, report error return 1; - p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array - free(p->sortIndices); - if (p->type == 1 || p->type == 2) { - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - } else { // free the stuff alloc for type 3 only - FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code - free(p->CpBatch); - free(p->Sp); - free(p->Tp); - free(p->Up); - free(p->X); - free(p->Y); - free(p->Z); - free(p->prephase); - free(p->deconv); - } - delete p; + delete reinterpret_cast *>(p); + p = nullptr; return 0; // success } diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp new file mode 100644 index 000000000..70a52afa8 --- /dev/null +++ b/src/finufft_core.cpp @@ -0,0 +1,1237 @@ +#include +#include +#include +#include +#include + +#include "../contrib/legendre_rule_fast.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace finufft; +using namespace finufft::utils; +using namespace finufft::spreadinterp; +using namespace finufft::quadrature; + +/* Computational core for FINUFFT. + + Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus + 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. + Original guru interface written by Andrea Malleo, summer 2019, mentored + by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. + + As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions + and the two finufft2d?many() functions. The (now 18) simple C++ interfaces + are in simpleinterfaces.cpp. + +Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: + + TYPE 1: + The type 1 NUFFT proceeds in three main steps: + 1) spread data to oversampled regular mesh using kernel. + 2) compute FFT on uniform mesh + 3) deconvolve by division of each Fourier mode independently by the kernel + Fourier series coeffs (not merely FFT of kernel), shuffle to output. + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 2: + The type 2 algorithm proceeds in three main steps: + 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff + 2) compute inverse FFT on uniform fine grid + 3) spread (dir=2, ie interpolate) data to regular mesh + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 3: + The type 3 algorithm is basically a type 2 (which is implemented precisely + as call to type 2) replacing the middle FFT (Step 2) of a type 1. + Beyond this, the new twists are: + i) nf1, number of upsampled points for the type-1, depends on the product + of interval widths containing input and output points (X*S). + ii) The deconvolve (post-amplify) step is division by the Fourier transform + of the scaled kernel, evaluated on the *nonuniform* output frequency + grid; this is done by direct approximation of the Fourier integral + using quadrature of the kernel function times exponentials. + iii) Shifts in x (real) and s (Fourier) are done to minimize the interval + half-widths X and S, hence nf1. + + MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): + maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so + this is good only for small problems. + + +Design notes for guru interface implementation: + +* Since finufft_plan is C-compatible, we need to use malloc/free for its + allocatable arrays, keeping it quite low-level. We can't use std::vector + since that would only survive in the scope of each function. + +* Thread-safety: FINUFFT plans are passed as pointers, so it has no global + state apart from that associated with FFTW (and the did_fftw_init). +*/ + +// ---------- local math routines (were in common.cpp; no need now): -------- + +namespace finufft { +namespace common { + +static constexpr double PI = 3.14159265358979329; + +static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf) +// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts +// and requested number of Fourier modes ms. Returns 0 if success, else an +// error code if nf was unreasonably big (& tell the world). +{ + *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails + if (*nf < MAX_NF) { + *nf = next235even(*nf); // expensive at huge nf + return 0; + } else { + fprintf(stderr, + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " + "malloc\n", + __func__, (double)*nf, (double)MAX_NF); + return FINUFFT_ERR_MAXNALLOC; + } +} + +template +static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, + int dim) +// Set up the spreader parameters given eps, and pass across various nufft +// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 +{ + // this calls spreadinterp.cpp... + int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, + opts.spread_debug, opts.showwarn, dim); + // override various spread opts from their defaults... + spopts.debug = opts.spread_debug; + spopts.sort = opts.spread_sort; // could make dim or CPU choices here? + spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) + spopts.chkbnds = opts.chkbnds; + spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here + if (opts.spread_nthr_atomic >= 0) // overrides + spopts.atomic_threshold = opts.spread_nthr_atomic; + if (opts.spread_max_sp_size > 0) // overrides + spopts.max_subproblem_size = opts.spread_max_sp_size; + if (opts.chkbnds != 1) // deprecated default value hardcoded here + fprintf(stderr, + "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", + __func__); + return ier; +} + +template +static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf, T *h, T *gam) +/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), + for type 3 only. + Inputs: + X and S are the xj and sk interval half-widths respectively. + opts and spopts are the NUFFT and spreader opts strucs, respectively. + Outputs: + nf is the size of upsampled grid for a given single dimension. + h is the grid spacing = 2pi/nf + gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). + Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 + New logic 6/12/17 +*/ +{ + int nss = spopts.nspread + 1; // since ns may be odd + T Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + if (S == 0.0) { + Xsafe = 1.0; + Ssafe = 1.0; + } else + Xsafe = max(Xsafe, 1 / S); + else + Ssafe = max(Ssafe, 1 / X); + // use the safe X and S... + auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); + if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf + *nf = (BIGINT)nfd; + // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); + // catch too small nf, and nan or +-inf, otherwise spread fails... + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; + if (*nf < MAX_NF) // otherwise will fail anyway + *nf = next235even(*nf); // expensive at huge nf + *h = T(2.0 * PI / *nf); // upsampled grid spacing + *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' +} + +template +static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) +/* + Approximates exact Fourier series coeffs of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Uses phase winding for cheap eval on the regular freq + grid. Note that this is also the Fourier transform of the non-periodized + kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an + overall prefactor of 1/h, which is needed anyway for the correction, and + arises because the quadrature weights are scaled for grid units not x units. + The kernel is actually centered at nf/2, related to the centering of the grid; + this is now achieved by the sign flip in a[n] below. + + Inputs: + nf - size of 1d uniform spread grid, must be even. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, + divided by h = 2pi/n. + (should be allocated for at least nf/2+1 Ts) + + Compare onedim_dct_kernel which has same interface, but computes DFT of + sampled kernel, not quite the same object. + + Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. + Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + std::complex a[MAX_NQUAD]; + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = -exp(2 * PI * std::complex(0, 1) * z[n] / double(nf)); // phase winding + // rates + } + BIGINT nout = nf / 2 + 1; // how many values we're writing to + int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks + std::vector brk(nt + 1); // start indices for each thread + for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads + brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); +#pragma omp parallel num_threads(nt) + { // each thread gets own chunk to do + int t = MY_OMP_GET_THREAD_NUM(); + std::complex aj[MAX_NQUAD]; // phase rotator for this thread + for (int n = 0; n < q; ++n) + aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk + for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array + T x = 0.0; // accumulator for answer at this j + for (int n = 0; n < q; ++n) { + x += f[n] * 2 * real(aj[n]); // include the negative freq + aj[n] *= a[n]; // wind the phases + } + fwkerhalf[j] = x; + } + } +} + +template +static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) +/* + Approximates exact 1D Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). + + Inputs: + nk - number of freqs + k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) + Note, z is in grid-point units, and k values must be in [-pi, pi) for + accuracy. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) + + Barnett 2/8/17. openmp since cos slow 2/9/17 + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + for (int n = 0; n < q; ++n) { + z[n] *= (T)J2; // quadr nodes for [0,J/2] + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights + } +#pragma omp parallel for num_threads(opts.nthreads) + for (BIGINT j = 0; j < nk; ++j) { // loop along output array + T x = 0.0; // register + for (int n = 0; n < q; ++n) + x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! + phihat[j] = x; + } +} + +template +static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, + std::complex *fw, int modeord) +/* + if dir==1: copies fw to fk with amplification by prefac/ker + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) + 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). + + fk is a size-ms T complex array (2*ms Ts alternating re,im parts) + fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) + ker is real-valued T array of length nf1/2+1. + + Single thread only, but shouldn't matter since mostly data movement. + + It has been tested that the repeated floating division in this inner loop + only contributes at the <3% level in 3D relative to the FFT cost (8 threads). + This could be removed by passing in an inverse kernel and doing mults. + + todo: rewrite w/ C++-complex I/O, check complex divide not slower than + real divide, or is there a way to force a real divide? + + Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 +*/ +{ + BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices + if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (kmax + 1); + } // or, instead, FFT ordering + if (dir == 1) { // read fw, write out to fk... + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fk[pp++] = prefac * fw[k].real() / ker[k]; // re + fk[pp++] = prefac * fw[k].imag() / ker[k]; // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re + fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im + } + } else { // read fk, write out to fw w/ zero padding... + for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where + // needed + fw[k] = 0.0; + } + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fw[k].real(prefac * fk[pp++] / ker[k]); // re + fw[k].imag(prefac * fk[pp++] / ker[k]); // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re + fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im + } + } +} + +template +static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, + T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, + int modeord) +/* + 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. + + if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with + ms looped over fast and mt slow. + fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with + nf1 looped over fast and nf2 slow. + ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 + respectively. + + Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices + if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k2max + 1) * ms; + } // or, instead, FFT ordering + if (dir == 2) // zero pad needed x-lines (contiguous in memory) + for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all + // dims + fw[j] = 0.0; + for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs + // point fk and fw to the start of this y value's row (2* is for complex): + common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, + &fw[nf1 * k2], modeord); + for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs + common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, + &fw[nf1 * (nf2 + k2)], modeord); +} + +template +static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, + BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, + BIGINT nf3, std::complex *fw, int modeord) +/* + 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. + + if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with + ms looped over fastest and mu slowest. + fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with + nf1 looped over fastest and nf3 slowest. + ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, + and nf3/2+1 respectively. + + Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices + if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k3max + 1) * ms * mt; + } // or FFT ordering + BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane + if (dir == 2) // zero pad needed xy-planes (contiguous in memory) + for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims + fw[j] = 0.0; + for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs + // point fk and fw to the start of this z value's plane (2* is for complex): + common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, + nf2, &fw[np * k3], modeord); + for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs + common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, + nf2, &fw[np * (nf3 + k3)], modeord); +} + +// --------- batch helper functions for t1,2 exec: --------------------------- + +template +static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T *p, + std::complex *cBatch) +/* + Spreads (or interpolates) a batch of batchSize strength vectors in cBatch + to (or from) the batch of fine working grids p->fwBatch, using the same set of + (index-sorted) NU points p->X,Y,Z for each vector in the batch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + Returns 0 (no error reporting for now). + Notes: + 1) cBatch is already assumed to have the correct offset, ie here we + read from the start of cBatch (unlike Malleo). fwBatch also has zero offset + 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp + Barnett 5/19/20, based on Malleo 2019. +*/ +{ + // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. + // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. + // But when nthr_outer=1 here, omp par inside the loop sees all threads... +#ifdef _OPENMP + int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; +#endif +#pragma omp parallel for num_threads(nthr_outer) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, + p->Y, p->Z, (T *)ci, p->spopts, p->didSort); + } + return 0; +} + +template +static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T *p, std::complex *fkBatch) +/* + Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch + into each output array fk in fkBatch. + Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, + again looping over fk in fkBatch and fw in p->fwBatch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize + times. + Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) +*/ +{ + // since deconvolveshuffle?d are single-thread, omp par seems to help here... +#pragma omp parallel for num_threads(batchSize) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + + // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... + if (p->dim == 1) + deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, + p->nf1, fwi, p->opts.modeord); + else if (p->dim == 2) + deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, + p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); + else + deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, + p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, + p->nf3, fwi, p->opts.modeord); + } + return 0; +} + +} // namespace common +} // namespace finufft + +// --------------- rest is the 5 user guru (plan) interface drivers: --------- +// (not namespaced since have safe names finufft{f}_* ) +using namespace finufft::common; // accesses routines defined above + +// Marco Barbone: 5.8.2024 +// These are user-facing. +// The various options could be macros to follow c standard library conventions. +// Question: would these be enums? + +// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO +void finufft_default_opts_t(finufft_opts *o) +// Sets default nufft opts (referenced by all language interfaces too). +// See finufft_opts.h for meanings. +// This was created to avoid uncertainty about C++11 style static initialization +// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. +// Sphinx sucks the below code block into the web docs, hence keep it clean... +{ + // sphinx tag (don't remove): @defopts_start + o->modeord = 0; + o->chkbnds = 1; + + o->debug = 0; + o->spread_debug = 0; + o->showwarn = 1; + + o->nthreads = 0; +#ifdef FINUFFT_USE_DUCC0 + o->fftw = 0; +#else + o->fftw = FFTW_ESTIMATE; +#endif + o->spread_sort = 2; + o->spread_kerevalmeth = 1; + o->spread_kerpad = 1; + o->upsampfac = 0.0; + o->spread_thread = 0; + o->maxbatchsize = 0; + o->spread_nthr_atomic = -1; + o->spread_max_sp_size = 0; + o->fftw_lock_fun = nullptr; + o->fftw_unlock_fun = nullptr; + o->fftw_lock_data = nullptr; + // sphinx tag (don't remove): @defopts_end +} + +// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) +// Populates the fields of finufft_plan which is pointed to by "pp". +// opts is ptr to a finufft_opts to set options, or NULL to use defaults. +// For some of the fields (if "auto" selected) here choose the actual setting. +// For types 1,2 allocates memory for internal working arrays, +// evaluates spreading kernel coefficients, and instantiates the fftw_plan +{ + FINUFFT_PLAN_T *p; + p = new FINUFFT_PLAN_T; // allocate fresh plan struct + *pp = p; // pass out plan as ptr to plan struct + + if (opts == NULL) // use default opts + finufft_default_opts_t(&(p->opts)); + else // or read from what's passed in + p->opts = *opts; // keep a deep copy; changing *opts now has no effect + + if (p->opts.debug) // do a hello world + printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", + __func__); + + p->fftPlan = std::make_unique>( + p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); + + if ((type != 1) && (type != 2) && (type != 3)) { + fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); + return FINUFFT_ERR_TYPE_NOTVALID; + } + if ((dim != 1) && (dim != 2) && (dim != 3)) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); + return FINUFFT_ERR_DIM_NOTVALID; + } + if (ntrans < 1) { + fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); + return FINUFFT_ERR_NTRANS_NOTVALID; + } + if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", + __func__); + return FINUFFT_ERR_LOCK_FUNS_INVALID; + ; + } + + // get stuff from args... + p->type = type; + p->dim = dim; + p->ntrans = ntrans; + p->tol = tol; + p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input + + // choose overall # threads... +#ifdef _OPENMP + int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); + int nthr = ompmaxnthr; // default: use as many as OMP gives us + // (the above could be set, or suggested set, to 1 for small enough problems...) + if (p->opts.nthreads > 0) { + nthr = p->opts.nthreads; // user override, now without limit + if (p->opts.showwarn && (nthr > ompmaxnthr)) + fprintf(stderr, + "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " + "available; note large nthreads can be slower.\n", + __func__, nthr, ompmaxnthr); + } +#else + int nthr = 1; // always 1 thread (avoid segfault) + if (p->opts.nthreads > 1) + fprintf(stderr, + "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", + __func__, p->opts.nthreads); +#endif + p->opts.nthreads = nthr; // store actual # thr planned for + // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) + + // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) + if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize + p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss + p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b + } else { // batchSize override by user + p->batchSize = min(p->opts.maxbatchsize, ntrans); + p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches + } + if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice + if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { + fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); + return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; + } + + if (type != 3) { // read in user Fourier mode array sizes... + p->ms = n_modes[0]; + p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims + p->mu = (dim > 2) ? n_modes[2] : 1; + p->N = p->ms * p->mt * p->mu; // N = total # modes + } + + // heuristic to choose default upsampfac... (currently two poss) + if (p->opts.upsampfac == 0.0) { // indicates auto-choose + p->opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach + if (type == 3) // could move to setpts, more known? + p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT + else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, + // typ tol, 12-core xeon + p->opts.upsampfac = 1.25; + } + if (p->opts.debug > 1) + printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); + } + // use opts to choose and write into plan's spread options... + int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); + if (ier > 1) // proceed if success or warning + return ier; + + // set others as defaults (or unallocated for arrays)... + p->X = NULL; + p->Y = NULL; + p->Z = NULL; + p->phiHat1 = NULL; + p->phiHat2 = NULL; + p->phiHat3 = NULL; + p->nf1 = 1; + p->nf2 = 1; + p->nf3 = 1; // crucial to leave as 1 for unused dims + p->sortIndices = NULL; // used in all three types + + // ------------------------ types 1,2: planning needed --------------------- + if (type == 1 || type == 2) { + + int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) + // Note: batchSize not used since might be only 1. + + p->spopts.spread_direction = type; + + constexpr TF EPSILON = std::numeric_limits::epsilon(); + if (p->opts.showwarn) { // user warn round-off error... + if (EPSILON * p->ms > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->ms)); + if (EPSILON * p->mt > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mt)); + if (EPSILON * p->mu > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mu)); + } + + // determine fine grid sizes, sanity check.. + int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); + if (nfier) return nfier; // nf too big; we're done + p->phiHat1 = (TF *)malloc(sizeof(TF) * (p->nf1 / 2 + 1)); + if (dim > 1) { + nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); + if (nfier) return nfier; + p->phiHat2 = (TF *)malloc(sizeof(TF) * (p->nf2 / 2 + 1)); + } + if (dim > 2) { + nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); + if (nfier) return nfier; + p->phiHat3 = (TF *)malloc(sizeof(TF) * (p->nf3 / 2 + 1)); + } + + if (p->opts.debug) { // "long long" here is to avoid warnings with printf... + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " + "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " + "batchSize=%d ", + __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, + (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, + p->batchSize); + if (p->batchSize == 1) // spread_thread has no effect in this case + printf("\n"); + else + printf(" spread_thread=%d\n", p->opts.spread_thread); + } + + // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim + CNTime timer; + timer.start(); + onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); + if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); + if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); + if (p->opts.debug) + printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, + timer.elapsedsec()); + + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points + if (p->nf * p->batchSize > MAX_NF) { + fprintf(stderr, + "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 + return FINUFFT_ERR_MAXNALLOC; + } + + timer.restart(); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace + if (p->opts.debug) + printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * p->nf * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch) { // we don't catch all such mallocs, just this big one + fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", + __func__); + free(p->phiHat1); + free(p->phiHat2); + free(p->phiHat3); + return FINUFFT_ERR_ALLOC; + } + + timer.restart(); // plan the FFTW + const auto ns = gridsize_for_fft(p); + p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); + if (p->opts.debug) + printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, + nthr_fft, timer.elapsedsec()); + + } else { // -------------------------- type 3 (no planning) ------------ + + if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); + // in case destroy occurs before setpts, need safe dummy ptrs/plans... + p->CpBatch = NULL; + p->fwBatch = NULL; + p->Sp = NULL; + p->Tp = NULL; + p->Up = NULL; + p->prephase = NULL; + p->deconv = NULL; + p->innerT2plan = NULL; + // Type 3 will call finufft_makeplan for type 2; no need to init FFTW + // Note we don't even know nj or nk yet, so can't do anything else! + } + return ier; // report setup_spreader status (could be warning) +} +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, float tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, double tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); + +// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u) +/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for + spreading. (The last 4 arguments are ignored.) + For type 3: allocates internal working arrays, scales/centers the NU points + and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. +*/ +{ + int d = p->dim; // abbrev for spatial dim + CNTime timer; + timer.start(); + p->nj = nj; // the user only now chooses how many NU (x,y,z) pts + if (nj < 0) { + fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nj > MAX_NU_PTS) { + fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + + if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- + // (all we can do is check and maybe bin-sort the NU pts) + p->X = xj; // plan must keep pointers to user's fixed NU pts + p->Y = yj; + p->Z = zj; + int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); + if (p->opts.debug > 1) + printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, + timer.elapsedsec()); + if (ier) // no warnings allowed here + return ier; + timer.restart(); + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. + if (p->sortIndices) free(p->sortIndices); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); + if (!p->sortIndices) { + fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); + return FINUFFT_ERR_SPREAD_ALLOC; + } + p->didSort = + indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); + if (p->opts.debug) + printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); + + } else { // ------------------------- TYPE 3 SETPTS ----------------------- + // (here we can precompute pre/post-phase factors and plan the t2) + + if (nk < 0) { + fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nk > MAX_NU_PTS) { + fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + p->nk = nk; // user set # targ freq pts + p->S = s; // keep pointers to user's input target pts + p->T = t; + p->U = u; + + // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... + TF S1, S2, S3; // get half-width X, center C, which contains {x_j}... + arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); + arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} + set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), + &(p->t3P.gam1)); // applies twist i) + p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc + p->t3P.D2 = 0.0; + if (d > 1) { + arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} + arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} + set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), + &(p->t3P.gam2)); + } + p->t3P.C3 = 0.0; + p->t3P.D3 = 0.0; + if (d > 2) { + arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} + arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} + set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), + &(p->t3P.gam3)); + } + + if (p->opts.debug) { // report on choices of shifts, centers, etc... + printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, + p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); + if (d > 1) + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, + p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); + if (d > 2) + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, + p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); + } + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points + if (p->nf * p->batchSize > MAX_NF) { + fprintf(stderr, + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + return FINUFFT_ERR_MAXNALLOC; + } + p->fftPlan->free(p->fwBatch); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace + + // (note FFTW_ALLOC is not needed over malloc, but matches its type) + if (p->CpBatch) free(p->CpBatch); + p->CpBatch = + (std::complex *)malloc(sizeof(std::complex) * nj * p->batchSize); // batch + // c' + // work + + if (p->opts.debug) + printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * (p->nf + nj) * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch || !p->CpBatch) { + fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); + return FINUFFT_ERR_ALLOC; + } + // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); + + // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... + // FIXME: should use realloc + if (p->X) free(p->X); + if (p->Sp) free(p->Sp); + p->X = (TF *)malloc(sizeof(TF) * nj); + p->Sp = (TF *)malloc(sizeof(TF) * nk); + if (d > 1) { + if (p->Y) free(p->Y); + if (p->Tp) free(p->Tp); + p->Y = (TF *)malloc(sizeof(TF) * nj); + p->Tp = (TF *)malloc(sizeof(TF) * nk); + } + if (d > 2) { + if (p->Z) free(p->Z); + if (p->Up) free(p->Up); + p->Z = (TF *)malloc(sizeof(TF) * nj); + p->Up = (TF *)malloc(sizeof(TF) * nk); + } + + // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... + TF ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim + if (d > 1) ig2 = 1.0 / p->t3P.gam2; + if (d > 2) ig3 = 1.0 / p->t3P.gam3; +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { + p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j + if (d > 1) // (ok to do inside loop because of branch predict) + p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j + if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j + } + + // set up prephase array... + std::complex imasign = + (p->fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i + if (p->prephase) free(p->prephase); + p->prephase = (std::complex *)malloc(sizeof(std::complex) * nj); + if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs + TF phase = p->t3P.D1 * xj[j]; + if (d > 1) phase += p->t3P.D2 * yj[j]; + if (d > 2) phase += p->t3P.D3 * zj[j]; + p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler + // e^{+-i.phase} + } + } else + for (BIGINT j = 0; j < nj; ++j) + p->prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? + + // rescale the target s_k etc to s'_k etc... +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { + p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R + if (d > 1) + p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < + // pi/R + if (d > 2) + p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < + // pi/R + } + // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... + // (exploits that FT separates because kernel is prod of 1D funcs) + if (p->deconv) free(p->deconv); + p->deconv = (std::complex *)malloc(sizeof(std::complex) * nk); + TF *phiHatk1 = (TF *)malloc(sizeof(TF) * nk); // don't confuse w/ p->phiHat + onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 + TF *phiHatk2 = NULL, *phiHatk3 = NULL; + if (d > 1) { + phiHatk2 = (TF *)malloc(sizeof(TF) * nk); + onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 + } + if (d > 2) { + phiHatk3 = (TF *)malloc(sizeof(TF) * nk); + onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 + } + int Cfinite = + isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan + // or inf if + // M=0, no + // input NU pts + int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs + TF phiHat = phiHatk1[k]; + if (d > 1) phiHat *= phiHatk2[k]; + if (d > 2) phiHat *= phiHatk3[k]; + p->deconv[k] = (std::complex)(1.0 / phiHat); + if (Cfinite && Cnonzero) { + TF phase = (s[k] - p->t3P.D1) * p->t3P.C1; + if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; + if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; + p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} + } + } + free(phiHatk1); + free(phiHatk2); + free(phiHatk3); // done w/ deconv fill + if (p->opts.debug) + printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); + + // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... + timer.restart(); + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. + if (p->sortIndices) free(p->sortIndices); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); + if (!p->sortIndices) { + fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); + return FINUFFT_ERR_SPREAD_ALLOC; + } + p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, + p->Z, p->spopts); + if (p->opts.debug) + printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); + + // Plan and setpts once, for the (repeated) inner type 2 finufft call... + timer.restart(); + BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw + finufft_opts t2opts = p->opts; // deep copy, since not ptrs + t2opts.modeord = 0; // needed for correct t3! + t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail + t2opts.spread_debug = max(0, p->opts.spread_debug - 1); + t2opts.showwarn = 0; // so don't see warnings 2x + // (...could vary other t2opts here?) + if (p->innerT2plan) { + delete p->innerT2plan; + p->innerT2plan = nullptr; + } + int ier = finufft_makeplan_t(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, + &p->innerT2plan, &t2opts); + if (ier > 1) { // if merely warning, still proceed + fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", + __func__, ier); + return ier; + } + ier = finufft_setpts_t(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, + NULL); // note nk = # output points (not nj) + if (ier > 1) { + fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); + return ier; + } + if (p->opts.debug) + printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); + } + return 0; +} +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, float *xj, + float *yj, float *zj, BIGINT nk, float *s, float *t, + float *u); +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, double *xj, + double *yj, double *zj, BIGINT nk, double *s, + double *t, double *u); + +// ............ end setpts .................................................. + +// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and FFT as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ + CNTime timer; + timer.start(); + + if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ + + double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing + if (p->opts.debug) + printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); + + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + + // current batch is either batchSize, or possibly truncated if last one + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; // index of vector, since batchsizes same + std::complex *cjb = cj + bB * p->nj; // point to batch of weights + std::complex *fkb = fk + bB * p->N; // point to batch of mode coeffs + if (p->opts.debug > 1) + printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 1: (varies by type) + timer.restart(); + if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid + spreadinterpSortedBatch(thisBatchSize, p, cjb); + t_sprint += timer.elapsedsec(); + } else { // type 2: amplify Fourier coeffs fk into 0-padded fw + deconvolveBatch(thisBatchSize, p, fkb); + t_deconv += timer.elapsedsec(); + } + + // STEP 2: call the FFT on this batch + timer.restart(); + do_fft(p); + t_fft += timer.elapsedsec(); + if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); + + // STEP 3: (varies by type) + timer.restart(); + if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk + deconvolveBatch(thisBatchSize, p, fkb); + t_deconv += timer.elapsedsec(); + } else { // type 2: interpolate unif fw grid to NU target pts + spreadinterpSortedBatch(thisBatchSize, p, cjb); + t_sprint += timer.elapsedsec(); + } + } // ........end b loop + + if (p->opts.debug) { // report total times in their natural order... + if (p->type == 1) { + printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); + } else { + printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot interp:\t\t\t%.3g s\n", t_sprint); + } + } + } + + else { // ----------------------------- TYPE 3 EXEC --------------------- + + // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long + // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug + + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, + t_deconv = 0.0; // accumulated timings + if (p->opts.debug) + printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); + + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + + // batching and pointers to this batch, identical to t1,2 above... + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; + std::complex *cjb = cj + bB * p->nj; // batch of input strengths + std::complex *fkb = fk + bB * p->nk; // batch of output strengths + if (p->opts.debug > 1) + printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... + timer.restart(); +#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nj; + for (BIGINT j = 0; j < p->nj; ++j) { + p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; + } + } + t_pre += timer.elapsedsec(); + + // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... + timer.restart(); + p->spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + t_spr += timer.elapsedsec(); + + // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... + timer.restart(); + // illegal possible shrink of ntrans *after* plan for smaller last batch: + p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! + /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array + still the same size, as Andrea explained; just wastes a few flops) */ + finufft_execute_t(p->innerT2plan, fkb, p->fwBatch); + t_t2 += timer.elapsedsec(); + // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... + timer.restart(); +#pragma omp parallel for num_threads(p->opts.nthreads) + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nk; + for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; + } + t_deconv += timer.elapsedsec(); + } // ........end b loop + + if (p->opts.debug) { // report total times in their natural order... + printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); + printf(" tot spread:\t\t\t%.3g s\n", t_spr); + printf(" tot type 2:\t\t\t%.3g s\n", t_t2); + printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); + } + } + // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long + // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug + + return 0; +} +template int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, + std::complex *fk); +template int finufft_execute_t( + FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { + // Free everything we allocated inside of finufft_plan pointed to by p. + // Also must not crash if called immediately after finufft_makeplan. + // Thus either each thing free'd here is guaranteed to be NULL or correctly + // allocated. + if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array + free(sortIndices); + if (type == 1 || type == 2) { + free(phiHat1); + free(phiHat2); + free(phiHat3); + } else { // free the stuff alloc for type 3 only + delete innerT2plan; + innerT2plan = nullptr; // if NULL, ignore its error code + free(CpBatch); + free(Sp); + free(Tp); + free(Up); + free(X); + free(Y); + free(Z); + free(prephase); + free(deconv); + } +} +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index 1fb49db9e..454f22af7 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -31,12 +31,12 @@ static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FL // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { - FINUFFT_PLAN plan; + FINUFFT_PLAN plan = nullptr; int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, popts); // popts (ptr to opts) can be NULL if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); - delete plan; + FINUFFT_DESTROY(plan); return ier; } diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index b2b5ea8b0..126bcd2d7 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -2162,6 +2162,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. */ { + constexpr T EPSILON = std::numeric_limits::epsilon(); if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, diff --git a/test/testutils.cpp b/test/testutils.cpp index 64b5d7a0a..7b550ebff 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -57,7 +57,8 @@ int main(int argc, char *argv[]) { a[j] = CPX(1.0, 0.0); b[j] = a[j]; } - FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly + constexpr FLT EPSILON = std::numeric_limits::epsilon(); + FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1; if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1; b[0] = CPX(0.0, 0.0); // perturb b from a