From c0845c94a9c3b3078f5d6e195419b85125fe9ff6 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 7 May 2024 20:33:43 -0400
Subject: [PATCH 1/4] rework nthr logic replacing #431: no cap on user
 o.nthreads override; ifndef _OPENMP hard nthr=1 override; warnings at the
 finufft level only; binsort controlled by spopts.sort_threads matching docs

---
 CHANGELOG            |  5 +++--
 src/finufft.cpp      | 19 +++++++++++++------
 src/spreadinterp.cpp | 30 ++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 14c04dceb..893ea124d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,8 +1,9 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-* CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1
-  for single-thread build); warns if this cap was activated (PR 431)
+* CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or
+  if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt.
+  Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431.
 * new docs troubleshooting accuracy limitations due to condition number of the
   NUFFT problem.
 * new sanity check on nj and nk (<0 or too big); new err code, tester, doc.
diff --git a/src/finufft.cpp b/src/finufft.cpp
index a05e1c02e..1b2c4ef4d 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -591,15 +591,22 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
   p->fftSign = (iflag>=0) ? 1 : -1;         // clean up flag input
 
   // choose overall # threads...
-  int maxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr = maxnthr;                       // use as many as OMP gives us
+#ifdef _OPENMP
+  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
+  int nthr = ompmaxnthr;                    // default: use as many as OMP gives us
+  // (the above could be set, or suggested set, to 1 for small enough problems...)
   if (p->opts.nthreads>0) {
-    nthr = min(maxnthr,p->opts.nthreads);   // user override up to max avail
-    if (p->opts.nthreads > maxnthr)         // if no OMP, maxnthr=1
-      fprintf(stderr,"%s warning: user requested %d threads, but only %d threads available; enforcing nthreads=%d.\n",__func__,p->opts.nthreads,maxnthr,nthr);
+    nthr = p->opts.nthreads;                // user override, now without limit
+    if (p->opts.showwarn && (nthr > ompmaxnthr))
+      fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr);
   }
+#else
+  int nthr = 1;                             // always 1 thread (avoid segfault)
+  if (p->opts.nthreads>1)
+    fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads);
+#endif
   p->opts.nthreads = nthr;                  // store actual # thr planned for
-  // (this sets all downstream spread/interp, 1dkernel, and FFT thread counts)
+  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
   
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
   if (p->opts.maxbatchsize==0) {            // logic to auto-set best batchsize
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 8d6e34d96..9898b3982 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -267,14 +267,18 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
   timer.start();                 // if needed, sort all the NU pts...
   int did_sort=0;
   int maxnthr = MY_OMP_GET_MAX_THREADS();
-  if (opts.nthreads>0)           // user override up to max avail
-    maxnthr = min(maxnthr,opts.nthreads);
-  
+  if (opts.sort_threads>0)           // user override, now without limit
+    maxnthr = opts.sort_threads;     // maxnthr = the max threads sorting could use
+  // (we don't print warning here, since: no showwarn in spread_opts, and finufft
+  // already warned about it. spreadinterp-only advanced users will miss a warning)
   if (opts.sort==1 || (opts.sort==2 && better_to_sort)) {
     // store a good permutation ordering of all NU pts (dim=1,2 or 3)
     int sort_debug = (opts.debug>=2);    // show timing output?
-    int sort_nthr = opts.sort_threads;   // choose # threads for sorting
-    if (sort_nthr==0)   // use auto choice: when N>>M, one thread is better!
+    int sort_nthr = opts.sort_threads;   // 0, or proposed max # threads for sorting
+#ifndef _OPENMP
+    sort_nthr = 1;                   // if single-threaded lib, override user
+#endif
+    if (sort_nthr==0)   // multithreaded auto choice: when N>>M, one thread is better!
       sort_nthr = (10*M>N) ? maxnthr : 1;      // heuristic
     if (sort_nthr==1)
       bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug);
@@ -323,9 +327,12 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   int ndims = ndims_from_Ns(N1,N2,N3);
   BIGINT N=N1*N2*N3;            // output array size
   int ns=opts.nspread;          // abbrev. for w, kernel width
-  int nthr = MY_OMP_GET_MAX_THREADS();  // # threads to use to spread
+  int nthr = MY_OMP_GET_MAX_THREADS();  // guess # threads to use to spread
   if (opts.nthreads>0)
-    nthr = min(nthr,opts.nthreads);     // user override up to max avail
+    nthr = opts.nthreads;       // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                   // if single-threaded lib, override user
+#endif
   if (opts.debug)
     printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr);
   
@@ -445,9 +452,12 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   int ndims = ndims_from_Ns(N1,N2,N3);
   int ns=opts.nspread;          // abbrev. for w, kernel width
   FLT ns2 = (FLT)ns/2;          // half spread width, used as stencil shift
-  int nthr = MY_OMP_GET_MAX_THREADS();   // # threads to use to interp
+  int nthr = MY_OMP_GET_MAX_THREADS();   // guess # threads to use to interp
   if (opts.nthreads>0)
-    nthr = min(nthr,opts.nthreads);      // user override up to max avail
+    nthr = opts.nthreads;       // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                   // if single-threaded lib, override user
+#endif
   if (opts.debug)
     printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr);
 
@@ -1292,7 +1302,7 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
   nbins2 = isky ? N2/bin_size_y+1 : 1;
   nbins3 = iskz ? N3/bin_size_z+1 : 1;
   BIGINT nbins = nbins1*nbins2*nbins3;
-  if (nthr==0)
+  if (nthr==0)                      // should never happen in spreadinterp use
     fprintf(stderr,"[%s] nthr (%d) must be positive!\n",__func__,nthr);
   int nt = min(M,(BIGINT)nthr);     // handle case of less points than threads
   std::vector<BIGINT> brk(nt+1);    // list of start NU pt indices per thread

From 4d7d25704afd3c51a0346d47b2b6d1d8d0a90f16 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 7 May 2024 20:43:06 -0400
Subject: [PATCH 2/4] docs opts.rst to match new logic and advice for nthreads

---
 docs/opts.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/opts.rst b/docs/opts.rst
index 6df982fe0..e56cf5649 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -128,7 +128,9 @@ Diagnostic options
 Algorithm performance options
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-**nthreads**: Number of threads to use. This is capped at the number of available threads (eg, to prevent misuse of a single-threaded code). It then sets the number of threads FINUFFT will use in FFTW, bin-sorting, and spreading/interpolation steps. This number of threads also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`). Setting ``nthreads=0`` uses all threads available, usually recommended. However, for repeated small problems it can be advantageous to use a small number, even as small as 1.
+**nthreads**: (Ignored in single-threaded library builds.) If positive, sets the number of threads to use throughout (multi-threaded build of) library, or if ``0`` uses the maximum number of threads available according to OpenMP. In the positive case, no cap is placed on this number. This number of threads is passed to bin-sorting (which may choose to use less threads), but is adhered to in FFTW and spreading/interpolation steps. This number of threads (or 1 for single-threaded builds) also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here <c>`).
+For medium-to-large transforms, ``0`` is usually recommended.
+However, for (repeated) small transforms it can be advantageous to use a small number, even as small as ``1``.
 
 **fftw**: FFTW planner flags. This number is simply passed to FFTW's planner;
 the flags are documented `here <http://www.fftw.org/fftw3_doc/Planner-Flags.html#Planner-Flags>`_.

From ecb395ce54388cb3dd4c9672a7cb88461e9f1c7b Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 7 May 2024 21:26:56 -0400
Subject: [PATCH 3/4] redo sort logic so spopts.sort_threads is higher-priority
 than spopts.nthreads (which also does not override the heuristic, merely
 overrides omp as the max possible)

---
 src/spreadinterp.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 9898b3982..e36c8f749 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -251,7 +251,7 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
                    ordering for the x-coords of NU pts, etc.
     returned value - whether a sort was done (1) or not (0).
 
-   Barnett 2017; split out by Melody Shih, Jun 2018.
+   Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024.
 */
 {
   CNTime timer;
@@ -266,23 +266,26 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
 
   timer.start();                 // if needed, sort all the NU pts...
   int did_sort=0;
-  int maxnthr = MY_OMP_GET_MAX_THREADS();
-  if (opts.sort_threads>0)           // user override, now without limit
-    maxnthr = opts.sort_threads;     // maxnthr = the max threads sorting could use
+  int maxnthr = MY_OMP_GET_MAX_THREADS();  // used if both below opts default
+  if (opts.nthreads>0)
+    maxnthr = opts.nthreads;         // user nthreads overrides, without limit
+  if (opts.sort_threads>0)
+    maxnthr = opts.sort_threads;     // high-priority override, also no limit
+  // At this point: maxnthr = the max threads sorting could use
   // (we don't print warning here, since: no showwarn in spread_opts, and finufft
   // already warned about it. spreadinterp-only advanced users will miss a warning)
   if (opts.sort==1 || (opts.sort==2 && better_to_sort)) {
     // store a good permutation ordering of all NU pts (dim=1,2 or 3)
     int sort_debug = (opts.debug>=2);    // show timing output?
-    int sort_nthr = opts.sort_threads;   // 0, or proposed max # threads for sorting
+    int sort_nthr = opts.sort_threads;   // 0, or user max # threads for sort
 #ifndef _OPENMP
-    sort_nthr = 1;                   // if single-threaded lib, override user
+    sort_nthr = 1;                       // if single-threaded lib, override user
 #endif
     if (sort_nthr==0)   // multithreaded auto choice: when N>>M, one thread is better!
-      sort_nthr = (10*M>N) ? maxnthr : 1;      // heuristic
+      sort_nthr = (10*M>N) ? maxnthr : 1;     // heuristic
     if (sort_nthr==1)
       bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug);
-    else                                      // sort_nthr>1, sets # threads
+    else                                      // sort_nthr>1, user fixes # threads (>=2)
       bin_sort_multithread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug,sort_nthr);
     if (opts.debug) 
       printf("\tsorted (%d threads):\t%.3g s\n",sort_nthr,timer.elapsedsec());
@@ -331,7 +334,7 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   if (opts.nthreads>0)
     nthr = opts.nthreads;       // user override, now without limit
 #ifndef _OPENMP
-  nthr = 1;                   // if single-threaded lib, override user
+  nthr = 1;                   // single-threaded lib must override user
 #endif
   if (opts.debug)
     printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr);
@@ -456,7 +459,7 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
   if (opts.nthreads>0)
     nthr = opts.nthreads;       // user override, now without limit
 #ifndef _OPENMP
-  nthr = 1;                   // if single-threaded lib, override user
+  nthr = 1;                   // single-threaded lib must override user
 #endif
   if (opts.debug)
     printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr);

From 2124cbeae17e78d7f55d96e716ad3e1e1c9b5aa0 Mon Sep 17 00:00:00 2001
From: Libin Lu <llu@flatironinstitute.org>
Date: Thu, 9 May 2024 17:11:53 -0400
Subject: [PATCH 4/4] make gh runner image happy

---
 .github/workflows/cmake_ci.yml     | 5 +++++
 .github/workflows/python_wheel.yml | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml
index ed92e1cd5..c0f000e86 100644
--- a/.github/workflows/cmake_ci.yml
+++ b/.github/workflows/cmake_ci.yml
@@ -38,6 +38,11 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Unlink gcc
+        if: runner.os == 'macOS'
+        run: |
+          brew unlink gcc
+
       - name: Setup Cpp
         uses: aminya/setup-cpp@v1
         with:
diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml
index 5a84dd5a9..6843cadf7 100644
--- a/.github/workflows/python_wheel.yml
+++ b/.github/workflows/python_wheel.yml
@@ -46,7 +46,8 @@ jobs:
 
     - name: Install gcc and fftw
       run: |
-        brew install gcc fftw
+        brew unlink gcc
+        brew install gcc@13 fftw
         cp make.inc.macosx_gcc-12 make.inc
         echo "FC=gfortran-13" >> make.inc
         echo "CC=gcc-13" >> make.inc