From c0845c94a9c3b3078f5d6e195419b85125fe9ff6 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Tue, 7 May 2024 20:33:43 -0400 Subject: [PATCH 1/4] rework nthr logic replacing #431: no cap on user o.nthreads override; ifndef _OPENMP hard nthr=1 override; warnings at the finufft level only; binsort controlled by spopts.sort_threads matching docs --- CHANGELOG | 5 +++-- src/finufft.cpp | 19 +++++++++++++------ src/spreadinterp.cpp | 30 ++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 14c04dceb..893ea124d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,9 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). -* CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1 - for single-thread build); warns if this cap was activated (PR 431) +* CPU plan stage allows any # threads, warns if > omp_get_max_threads(); or + if single-threaded fixes nthr=1 and warns opts.nthreads>1 attempt. + Sort now respects spread_opts.sort_threads not nthreads. Supercedes PR 431. * new docs troubleshooting accuracy limitations due to condition number of the NUFFT problem. * new sanity check on nj and nk (<0 or too big); new err code, tester, doc. diff --git a/src/finufft.cpp b/src/finufft.cpp index a05e1c02e..1b2c4ef4d 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -591,15 +591,22 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, p->fftSign = (iflag>=0) ? 1 : -1; // clean up flag input // choose overall # threads... - int maxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = maxnthr; // use as many as OMP gives us +#ifdef _OPENMP + int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); + int nthr = ompmaxnthr; // default: use as many as OMP gives us + // (the above could be set, or suggested set, to 1 for small enough problems...) if (p->opts.nthreads>0) { - nthr = min(maxnthr,p->opts.nthreads); // user override up to max avail - if (p->opts.nthreads > maxnthr) // if no OMP, maxnthr=1 - fprintf(stderr,"%s warning: user requested %d threads, but only %d threads available; enforcing nthreads=%d.\n",__func__,p->opts.nthreads,maxnthr,nthr); + nthr = p->opts.nthreads; // user override, now without limit + if (p->opts.showwarn && (nthr > ompmaxnthr)) + fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr); } +#else + int nthr = 1; // always 1 thread (avoid segfault) + if (p->opts.nthreads>1) + fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads); +#endif p->opts.nthreads = nthr; // store actual # thr planned for - // (this sets all downstream spread/interp, 1dkernel, and FFT thread counts) + // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) if (p->opts.maxbatchsize==0) { // logic to auto-set best batchsize diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 8d6e34d96..9898b3982 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -267,14 +267,18 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, timer.start(); // if needed, sort all the NU pts... int did_sort=0; int maxnthr = MY_OMP_GET_MAX_THREADS(); - if (opts.nthreads>0) // user override up to max avail - maxnthr = min(maxnthr,opts.nthreads); - + if (opts.sort_threads>0) // user override, now without limit + maxnthr = opts.sort_threads; // maxnthr = the max threads sorting could use + // (we don't print warning here, since: no showwarn in spread_opts, and finufft + // already warned about it. spreadinterp-only advanced users will miss a warning) if (opts.sort==1 || (opts.sort==2 && better_to_sort)) { // store a good permutation ordering of all NU pts (dim=1,2 or 3) int sort_debug = (opts.debug>=2); // show timing output? - int sort_nthr = opts.sort_threads; // choose # threads for sorting - if (sort_nthr==0) // use auto choice: when N>>M, one thread is better! + int sort_nthr = opts.sort_threads; // 0, or proposed max # threads for sorting +#ifndef _OPENMP + sort_nthr = 1; // if single-threaded lib, override user +#endif + if (sort_nthr==0) // multithreaded auto choice: when N>>M, one thread is better! sort_nthr = (10*M>N) ? maxnthr : 1; // heuristic if (sort_nthr==1) bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug); @@ -323,9 +327,12 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, int ndims = ndims_from_Ns(N1,N2,N3); BIGINT N=N1*N2*N3; // output array size int ns=opts.nspread; // abbrev. for w, kernel width - int nthr = MY_OMP_GET_MAX_THREADS(); // # threads to use to spread + int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread if (opts.nthreads>0) - nthr = min(nthr,opts.nthreads); // user override up to max avail + nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // if single-threaded lib, override user +#endif if (opts.debug) printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr); @@ -445,9 +452,12 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, int ndims = ndims_from_Ns(N1,N2,N3); int ns=opts.nspread; // abbrev. for w, kernel width FLT ns2 = (FLT)ns/2; // half spread width, used as stencil shift - int nthr = MY_OMP_GET_MAX_THREADS(); // # threads to use to interp + int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp if (opts.nthreads>0) - nthr = min(nthr,opts.nthreads); // user override up to max avail + nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // if single-threaded lib, override user +#endif if (opts.debug) printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr); @@ -1292,7 +1302,7 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, nbins2 = isky ? N2/bin_size_y+1 : 1; nbins3 = iskz ? N3/bin_size_z+1 : 1; BIGINT nbins = nbins1*nbins2*nbins3; - if (nthr==0) + if (nthr==0) // should never happen in spreadinterp use fprintf(stderr,"[%s] nthr (%d) must be positive!\n",__func__,nthr); int nt = min(M,(BIGINT)nthr); // handle case of less points than threads std::vector brk(nt+1); // list of start NU pt indices per thread From 4d7d25704afd3c51a0346d47b2b6d1d8d0a90f16 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Tue, 7 May 2024 20:43:06 -0400 Subject: [PATCH 2/4] docs opts.rst to match new logic and advice for nthreads --- docs/opts.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/opts.rst b/docs/opts.rst index 6df982fe0..e56cf5649 100644 --- a/docs/opts.rst +++ b/docs/opts.rst @@ -128,7 +128,9 @@ Diagnostic options Algorithm performance options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -**nthreads**: Number of threads to use. This is capped at the number of available threads (eg, to prevent misuse of a single-threaded code). It then sets the number of threads FINUFFT will use in FFTW, bin-sorting, and spreading/interpolation steps. This number of threads also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here `). Setting ``nthreads=0`` uses all threads available, usually recommended. However, for repeated small problems it can be advantageous to use a small number, even as small as 1. +**nthreads**: (Ignored in single-threaded library builds.) If positive, sets the number of threads to use throughout (multi-threaded build of) library, or if ``0`` uses the maximum number of threads available according to OpenMP. In the positive case, no cap is placed on this number. This number of threads is passed to bin-sorting (which may choose to use less threads), but is adhered to in FFTW and spreading/interpolation steps. This number of threads (or 1 for single-threaded builds) also controls the batch size for vectorized transforms (ie ``ntr>1`` :ref:`here `). +For medium-to-large transforms, ``0`` is usually recommended. +However, for (repeated) small transforms it can be advantageous to use a small number, even as small as ``1``. **fftw**: FFTW planner flags. This number is simply passed to FFTW's planner; the flags are documented `here `_. From ecb395ce54388cb3dd4c9672a7cb88461e9f1c7b Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Tue, 7 May 2024 21:26:56 -0400 Subject: [PATCH 3/4] redo sort logic so spopts.sort_threads is higher-priority than spopts.nthreads (which also does not override the heuristic, merely overrides omp as the max possible) --- src/spreadinterp.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 9898b3982..e36c8f749 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -251,7 +251,7 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, ordering for the x-coords of NU pts, etc. returned value - whether a sort was done (1) or not (0). - Barnett 2017; split out by Melody Shih, Jun 2018. + Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024. */ { CNTime timer; @@ -266,23 +266,26 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, timer.start(); // if needed, sort all the NU pts... int did_sort=0; - int maxnthr = MY_OMP_GET_MAX_THREADS(); - if (opts.sort_threads>0) // user override, now without limit - maxnthr = opts.sort_threads; // maxnthr = the max threads sorting could use + int maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default + if (opts.nthreads>0) + maxnthr = opts.nthreads; // user nthreads overrides, without limit + if (opts.sort_threads>0) + maxnthr = opts.sort_threads; // high-priority override, also no limit + // At this point: maxnthr = the max threads sorting could use // (we don't print warning here, since: no showwarn in spread_opts, and finufft // already warned about it. spreadinterp-only advanced users will miss a warning) if (opts.sort==1 || (opts.sort==2 && better_to_sort)) { // store a good permutation ordering of all NU pts (dim=1,2 or 3) int sort_debug = (opts.debug>=2); // show timing output? - int sort_nthr = opts.sort_threads; // 0, or proposed max # threads for sorting + int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort #ifndef _OPENMP - sort_nthr = 1; // if single-threaded lib, override user + sort_nthr = 1; // if single-threaded lib, override user #endif if (sort_nthr==0) // multithreaded auto choice: when N>>M, one thread is better! - sort_nthr = (10*M>N) ? maxnthr : 1; // heuristic + sort_nthr = (10*M>N) ? maxnthr : 1; // heuristic if (sort_nthr==1) bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug); - else // sort_nthr>1, sets # threads + else // sort_nthr>1, user fixes # threads (>=2) bin_sort_multithread(sort_indices,M,kx,ky,kz,N1,N2,N3,opts.pirange,bin_size_x,bin_size_y,bin_size_z,sort_debug,sort_nthr); if (opts.debug) printf("\tsorted (%d threads):\t%.3g s\n",sort_nthr,timer.elapsedsec()); @@ -331,7 +334,7 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, if (opts.nthreads>0) nthr = opts.nthreads; // user override, now without limit #ifndef _OPENMP - nthr = 1; // if single-threaded lib, override user + nthr = 1; // single-threaded lib must override user #endif if (opts.debug) printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr); @@ -456,7 +459,7 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, if (opts.nthreads>0) nthr = opts.nthreads; // user override, now without limit #ifndef _OPENMP - nthr = 1; // if single-threaded lib, override user + nthr = 1; // single-threaded lib must override user #endif if (opts.debug) printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld; pir=%d), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,opts.pirange,nthr); From 2124cbeae17e78d7f55d96e716ad3e1e1c9b5aa0 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Thu, 9 May 2024 17:11:53 -0400 Subject: [PATCH 4/4] make gh runner image happy --- .github/workflows/cmake_ci.yml | 5 +++++ .github/workflows/python_wheel.yml | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml index ed92e1cd5..c0f000e86 100644 --- a/.github/workflows/cmake_ci.yml +++ b/.github/workflows/cmake_ci.yml @@ -38,6 +38,11 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Unlink gcc + if: runner.os == 'macOS' + run: | + brew unlink gcc + - name: Setup Cpp uses: aminya/setup-cpp@v1 with: diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml index 5a84dd5a9..6843cadf7 100644 --- a/.github/workflows/python_wheel.yml +++ b/.github/workflows/python_wheel.yml @@ -46,7 +46,8 @@ jobs: - name: Install gcc and fftw run: | - brew install gcc fftw + brew unlink gcc + brew install gcc@13 fftw cp make.inc.macosx_gcc-12 make.inc echo "FC=gfortran-13" >> make.inc echo "CC=gcc-13" >> make.inc