diff --git a/perftest/CMakeLists.txt b/perftest/CMakeLists.txt index 7cb4ac30a..8ea2b3d78 100644 --- a/perftest/CMakeLists.txt +++ b/perftest/CMakeLists.txt @@ -1,5 +1,5 @@ # Each source test file is instantiated in single and double precision -set(PERFTESTS guru_timing_test manysmallprobs spreadtestnd) +set(PERFTESTS guru_timing_test manysmallprobs spreadtestnd spreadtestndall) foreach(TEST ${PERFTESTS}) add_executable(${TEST} ${TEST}.cpp) diff --git a/perftest/spreaderbench.py b/perftest/spreaderbench.py new file mode 100644 index 000000000..e093f16de --- /dev/null +++ b/perftest/spreaderbench.py @@ -0,0 +1,30 @@ +fast = 'new.txt' +slow = 'old.txt' + + +def read_data(filename): + data = [0] * 17 + with open(filename) as f1: + nspread = 0 + speed = 0 + for line in f1: + if 'nspread' in line: + nspread = int(line.split('=')[-1]) + if 'pts/s' in line: + speed = float(line.split(' ')[12]) + data[nspread] = speed + return data + +# compute relative increment in percentage between two numbers + + +vec = read_data(fast)[2:] +old = read_data(slow)[2:] + +# 1 : slow = x : fast +# x = (1 - slow/fast) * 100 +i = 2 +for vec, old in zip(vec, old): + diff = (1 - old/vec)*100 + print(f'nspread={i:02d} delta={diff:.3f}%') + i+=1 diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp new file mode 100644 index 000000000..55da3f978 --- /dev/null +++ b/perftest/spreadtestndall.cpp @@ -0,0 +1,276 @@ +#include +#include +#include + +#include +#include +#include +#include + +using namespace finufft::spreadinterp; +using namespace finufft::utils; // for timer + +void usage() { + printf("usage: spreadtestnd dims [M N [dir [sort [flags [debug [kerpad [kerevalmeth [upsampfac]]]]]]]]\n\twhere " + "dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\tdir=spreader direction " + "[spread/interpolate]\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; default)\n\tflags: expert " + "timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 " + "(no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 (Horner " + "ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestndall 1 1e6 1e6 1 2 0 1\n"); +} + +int main(int argc, char *argv[]) +/* Test executable for the 1D, 2D, or 3D C++ spreader, both directions. + * It checks speed, and basic correctness via the grid sum of the result. + * See usage() for usage. Note it currently tests only pirange=0, which is not + * the use case in finufft, and can differ in speed (see devel/foldrescale*) + * + * Example: spreadtestndall 3 1e7 1e7 1 1 + * + * Compilation (also check ../makefile): + * g++ spreadtestndall.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestndall -fPIC -Ofast -funroll-loops + * -fopenmp + * + */ +{ + int d = 3; // Cmd line args & their defaults: default #dims + double w; + int dir = 1; // default (eg 1e-6 has nspread=7) + BIGINT M = 1e6; // default # NU pts + BIGINT roughNg = 1e6; // default # U pts + int sort = 2; // spread_sort + int flags = 0; // default + int debug = 0; // default + int kerpad = 0; // default + int kerevalmeth = 1; // default: Horner + FLT upsampfac = 2.0; // standard + + if (argc < 2 || argc == 3 || argc > 11) { + usage(); + return (argc > 1); + } + sscanf(argv[1], "%d", &d); + if (d < 1 || d > 3) { + printf("d must be 1, 2 or 3!\n"); + usage(); + return 1; + } + if (argc > 2) { + sscanf(argv[2], "%lf", &w); + M = (BIGINT)w; // to read "1e6" right! + if (M < 1) { + printf("M (# NU pts) must be positive!\n"); + usage(); + return 1; + } + sscanf(argv[3], "%lf", &w); + roughNg = (BIGINT)w; + if (roughNg < 1) { + printf("N (# U pts) must be positive!\n"); + usage(); + return 1; + } + } + if (argc > 4) + sscanf(argv[4], "%d", &dir); + if (argc > 5) { + sscanf(argv[5], "%d", &sort); + if ((sort != 0) && (sort != 1) && (sort != 2)) { + printf("sort must be 0, 1 or 2!\n"); + usage(); + return 1; + } + } + if (argc > 6) + sscanf(argv[6], "%d", &flags); + if (argc > 7) { + sscanf(argv[7], "%d", &debug); + if ((debug < 0) || (debug > 2)) { + printf("debug must be 0, 1 or 2!\n"); + usage(); + return 1; + } + } + if (argc > 8) { + sscanf(argv[8], "%d", &kerpad); + if ((kerpad < 0) || (kerpad > 1)) { + printf("kerpad must be 0 or 1!\n"); + usage(); + return 1; + } + } + if (argc > 9) { + sscanf(argv[9], "%d", &kerevalmeth); + if ((kerevalmeth < 0) || (kerevalmeth > 1)) { + printf("kerevalmeth must be 0 or 1!\n"); + usage(); + return 1; + } + } + if (argc > 10) { + sscanf(argv[10], "%lf", &w); + upsampfac = (FLT)w; + if (upsampfac <= 1.0) { + printf("upsampfac must be >1.0!\n"); + usage(); + return 1; + } + } + + BIGINT N = (BIGINT)round(pow(roughNg, 1.0 / d)); // Fourier grid size per dim + BIGINT Ng = (BIGINT)pow(N, d); // actual total grid points + BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1; // the y and z grid sizes + std::vector kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im + if (d > 1) + ky.resize(M); // only alloc needed coords + if (d > 2) + kz.resize(M); + std::vector d_uniform(2 * Ng); // Re and Im + + finufft_spread_opts opts; + const auto max_digits = []() { + if (std::is_same::value) { + return 17; + } else { + return 9; + } + }(); + for (int digits = 2; digits < max_digits; digits++) { + const auto tol = 10.0 * pow(10.0, -digits); + printf("digits=%d, tol = %.3g\n", digits, FLT(tol)); + int ier_set = setup_spreader(opts, tol, upsampfac, kerevalmeth, debug, 1, d); + + if (ier_set > 1) { // exit gracefully if can't set up. + printf("error when setting up spreader (ier_set=%d)!\n", ier_set); + return ier_set; + } + opts.debug = debug; // print more diagnostics? + opts.sort = sort; + opts.flags = flags; + opts.kerpad = kerpad; + opts.upsampfac = upsampfac; + opts.nthreads = 0; // max # threads used, or 0 to use what's avail + opts.sort_threads = 0; + opts.kerpad = 0; + // opts.max_subproblem_size = 1e5; + FLT maxerr, ansmod; + + // spread a single source, only for reference accuracy check... + opts.spread_direction = 1; + + d_nonuniform[0] = 1.0; + d_nonuniform[1] = 0.0; // unit strength + kx[0] = ky[0] = kz[0] = M_PI / 2.0; // at center + int ier = spreadinterp(N, N2, N3, d_uniform.data(), 1, kx.data(), ky.data(), kz.data(), d_nonuniform.data(), + opts); // vector::data officially C++11 but works + if (ier != 0) { + printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier); + return ier; + } + FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid + for (BIGINT i = 0; i < Ng; ++i) { + kersumre += d_uniform[2 * i]; + kersumim += d_uniform[2 * i + 1]; // in case the kernel isn't real! + } + + // now do the large-scale test w/ random sources.. + printf("making random data...\n"); + FLT strre = 0.0, strim = 0.0; // also sum the strengths +#pragma omp parallel + { + unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s +#pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim) + for (BIGINT i = 0; i < M; ++i) { + kx[i] = randm11r(&se) * 3 * M_PI; + // kx[i]=2.0*kx[i] - 50.0; //// to test folding within +-1 period + if (d > 1) + ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords + if (d > 2) + kz[i] = randm11r(&se) * 3 * M_PI; + d_nonuniform[i * 2] = randm11r(&se); + d_nonuniform[i * 2 + 1] = randm11r(&se); + strre += d_nonuniform[2 * i]; + strim += d_nonuniform[2 * i + 1]; + } + } + CNTime timer{}; + double t; + if (dir == 1) { // test direction 1 (NU -> U spreading) ...................... + printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d, (double)Ng, opts.spread_direction, + tol, opts.nspread); + timer.start(); + ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(), d_nonuniform.data(), + opts); + t = timer.elapsedsec(); + if (ier != 0) { + printf("error (ier=%d)!\n", ier); + return ier; + } else + printf(" %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M, t, M / t, + pow(opts.nspread, d) * M / t); + + FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping +#pragma omp parallel for reduction(+ : sumre, sumim) + for (BIGINT i = 0; i < Ng; ++i) { + sumre += d_uniform[2 * i]; + sumim += d_uniform[2 * i + 1]; + } + FLT pre = kersumre * strre - kersumim * strim; // pred ans, complex mult + FLT pim = kersumim * strre + kersumre * strim; + FLT maxerr = std::max(fabs(sumre - pre), fabs(sumim - pim)); + FLT ansmod = sqrt(sumre * sumre + sumim * sumim); + printf(" rel err in total over grid: %.3g\n", maxerr / ansmod); + // note this is weaker than below dir=2 test, but is good indicator that + // periodic wrapping is correct + } + // test direction 2 (U -> NU interpolation) .............................. + if (dir == 2) { + printf("making more random NU pts...\n"); + for (BIGINT i = 0; i < Ng; ++i) { // unit grid data + d_uniform[2 * i] = 1.0; + d_uniform[2 * i + 1] = 0.0; + } +#pragma omp parallel + { + unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s +#pragma omp for schedule(dynamic, 1000000) + for (BIGINT i = 0; i < M; ++i) { // random target pts + // kx[i]=10+.9*rand01r(&s)*N; // or if want to keep ns away from edges + kx[i] = randm11r(&se) * 3 * M_PI; + if (d > 1) + ky[i] = randm11r(&se) * 3 * M_PI; + if (d > 2) + kz[i] = randm11r(&se) * 3 * M_PI; + } + } + + opts.spread_direction = 2; + printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d, (double)Ng, opts.spread_direction, + tol, opts.nspread); + timer.restart(); + ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(), d_nonuniform.data(), + opts); + t = timer.elapsedsec(); + if (ier != 0) { + printf("error (ier=%d)!\n", ier); + return 1; + } else + printf(" %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M, t, M / t, + pow(opts.nspread, d) * M / t); + + // math test is worst-case error from pred value (kersum) on interp pts: + maxerr = 0.0; + for (BIGINT i = 0; i < M; ++i) { + FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre), fabs(d_nonuniform[2 * i + 1] - kersumim)); + if (err > maxerr) + maxerr = err; + } + ansmod = sqrt(kersumre * kersumre + kersumim * kersumim); + printf(" max rel err in values at NU pts: %.3g\n", maxerr / ansmod); + // this is stronger test than for dir=1, since it tests sum of kernel for + // each NU pt. However, it cannot detect reading + // from wrong grid pts (they are all unity) + } + } + return 0; +}